[med-svn] [Git][med-team/python-treetime][master] 6 commits: New upstream version 0.5.3
Andreas Tille
gitlab at salsa.debian.org
Fri Jan 25 17:24:57 GMT 2019
Andreas Tille pushed to branch master at Debian Med / python-treetime
Commits:
0945a217 by Andreas Tille at 2019-01-25T17:21:34Z
New upstream version 0.5.3
- - - - -
a5597e7e by Andreas Tille at 2019-01-25T17:21:34Z
Update upstream source from tag 'upstream/0.5.3'
Update to upstream version '0.5.3'
with Debian dir 5f5a3a73c0118bdd00c4a9de923cf9ac87b88e30
- - - - -
86dc3c62 by Andreas Tille at 2019-01-25T17:21:35Z
New upstream version
- - - - -
82e478ec by Andreas Tille at 2019-01-25T17:21:35Z
debhelper 12
- - - - -
752084c0 by Andreas Tille at 2019-01-25T17:21:37Z
Standards-Version: 4.3.0
- - - - -
a348b07e by Andreas Tille at 2019-01-25T17:22:32Z
Upload to unstable
- - - - -
16 changed files:
- debian/changelog
- debian/compat
- debian/control
- treetime/__init__.py
- treetime/branch_len_interpolator.py
- treetime/clock_tree.py
- treetime/config.py
- treetime/distribution.py
- treetime/gtr.py
- + treetime/gtr_site_specific.py
- treetime/node_interpolator.py
- treetime/seq_utils.py
- + treetime/seqgen.py
- treetime/treeanc.py
- treetime/treetime.py
- treetime/wrappers.py
Changes:
=====================================
debian/changelog
=====================================
@@ -1,3 +1,11 @@
+python-treetime (0.5.3-1) unstable; urgency=medium
+
+ * New upstream version
+ * debhelper 12
+ * Standards-Version: 4.3.0
+
+ -- Andreas Tille <tille at debian.org> Fri, 25 Jan 2019 18:21:37 +0100
+
python-treetime (0.5.2-2) unstable; urgency=medium
* Re-upload due to accidental removal of the package (see #916906)
=====================================
debian/compat
=====================================
@@ -1 +1 @@
-11
+12
=====================================
debian/control
=====================================
@@ -4,7 +4,7 @@ Uploaders: Andreas Tille <tille at debian.org>
Section: science
Testsuite: autopkgtest-pkg-python
Priority: optional
-Build-Depends: debhelper (>= 11~),
+Build-Depends: debhelper (>= 12~),
dh-python,
python3-all,
python3-biopython,
@@ -12,7 +12,7 @@ Build-Depends: debhelper (>= 11~),
python3-pandas,
python3-scipy,
python3-setuptools
-Standards-Version: 4.2.1
+Standards-Version: 4.3.0
Vcs-Browser: https://salsa.debian.org/med-team/python-treetime
Vcs-Git: https://salsa.debian.org/med-team/python-treetime.git
Homepage: https://github.com/neherlab/treetime
=====================================
treetime/__init__.py
=====================================
@@ -6,4 +6,4 @@ from .treetime import ttconf as treetime_conf
from .gtr import GTR
from .merger_models import Coalescent
from .treeregression import TreeRegression
-version="0.5.2"
+version="0.5.3"
=====================================
treetime/branch_len_interpolator.py
=====================================
@@ -118,7 +118,7 @@ class BranchLenInterpolator (Distribution):
@gamma.setter
def gamma(self, value):
- self._gamma = value
+ self._gamma = max(ttconf.TINY_NUMBER, value)
@property
def merger_cost(self):
=====================================
treetime/clock_tree.py
=====================================
@@ -720,12 +720,13 @@ class ClockTree(TreeAnc):
return ttconf.ERROR
rate_std = np.sqrt(self.clock_model['cov'][0,0])
- current_rate = self.clock_model['slope']
+ current_rate = np.abs(self.clock_model['slope'])
upper_rate = self.clock_model['slope'] + rate_std
lower_rate = max(0.1*current_rate, self.clock_model['slope'] - rate_std)
for n in self.tree.find_clades():
if n.up:
- n.branch_length_interpolator.gamma*=upper_rate/current_rate
+ n._orig_gamma = n.branch_length_interpolator.gamma
+ n.branch_length_interpolator.gamma = n._orig_gamma*upper_rate/current_rate
self.logger("###ClockTree.calc_rate_susceptibility: run with upper bound of rate estimate", 1)
self.make_time_tree(**params)
@@ -733,7 +734,7 @@ class ClockTree(TreeAnc):
for n in self.tree.find_clades():
n.numdate_rate_variation = [(upper_rate, n.numdate)]
if n.up:
- n.branch_length_interpolator.gamma*=lower_rate/upper_rate
+ n.branch_length_interpolator.gamma = n._orig_gamma*lower_rate/current_rate
self.logger("###ClockTree.calc_rate_susceptibility: run with lower bound of rate estimate", 1)
self.make_time_tree(**params)
@@ -741,7 +742,7 @@ class ClockTree(TreeAnc):
for n in self.tree.find_clades():
n.numdate_rate_variation.append((lower_rate, n.numdate))
if n.up:
- n.branch_length_interpolator.gamma*=current_rate/lower_rate
+ n.branch_length_interpolator.gamma = n._orig_gamma
self.logger("###ClockTree.calc_rate_susceptibility: run with central rate estimate", 1)
self.make_time_tree(**params)
=====================================
treetime/config.py
=====================================
@@ -27,7 +27,7 @@ MIN_INTEGRATION_PEAK = 0.001
# clocktree parameters
BRANCH_LEN_PENALTY = 0
-MAX_BRANCH_LENGTH = 1.5 # only relevant for time trees - upper boundary of interpolator objects
+MAX_BRANCH_LENGTH = 4.0 # only relevant for branch length optimization and time trees - upper boundary of interpolator objects
NINTEGRAL = 300
REL_TOL_PRUNE = 0.01
REL_TOL_REFINE = 0.05
=====================================
treetime/distribution.py
=====================================
@@ -17,7 +17,7 @@ class Distribution(object):
"""
@staticmethod
- def calc_fwhm(distribution, is_log=True):
+ def calc_fwhm(distribution, is_neg_log=True):
"""
Assess the width of the probability distribution. This returns
full-width-half-max
@@ -25,29 +25,34 @@ class Distribution(object):
if isinstance(distribution, interp1d):
- if is_log:
+ if is_neg_log:
ymin = distribution.y.min()
- prob = np.exp(-(distribution.y-ymin))
+ log_prob = distribution.y-ymin
else:
- prob = distribution.y
+ log_prob = -np.log(distribution.y)
+ log_prob -= log_prob.min()
xvals = distribution.x
elif isinstance(distribution, Distribution):
- # Distribution always stores log-prob
+ # Distribution always stores neg log-prob with the peak value subtracted
xvals = distribution._func.x
- prob = distribution.prob_relative(xvals)
+ log_prob = distribution._func.y
else:
raise TypeError("Error in computing the FWHM for the distribution. "
" The input should be either Distribution or interpolation object");
- x_idxs = binary_dilation(prob >= 0.4*(prob.max() - prob.min())+prob.min(), iterations=1)
- xs = xvals[x_idxs]
- if xs.shape[0] < 2:
+ L = xvals.shape[0]
+ # 0.69... is log(2), there is always one value for which this is true since
+ # the minimum is subtracted
+ tmp = np.where(log_prob < 0.693147)[0]
+ x_l, x_u = tmp[0], tmp[-1]
+ if L < 2:
print ("Not enough points to compute FWHM: returning zero")
return min(TINY_NUMBER, distribution.xmax - distribution.xmin)
else:
- return xs.max() - xs.min()
+ # need to guard against out-of-bounds errors
+ return max(TINY_NUMBER, xvals[min(x_u+1,L-1)] - xvals[max(0,x_l-1)])
@classmethod
@@ -60,10 +65,12 @@ class Distribution(object):
distribution.weight = weight
return distribution
+
@classmethod
def shifted_x(cls, dist, delta_x):
return Distribution(dist.x+delta_x, dist.y, kind=dist.kind)
+
@staticmethod
def multiply(dists):
'''
@@ -102,12 +109,13 @@ class Distribution(object):
res = Distribution.delta_function(x_vals[0])
else:
res = Distribution(x_vals[ind], y_vals[ind], is_log=True,
- min_width=min_width, kind='linear')
+ min_width=min_width, kind='linear', assume_sorted=True)
return res
- def __init__(self, x, y, is_log=True, min_width = MIN_INTEGRATION_PEAK, kind='linear'):
+ def __init__(self, x, y, is_log=True, min_width = MIN_INTEGRATION_PEAK,
+ kind='linear', assume_sorted=False):
"""
Create Distribution instance
@@ -117,12 +125,15 @@ class Distribution(object):
if isinstance(x, Iterable) and isinstance (y, Iterable):
self._delta = False # NOTE in classmethod this value is set explicitly to True.
- xvals, yvals = np.array(sorted(zip(x,y))).T
# first, prepare x, y values
+ if assume_sorted:
+ xvals, yvals = x,y
+ else:
+ xvals, yvals = np.array(sorted(zip(x,y))).T
if not is_log:
yvals = -np.log(yvals)
# just for safety
- yvals [np.isnan(yvals)] = BIG_NUMBER
+ yvals[np.isnan(yvals)] = BIG_NUMBER
# set the properties
self._kind=kind
# remember range
@@ -135,7 +146,8 @@ class Distribution(object):
yvals -= self._peak_val
self._ymax = yvals.max()
# store the interpolation object
- self._func= interp1d(xvals, yvals, kind=kind, fill_value=BIG_NUMBER, bounds_error=False)
+ self._func= interp1d(xvals, yvals, kind=kind, fill_value=BIG_NUMBER,
+ bounds_error=False, assume_sorted=True)
self._fwhm = Distribution.calc_fwhm(self)
elif np.isscalar(x):
=====================================
treetime/gtr.py
=====================================
@@ -54,6 +54,7 @@ class GTR(object):
else:
self.profile_map = prof_map
+
if logger is None:
def logger_default(*args,**kwargs):
"""standard logging function if none provided"""
@@ -62,8 +63,29 @@ class GTR(object):
self.logger = logger_default
else:
self.logger = logger
- n_states = len(self.alphabet)
+ self.ambiguous = None
+ self.gap_index = None
+ self.n_states = len(self.alphabet)
+ self.assign_gap_and_ambiguous()
+
+ # NEEDED TO BREAK RATE MATRIX DEGENERACY AND FORCE NP TO RETURN REAL ORTHONORMAL EIGENVECTORS
+ # ugly hack, but works and shouldn't affect results
+ tmp_rng_state = np.random.get_state()
+ np.random.seed(12345)
+ self.break_degen = np.random.random(size=(self.n_states, self.n_states))*1e-6
+ np.random.set_state(tmp_rng_state)
+
+ # init all matrices with dummy values
+ self.logger("GTR: init with dummy values!", 3)
+ self.v = None # right eigenvectors
+ self.v_inv = None # left eigenvectors
+ self.eigenvals =None # eigenvalues
+ self.assign_rates()
+
+
+ def assign_gap_and_ambiguous(self):
+ n_states = len(self.alphabet)
self.logger("GTR: with alphabet: "+str(self.alphabet),1)
# determine if a character exists that corresponds to no info, i.e. all one profile
if any([x.sum()==n_states for x in self.profile_map.values()]):
@@ -78,28 +100,13 @@ class GTR(object):
self.gap_index = list(self.alphabet).index('-')
except:
self.logger("GTR: no gap symbol!", 4, warn=True)
- self.gap_index=-1
-
-
- # NEEDED TO BREAK RATE MATRIX DEGENERACY AND FORCE NP TO RETURN REAL ORTHONORMAL EIGENVECTORS
- # ugly hack, but works and shouldn't affect results
- tmp_rng_state = np.random.get_state()
- np.random.seed(12345)
- self.break_degen = np.random.random(size=(n_states, n_states))*1e-6
- np.random.set_state(tmp_rng_state)
-
- # init all matrices with dummy values
- self.logger("GTR: init with dummy values!", 3)
- self.v = None # right eigenvectors
- self.v_inv = None # left eigenvectors
- self.eigenvals =None # eigenvalues
- self.assign_rates()
+ self.gap_index=None
@property
def Q(self):
- """function that return the product of the transtiion matrix
- and the equilibrium frequencies to option the rate matrix
+ """function that return the product of the transition matrix
+ and the equilibrium frequencies to obtain the rate matrix
of the GTR model
"""
return (self.W*self.Pi).T
@@ -113,24 +120,32 @@ class GTR(object):
'''
String representation of the GTR model for pretty printing
'''
- eq_freq_str = "Substitution rate (mu): "+str(np.round(self.mu,6))+'\n'
+ multi_site = len(self.Pi.shape)==2
- eq_freq_str += "\nEquilibrium frequencies (pi_i):\n"
- for a,p in zip(self.alphabet, self.Pi):
- eq_freq_str+=' '+str(a)+': '+str(np.round(p,4))+'\n'
+ if multi_site:
+ eq_freq_str = "Average substitution rate (mu): "+str(np.round(self.average_rate,6))+'\n'
+ else:
+ eq_freq_str = "Substitution rate (mu): "+str(np.round(self.mu,6))+'\n'
+
+ if not multi_site:
+ eq_freq_str += "\nEquilibrium frequencies (pi_i):\n"
+ for a,p in zip(self.alphabet, self.Pi):
+ eq_freq_str+=' '+str(a)+': '+str(np.round(p,4))+'\n'
W_str = "\nSymmetrized rates from j->i (W_ij):\n"
W_str+='\t'+'\t'.join(map(str, self.alphabet))+'\n'
for a,Wi in zip(self.alphabet, self.W):
W_str+= ' '+str(a)+'\t'+'\t'.join([str(np.round(max(0,p),4)) for p in Wi])+'\n'
- Q_str = "\nActual rates from j->i (Q_ij):\n"
- Q_str+='\t'+'\t'.join(map(str, self.alphabet))+'\n'
- for a,Qi in zip(self.alphabet, self.Q):
- Q_str+= ' '+str(a)+'\t'+'\t'.join([str(np.round(max(0,p),4)) for p in Qi])+'\n'
+ if not multi_site:
+ Q_str = "\nActual rates from j->i (Q_ij):\n"
+ Q_str+='\t'+'\t'.join(map(str, self.alphabet))+'\n'
+ for a,Qi in zip(self.alphabet, self.Q):
+ Q_str+= ' '+str(a)+'\t'+'\t'.join([str(np.round(max(0,p),4)) for p in Qi])+'\n'
return eq_freq_str + W_str + Q_str
+
def assign_rates(self, mu=1.0, pi=None, W=None):
"""
Overwrite the GTR model given the provided data
@@ -402,7 +417,7 @@ class GTR(object):
nij : nxn matrix
The number of times a change in character state is observed
- between state i and j
+ between state j and i
Ti :n vector
The time spent in each character state
@@ -451,11 +466,7 @@ class GTR(object):
+ ttconf.TINY_NUMBER + 2*pc_mat)
np.fill_diagonal(W_ij, 0)
- Wdiag = (((W_ij.T*pi).T).sum(axis=0)+ttconf.TINY_NUMBER)/(pi+ttconf.TINY_NUMBER)
- np.fill_diagonal(W_ij, Wdiag)
- Q1 = np.diag(pi).dot(W_ij)
- scale_factor = np.sum(np.diagonal(Q1*np.diag(pi)))
- np.fill_diagonal(W_ij, 0)
+ scale_factor = np.einsum('i,ij,j',pi,W_ij,pi)
W_ij = W_ij/scale_factor
if fixed_pi is None:
@@ -468,7 +479,7 @@ class GTR(object):
gtr.logger('the iterative scheme has not converged',3,warn=True)
elif np.abs(1-np.max(pi.sum(axis=0))) > dp:
gtr.logger('the iterative scheme has converged, but proper normalization was not reached',3,warn=True)
- if gtr.gap_index>=0:
+ if gtr.gap_index is not None:
if pi[gtr.gap_index]<gap_limit:
gtr.logger('The model allows for gaps which are estimated to occur at a low fraction of %1.3e'%pi[gtr.gap_index]+
'\n\t\tthis can potentially result in artificats.'+
@@ -517,7 +528,23 @@ class GTR(object):
self.v = np.real(eigvecs)
self.v_inv = np.linalg.inv(self.v)
self.eigenvals = np.real(eigvals)
- return
+
+
+ def _eig_sym(self):
+ """
+ Perform eigendecompositon of the rate matrix and stores the left- and right-
+ matrices to convert the sequence profiles to the GTR matrix eigenspace
+ and hence to speed-up the computations.
+ """
+ # eigendecomposition of the rate matrix
+ tmpp = np.sqrt(self.Pi)
+ symQ = self.W*np.outer(tmpp, tmpp)
+ eigvals, eigvecs = np.linalg.eigh(symQ)
+ tmp_v = eigvecs.T*tmpp
+ one_norm = np.sum(np.abs(tmp_v), axis=1)
+ self.v = tmp_v.T/one_norm
+ self.v_inv = (eigvecs*one_norm).T/tmpp
+ self.eigenvals = eigvals
def compress_sequence_pair(self, seq_p, seq_ch, pattern_multiplicity=None,
@@ -572,9 +599,9 @@ class GTR(object):
bs.append(seq==nuc)
for n1,nuc1 in enumerate(self.alphabet):
- if (n1!=self.gap_index or (not ignore_gaps)):
+ if (self.gap_index is None) or (not ignore_gaps) or (n1!=self.gap_index):
for n2,nuc2 in enumerate(self.alphabet):
- if (n2!=self.gap_index or (not ignore_gaps)):
+ if (self.gap_index is None) or (not ignore_gaps) or (n2!=self.gap_index):
count = ((bool_seqs_p[n1]&bool_seqs_ch[n2])*pattern_multiplicity).sum()
if count: pair_count.append(((n1,n2), count))
else: # enumerate state pairs of the sequence for large alphabets
@@ -637,7 +664,8 @@ class GTR(object):
return logP if return_log else np.exp(logP)
- def prob_t(self, seq_p, seq_ch, t, pattern_multiplicity = None, return_log=False, ignore_gaps=True):
+ def prob_t(self, seq_p, seq_ch, t, pattern_multiplicity = None,
+ return_log=False, ignore_gaps=True):
"""
Compute the probability to observe seq_ch (child sequence) after time t starting from seq_p
(parent sequence).
@@ -703,7 +731,7 @@ class GTR(object):
return self.optimal_t_compressed(seq_pair, multiplicity)
- def optimal_t_compressed(self, seq_pair, multiplicity, profiles=False):
+ def optimal_t_compressed(self, seq_pair, multiplicity, profiles=False, tol=1e-10):
"""
Find the optimal distance between the two sequences, for compressed sequences
@@ -756,7 +784,8 @@ class GTR(object):
to be separated by the time t.
"""
if profiles:
- return -1.0*self.prob_t_profiles(seq_pair, multiplicity,t**2, return_log=True)
+ res = -1.0*self.prob_t_profiles(seq_pair, multiplicity,t**2, return_log=True)
+ return res
else:
return -1.0*self.prob_t_compressed(seq_pair, multiplicity,t**2, return_log=True)
@@ -764,7 +793,7 @@ class GTR(object):
from scipy.optimize import minimize_scalar
opt = minimize_scalar(_neg_prob,
bounds=[-np.sqrt(ttconf.MAX_BRANCH_LENGTH),np.sqrt(ttconf.MAX_BRANCH_LENGTH)],
- args=(seq_pair, multiplicity), tol=1e-10)
+ args=(seq_pair, multiplicity), tol=tol)
new_len = opt["x"]**2
if 'success' not in opt:
opt['success'] = True
@@ -789,7 +818,8 @@ class GTR(object):
return new_len
- def prob_t_profiles(self, profile_pair, multiplicity, t, return_log=False, ignore_gaps=True):
+ def prob_t_profiles(self, profile_pair, multiplicity, t,
+ return_log=False, ignore_gaps=True):
'''
Calculate the probability of observing a node pair at a distance t
@@ -816,15 +846,17 @@ class GTR(object):
if t<0:
logP = -ttconf.BIG_NUMBER
else:
- Qt = self.expQt(t).T
- res = profile_pair[0].dot(Qt)
- overlap = np.sum(res*profile_pair[1], axis=1)
- if ignore_gaps: # calculate the probability that neither outgroup/node has a gap
+ Qt = self.expQt(t)
+ if len(Qt.shape)==3:
+ res = np.einsum('ai,ija,aj->a', profile_pair[1], Qt, profile_pair[0])
+ else:
+ res = np.einsum('ai,ij,aj->a', profile_pair[1], Qt, profile_pair[0])
+ if ignore_gaps and (self.gap_index is not None): # calculate the probability that neither outgroup/node has a gap
non_gap_frac = (1-profile_pair[0][:,self.gap_index])*(1-profile_pair[1][:,self.gap_index])
# weigh log LH by the non-gap probability
- logP = np.sum(multiplicity*np.log(overlap)*non_gap_frac)
+ logP = np.sum(multiplicity*np.log(res)*non_gap_frac)
else:
- logP = np.sum(multiplicity*np.log(overlap))
+ logP = np.sum(multiplicity*np.log(res))
return logP if return_log else np.exp(logP)
@@ -862,6 +894,37 @@ class GTR(object):
return np.log(res) if return_log else res
+ def evolve(self, profile, t, return_log=False):
+ """
+ Compute the probability of the sequence state of the child
+ at time t later, given the parent profile.
+
+ Parameters
+ ----------
+
+ profile : numpy.array
+ Sequence profile. Shape = (L, a),
+ where L - sequence length, a - alphabet size.
+
+ t : double
+ Time to propagate
+
+ return_log: bool
+ If True, return log-probability
+
+ Returns
+ -------
+
+ res : np.array
+ Profile of the sequence after time t in the future.
+ Shape = (L, a), where L - sequence length, a - alphabet size.
+
+ """
+ Qt = self.expQt(t).T
+ res = profile.dot(Qt)
+ return np.log(res) if return_log else res
+
+
def _exp_lt(self, t):
"""
Parameters
@@ -952,6 +1015,8 @@ class GTR(object):
return np.sum([np.sum((seq==state)*pattern_multiplicity*np.log(self.Pi[si]))
for si,state in enumerate(self.alphabet)])
+ def average_rate(self):
+ return self.mu*np.einsum('i,ij,j',self.Pi, self.W, self.Pi)
def save_to_npz(self, outfile):
full_gtr = self.mu * np.dot(self.Pi, self.W)
=====================================
treetime/gtr_site_specific.py
=====================================
@@ -0,0 +1,421 @@
+from __future__ import division, print_function, absolute_import
+from collections import defaultdict
+import numpy as np
+from treetime import config as ttconf
+from .seq_utils import alphabets, profile_maps, alphabet_synonyms
+from .gtr import GTR
+
+class GTR_site_specific(GTR):
+ """
+ Defines General-Time-Reversible model of character evolution.
+ """
+ def __init__(self, *args, **kwargs):
+ if 'seq_len' in kwargs:
+ self.seq_len = kwargs['seq_len']
+ kwargs.pop('seq_len')
+ else:
+ self.seq_len = 1
+ super(GTR_site_specific, self).__init__(**kwargs)
+
+
+ @property
+ def Q(self):
+ """function that return the product of the transition matrix
+ and the equilibrium frequencies to obtain the rate matrix
+ of the GTR model
+ """
+ tmp = np.einsum('ia,ij->ija', self.Pi, self.W)
+ diag_vals = np.sum(tmp, axis=0)
+ for x in range(tmp.shape[-1]):
+ np.fill_diagonal(tmp[:,:,x], -diag_vals[:,x])
+ return tmp
+
+ def assign_rates(self, mu=1.0, pi=None, W=None):
+ """
+ Overwrite the GTR model given the provided data
+
+ Parameters
+ ----------
+
+ mu : float
+ Substitution rate
+
+ W : nxn matrix
+ Substitution matrix
+
+ pi : n vector
+ Equilibrium frequencies
+
+ """
+ n = len(self.alphabet)
+ self.mu = np.copy(mu)
+
+ if pi is not None and pi.shape[0]==n:
+ self.seq_len = pi.shape[-1]
+ Pi = np.copy(pi)
+ else:
+ if pi is not None and len(pi)!=n:
+ self.logger("length of equilibrium frequency vector does not match alphabet length", 4, warn=True)
+ self.logger("Ignoring input equilibrium frequencies", 4, warn=True)
+ Pi = np.ones(shape=(n,self.seq_len))
+
+ self.Pi = Pi/np.sum(Pi, axis=0)
+
+ if W is None or W.shape!=(n,n):
+ if (W is not None) and W.shape!=(n,n):
+ self.logger("Substitution matrix size does not match alphabet size", 4, warn=True)
+ self.logger("Ignoring input substitution matrix", 4, warn=True)
+ # flow matrix
+ W = np.ones((n,n))
+ else:
+ W=0.5*(np.copy(W)+np.copy(W).T)
+
+ np.fill_diagonal(W,0)
+ avg_pi = self.Pi.mean(axis=-1)
+ average_rate = W.dot(avg_pi).dot(avg_pi)
+ self.W = W/average_rate
+ self.mu *=average_rate
+
+ self._eig()
+
+
+ @classmethod
+ def random(cls, L=1, avg_mu=1.0, alphabet='nuc', pi_dirichlet_alpha=1,
+ W_dirichlet_alpha=3.0, mu_gamma_alpha=3.0):
+ """
+ Creates a random GTR model
+
+ Parameters
+ ----------
+
+ mu : float
+ Substitution rate
+
+ alphabet : str
+ Alphabet name (should be standard: 'nuc', 'nuc_gap', 'aa', 'aa_gap')
+
+
+ """
+ from scipy.stats import gamma
+ alphabet=alphabets[alphabet]
+ gtr = cls(alphabet=alphabet, seq_len=L)
+ n = gtr.alphabet.shape[0]
+
+ if pi_dirichlet_alpha:
+ pi = 1.0*gamma.rvs(pi_dirichlet_alpha, size=(n,L))
+ else:
+ pi = np.ones((n,L))
+
+ pi /= pi.sum(axis=0)
+ if W_dirichlet_alpha:
+ tmp = 1.0*gamma.rvs(W_dirichlet_alpha, size=(n,n))
+ else:
+ tmp = np.ones((n,n))
+ tmp = np.tril(tmp,k=-1)
+ W = tmp + tmp.T
+
+ if mu_gamma_alpha:
+ mu = gamma.rvs(mu_gamma_alpha, size=(L,))
+ else:
+ mu = np.ones(L)
+
+ gtr.assign_rates(mu=mu, pi=pi, W=W)
+ gtr.mu *= avg_mu/np.mean(gtr.mu)
+
+ return gtr
+
+ @classmethod
+ def custom(cls, mu=1.0, pi=None, W=None, **kwargs):
+ """
+ Create a GTR model by specifying the matrix explicitly
+
+ Parameters
+ ----------
+
+ mu : float
+ Substitution rate
+
+ W : nxn matrix
+ Substitution matrix
+
+ pi : n vector
+ Equilibrium frequencies
+
+ **kwargs:
+ Key word arguments to be passed
+
+ Keyword Args
+ ------------
+
+ alphabet : str
+ Specify alphabet when applicable. If the alphabet specification is
+ required, but no alphabet is specified, the nucleotide alphabet will be used as
+ default.
+
+ """
+ gtr = cls(**kwargs)
+ gtr.assign_rates(mu=mu, pi=pi, W=W)
+ return gtr
+
+ @classmethod
+ def infer(cls, sub_ija, T_ia, root_state, pc=0.01,
+ gap_limit=0.01, Nit=30, dp=1e-5, **kwargs):
+ """
+ Infer a GTR model by specifying the number of transitions and time spent in each
+ character. The basic equation that is being solved is
+
+ :math:`n_{ij} = pi_i W_{ij} T_j`
+
+ where :math:`n_{ij}` are the transitions, :math:`pi_i` are the equilibrium
+ state frequencies, :math:`W_{ij}` is the "substitution attempt matrix",
+ while :math:`T_i` is the time on the tree spent in character state
+ :math:`i`. To regularize the process, we add pseudocounts and also need
+ to account for the fact that the root of the tree is in a particular
+ state. the modified equation is
+
+ :math:`n_{ij} + pc = pi_i W_{ij} (T_j+pc+root\_state)`
+
+ Parameters
+ ----------
+
+ nija : nxn matrix
+ The number of times a change in character state is observed
+ between state j and i at position a
+
+ Tia :n vector
+ The time spent in each character state at position a
+
+ root_state : n vector
+ The number of characters in state i in the sequence
+ of the root node.
+
+ pc : float
+ Pseudocounts, this determines the lower cutoff on the rate when
+ no substitutions are observed
+
+ **kwargs:
+ Key word arguments to be passed
+
+ Keyword Args
+ ------------
+
+ alphabet : str
+ Specify alphabet when applicable. If the alphabet specification
+ is required, but no alphabet is specified, the nucleotide alphabet will be used as default.
+
+ """
+ from scipy import linalg as LA
+ gtr = cls(**kwargs)
+ gtr.logger("GTR: model inference ",1)
+ q = len(gtr.alphabet)
+ L = sub_ija.shape[-1]
+
+ n_iter = 0
+ n_ija = np.copy(sub_ija)
+ n_ija[range(q),range(q),:] = 0
+ n_ij = n_ija.sum(axis=-1)
+
+ m_ia = np.sum(n_ija,axis=1) + root_state + pc
+ n_a = n_ija.sum(axis=1).sum(axis=0) + pc
+
+ Lambda = np.sum(root_state,axis=0) + q*pc
+ p_ia_old=np.zeros((q,L))
+ p_ia = np.ones((q,L))/q
+ mu_a = np.ones(L)
+
+ W_ij = np.ones((q,q)) - np.eye(q)
+
+ while (LA.norm(p_ia_old-p_ia)>dp) and n_iter<Nit:
+ n_iter += 1
+ p_ia_old = np.copy(p_ia)
+ S_ij = np.einsum('a,ia,ja',mu_a, p_ia, T_ia)
+ W_ij = (n_ij + n_ij.T + pc)/(S_ij + S_ij.T + pc)
+
+ avg_pi = p_ia.mean(axis=-1)
+ average_rate = W_ij.dot(avg_pi).dot(avg_pi)
+ W_ij = W_ij/average_rate
+ mu_a *=average_rate
+
+ p_ia = m_ia/(mu_a*np.dot(W_ij,T_ia)+Lambda)
+ p_ia = p_ia/p_ia.sum(axis=0)
+
+ mu_a = n_a/(pc+np.einsum('ia,ij,ja->a', p_ia, W_ij, T_ia))
+
+ if n_iter >= Nit:
+ gtr.logger('WARNING: maximum number of iterations has been reached in GTR inference',3, warn=True)
+ if LA.norm(p_ia_old-p_ia) > dp:
+ gtr.logger('the iterative scheme has not converged',3,warn=True)
+ if gtr.gap_index is not None:
+ for p in range(p_ia.shape[-1]):
+ if p_ia[gtr.gap_index,p]<gap_limit:
+ gtr.logger('The model allows for gaps which are estimated to occur at a low fraction of %1.3e'%p_ia[gtr.gap_index]+
+ '\n\t\tthis can potentially result in artificats.'+
+ '\n\t\tgap fraction will be set to %1.4f'%gap_limit,2,warn=True)
+ p_ia[gtr.gap_index,p] = gap_limit
+ p_ia[:,p] /= p_ia[:,p].sum()
+
+ gtr.assign_rates(mu=mu_a, W=W_ij, pi=p_ia)
+ return gtr
+
+
+ def _eig_single_site(self, W, p):
+ tmpp = np.sqrt(p)
+ symQ = W*np.outer(tmpp, tmpp)
+ np.fill_diagonal(symQ, -np.sum(W*p, axis=1))
+ eigvals, eigvecs = np.linalg.eigh(symQ)
+ tmp_v = eigvecs.T*tmpp
+ one_norm = np.sum(np.abs(tmp_v), axis=1)
+ return eigvals, tmp_v.T/one_norm, (eigvecs*one_norm).T/tmpp
+
+
+ def _eig(self):
+ eigvals, vec, vec_inv = [], [], []
+ for pi in range(self.seq_len):
+ if len(self.W.shape)>2:
+ W = np.copy(self.W[:,:,pi])
+ np.fill_diagonal(W, 0)
+ elif pi==0:
+ np.fill_diagonal(self.W, 0)
+ W=self.W
+
+ ev, evec, evec_inv = self._eig_single_site(W,self.Pi[:,pi])
+ eigvals.append(ev)
+ vec.append(evec)
+ vec_inv.append(evec_inv)
+
+ self.eigenvals = np.array(eigvals).T
+ self.v = np.swapaxes(vec,0,-1)
+ self.v_inv = np.swapaxes(vec_inv, 0,-1)
+
+
+ def expQt(self, t):
+ # this is currently very slow.
+ eLambdaT = np.exp(self.mu*t*self.eigenvals)
+ return np.einsum('jia,ja,kja->ika', self.v, eLambdaT, self.v_inv)
+
+
+ def prop_t_compressed(self, seq_pair, multiplicity, t, return_log=False):
+ print("NOT IMPEMENTED")
+
+
+ def propagate_profile(self, profile, t, return_log=False):
+ """
+ Compute the probability of the sequence state of the parent
+ at time (t+t0, backwards), given the sequence state of the
+ child (profile) at time t0.
+
+ Parameters
+ ----------
+
+ profile : numpy.array
+ Sequence profile. Shape = (L, a),
+ where L - sequence length, a - alphabet size.
+
+ t : double
+ Time to propagate
+
+ return_log: bool
+ If True, return log-probability
+
+ Returns
+ -------
+
+ res : np.array
+ Profile of the sequence after time t in the past.
+ Shape = (L, a), where L - sequence length, a - alphabet size.
+
+ """
+ Qt = self.expQt(t)
+ res = np.einsum('ai,ija->aj', profile, Qt)
+
+ return np.log(res) if return_log else res
+
+
+ def evolve(self, profile, t, return_log=False):
+ """
+ Compute the probability of the sequence state of the child
+ at time t later, given the parent profile.
+
+ Parameters
+ ----------
+
+ profile : numpy.array
+ Sequence profile. Shape = (L, a),
+ where L - sequence length, a - alphabet size.
+
+ t : double
+ Time to propagate
+
+ return_log: bool
+ If True, return log-probability
+
+ Returns
+ -------
+
+ res : np.array
+ Profile of the sequence after time t in the future.
+ Shape = (L, a), where L - sequence length, a - alphabet size.
+
+ """
+ Qt = self.expQt(t)
+ res = np.einsum('ai,jia->aj', profile, Qt)
+
+ return np.log(res) if return_log else res
+
+
+ def prob_t(self, seq_p, seq_ch, t, pattern_multiplicity = None,
+ return_log=False, ignore_gaps=True):
+ """
+ Compute the probability to observe seq_ch (child sequence) after time t starting from seq_p
+ (parent sequence).
+
+ Parameters
+ ----------
+
+ seq_p : character array
+ Parent sequence
+
+ seq_c : character array
+ Child sequence
+
+ t : double
+ Time (branch len) separating the profiles.
+
+ pattern_multiplicity : numpy array
+ If sequences are reduced by combining identical alignment patterns,
+ these multplicities need to be accounted for when counting the number
+ of mutations across a branch. If None, all pattern are assumed to
+ occur exactly once.
+
+ return_log : bool
+ It True, return log-probability.
+
+ Returns
+ -------
+ prob : np.array
+ Resulting probability
+
+ """
+ if t<0:
+ logP = -ttconf.BIG_NUMBER
+ else:
+ tmp_eQT = self.expQt(t)
+ bad_indices=(tmp_eQT==0)
+ logQt = np.log(tmp_eQT + ttconf.TINY_NUMBER*(bad_indices))
+ logQt[np.isnan(logQt) | np.isinf(logQt) | bad_indices] = -ttconf.BIG_NUMBER
+ seq_indices_c = np.zeros(len(seq_ch), dtype=int)
+ seq_indices_p = np.zeros(len(seq_p), dtype=int)
+ for ai, a in self.alphabet:
+ seq_indices_p[seq_p==a] = ai
+ seq_indices_c[seq_ch==a] = ai
+
+ if len(logQt.shape)==2:
+ logP = np.sum(logQt[seq_indices_p, seq_indices_c]*pattern_multiplicity)
+ else:
+ logP = np.sum(logQt[seq_indices_p, seq_indices_c, np.arange(len(seq_c))]*pattern_multiplicity)
+
+ return logP if return_log else np.exp(logP)
+
+
+ def average_rate(self):
+ return np.einsum('a,ia,ij,ja->a',self.mu, self.Pi, self.W, self.Pi)
=====================================
treetime/node_interpolator.py
=====================================
@@ -80,7 +80,7 @@ def _convolution_integrand(t_val, f, g,
# create the interpolation object on this grid
FG = Distribution(tau, fg, is_log=True, min_width = np.max([f.min_width, g.min_width]),
- kind='linear')
+ kind='linear', assume_sorted=True)
return FG
@@ -258,7 +258,6 @@ class NodeInterpolator (Distribution):
insert_point_idx[1:] = refine_factor
insert_point_idx[:-1] += refine_factor
# add additional points if there are any to add
-
if np.sum(insert_point_idx):
add_x = np.concatenate([np.linspace(t1,t2,n+2)[1:-1] for t1,t2,n in
zip(t_grid_0[1:-2], t_grid_0[2:-1], insert_point_idx) if n>0])
=====================================
treetime/seq_utils.py
=====================================
@@ -89,31 +89,29 @@ profile_maps = {
},
'aa_nogap':{
- 'A': np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Alanine Ala
- 'C': np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Cysteine Cys
- 'D': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Aspartic AciD Asp
- 'E': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamic Acid Glu
- 'F': np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Phenylalanine Phe
- 'G': np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glycine Gly
- 'H': np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Histidine His
- 'I': np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Isoleucine Ile
- 'K': np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Lysine Lys
- 'L': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Leucine Leu
- 'M': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Methionine Met
- 'N': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #AsparagiNe Asn
- 'P': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Proline Pro
- 'Q': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine Gln
- 'R': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #ARginine Arg
- 'S': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype='float'), #Serine Ser
- 'T': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype='float'), #Threonine Thr
- 'V': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype='float'), #Valine Val
- 'W': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype='float'), #Tryptophan Trp
- 'Y': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype='float'), #Tyrosine Tyr
- '*': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #stop
- '-': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #gap
- 'X': np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype='float'), #not specified/any
- 'B': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Asparagine/Aspartic Acid Asx
- 'Z': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine/Glutamic Acid Glx
+ 'A': np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Alanine Ala
+ 'C': np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Cysteine Cys
+ 'D': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Aspartic AciD Asp
+ 'E': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamic Acid Glu
+ 'F': np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Phenylalanine Phe
+ 'G': np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glycine Gly
+ 'H': np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Histidine His
+ 'I': np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Isoleucine Ile
+ 'K': np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Lysine Lys
+ 'L': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Leucine Leu
+ 'M': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Methionine Met
+ 'N': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #AsparagiNe Asn
+ 'P': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Proline Pro
+ 'Q': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine Gln
+ 'R': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype='float'), #ARginine Arg
+ 'S': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype='float'), #Serine Ser
+ 'T': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype='float'), #Threonine Thr
+ 'V': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype='float'), #Valine Val
+ 'W': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype='float'), #Tryptophan Trp
+ 'Y': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype='float'), #Tyrosine Tyr
+ 'X': np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype='float'), #not specified/any
+ 'B': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Asparagine/Aspartic Acid Asx
+ 'Z': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine/Glutamic Acid Glx
}
}
@@ -172,20 +170,11 @@ def seq2prof(seq, profile_map):
Profile for the character. Zero array if the character not found
"""
- plength = np.unique([len(x) for x in profile_map.values()])
- if len(plength)==1:
- n_states = plength[0]
- else:
- print("profile contains arrays of different length!")
- return None
- prof = np.array([profile_map[k] if k in profile_map
- else np.ones(n_states) for k in seq ])
+ return np.array([profile_map[k] for k in seq])
- bad_indices=(prof.sum(1) < 1e-5)
- return prof[~bad_indices]
-def prof2seq(profile, gtr, sample_from_prof=False):
+def prof2seq(profile, gtr, sample_from_prof=False, normalize=True):
"""
Convert profile to sequence and normalize profile across sites.
@@ -215,7 +204,10 @@ def prof2seq(profile, gtr, sample_from_prof=False):
"""
# normalize profile such that probabilities at each site sum to one
- tmp_profile=(profile.T/profile.sum(axis=1)).T
+ if normalize:
+ tmp_profile, pre=normalize_profile(profile, return_offset=False)
+ else:
+ tmp_profile = profile
# sample sequence according to the probabilities in the profile
# (sampling from cumulative distribution over the different states)
@@ -231,3 +223,30 @@ def prof2seq(profile, gtr, sample_from_prof=False):
return seq, prof_values, idx
+
+def normalize_profile(in_profile, log=False, return_offset = True):
+ """return a normalized version of a profile matrix
+
+ Parameters
+ ----------
+ in_profile : np.array
+ shape Lxq, will be normalized to one across each row
+ log : bool, optional
+ treat the input as log probabilities
+ return_offset : bool, optional
+ return the log of the scale factor for each row
+
+ Returns
+ -------
+ tuple
+ normalized profile (fresh np object) and offset (if return_offset==True)
+ """
+ if log:
+ tmp_prof = np.exp(in_profile) # - tmp_prefactor)
+ else:
+ tmp_prof = in_profile
+
+ norm_vector = tmp_prof.sum(axis=1)
+ return (np.copy(np.einsum('ai,a->ai',tmp_prof,1.0/norm_vector)),
+ np.log(norm_vector) if return_offset else None)
+
=====================================
treetime/seqgen.py
=====================================
@@ -0,0 +1,54 @@
+from __future__ import division, print_function, absolute_import
+from collections import defaultdict
+import numpy as np
+from treetime import config as ttconf
+from .seq_utils import alphabets, profile_maps, alphabet_synonyms, seq2array, seq2prof
+from .gtr import GTR
+from .treeanc import TreeAnc
+
+
+class SeqGen(TreeAnc):
+ def __init__(self, *args, **kwargs):
+ super(SeqGen, self).__init__(reduce_alignment=False, **kwargs)
+
+
+ def sample_from_profile(self, p):
+ cum_p = p.cumsum(axis=1).T
+ prand = np.random.random(p.shape[0])
+ seq = self.gtr.alphabet[np.argmax(cum_p>prand, axis=0)]
+ return seq
+
+
+ def evolve(self, root_seq=None):
+ self.seq_len = self.gtr.seq_len
+ if root_seq:
+ self.tree.root.sequence = seq2array(root_seq)
+ else:
+ self.tree.root.sequence = self.sample_from_profile(self.gtr.Pi.T)
+
+ for n in self.tree.get_nonterminals(order='preorder'):
+ profile_p = seq2prof(n.sequence, self.gtr.profile_map)
+ for c in n:
+ profile = self.gtr.evolve(profile_p, c.branch_length)
+ c.sequence = self.sample_from_profile(profile)
+ self.make_reduced_alignment()
+
+ for n in self.tree.find_clades():
+ if n==self.tree.root:
+ n.mutations=[]
+ else:
+ n.mutations = self.get_mutations(n)
+
+
+ def get_aln(self, internal=False):
+ from Bio import SeqRecord, Seq
+ from Bio.Align import MultipleSeqAlignment
+
+ tmp = []
+ for n in self.tree.get_terminals():
+ if n.is_terminal() or internal:
+ tmp.append(SeqRecord.SeqRecord(id=n.name, name=n.name, description='', seq=Seq.Seq(''.join(n.sequence))))
+
+ return MultipleSeqAlignment(tmp)
+
+
=====================================
treetime/treeanc.py
=====================================
@@ -1,12 +1,14 @@
from __future__ import print_function, division, absolute_import
import time, sys
+import gc
from collections import defaultdict
import numpy as np
from Bio import Phylo
from Bio import AlignIO
from treetime import config as ttconf
-from .seq_utils import seq2prof,seq2array,prof2seq
+from .seq_utils import seq2prof,seq2array,prof2seq, normalize_profile
from .gtr import GTR
+from .gtr_site_specific import GTR_site_specific
string_types = [str] if sys.version_info[0]==3 else [str, unicode]
@@ -15,12 +17,14 @@ class TreeAnc(object):
"""
Class defines simple tree object with basic interface methods: reading and
saving from/to files, initializing leaves with sequences from the
- alignment, making ancestral state inferrence
+ alignment, making ancestral state inference
"""
def __init__(self, tree=None, aln=None, gtr=None, fill_overhangs=True,
ref=None, verbose = ttconf.VERBOSE, ignore_gaps=True,
- convert_upper=True, seq_multiplicity=None, log=None, **kwargs):
+ convert_upper=True, seq_multiplicity=None, log=None,
+ reduce_alignment=True,
+ **kwargs):
"""
TreeAnc constructor. It prepares the tree, attaches sequences to the leaf nodes,
and sets some configuration parameters.
@@ -92,7 +96,7 @@ class TreeAnc(object):
self.is_vcf = False #this is set true when aln is set, if aln is dict
# if sequences represent multiple samples, this can be added as multiplicity here
self.seq_multiplicity = {} if seq_multiplicity is None else seq_multiplicity
-
+ self.reduce_alignment = reduce_alignment
self.ignore_gaps = ignore_gaps
self._tree = None
@@ -120,7 +124,7 @@ class TreeAnc(object):
if self.aln and self.tree:
if len(self.tree.get_terminals()) != len(self.aln):
- print("**WARNING: Number of sequences in tree differs from number of sequences in alignment!**")
+ self.logger("**WARNING: Number of sequences in tree differs from number of sequences in alignment!**", 3, warn=True)
def logger(self, msg, level, warn=False):
@@ -184,7 +188,7 @@ class TreeAnc(object):
value : GTR
the new GTR object
"""
- if not isinstance(value, GTR):
+ if not (isinstance(value, GTR) or isinstance(value, GTR_site_specific)):
raise TypeError(" GTR instance expected")
self._gtr = value
@@ -212,7 +216,7 @@ class TreeAnc(object):
self._gtr = GTR.standard(model=in_gtr, **kwargs)
self._gtr.logger = self.logger
- elif isinstance(in_gtr, GTR):
+ elif isinstance(in_gtr, GTR) or isinstance(in_gtr, GTR_site_specific):
self._gtr = in_gtr
self._gtr.logger=self.logger
else:
@@ -341,15 +345,14 @@ class TreeAnc(object):
else:
self.seq_len = self.aln.get_alignment_length()
-
# check whether the alignment is consistent with a nucleotide alignment.
likely_alphabet = self._guess_alphabet()
from .seq_utils import alphabets
# if likely alignment is not nucleotide but the gtr alignment is, WARN
- if likely_alphabet=='aa' and len(self.gtr.alphabet)==len(alphabets['nuc']) and np.all(self.gtr.alphabet==alphabets['nuc']):
+ if likely_alphabet=='aa' and self.gtr.n_states==len(alphabets['nuc']) and np.all(self.gtr.alphabet==alphabets['nuc']):
self.logger('WARNING: small fraction of ACGT-N in alignment. Really a nucleotide alignment? if not, rerun with --aa', 1, warn=True)
# conversely, warn if alignment is consistent with nucleotide but gtr has a long alphabet
- if likely_alphabet=='nuc' and len(self.gtr.alphabet)>10:
+ if likely_alphabet=='nuc' and self.gtr.n_states>10:
self.logger('WARNING: almost exclusively ACGT-N in alignment. Really a protein alignment?', 1, warn=True)
if hasattr(self, '_tree') and (self.tree is not None):
@@ -421,6 +424,21 @@ class TreeAnc(object):
"""
self._ref = in_ref
+ def extend_profile(self):
+ if self.aln:
+ if self.is_vcf and self.ref:
+ unique_chars = np.unique(list(self.ref))
+ else:
+ tmp_unique_chars = []
+ for node in self.tree.get_terminals():
+ tmp_unique_chars.extend(np.unique(node.sequence))
+ unique_chars = np.unique(tmp_unique_chars)
+ for c in unique_chars:
+ if c not in self.gtr.profile_map:
+ self.gtr.profile_map[c] = np.ones(self.gtr.n_states)
+ self.logger("WARNING: character %s is unknown. Treating it as missing information"%c,1,warn=True)
+
+
def _guess_alphabet(self):
if self.aln:
if self.is_vcf and self.ref:
@@ -485,6 +503,8 @@ class TreeAnc(object):
self.logger("***WARNING: TreeAnc: %d nodes don't have a matching sequence in the alignment."
" POSSIBLE ERROR."%failed_leaves, 0, warn=True)
+ # extend profile to contain additional unknown characters
+ self.extend_profile()
return self.make_reduced_alignment()
@@ -536,7 +556,7 @@ class TreeAnc(object):
if self.is_vcf:
tmp_reduced_aln, alignment_patterns, positions = self.process_alignment_dict()
seqNames = self.aln.keys() #store seqName order to put back on tree
- else:
+ elif self.reduce_alignment:
# transpose real alignment, for ease of iteration
alignment_patterns = {}
tmp_reduced_aln = []
@@ -550,6 +570,15 @@ class TreeAnc(object):
else:
aln_transpose = np.array(seqs).T
positions = range(aln_transpose.shape[0])
+ else:
+ self.multiplicity = np.ones(self.seq_len, dtype=float)
+ self.full_to_reduced_sequence_map = np.arange(self.seq_len)
+ self.reduced_to_full_sequence_map = {p:np.array([p]) for p in np.arange(self.seq_len)}
+ for n in self.tree.find_clades():
+ if hasattr(n, 'sequence'):
+ n.original_cseq = np.copy(n.sequence)
+ n.cseq = np.copy(n.sequence)
+ return ttconf.SUCCESS
for pi in positions:
if self.is_vcf:
@@ -849,6 +878,7 @@ class TreeAnc(object):
_ml_anc(final=True, **kwargs) # call one of the reconstruction types
alpha = list(self.gtr.alphabet)
n=len(alpha)
+ # matrix of mutations n_{ij}: i = derived state, j=ancestral state
nij = np.zeros((n,n))
Ti = np.zeros(n)
@@ -856,10 +886,10 @@ class TreeAnc(object):
for node in self.tree.find_clades():
if hasattr(node,'mutations'):
for a,pos, d in node.mutations:
- i,j = alpha.index(a), alpha.index(d)
+ i,j = alpha.index(d), alpha.index(a)
nij[i,j]+=1
- Ti[i] += 0.5*self._branch_length_to_gtr(node)
- Ti[j] -= 0.5*self._branch_length_to_gtr(node)
+ Ti[j] += 0.5*self._branch_length_to_gtr(node)
+ Ti[i] -= 0.5*self._branch_length_to_gtr(node)
for ni,nuc in enumerate(node.cseq):
i = alpha.index(nuc)
Ti[i] += self._branch_length_to_gtr(node)*self.multiplicity[ni]
@@ -982,18 +1012,19 @@ class TreeAnc(object):
node_seq = node.original_cseq
muts = []
- for p, (anc, der) in enumerate(zip(node.up.cseq, node_seq)):
- # only if the states in compressed sequences differ:
- if anc!=der:
- # expand to the positions in real sequence
- muts.extend([(anc, pos, der) for pos in self.reduced_to_full_sequence_map[p]])
+ diff_pos = np.where(node.up.cseq!=node_seq)[0]
+ for p in diff_pos:
+ anc = node.up.cseq[p]
+ der = node_seq[p]
+ # expand to the positions in real sequence
+ muts.extend([(anc, pos, der) for pos in self.reduced_to_full_sequence_map[p]])
#sort by position
return sorted(muts, key=lambda x:x[1])
def get_branch_mutation_matrix(self, node, full_sequence=False):
- """uses results from marginal ancesrtal inference to return a joint
+ """uses results from marginal ancestral inference to return a joint
distribution of the sequence states at both ends of the branch.
Parameters
@@ -1010,19 +1041,20 @@ class TreeAnc(object):
numpy.array
an Lxqxq stack of matrices (q=alphabet size, L (reduced)sequence length)
"""
- from itertools import product
pp,pc = self.marginal_branch_profile(node)
- if pp is None or pc is None:
- return None
+ # calculate pc_i [e^Qt]_ij pp_j for each site
expQt = self.gtr.expQt(self._branch_length_to_gtr(node))
- mut_matrix_stack = np.zeros((pp.shape[1], pp.shape[1],pp.shape[0]))
- for i,j in product(range(pp.shape[1]), repeat=2):
- mut_matrix_stack[i,j,:] = pp[:,i]*pc[:,j]*expQt[j,i]
+ if len(expQt.shape)==3: # site specific model
+ mut_matrix_stack = np.einsum('ai,aj,ija->aij', pc, pp, expQt)
+ else:
+ mut_matrix_stack = np.einsum('ai,aj,ij->aij', pc, pp, expQt)
+
+ # normalize this distribution
+ normalizer = mut_matrix_stack.sum(axis=2).sum(axis=1)
+ mut_matrix_stack = np.einsum('aij,a->aij', mut_matrix_stack, 1.0/normalizer)
- normalizer = mut_matrix_stack.sum(axis=1).sum(axis=0)
- mut_matrix_stack = mut_matrix_stack/normalizer
- mut_matrix_stack = np.swapaxes(np.swapaxes(mut_matrix_stack, 1,2), 0,1)
+ # expand to full sequence if requested
if full_sequence:
return mut_matrix_stack[self.full_to_reduced_sequence_map]
else:
@@ -1031,7 +1063,7 @@ class TreeAnc(object):
def expanded_sequence(self, node, include_additional_constant_sites=False):
"""
- Get node's compressed sequence and expand it to the real sequence
+ Expand a nodes compressed sequence into the real sequence
Parameters
----------
@@ -1039,7 +1071,7 @@ class TreeAnc(object):
Tree node
Returns
- -------f
+ -------
seq : np.array
Sequence as np.array of chars
"""
@@ -1047,12 +1079,8 @@ class TreeAnc(object):
L = self.seq_len
else:
L = self.seq_len - self.additional_constant_sites
- seq = np.zeros_like(self.full_to_reduced_sequence_map[:L], dtype='U1')
- for pos, state in enumerate(node.cseq):
- full_pos = self.reduced_to_full_sequence_map[pos]
- seq[full_pos[full_pos<L]] = state
- return seq
+ return node.cseq[self.full_to_reduced_sequence_map[:L]]
def dict_sequence(self, node, keep_var_ambigs=False):
@@ -1293,7 +1321,7 @@ class TreeAnc(object):
return max(ttconf.MIN_BRANCH_LENGTH*self.one_mutation, node.branch_length)
- def _ml_anc_marginal(self, store_compressed=True, final=True, sample_from_profile=False,
+ def _ml_anc_marginal(self, store_compressed=False, final=True, sample_from_profile=False,
debug=False, **kwargs):
"""
Perform marginal ML reconstruction of the ancestral states. In contrast to
@@ -1325,38 +1353,37 @@ class TreeAnc(object):
n_states = self.gtr.alphabet.shape[0]
self.logger("TreeAnc._ml_anc_marginal: type of reconstruction: Marginal", 2)
- self.logger("Walking up the tree, computing likelihoods... ", 3)
+ self.logger("Attaching sequence profiles to leafs... ", 3)
# set the leaves profiles
for leaf in tree.get_terminals():
# in any case, set the profile
leaf.marginal_subtree_LH = seq2prof(leaf.original_cseq, self.gtr.profile_map)
leaf.marginal_subtree_LH_prefactor = np.zeros(L)
+ self.logger("Walking up the tree, computing likelihoods... ", 3)
# propagate leaves --> root, set the marginal-likelihood messages
for node in tree.get_nonterminals(order='postorder'): #leaves -> root
# regardless of what was before, set the profile to ones
- tmp_log_subtree_LH = np.zeros((L,n_states))
- node.marginal_subtree_LH_prefactor = np.zeros(L)
+ tmp_log_subtree_LH = np.zeros((L,n_states), dtype=float)
+ node.marginal_subtree_LH_prefactor = np.zeros(L, dtype=float)
for ch in node.clades:
ch.marginal_log_Lx = self.gtr.propagate_profile(ch.marginal_subtree_LH,
self._branch_length_to_gtr(ch), return_log=True) # raw prob to transfer prob up
tmp_log_subtree_LH += ch.marginal_log_Lx
node.marginal_subtree_LH_prefactor += ch.marginal_subtree_LH_prefactor
- tmp_prefactor = np.max(tmp_log_subtree_LH,axis=1)
- node.marginal_subtree_LH = np.exp(tmp_log_subtree_LH.T-tmp_prefactor).T
- pre = node.marginal_subtree_LH.sum(axis=1) #sum over nucleotide states
- node.marginal_subtree_LH = (node.marginal_subtree_LH.T/pre).T # normalize so that the sum is 1
- node.marginal_subtree_LH_prefactor += np.log(pre) + tmp_prefactor # and store log-prefactor
+ node.marginal_subtree_LH, offset = normalize_profile(tmp_log_subtree_LH, log=True)
+ node.marginal_subtree_LH_prefactor += offset # and store log-prefactor
self.logger("Computing root node sequence and total tree likelihood...",3)
# Msg to the root from the distant part (equ frequencies)
- tree.root.marginal_outgroup_LH = np.repeat([self.gtr.Pi], tree.root.marginal_subtree_LH.shape[0], axis=0)
+ if len(self.gtr.Pi.shape)==1:
+ tree.root.marginal_outgroup_LH = np.repeat([self.gtr.Pi], tree.root.marginal_subtree_LH.shape[0], axis=0)
+ else:
+ tree.root.marginal_outgroup_LH = np.copy(self.gtr.Pi.T)
- tree.root.marginal_profile = tree.root.marginal_outgroup_LH*tree.root.marginal_subtree_LH
- pre = tree.root.marginal_profile.sum(axis=1)
- tree.root.marginal_profile = (tree.root.marginal_profile.T/pre).T
- marginal_LH_prefactor = tree.root.marginal_subtree_LH_prefactor + np.log(pre)
+ tree.root.marginal_profile, pre = normalize_profile(tree.root.marginal_outgroup_LH*tree.root.marginal_subtree_LH)
+ marginal_LH_prefactor = tree.root.marginal_subtree_LH_prefactor + pre
# choose sequence characters from this profile.
# treat root node differently to avoid piling up mutations on the longer branch
@@ -1368,11 +1395,12 @@ class TreeAnc(object):
other_sample_from_profile = sample_from_profile
seq, prof_vals, idxs = prof2seq(tree.root.marginal_profile,
- self.gtr, sample_from_prof=root_sample_from_profile)
+ self.gtr, sample_from_prof=root_sample_from_profile, normalize=False)
self.tree.sequence_LH = marginal_LH_prefactor
self.tree.total_sequence_LH = (self.tree.sequence_LH*self.multiplicity).sum()
self.tree.root.cseq = seq
+ gc.collect()
if final:
if self.is_vcf:
@@ -1389,22 +1417,16 @@ class TreeAnc(object):
# integrate the information coming from parents with the information
# of all children my multiplying it to the prev computed profile
- tmp_msg = np.log(node.up.marginal_profile) - node.marginal_log_Lx
- tmp_prefactor = np.max(tmp_msg, axis=1)
- tmp_msg = np.exp(tmp_msg.T - tmp_prefactor).T
-
- norm_vector = tmp_msg.sum(axis=1)
- node.marginal_outgroup_LH = (tmp_msg.T/norm_vector).T
- tmp_msg_from_parent = self.gtr.propagate_profile(node.marginal_outgroup_LH,
+ node.marginal_outgroup_LH, pre = normalize_profile(np.log(node.up.marginal_profile) - node.marginal_log_Lx,
+ log=True, return_offset=False)
+ tmp_msg_from_parent = self.gtr.evolve(node.marginal_outgroup_LH,
self._branch_length_to_gtr(node), return_log=False)
- node.marginal_profile = node.marginal_subtree_LH * tmp_msg_from_parent
- norm_vector = node.marginal_profile.sum(axis=1)
- node.marginal_profile=(node.marginal_profile.T/norm_vector).T
+ node.marginal_profile, pre = normalize_profile(node.marginal_subtree_LH * tmp_msg_from_parent, return_offset=False)
# choose sequence based maximal marginal LH.
seq, prof_vals, idxs = prof2seq(node.marginal_profile, self.gtr,
- sample_from_prof=other_sample_from_profile)
+ sample_from_prof=other_sample_from_profile, normalize=False)
if hasattr(node, 'cseq') and node.cseq is not None:
N_diff += (seq!=node.cseq).sum()
@@ -1430,11 +1452,11 @@ class TreeAnc(object):
if not debug:
for node in self.tree.find_clades():
try:
- # del node.marginal_profile
- # del node.marginal_outgroup_LH
- del node.marginal_Lx
+ del node.marginal_log_Lx
+ del node.marginal_subtree_LH_prefactor
except:
pass
+ gc.collect()
return N_diff
@@ -1481,8 +1503,7 @@ class TreeAnc(object):
branch_len = self._branch_length_to_gtr(node)
# transition matrix from parent states to the current node states.
# denoted as Pij(i), where j - parent state, i - node state
- log_transitions = np.log(self.gtr.expQt(branch_len))
-
+ log_transitions = np.log(np.maximum(ttconf.TINY_NUMBER, self.gtr.expQt(branch_len)))
if node.is_terminal():
try:
msg_from_children = np.log(np.maximum(seq2prof(node.original_cseq, self.gtr.profile_map), ttconf.TINY_NUMBER))
@@ -1499,7 +1520,7 @@ class TreeAnc(object):
# and compute the likelihood of this state
for char_i, char in enumerate(self.gtr.alphabet):
# Pij(i) * L_ch(i) for given parent state j
- msg_to_parent = (log_transitions.T[char_i, :] + msg_from_children)
+ msg_to_parent = (log_transitions[:,char_i].T + msg_from_children)
# For this parent state, choose the best state of the current node:
node.joint_Cx[:, char_i] = msg_to_parent.argmax(axis=1)
# compute the likelihood of the best state of the current node
@@ -1509,7 +1530,7 @@ class TreeAnc(object):
# root node profile = likelihood of the total tree
msg_from_children = np.sum(np.stack([c.joint_Lx for c in self.tree.root], axis = 0), axis=0)
# Pi(i) * Prod_ch Lch(i)
- self.tree.root.joint_Lx = msg_from_children + np.log(self.gtr.Pi)
+ self.tree.root.joint_Lx = msg_from_children + np.log(self.gtr.Pi).T
normalized_profile = (self.tree.root.joint_Lx.T - self.tree.root.joint_Lx.max(axis=1)).T
# choose sequence characters from this profile.
@@ -1619,8 +1640,8 @@ class TreeAnc(object):
def optimize_branch_length(self, mode='joint', **kwargs):
"""
- Perform optimization for the branch lengths of the whole tree or any
- subtree.
+ Perform optimization for the branch lengths of the entire tree.
+ This method only does a single path and needs to be iterated.
**Note** this method assumes that each node stores information
about its sequence as numpy.array object (node.sequence attribute).
@@ -1694,7 +1715,7 @@ class TreeAnc(object):
# as branch lengths changed, the params must be fixed
self.tree.root.up = None
self.tree.root.dist2root = 0.0
- if max_bl>0.15:
+ if max_bl>0.15 and mode=='joint':
self.logger("TreeAnc.optimize_branch_length: THIS TREE HAS LONG BRANCHES."
" \n\t ****TreeTime IS NOT DESIGNED TO OPTIMIZE LONG BRANCHES."
" \n\t ****PLEASE OPTIMIZE BRANCHES WITH ANOTHER TOOL AND RERUN WITH"
@@ -1802,18 +1823,16 @@ class TreeAnc(object):
'''
parent = node.up
if parent is None:
- self.logger("Branch profiles can't be calculated for the root!",3)
- return None, None
+ raise Exception("Branch profiles can't be calculated for the root!")
if not hasattr(node, 'marginal_outgroup_LH'):
- self.logger("marginal ancestral inference needs to be performed first", 3)
- return None, None
+ raise Exception("marginal ancestral inference needs to be performed first!")
pc = node.marginal_subtree_LH
pp = node.marginal_outgroup_LH
return pp, pc
- def optimal_marginal_branch_length(self, node):
+ def optimal_marginal_branch_length(self, node, tol=1e-10):
'''
calculate the marginal distribution of sequence states on both ends
of the branch leading to node,
@@ -1833,7 +1852,7 @@ class TreeAnc(object):
if node.up is None:
return self.one_mutation
pp, pc = self.marginal_branch_profile(node)
- return self.gtr.optimal_t_compressed((pp, pc), self.multiplicity, profiles=True)
+ return self.gtr.optimal_t_compressed((pp, pc), self.multiplicity, profiles=True, tol=tol)
def prune_short_branches(self):
=====================================
treetime/treetime.py
=====================================
@@ -6,7 +6,8 @@ from treetime import config as ttconf
from .utils import tree_layout
from .clock_tree import ClockTree
-rerooting_mechanisms = ["min_dev","min_dev_ML", "best", "least-squares", "ML"]
+rerooting_mechanisms = ["min_dev", "min_dev_ML", "best", "least-squares", "ML"]
+deprecated_rerooting_mechanisms = {"residual":"least-squares", "res":"least-squares"}
class TreeTime(ClockTree):
"""
@@ -158,7 +159,8 @@ class TreeTime(ClockTree):
else:
plot_rtt=False
reroot_mechanism = 'least-squares' if root=='clock_filter' else root
- self.clock_filter(reroot=reroot_mechanism, n_iqd=n_iqd, plot=plot_rtt)
+ if self.clock_filter(reroot=reroot_mechanism, n_iqd=n_iqd, plot=plot_rtt)==ttconf.ERROR:
+ return ttconf.ERROR
elif root is not None:
if self.reroot(root=root)==ttconf.ERROR:
return ttconf.ERROR
@@ -200,6 +202,7 @@ class TreeTime(ClockTree):
# estimate a relaxed molecular clock
if relaxed_clock:
+ print("relaxed_clock", relaxed_clock)
self.relaxed_clock(**relaxed_clock)
need_new_time_tree = True
@@ -323,7 +326,8 @@ class TreeTime(ClockTree):
if reroot:
if reroot.startswith("ML"):
self.logger("TreeTime.ClockFilter: filtering with covariance aware methods is not recommended.", 0, warn=True)
- self.reroot(root='least-squares' if reroot=='best' else reroot)
+ if self.reroot(root='least-squares' if reroot=='best' else reroot)==ttconf.ERROR:
+ return ttconf.ERROR
else:
self.get_clock_model(covariation=False)
@@ -344,8 +348,8 @@ class TreeTime(ClockTree):
node.bad_branch=False
# redo root estimation after outlier removal
- if reroot:
- self.reroot(root=reroot)
+ if reroot and self.reroot(root=reroot)==ttconf.ERROR:
+ return ttconf.ERROR
if plot:
self.plot_root_to_tip()
@@ -405,7 +409,12 @@ class TreeTime(ClockTree):
for n in self.tree.find_clades():
n.branch_length=n.mutation_length
- if root in rerooting_mechanisms:
+ if root in rerooting_mechanisms or root in deprecated_rerooting_mechanisms:
+ if root in deprecated_rerooting_mechanisms:
+ self.logger('TreeTime.reroot: rerooting mechanisms %s has been renamed to %s'
+ %(root, deprecated_rerooting_mechanisms[root]), 1, warn=True)
+ root = deprecated_rerooting_mechanisms[root]
+
new_root = self._find_best_root(covariation='ML' in root,
force_positive=force_positive and (not root.startswith('min_dev')))
else:
@@ -420,7 +429,7 @@ class TreeTime(ClockTree):
if n.raw_date_constraint is not None],
key=lambda x:np.mean(x.raw_date_constraint))[0]
else:
- self.logger('TreeTime.reroot -- WARNING: unsupported rooting mechanisms or root not found',2,warn=True)
+ self.logger('TreeTime.reroot -- ERROR: unsupported rooting mechanisms or root not found',0,warn=True)
return ttconf.ERROR
#this forces a bifurcating root, as we want. Branch lengths will be reoptimized anyway.
@@ -733,13 +742,13 @@ class TreeTime(ClockTree):
for node in self.tree.find_clades(order='preorder'):
if node.up is None:
- node.gamma =- 0.5*node._k1/node._k2
+ node.gamma = max(0.1, -0.5*node._k1/node._k2)
else:
if node.up.up is None:
g_up = node.up.gamma
else:
g_up = node.up.branch_length_interpolator.gamma
- node.branch_length_interpolator.gamma = (coupling*g_up - 0.5*node._k1)/(coupling+node._k2)
+ node.branch_length_interpolator.gamma = max(0.1,(coupling*g_up - 0.5*node._k1)/(coupling+node._k2))
###############################################################################
### rerooting
=====================================
treetime/wrappers.py
=====================================
@@ -538,6 +538,7 @@ def timetree(params):
branch_length_mode = branch_length_mode,
fixed_pi=fixed_pi)
if success==ttconf.ERROR: # if TreeTime.run failed, exit
+ print("\nTreeTime run FAILED: please check above for errors and/or rerun with --verbose 4.\n")
return 1
###########################################################################
View it on GitLab: https://salsa.debian.org/med-team/python-treetime/compare/444878a998d7b34a5f302850c14a38937fef78e5...a348b07e59c2bd41e514d723cefdfdd85bea81a6
--
View it on GitLab: https://salsa.debian.org/med-team/python-treetime/compare/444878a998d7b34a5f302850c14a38937fef78e5...a348b07e59c2bd41e514d723cefdfdd85bea81a6
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190125/f8e1ef84/attachment-0001.html>
More information about the debian-med-commit
mailing list