[med-svn] [iqtree] 01/04: Imported Upstream version 1.4.3+dfsg
Andreas Tille
tille at debian.org
Mon Aug 15 20:37:21 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository iqtree.
commit d67f0c4bfca8e0fd8475b648d97c71984a432b24
Author: Andreas Tille <tille at debian.org>
Date: Mon Aug 15 22:22:56 2016 +0200
Imported Upstream version 1.4.3+dfsg
---
CMakeLists.txt | 2 +-
alignment.cpp | 88 +++-
alignment.h | 14 +-
checkpoint.cpp | 4 +-
iqtree.cpp | 45 +-
model/modelcodon.cpp | 75 +++
model/modelcodon.h | 10 +
model/modeldna.cpp | 10 +-
model/modelfactory.cpp | 414 +++++++--------
model/modelfactory.h | 8 +-
model/modelgtr.cpp | 8 +-
model/modelgtr.h | 2 +-
model/modelmixture.cpp | 2 +-
model/modelnonrev.cpp | 6 +-
model/ratefree.cpp | 28 +-
model/ratefreeinvar.cpp | 1 +
model/ratefreeinvar.h | 2 +-
model/rategamma.cpp | 29 +-
model/rategammainvar.cpp | 239 ++++++---
model/rategammainvar.h | 53 +-
model/rateheterogeneity.cpp | 18 +-
model/rateheterogeneity.h | 13 +
ncl/nxscharactersblock.cpp | 9 +-
ngs.cpp | 4 +-
ngs.h | 2 +-
pda.cpp | 2 -
phyloanalysis.cpp | 53 +-
phylokernel.h | 10 +-
phylokernelmixrate.h | 6 +
phylokernelmixture.h | 6 +
phylosupertreeplen.cpp | 6 +-
phylotesting.cpp | 69 ++-
phylotree.cpp | 1162 ++----------------------------------------
phylotree.h | 152 +-----
phylotreesse.cpp | 1185 ++++++++++++++++---------------------------
quartet.cpp | 4 +
tools.cpp | 122 +++--
tools.h | 33 +-
38 files changed, 1425 insertions(+), 2471 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 22a4864..497a66f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,7 @@ add_definitions(-DIQ_TREE)
# The version number.
set (iqtree_VERSION_MAJOR 1)
set (iqtree_VERSION_MINOR 4)
-set (iqtree_VERSION_PATCH "2")
+set (iqtree_VERSION_PATCH "3")
set(BUILD_SHARED_LIBS OFF)
diff --git a/alignment.cpp b/alignment.cpp
index 4a20602..25214c2 100644
--- a/alignment.cpp
+++ b/alignment.cpp
@@ -347,7 +347,10 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype) : v
readFasta(filename, sequence_type);
} else if (intype == IN_PHYLIP) {
cout << "Phylip format detected" << endl;
- readPhylip(filename, sequence_type);
+ if (Params::getInstance().phylip_sequential_format)
+ readPhylipSequential(filename, sequence_type);
+ else
+ readPhylip(filename, sequence_type);
} else if (intype == IN_CLUSTAL) {
cout << "Clustal format detected" << endl;
readClustal(filename, sequence_type);
@@ -1464,6 +1467,76 @@ int Alignment::readPhylip(char *filename, char *sequence_type) {
return buildPattern(sequences, sequence_type, nseq, nsite);
}
+int Alignment::readPhylipSequential(char *filename, char *sequence_type) {
+
+ StrVector sequences;
+ ostringstream err_str;
+ ifstream in;
+ int line_num = 1;
+ // set the failbit and badbit
+ in.exceptions(ios::failbit | ios::badbit);
+ in.open(filename);
+ int nseq = 0, nsite = 0;
+ int seq_id = 0;
+ string line;
+ // remove the failbit
+ in.exceptions(ios::badbit);
+ num_states = 0;
+
+ for (; !in.eof(); line_num++) {
+ getline(in, line);
+ line = line.substr(0, line.find_first_of("\n\r"));
+ if (line == "") continue;
+
+ //cout << line << endl;
+ if (nseq == 0) { // read number of sequences and sites
+ istringstream line_in(line);
+ if (!(line_in >> nseq >> nsite))
+ throw "Invalid PHYLIP format. First line must contain number of sequences and sites";
+ //cout << "nseq: " << nseq << " nsite: " << nsite << endl;
+ if (nseq < 3)
+ throw "There must be at least 3 sequences";
+ if (nsite < 1)
+ throw "No alignment columns";
+
+ seq_names.resize(nseq, "");
+ sequences.resize(nseq, "");
+
+ } else { // read sequence contents
+ if (seq_id >= nseq)
+ throw "Line " + convertIntToString(line_num) + ": Too many sequences detected";
+
+ if (seq_names[seq_id] == "") { // cut out the sequence name
+ string::size_type pos = line.find_first_of(" \t");
+ if (pos == string::npos) pos = 10; // assume standard phylip
+ seq_names[seq_id] = line.substr(0, pos);
+ line.erase(0, pos);
+ }
+ for (string::iterator it = line.begin(); it != line.end(); it++) {
+ if ((*it) <= ' ') continue;
+ if (isalnum(*it) || (*it) == '-' || (*it) == '?'|| (*it) == '.' || (*it) == '*' || (*it) == '~')
+ sequences[seq_id].append(1, toupper(*it));
+ else {
+ err_str << "Line " << line_num <<": Unrecognized character " << *it;
+ throw err_str.str();
+ }
+ }
+ if (sequences[seq_id].length() > nsite)
+ throw ("Line " + convertIntToString(line_num) + ": Sequence " + seq_names[seq_id] + " is too long (" + convertIntToString(sequences[seq_id].length()) + ")");
+ if (sequences[seq_id].length() == nsite) {
+ seq_id++;
+ }
+ }
+ //sequences.
+ }
+ in.clear();
+ // set the failbit again
+ in.exceptions(ios::failbit | ios::badbit);
+ in.close();
+
+ return buildPattern(sequences, sequence_type, nseq, nsite);
+}
+
int Alignment::readFasta(char *filename, char *sequence_type) {
StrVector sequences;
@@ -3140,7 +3213,7 @@ void Alignment::computeCodonFreq(StateFreqType freq, double *state_freq, double
convfreq(state_freq);
}
-void Alignment::computeEmpiricalRate (double *rates) {
+void Alignment::computeDivergenceMatrix(double *rates) {
int i, j, k;
assert(rates);
int nseqs = getNSeq();
@@ -3181,8 +3254,8 @@ void Alignment::computeEmpiricalRate (double *rates) {
for (j = i+1; j < num_states; j++) {
rates[k++] = (pair_rates[i*num_states+j] + pair_rates[j*num_states+i]) / last_rate;
// BIG WARNING: zero rates might cause numerical instability!
- if (rates[k-1] <= 0.0001) rates[k-1] = 0.01;
- if (rates[k-1] > 100.0) rates[k-1] = 50.0;
+// if (rates[k-1] <= 0.0001) rates[k-1] = 0.01;
+// if (rates[k-1] > 100.0) rates[k-1] = 50.0;
}
rates[k-1] = 1;
if (verbose_mode >= VB_MAX) {
@@ -3199,11 +3272,11 @@ void Alignment::computeEmpiricalRate (double *rates) {
delete [] pair_rates;
}
-void Alignment::computeEmpiricalRateNonRev (double *rates) {
+void Alignment::computeDivergenceMatrixNonRev (double *rates) {
double *rates_mat = new double[num_states*num_states];
int i, j, k;
- computeEmpiricalRate(rates);
+ computeDivergenceMatrix(rates);
for (i = 0, k = 0; i < num_states-1; i++)
for (j = i+1; j < num_states; j++)
@@ -3472,3 +3545,6 @@ double Alignment::multinomialProb (IntVector &pattern_freq)
}
return (fac - sumFac + sumProb);
}
+
+
+
diff --git a/alignment.h b/alignment.h
index 1b736c9..fdee2c8 100644
--- a/alignment.h
+++ b/alignment.h
@@ -111,7 +111,7 @@ public:
int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite);
/**
- read the alignment in PHYLIP format
+ read the alignment in PHYLIP format (interleaved)
@param filename file name
@param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
@return 1 on success, 0 on failure
@@ -119,6 +119,14 @@ public:
int readPhylip(char *filename, char *sequence_type);
/**
+ read the alignment in sequential PHYLIP format
+ @param filename file name
+ @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
+ @return 1 on success, 0 on failure
+ */
+ int readPhylipSequential(char *filename, char *sequence_type);
+
+ /**
read the alignment in FASTA format
@param filename file name
@param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
@@ -542,13 +550,13 @@ public:
compute empirical rates between state pairs
@param rates (OUT) vector of size num_states*(num_states-1)/2 for the rates
*/
- virtual void computeEmpiricalRate(double *rates);
+ virtual void computeDivergenceMatrix(double *rates);
/**
compute non-reversible empirical rates between state pairs
@param rates (OUT) vector of size num_states*(num_states-1) for the rates
*/
- virtual void computeEmpiricalRateNonRev(double *rates);
+ virtual void computeDivergenceMatrixNonRev(double *rates);
/**
count the fraction of constant sites in the alignment, update the variable frac_const_sites
diff --git a/checkpoint.cpp b/checkpoint.cpp
index 19312a6..1229d3f 100644
--- a/checkpoint.cpp
+++ b/checkpoint.cpp
@@ -99,7 +99,9 @@ void Checkpoint::setDumpInterval(double interval) {
void Checkpoint::dump(bool force) {
- assert(filename != "");
+ if (filename == "")
+ return;
+
if (!force && getRealTime() < prev_dump_time + dump_interval) {
return;
}
diff --git a/iqtree.cpp b/iqtree.cpp
index 2179c96..cf8dc72 100644
--- a/iqtree.cpp
+++ b/iqtree.cpp
@@ -314,6 +314,19 @@ void IQTree::initSettings(Params ¶ms) {
boot_trees_brlen.resize(params.gbo_replicates);
} else {
cout << "CHECKPOINT: " << boot_trees.size() << " UFBoot trees and " << boot_splits.size() << " UFBootSplits restored" << endl;
+ // TODO: quick and dirty fix, no branch lengths are saved after checkpointing
+ if (params.print_ufboot_trees == 2) {
+ boot_trees_brlen.resize(params.gbo_replicates);
+ string ufboot_file = params.out_prefix + string(".ufboot");
+ if (fileExists(ufboot_file)) {
+ ifstream in(ufboot_file.c_str());
+ for (i = 0; i < params.gbo_replicates && !in.eof(); i++)
+ in >> boot_trees_brlen[i];
+ in.close();
+ } else {
+ outWarning("Cannot properly restore bootstrap trees with branch lengths");
+ }
+ }
}
VerboseMode saved_mode = verbose_mode;
verbose_mode = VB_QUIET;
@@ -1720,8 +1733,8 @@ extern pllUFBootData * pllUFBootDataPtr;
string IQTree::optimizeModelParameters(bool printInfo, double logl_epsilon) {
if (logl_epsilon == -1)
logl_epsilon = params->modeps;
-// if (params->test_param)
-// logl_epsilon = 1.0;
+// if (params->opt_gammai)
+// logl_epsilon = 0.1;
cout << "Estimate model parameters (epsilon = " << logl_epsilon << ")" << endl;
double stime = getRealTime();
string newTree;
@@ -1746,13 +1759,9 @@ string IQTree::optimizeModelParameters(bool printInfo, double logl_epsilon) {
cout << etime - stime << " seconds (logl: " << curScore << ")" << endl;
} else {
double modOptScore;
- if (params->test_param) { // DO RESTART ON ALPHA AND P_INVAR
- double stime = getRealTime();
+ if (params->opt_gammai) { // DO RESTART ON ALPHA AND P_INVAR
modOptScore = getModelFactory()->optimizeParametersGammaInvar(params->fixed_branch_length, printInfo, logl_epsilon);
- double etime = getRealTime();
- cout << "Testing param took: " << etime -stime << " CPU seconds" << endl;
- cout << endl;
- params->test_param = false;
+ params->opt_gammai = false;
} else {
modOptScore = getModelFactory()->optimizeParameters(params->fixed_branch_length, printInfo, logl_epsilon);
}
@@ -1915,6 +1924,10 @@ double IQTree::doTreeSearch() {
* Perturb the tree
*---------------------------------------*/
double perturbScore = 0.0;
+ int numStableBranches = aln->getNSeq() - 3 - candidateTrees.getStableSplits().size();
+ // Change from floor to ceil to make sure perturbing at least 1 branch
+ int numPerturb = ceil(searchinfo.curPerStrength * numStableBranches);
+ bool treechanged = false;
if (iqp_assess_quartet == IQP_BOOTSTRAP) {
// create bootstrap sample
Alignment* bootstrap_alignment;
@@ -1929,8 +1942,6 @@ double IQTree::doTreeSearch() {
curScore = optimizeAllBranches();
} else {
if (params->snni) {
- int numStableBranches = aln->getNSeq() - 3 - candidateTrees.getStableSplits().size();
- int numNNI = floor(searchinfo.curPerStrength * numStableBranches);
// string candidateTree = candidateTrees.getRandCandTree();
// readTreeString(candidateTree);
readTreeString(candidateTrees.getRandCandTree());
@@ -1939,7 +1950,7 @@ double IQTree::doTreeSearch() {
if (params->iqp) {
doIQP();
} else {
- doRandomNNIs(numNNI);
+ doRandomNNIs(numPerturb);
}
} else {
readTreeString(candidateTrees.getBestTrees()[0]);
@@ -1957,8 +1968,11 @@ double IQTree::doTreeSearch() {
}
}
+ double oldScore = curScore;
computeLogL();
perturbScore = curScore;
+ if (perturbScore < oldScore - 0.01)
+ treechanged = true;
}
/*----------------------------------------
@@ -1968,6 +1982,10 @@ double IQTree::doTreeSearch() {
int nni_steps = 0;
imd_tree = doNNISearch(nni_count, nni_steps);
+
+ if (nni_count == 0 && params->snni && numPerturb > 0 && treechanged) {
+ assert(0 && "BUG: NNI could not improved perturbed tree");
+ }
if (iqp_assess_quartet == IQP_BOOTSTRAP) {
// restore alignment
@@ -2274,8 +2292,9 @@ double IQTree::optimizeNNI(int &nni_count, int &nni_steps) {
numNNIs = 1;
curScore = oldScore;
}
- if (curScore - oldScore < 0.1)
- break;
+ // BUG in following line, causing premature break by rollBack! that's why commented out
+// if (curScore - oldScore < 0.1)
+// break;
}
if (nni_count == 0 && verbose_mode >= VB_MED) {
diff --git a/model/modelcodon.cpp b/model/modelcodon.cpp
index 205b38a..b3c0fa0 100644
--- a/model/modelcodon.cpp
+++ b/model/modelcodon.cpp
@@ -9,6 +9,9 @@
#include <string>
+const double MIN_OMEGA_KAPPA = 0.001;
+const double MAX_OMEGA_KAPPA = 50.0;
+
/* Empirical codon model restricted (Kosiol et al. 2007), source: http://www.ebi.ac.uk/goldman/ECM/ */
string model_ECMrest1 =
"11.192024 \
@@ -454,6 +457,7 @@ StateFreqType ModelCodon::initGY94(bool fix_kappa, CodonKappaStyle kappa_style)
if (fix_kappa)
kappa = 1.0;
fix_kappa2 = true;
+ codon_freq_style = CF_TARGET_CODON;
this->codon_kappa_style = kappa_style;
if (kappa_style == CK_TWO_KAPPA)
fix_kappa2 = false;
@@ -890,6 +894,77 @@ void ModelCodon::setVariables(double *variables) {
}
}
+void ModelCodon::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
+ int i, ndim = getNDim();
+
+ for (i = 1; i <= ndim; i++) {
+ //cout << variables[i] << endl;
+ lower_bound[i] = MIN_OMEGA_KAPPA;
+ upper_bound[i] = MAX_OMEGA_KAPPA;
+ bound_check[i] = false;
+ }
+
+ if (freq_type == FREQ_ESTIMATE) {
+ for (i = ndim-num_states+2; i <= ndim; i++) {
+// lower_bound[i] = MIN_FREQUENCY/state_freq[highest_freq_state];
+// upper_bound[i] = state_freq[highest_freq_state]/MIN_FREQUENCY;
+ lower_bound[i] = MIN_FREQUENCY;
+// upper_bound[i] = 100.0;
+ upper_bound[i] = 1.0;
+ bound_check[i] = false;
+ }
+ }
+}
+
+double ModelCodon::optimizeParameters(double gradient_epsilon) {
+ int ndim = getNDim();
+
+ // return if nothing to be optimized
+ if (ndim == 0) return 0.0;
+
+ if (verbose_mode >= VB_MAX)
+ cout << "Optimizing " << name << " model parameters..." << endl;
+
+
+ double *variables = new double[ndim+1];
+ double *upper_bound = new double[ndim+1];
+ double *lower_bound = new double[ndim+1];
+ bool *bound_check = new bool[ndim+1];
+ double score;
+
+ for (int i = 0; i < num_states; i++)
+ if (state_freq[i] > state_freq[highest_freq_state])
+ highest_freq_state = i;
+
+ // by BFGS algorithm
+ setVariables(variables);
+ setBounds(lower_bound, upper_bound, bound_check);
+// if (phylo_tree->params->optimize_alg.find("BFGS-B") == string::npos)
+// score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_RATE));
+// else
+ score = -L_BFGS_B(ndim, variables+1, lower_bound+1, upper_bound+1, max(gradient_epsilon, TOL_RATE));
+
+ bool changed = getVariables(variables);
+ // BQM 2015-09-07: normalize state_freq
+ if (freq_type == FREQ_ESTIMATE) {
+ scaleStateFreq(true);
+ changed = true;
+ }
+ if (changed) {
+ decomposeRateMatrix();
+ phylo_tree->clearAllPartialLH();
+ score = phylo_tree->computeLikelihood();
+ }
+
+ delete [] bound_check;
+ delete [] lower_bound;
+ delete [] upper_bound;
+ delete [] variables;
+
+ return score;
+}
+
+
void ModelCodon::writeInfo(ostream &out) {
if (name.find('_') == string::npos)
out << "Nonsynonymous/synonymous ratio (omega): " << omega << endl;
diff --git a/model/modelcodon.h b/model/modelcodon.h
index 134d72a..b36a6be 100644
--- a/model/modelcodon.h
+++ b/model/modelcodon.h
@@ -115,6 +115,16 @@ public:
/** compute the corrected empirical omega (Kosiol et al 2007) */
double computeEmpiricalOmega();
+ /**
+ * setup the bounds for joint optimization with BFGS
+ */
+ virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check);
+
+ /**
+ optimize model parameters
+ @return the best likelihood
+ */
+ virtual double optimizeParameters(double gradient_epsilon);
/** 3x4 matrix of nucleotide frequencies at 1st,2nd,3rd codon position */
double *ntfreq;
diff --git a/model/modeldna.cpp b/model/modeldna.cpp
index f3ce873..f881bbd 100644
--- a/model/modeldna.cpp
+++ b/model/modeldna.cpp
@@ -219,10 +219,14 @@ void ModelDNA::readRates(string str) throw(const char*) {
if (str[end_pos] == '?') {
param_fixed[i+1] = false;
end_pos++;
- rate = i + 0.4;
+ rate = 1.0;
num_params++;
} else {
- param_fixed[i+1] = true;
+ if (Params::getInstance().optimize_rate_matrix) {
+ num_params++;
+ param_fixed[i+1] = false;
+ } else
+ param_fixed[i+1] = true;
try {
rate = convert_double(str.substr(end_pos).c_str(), new_end_pos);
} catch (string str) {
@@ -233,7 +237,7 @@ void ModelDNA::readRates(string str) throw(const char*) {
if (rate < 0.0)
outError("Negative rates found");
if (i == nrates-1 && end_pos < str.length())
- outError("String too long ", str);
+ outError("More than " + convertIntToString(nrates) + " rate parameters specified in " + str);
if (i < nrates-1 && end_pos >= str.length())
outError("Unexpected end of string ", str);
if (end_pos < str.length() && str[end_pos] != ',')
diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp
index bb7efbd..ad015c6 100644
--- a/model/modelfactory.cpp
+++ b/model/modelfactory.cpp
@@ -102,8 +102,7 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
is_storing = false;
joint_optimize = params.optimize_model_rate_joint;
fused_mix_rate = false;
-
- string model_str = params.model_name;
+ string model_str = params.model_name;
string rate_str;
try {
@@ -472,57 +471,57 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
//string rate_str = model_str.substr(pos);
if (posI != string::npos && posG != string::npos) {
site_rate = new RateGammaInvar(num_rate_cats, gamma_shape, params.gamma_median,
- p_invar_sites, params.optimize_model_rate_joint, tree);
+ p_invar_sites, params.optimize_alg_gammai, tree, false);
} else if (posI != string::npos && posR != string::npos) {
- site_rate = new RateFreeInvar(num_rate_cats, gamma_shape, freerate_params, p_invar_sites, !fused_mix_rate, params.optimize_alg, tree);
+ site_rate = new RateFreeInvar(num_rate_cats, gamma_shape, freerate_params, !fused_mix_rate, p_invar_sites, params.optimize_alg, tree);
} else if (posI != string::npos) {
site_rate = new RateInvar(p_invar_sites, tree);
} else if (posG != string::npos) {
site_rate = new RateGamma(num_rate_cats, gamma_shape, params.gamma_median, tree);
} else if (posR != string::npos) {
site_rate = new RateFree(num_rate_cats, gamma_shape, freerate_params, !fused_mix_rate, params.optimize_alg, tree);
- } else if ((posX = rate_str.find("+M")) != string::npos) {
- tree->setLikelihoodKernel(LK_NORMAL);
- params.rate_mh_type = true;
- if (rate_str.length() > posX+2 && isdigit(rate_str[posX+2])) {
- num_rate_cats = convert_int(rate_str.substr(posX+2).c_str());
- if (num_rate_cats < 0) outError("Wrong number of rate categories");
- } else num_rate_cats = -1;
- if (num_rate_cats >= 0)
- site_rate = new RateMeyerDiscrete(num_rate_cats, params.mcat_type,
- params.rate_file, tree, params.rate_mh_type);
- else
- site_rate = new RateMeyerHaeseler(params.rate_file, tree, params.rate_mh_type);
- site_rate->setTree(tree);
- } else if ((posX = rate_str.find("+D")) != string::npos) {
- tree->setLikelihoodKernel(LK_NORMAL);
- params.rate_mh_type = false;
- if (rate_str.length() > posX+2 && isdigit(rate_str[posX+2])) {
- num_rate_cats = convert_int(rate_str.substr(posX+2).c_str());
- if (num_rate_cats < 0) outError("Wrong number of rate categories");
- } else num_rate_cats = -1;
- if (num_rate_cats >= 0)
- site_rate = new RateMeyerDiscrete(num_rate_cats, params.mcat_type,
- params.rate_file, tree, params.rate_mh_type);
- else
- site_rate = new RateMeyerHaeseler(params.rate_file, tree, params.rate_mh_type);
- site_rate->setTree(tree);
- } else if ((posX = rate_str.find("+NGS")) != string::npos) {
- tree->setLikelihoodKernel(LK_NORMAL);
- if (rate_str.length() > posX+4 && isdigit(rate_str[posX+4])) {
- num_rate_cats = convert_int(rate_str.substr(posX+4).c_str());
- if (num_rate_cats < 0) outError("Wrong number of rate categories");
- } else num_rate_cats = -1;
- site_rate = new NGSRateCat(tree, num_rate_cats);
- site_rate->setTree(tree);
- } else if ((posX = rate_str.find("+NGS")) != string::npos) {
- tree->setLikelihoodKernel(LK_NORMAL);
- if (rate_str.length() > posX+4 && isdigit(rate_str[posX+4])) {
- num_rate_cats = convert_int(rate_str.substr(posX+4).c_str());
- if (num_rate_cats < 0) outError("Wrong number of rate categories");
- } else num_rate_cats = -1;
- site_rate = new NGSRate(tree);
- site_rate->setTree(tree);
+// } else if ((posX = rate_str.find("+M")) != string::npos) {
+// tree->setLikelihoodKernel(LK_NORMAL);
+// params.rate_mh_type = true;
+// if (rate_str.length() > posX+2 && isdigit(rate_str[posX+2])) {
+// num_rate_cats = convert_int(rate_str.substr(posX+2).c_str());
+// if (num_rate_cats < 0) outError("Wrong number of rate categories");
+// } else num_rate_cats = -1;
+// if (num_rate_cats >= 0)
+// site_rate = new RateMeyerDiscrete(num_rate_cats, params.mcat_type,
+// params.rate_file, tree, params.rate_mh_type);
+// else
+// site_rate = new RateMeyerHaeseler(params.rate_file, tree, params.rate_mh_type);
+// site_rate->setTree(tree);
+// } else if ((posX = rate_str.find("+D")) != string::npos) {
+// tree->setLikelihoodKernel(LK_NORMAL);
+// params.rate_mh_type = false;
+// if (rate_str.length() > posX+2 && isdigit(rate_str[posX+2])) {
+// num_rate_cats = convert_int(rate_str.substr(posX+2).c_str());
+// if (num_rate_cats < 0) outError("Wrong number of rate categories");
+// } else num_rate_cats = -1;
+// if (num_rate_cats >= 0)
+// site_rate = new RateMeyerDiscrete(num_rate_cats, params.mcat_type,
+// params.rate_file, tree, params.rate_mh_type);
+// else
+// site_rate = new RateMeyerHaeseler(params.rate_file, tree, params.rate_mh_type);
+// site_rate->setTree(tree);
+// } else if ((posX = rate_str.find("+NGS")) != string::npos) {
+// tree->setLikelihoodKernel(LK_NORMAL);
+// if (rate_str.length() > posX+4 && isdigit(rate_str[posX+4])) {
+// num_rate_cats = convert_int(rate_str.substr(posX+4).c_str());
+// if (num_rate_cats < 0) outError("Wrong number of rate categories");
+// } else num_rate_cats = -1;
+// site_rate = new NGSRateCat(tree, num_rate_cats);
+// site_rate->setTree(tree);
+// } else if ((posX = rate_str.find("+NGS")) != string::npos) {
+// tree->setLikelihoodKernel(LK_NORMAL);
+// if (rate_str.length() > posX+4 && isdigit(rate_str[posX+4])) {
+// num_rate_cats = convert_int(rate_str.substr(posX+4).c_str());
+// if (num_rate_cats < 0) outError("Wrong number of rate categories");
+// } else num_rate_cats = -1;
+// site_rate = new NGSRate(tree);
+// site_rate->setTree(tree);
} else if ((posX = rate_str.find("+K")) != string::npos) {
if (rate_str.length() > posX+2 && isdigit(rate_str[posX+2])) {
num_rate_cats = convert_int(rate_str.substr(posX+2).c_str());
@@ -705,80 +704,20 @@ double ModelFactory::initGTRGammaIParameters(RateHeterogeneity *rate, ModelSubst
}
double ModelFactory::optimizeParametersOnly(double gradient_epsilon) {
- double logl;
- if (Params::getInstance().fai && site_rate != NULL && model != NULL) {
- cout << "Optimize substitutional and site rates with restart ..." << endl;
- PhyloTree* tree = site_rate->phylo_tree;
- double initAlpha = 0.1;
- double maxInitAlpha = 1.0;
- double alphaStep = 0.1;
- double bestLogl = -DBL_MAX;
- double bestAlpha = 0.0;
- double bestPInvar = 0.0;
- double initPInvar = site_rate->getPInvar();
- int numRateEntries = model->getNumRateEntries();
- double *initRates = new double[numRateEntries];
- double *bestRates = new double[numRateEntries];
- model->getRateMatrix(initRates);
- int numStates = model->num_states;
- double *initStateFreqs = new double[numStates];
- model->getStateFrequency(initStateFreqs);
- double *bestStateFreqs = new double[numStates];
- DoubleVector initBranchLengths;
- DoubleVector bestBranchLengths;
- tree->saveBranchLengths(initBranchLengths);
-
- while (initAlpha <= maxInitAlpha) {
- tree->restoreBranchLengths(initBranchLengths);
- double initLogl = initGTRGammaIParameters(site_rate, model, initAlpha, initPInvar, initRates, initStateFreqs);
- if (joint_optimize) {
- logl = optimizeAllParameters(gradient_epsilon);
- } else {
- model->optimizeParameters(gradient_epsilon);
- site_rate->optimizeParameters(gradient_epsilon);
- logl = tree->optimizeAllBranches(1);
- }
- RateHeterogeneity* rateGammaInvar = site_rate;
- ModelGTR* modelGTR = (ModelGTR*)(model);
- double curAlpha = rateGammaInvar->getGammaShape();
- double curPInvar = rateGammaInvar->getPInvar();
- if (logl > bestLogl) {
- bestLogl = logl;
- bestAlpha = curAlpha;
- bestPInvar = curPInvar;
- modelGTR->getRateMatrix(bestRates);
- modelGTR->getStateFrequency(bestStateFreqs);
- tree->saveBranchLengths(bestBranchLengths);
- }
- if (verbose_mode >= VB_MED) {
- cout << "Init. alpha = " << initAlpha << " / Init. PInvar = " << initPInvar << " / Init. Logl = " <<
- initLogl << " / Est. alpha = " << curAlpha
- << "/ Est. pinv = " << curPInvar << " / Final Logl = " << logl << endl;
- }
- initAlpha = initAlpha + alphaStep;
- }
- cout << "Best alpha = " << bestAlpha << " / best p_invar = " << bestPInvar << endl;
- tree->restoreBranchLengths(bestBranchLengths);
- logl = initGTRGammaIParameters(site_rate, model, bestAlpha, bestPInvar, bestRates, bestStateFreqs);
- delete [] initRates;
- delete [] bestRates;
- delete [] initStateFreqs;
- delete [] bestStateFreqs;
- } else {
- /* Optimize substitutional and heterogeneity rates independetly */
- if (!joint_optimize) {
- double model_lh = model->optimizeParameters(gradient_epsilon);
- double rate_lh = site_rate->optimizeParameters(gradient_epsilon);
- if (rate_lh == 0.0)
- logl = model_lh;
- else
- logl = rate_lh;
- } else {
- /* Optimize substitutional and heterogeneity rates jointly using BFGS */
- logl = optimizeAllParameters(gradient_epsilon);
- }
- }
- return logl;
+ double logl;
+ /* Optimize substitution and heterogeneity rates independently */
+ if (!joint_optimize) {
+ double model_lh = model->optimizeParameters(gradient_epsilon);
+ double rate_lh = site_rate->optimizeParameters(gradient_epsilon);
+ if (rate_lh == 0.0)
+ logl = model_lh;
+ else
+ logl = rate_lh;
+ } else {
+ /* Optimize substitution and heterogeneity rates jointly using BFGS */
+ logl = optimizeAllParameters(gradient_epsilon);
+ }
+ return logl;
}
double ModelFactory::optimizeAllParameters(double gradient_epsilon) {
@@ -830,27 +769,20 @@ double ModelFactory::optimizeAllParameters(double gradient_epsilon) {
}
double ModelFactory::optimizeParametersGammaInvar(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
- PhyloTree *tree = site_rate->getTree();
-
- RateHeterogeneity* site_rates = tree->getRate();
- if (site_rates == NULL) {
-// outError("The model must be +I+G");
- // model is not +I+G, call conventional function instead
- return optimizeParameters(fixed_len, write_info, logl_epsilon, gradient_epsilon);
- }
-
+ if (!site_rate->isGammai())
+ return optimizeParameters(fixed_len, write_info, logl_epsilon, gradient_epsilon);
+
+ double begin_time = getRealTime();
+
+ PhyloTree *tree = site_rate->getTree();
double frac_const = tree->aln->frac_const_sites;
- if (fixed_len) {
- tree->setCurScore(tree->computeLikelihood());
- } else {
- tree->optimizeAllBranches(1);
- }
-
+ tree->setCurScore(tree->computeLikelihood());
/* Back up branch lengths and substitutional rates */
- DoubleVector lenvec;
+ DoubleVector initBranLens;
DoubleVector bestLens;
- tree->saveBranchLengths(lenvec);
+ tree->saveBranchLengths(initBranLens);
+ bestLens = initBranLens;
int numRateEntries = tree->getModel()->getNumRateEntries();
double *rates = new double[numRateEntries];
double *bestRates = new double[numRateEntries];
@@ -861,64 +793,100 @@ double ModelFactory::optimizeParametersGammaInvar(bool fixed_len, bool write_inf
/* Best estimates found */
double *bestStateFreqs = new double[numStates];
- double bestLogl = tree->getCurScore();
+ double bestLogl = -DBL_MAX;
double bestAlpha = 0.0;
double bestPInvar = 0.0;
- double testInterval = (frac_const - MIN_PINVAR*2) / 10;
+ double testInterval = (frac_const - MIN_PINVAR * 2) / 9;
double initPInv = MIN_PINVAR;
- double initAlpha = site_rates->getGammaShape();
+ double initAlpha = site_rate->getGammaShape();
+
+ if (Params::getInstance().opt_gammai_fast) {
+ initPInv = frac_const/2;
+ bool stop = false;
+ while(!stop) {
+ if (write_info) {
+ cout << endl;
+ cout << "Testing with init. pinv = " << initPInv << " / init. alpha = " << initAlpha << endl;
+ }
+
+ vector<double> estResults = optimizeGammaInvWithInitValue(fixed_len, logl_epsilon, gradient_epsilon, tree, site_rate, rates, state_freqs,
+ initPInv, initAlpha, initBranLens);
- if (write_info)
- cout << "testInterval: " << testInterval << endl;
- // Now perform testing different inital p_inv values
- while (initPInv <= frac_const) {
- if (write_info) {
- cout << endl;
- cout << "Testing with init. pinv = " << initPInv << " / init. alpha = " << initAlpha << endl;
+ if (write_info) {
+ cout << "Est. p_inv: " << estResults[0] << " / Est. gamma shape: " << estResults[1]
+ << " / Logl: " << estResults[2] << endl;
+ }
+
+ if (estResults[2] > bestLogl) {
+ bestLogl = estResults[2];
+ bestAlpha = estResults[1];
+ bestPInvar = estResults[0];
+ bestLens.clear();
+ tree->saveBranchLengths(bestLens);
+ tree->getModel()->getRateMatrix(bestRates);
+ tree->getModel()->getStateFrequency(bestStateFreqs);
+ if (estResults[0] < initPInv) {
+ initPInv = estResults[0] - testInterval;
+ if (initPInv < 0.0)
+ initPInv = 0.0;
+ } else {
+ initPInv = estResults[0] + testInterval;
+ if (initPInv > frac_const)
+ initPInv = frac_const;
+ }
+ //cout << "New initPInv = " << initPInv << endl;
+ } else {
+ stop = true;
+ }
}
- tree->restoreBranchLengths(lenvec);
- ((ModelGTR*) tree->getModel())->setRateMatrix(rates);
- ((ModelGTR*) tree->getModel())->setStateFrequency(state_freqs);
- tree->getModel()->decomposeRateMatrix();
- site_rates->setPInvar(initPInv);
- site_rates->setGammaShape(initAlpha);
- tree->clearAllPartialLH();
- optimizeParameters(fixed_len, write_info, logl_epsilon, gradient_epsilon);
- double estAlpha = tree->getRate()->getGammaShape();
- double estPInv = tree->getRate()->getPInvar();
- double logl = tree->getCurScore();
- if (write_info) {
- cout << "Est. alpha: " << estAlpha << " / Est. pinv: " << estPInv
- << " / Logl: " << logl << endl;
+ } else {
+ // Now perform testing different initial p_inv values
+ while (initPInv <= frac_const) {
+ if (write_info) {
+ cout << endl;
+ cout << "Testing with init. pinv = " << initPInv << " / init. alpha = " << initAlpha << endl;
+ }
+ vector<double> estResults; // vector of p_inv, alpha and logl
+ if (Params::getInstance().opt_gammai_keep_bran)
+ estResults = optimizeGammaInvWithInitValue(fixed_len, logl_epsilon, gradient_epsilon, tree, site_rate, rates, state_freqs,
+ initPInv, initAlpha, bestLens);
+ else
+ estResults = optimizeGammaInvWithInitValue(fixed_len, logl_epsilon, gradient_epsilon, tree, site_rate, rates, state_freqs,
+ initPInv, initAlpha, initBranLens);
+ if (write_info) {
+ cout << "Est. p_inv: " << estResults[0] << " / Est. gamma shape: " << estResults[1]
+ << " / Logl: " << estResults[2] << endl;
+ }
+
+ initPInv = initPInv + testInterval;
+
+ if (estResults[2] > bestLogl + logl_epsilon) {
+ bestLogl = estResults[2];
+ bestAlpha = estResults[1];
+ bestPInvar = estResults[0];
+ bestLens.clear();
+ tree->saveBranchLengths(bestLens);
+ tree->getModel()->getRateMatrix(bestRates);
+ tree->getModel()->getStateFrequency(bestStateFreqs);
+ }
}
- initPInv = initPInv + testInterval;
-
- if (tree->getCurScore() > bestLogl) {
- bestLogl = logl;
- bestAlpha = estAlpha;
- bestPInvar = estPInv;
- bestLens.clear();
- tree->saveBranchLengths(bestLens);
- tree->getModel()->getRateMatrix(bestRates);
- tree->getModel()->getStateFrequency(bestStateFreqs);
- }
- }
+ }
- site_rates->setGammaShape(bestAlpha);
-// site_rates->setFixGammaShape(false);
- site_rates->setPInvar(bestPInvar);
-// site_rates->setFixPInvar(false);
+ site_rate->setGammaShape(bestAlpha);
+ site_rate->setPInvar(bestPInvar);
((ModelGTR*) tree->getModel())->setRateMatrix(bestRates);
((ModelGTR*) tree->getModel())->setStateFrequency(bestStateFreqs);
tree->restoreBranchLengths(bestLens);
tree->getModel()->decomposeRateMatrix();
+
tree->clearAllPartialLH();
tree->setCurScore(tree->computeLikelihood());
+ assert(fabs(tree->getCurScore() - bestLogl) < 1.0);
if (write_info) {
cout << endl;
- cout << "Best initial alpha: " << bestAlpha << " / initial pinv: " << bestPInvar << " / ";
+ cout << "Best p_inv: " << bestPInvar << " / best gamma shape: " << bestAlpha << " / ";
cout << "Logl: " << tree->getCurScore() << endl;
}
@@ -927,6 +895,10 @@ double ModelFactory::optimizeParametersGammaInvar(bool fixed_len, bool write_inf
delete [] bestRates;
delete [] bestStateFreqs;
+ double elapsed_secs = getRealTime() - begin_time;
+ if (write_info)
+ cout << "Parameters optimization took " << elapsed_secs << " sec" << endl;
+
// updating global variable is not safe!
// Params::getInstance().testAlpha = false;
@@ -934,6 +906,29 @@ double ModelFactory::optimizeParametersGammaInvar(bool fixed_len, bool write_inf
return tree->getCurScore();
}
+vector<double> ModelFactory::optimizeGammaInvWithInitValue(bool fixed_len, double logl_epsilon, double gradient_epsilon,
+ PhyloTree *tree, RateHeterogeneity *site_rates, double *rates,
+ double *state_freqs, double initPInv, double initAlpha,
+ DoubleVector &lenvec) {
+ tree->restoreBranchLengths(lenvec);
+ ((ModelGTR*) tree->getModel())->setRateMatrix(rates);
+ ((ModelGTR*) tree->getModel())->setStateFrequency(state_freqs);
+ tree->getModel()->decomposeRateMatrix();
+ site_rates->setPInvar(initPInv);
+ site_rates->setGammaShape(initAlpha);
+ tree->clearAllPartialLH();
+ optimizeParameters(fixed_len, false, logl_epsilon, gradient_epsilon);
+
+ vector<double> estResults;
+ double estPInv = tree->getRate()->getPInvar();
+ double estAlpha = tree->getRate()->getGammaShape();
+ double logl = tree->getCurScore();
+ estResults.push_back(estPInv);
+ estResults.push_back(estAlpha);
+ estResults.push_back(logl);
+ return estResults;
+}
+
double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
double logl_epsilon, double gradient_epsilon) {
@@ -948,19 +943,9 @@ double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
assert(tree);
stopStoringTransMatrix();
- // modified by Thomas Wong on Sept 11, 15
- // no optimization of branch length in the first round
- cur_lh = tree->computeLikelihood();
- /*
- if (fixed_len || tree->params->num_param_iterations == 0)
- cur_lh = tree->computeLikelihood();
- else {
- if (!Params::getInstance().testAlpha && !Params::getInstance().fai)
- cur_lh = tree->optimizeAllBranches(1);
- else
- cur_lh = tree->computeLikelihood();
- }
- */
+ // modified by Thomas Wong on Sept 11, 15
+ // no optimization of branch length in the first round
+ cur_lh = tree->computeLikelihood();
tree->setCurScore(cur_lh);
if (verbose_mode >= VB_MED || write_info)
cout << "1. Initial log-likelihood: " << cur_lh << endl;
@@ -977,31 +962,11 @@ double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
//bool optimize_rate = true;
// double gradient_epsilon = min(logl_epsilon, 0.01); // epsilon for parameters starts at epsilon for logl
for (i = 2; i < tree->params->num_param_iterations; i++) {
-// if (gradient_epsilon < 0.001)
-// gradient_epsilon = 0.001;
- /*
- double model_lh = model->optimizeParameters(param_epsilon);
- double rate_lh = 0.0;
- if (optimize_rate) {
- rate_lh = site_rate->optimizeParameters(param_epsilon);
- if (rate_lh < model_lh+1e-6 && model_lh != 0.0) optimize_rate = false;
- }
- if (model_lh == 0.0 && rate_lh == 0.0) {
- if (!fixed_len) cur_lh = tree->optimizeAllBranches(100, logl_epsilon);
- break;
- }
- double new_lh = (rate_lh != 0.0) ? rate_lh : model_lh;
- */
double new_lh;
- if (Params::getInstance().fai && i > 2) {
- Params::getInstance().fai = false;
- }
-
- // changed to opimise edge length first, and then Q,W,R inside the loop by Thomas on Sept 11, 15
+ // changed to opimise edge length first, and then Q,W,R inside the loop by Thomas on Sept 11, 15
if (!fixed_len)
new_lh = tree->optimizeAllBranches(min(i,3), logl_epsilon); // loop only 3 times in total (previously in v0.9.6 5 times)
-
new_lh = optimizeParametersOnly(gradient_epsilon);
if (new_lh == 0.0) {
@@ -1013,18 +978,6 @@ double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
site_rate->writeInfo(cout);
}
if (new_lh > cur_lh + logl_epsilon) {
- if (Params::getInstance().testAlpha && Params::getInstance().testAlphaEpsAdaptive) {
- if (i == 3) {
- double newEpsilon = (new_lh - cur_lh) * 0.01;
- if (newEpsilon > defaultEpsilon) {
- logl_epsilon = newEpsilon;
- cout << "Estimate model parameters with new epsilon = " << logl_epsilon << endl;
- }
- }
- }
-
-// if (gradient_epsilon > (new_lh - cur_lh) * logl_epsilon)
-// gradient_epsilon = (new_lh - cur_lh) * logl_epsilon;
cur_lh = new_lh;
if (verbose_mode >= VB_MED || write_info)
cout << i << ". Current log-likelihood: " << cur_lh << endl;
@@ -1036,12 +989,17 @@ double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
}
// normalize rates s.t. branch lengths are #subst per site
- double mean_rate = site_rate->rescaleRates();
- if (mean_rate != 1.0) {
- tree->scaleLength(mean_rate);
- tree->clearAllPartialLH();
+// if (Params::getInstance().optimize_alg_gammai != "EM")
+ {
+ double mean_rate = site_rate->rescaleRates();
+ if (mean_rate != 1.0) {
+ tree->scaleLength(mean_rate);
+ tree->clearAllPartialLH();
+ }
}
+
+
if (verbose_mode >= VB_MED || write_info)
cout << "Optimal log-likelihood: " << cur_lh << endl;
diff --git a/model/modelfactory.h b/model/modelfactory.h
index baedb6e..842d69c 100644
--- a/model/modelfactory.h
+++ b/model/modelfactory.h
@@ -166,7 +166,7 @@ public:
@return the best likelihood
*/
virtual double optimizeParameters(bool fixed_len = false, bool write_info = true,
- double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+ double logl_epsilon = 0.1, double gradient_epsilon = 0.0001);
/**
* optimize model parameters and tree branch lengths for the +I+G model
@@ -174,7 +174,8 @@ public:
* @param fixed_len TRUE to fix branch lengths, default is false
* @return the best likelihood
*/
- virtual double optimizeParametersGammaInvar(bool fixed_len = false, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+ virtual double optimizeParametersGammaInvar(bool fixed_len = false, bool write_info = true,
+ double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
/**
* @return TRUE if parameters are at the boundary that may cause numerical unstability
@@ -259,6 +260,9 @@ protected:
*/
virtual bool getVariables(double *variables);
+ vector<double> optimizeGammaInvWithInitValue(bool fixed_len, double logl_epsilon, double gradient_epsilon, PhyloTree *tree,
+ RateHeterogeneity *site_rates, double *rates, double *state_freqs,
+ double initPInv, double initAlpha, DoubleVector &lenvec);
};
#endif
diff --git a/model/modelgtr.cpp b/model/modelgtr.cpp
index 557a291..7a7b27d 100644
--- a/model/modelgtr.cpp
+++ b/model/modelgtr.cpp
@@ -55,9 +55,9 @@ ModelGTR::ModelGTR(PhyloTree *tree, bool count_rates)
eigen_coeff = aligned_alloc<double>(ncoeff);
- if (count_rates)
- phylo_tree->aln->computeEmpiricalRate(rates);
- else
+// if (count_rates)
+// computeEmpiricalRate();
+// else
for (i=0; i < nrate; i++) rates[i] = 1.0;
//eigen_coeff_derv1 = new double[ncoeff];
//eigen_coeff_derv2 = new double[ncoeff];
@@ -558,7 +558,7 @@ double ModelGTR::optimizeParameters(double gradient_epsilon) {
// return if nothing to be optimized
if (ndim == 0) return 0.0;
-
+
if (verbose_mode >= VB_MAX)
cout << "Optimizing " << name << " model parameters..." << endl;
diff --git a/model/modelgtr.h b/model/modelgtr.h
index c26bc52..a237168 100644
--- a/model/modelgtr.h
+++ b/model/modelgtr.h
@@ -50,7 +50,7 @@ public:
@param tree associated tree for the model
*/
ModelGTR(PhyloTree *tree, bool count_rates = true);
-
+
/**
init the model and decompose the rate matrix. This function should always be called
after creating the class. Otherwise it will not work properly.
diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp
index 2923b3a..0381513 100644
--- a/model/modelmixture.cpp
+++ b/model/modelmixture.cpp
@@ -721,7 +721,7 @@ frequency Fclass1 = 0.02549352 0.01296012 0.005545202 0.006005566 0.01002193 0.0
frequency Fclass2 = 0.09596966 0.008786096 0.02805857 0.01880183 0.005026264 0.006454635 0.01582725 0.7215719 0.003379354 0.002257725 0.003013483 0.01343441 0.001511657 0.002107865 0.006751404 0.04798539 0.01141559 0.000523736 0.002188483 0.004934972;\n\
frequency Fclass3 = 0.01726065 0.005467988 0.01092937 0.3627871 0.001046402 0.01984758 0.5149206 0.004145081 0.002563289 0.002955213 0.005286931 0.01558693 0.002693098 0.002075771 0.003006167 0.01263069 0.01082144 0.000253451 0.001144787 0.004573568;\n\
frequency Fclass4 = 0.1263139 0.09564027 0.07050061 0.03316681 0.02095119 0.05473468 0.02790523 0.009007538 0.03441334 0.005855319 0.008061884 0.1078084 0.009019514 0.05018693 0.07948 0.09447839 0.09258897 0.01390669 0.05367769 0.01230413;\n\
-frequency CF4 = FMIX{empirical,Fclass1,Fclass2,Fclass3,Fclass4};\n\
+model CF4 = POISSON+FMIX{empirical,Fclass1,Fclass2,Fclass3,Fclass4}+G;\n\
model JTTCF4G = JTT+FMIX{empirical,Fclass1,Fclass2,Fclass3,Fclass4}+G;\n\
\n\
[ ---------------------------------------------------------\n\
diff --git a/model/modelnonrev.cpp b/model/modelnonrev.cpp
index 315c43b..f63b155 100644
--- a/model/modelnonrev.cpp
+++ b/model/modelnonrev.cpp
@@ -27,9 +27,9 @@ ModelNonRev::ModelNonRev(PhyloTree *tree, bool count_rates)
delete [] rates;
rates = new double [num_params+1];
memset(rates, 0, sizeof(double) * (num_params+1));
- if (count_rates)
- phylo_tree->aln->computeEmpiricalRateNonRev(rates);
- else
+// if (count_rates)
+// computeEmpiricalRate();
+// else
for (int i = 0; i <= num_params; i++)
rates[i] = 1.0;
name = "UNREST";
diff --git a/model/ratefree.cpp b/model/ratefree.cpp
index 3265a2b..4e84f81 100644
--- a/model/ratefree.cpp
+++ b/model/ratefree.cpp
@@ -87,7 +87,7 @@ void RateFree::setNCategory(int ncat) {
int i;
for (i = 0; i < ncategory; i++)
- prop[i] = 1.0/ncategory;
+ prop[i] = (1.0-getPInvar())/ncategory;
// double sum_prop = (ncategory)*(ncategory+1)/2.0;
// double sum = 0.0;
@@ -110,6 +110,7 @@ void RateFree::setNCategory(int ncat) {
void RateFree::setRateAndProp(RateFree *input) {
assert(input->ncategory == ncategory-1);
+ setPInvar(input->getPInvar());
int k = 0, i;
// get the category k with largest proportion
for (i = 1; i < ncategory-1; i++)
@@ -272,7 +273,7 @@ void RateFree::setBounds(double *lower_bound, double *upper_bound, bool *bound_c
}
} else if (optimizing_params == 1){
// rates
- for (i = 1; i <= ncategory; i++) {
+ for (i = 1; i < ncategory; i++) {
lower_bound[i] = MIN_FREE_RATE;
upper_bound[i] = MAX_FREE_RATE;
bound_check[i] = false;
@@ -471,7 +472,7 @@ double RateFree::optimizeWithEM() {
model_fac->site_rate = site_rate;
tree->model_factory = model_fac;
tree->setParams(phylo_tree->params);
-
+ double old_score = 0.0;
// EM algorithm loop described in Wang, Li, Susko, and Roger (2008)
for (int step = 0; step < ncategory; step++) {
// first compute _pattern_lh_cat
@@ -482,13 +483,19 @@ double RateFree::optimizeWithEM() {
writeInfo(cout);
}
assert(score < 0);
+
+ if (step > 0)
+ assert(score > old_score-0.1);
+
+ old_score = score;
+
memset(new_prop, 0, nmix*sizeof(double));
// E-step
// decoupled weights (prop) from _pattern_lh_cat to obtain L_ci and compute pattern likelihood L_i
for (ptn = 0; ptn < nptn; ptn++) {
double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
- double lk_ptn = 0.0;
+ double lk_ptn = phylo_tree->ptn_invar[ptn];
for (c = 0; c < nmix; c++) {
lk_ptn += this_lk_cat[c];
}
@@ -505,6 +512,7 @@ double RateFree::optimizeWithEM() {
// M-step, update weights according to (*)
int maxpropid = 0;
+ double new_pinvar = 0.0;
for (c = 0; c < nmix; c++) {
new_prop[c] = new_prop[c] / phylo_tree->getAlnNSite();
if (new_prop[c] > new_prop[maxpropid])
@@ -530,9 +538,19 @@ double RateFree::optimizeWithEM() {
sum_prop += new_prop[c];
converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4);
prop[c] = new_prop[c];
+ new_pinvar += new_prop[c];
}
- assert(fabs(sum_prop-1.0) < MIN_PROP);
+ new_pinvar = 1.0 - new_pinvar;
+
+ if (new_pinvar != 0.0) {
+ converged = converged && (fabs(getPInvar()-new_pinvar) < 1e-4);
+ setPInvar(new_pinvar);
+// setOptimizePInvar(false);
+ phylo_tree->computePtnInvar();
+ }
+
+ assert(fabs(sum_prop+new_pinvar-1.0) < MIN_PROP);
// now optimize rates one by one
double sum = 0.0;
diff --git a/model/ratefreeinvar.cpp b/model/ratefreeinvar.cpp
index 0ec731c..b70ebb4 100644
--- a/model/ratefreeinvar.cpp
+++ b/model/ratefreeinvar.cpp
@@ -13,6 +13,7 @@ RateFreeInvar::RateFreeInvar(int ncat, double start_alpha, string params, bool s
cur_optimize = 0;
name = "+I" + name;
full_name = "Invar+" + full_name;
+ setNCategory(ncat);
}
void RateFreeInvar::saveCheckpoint() {
diff --git a/model/ratefreeinvar.h b/model/ratefreeinvar.h
index cb1422c..fbd912c 100644
--- a/model/ratefreeinvar.h
+++ b/model/ratefreeinvar.h
@@ -42,7 +42,7 @@ public:
@param category category ID from 0 to #category-1
@return the proportion of the specified category
*/
- virtual double getProp(int category) { return (1.0-p_invar)*prop[category]; }
+ virtual double getProp(int category) { return prop[category]; }
/**
get the rate of a specified category. Default returns 1.0 since it is homogeneous model
diff --git a/model/rategamma.cpp b/model/rategamma.cpp
index 48e0814..030abdc 100644
--- a/model/rategamma.cpp
+++ b/model/rategamma.cpp
@@ -1,6 +1,8 @@
/***************************************************************************
- * Copyright (C) 2009 by BUI Quang Minh *
- * minh.bui at univie.ac.at *
+ * Copyright (C) 2009-2015 by *
+ * BUI Quang Minh <minh.bui at univie.ac.at> *
+ * Lam-Tung Nguyen <nltung at gmail.com> *
+ * *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
@@ -68,6 +70,8 @@ void RateGamma::setNCategory(int ncat) {
ncategory = ncat;
if (rates) delete [] rates;
rates = new double[ncategory];
+ for (int cat = 0; cat < ncategory; cat++)
+ rates[cat] = 1.0;
name = "+G" + convertIntToString(ncategory);
full_name = "Gamma with " + convertIntToString(ncategory) + " categories";
computeRates();
@@ -95,6 +99,10 @@ void RateGamma::computeRates() {
return;
}
+ double curScale = 0.0;
+ for (cat = 0; cat < ncategory; cat++)
+ curScale += rates[cat];
+
if (!cut_median) {
computeRatesMean();
} else {
@@ -119,10 +127,21 @@ void RateGamma::computeRates() {
if (phylo_tree && phylo_tree->params && phylo_tree->params->no_rescale_gamma_invar)
return;
+ double newScale = 0.0;
+ for (cat = 0; cat < ncategory; cat++)
+ newScale += rates[cat];
+
+ if (newScale != curScale) {
+ for (cat = 0; cat < ncategory; cat++)
+ rates[cat] *= curScale/newScale;
+ }
+
/* if invariable sites are present */
- double p_inv = getPInvar();
- for (cat = 0; cat < ncategory; cat++)
- rates[cat] = rates[cat]/(1.0 - p_inv);
+// if (Params::getInstance().optimize_alg_gammai != "EM") {
+// double p_inv = getPInvar();
+// for (cat = 0; cat < ncategory; cat++)
+// rates[cat] = rates[cat]/(1.0 - p_inv);
+// }
/* check for very small rates */
// for (cat = 0; cat < ncategory; cat ++)
diff --git a/model/rategammainvar.cpp b/model/rategammainvar.cpp
index 2b0641d..70ebaa7 100644
--- a/model/rategammainvar.cpp
+++ b/model/rategammainvar.cpp
@@ -1,6 +1,8 @@
/***************************************************************************
- * Copyright (C) 2009 by BUI Quang Minh *
- * minh.bui at univie.ac.at *
+ * Copyright (C) 2009-2015 by *
+ * BUI Quang Minh <minh.bui at univie.ac.at> *
+ * Lam-Tung Nguyen <nltung at gmail.com> *
+ * *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
@@ -20,15 +22,25 @@
#include "rategammainvar.h"
RateGammaInvar::RateGammaInvar(int ncat, double shape, bool median,
- double p_invar_sites, bool simultaneous, PhyloTree *tree) :
+ double p_invar_sites, string optimize_alg, PhyloTree *tree, bool testParamDone) :
RateInvar(p_invar_sites, tree), RateGamma(ncat, shape, median, tree) {
name = "+I" + name;
full_name = "Invar+" + full_name;
- joint_optimize = simultaneous;
+ this->optimize_alg = optimize_alg;
cur_optimize = 0;
+ this->testParamDone = testParamDone;
+ for (int cat = 0; cat < ncategory; cat++)
+ rates[cat] = 1.0/(1.0-p_invar);
computeRates();
}
+void RateGammaInvar::setPInvar(double pInvar) {
+ p_invar = pInvar;
+ for (int cat = 0; cat < ncategory; cat++)
+ rates[cat] = 1.0/(1.0-p_invar);
+ computeRates();
+}
+
void RateGammaInvar::saveCheckpoint() {
checkpoint->startStruct("RateGammaInvar");
// CKP_SAVE(joint_optimize);
@@ -48,6 +60,8 @@ void RateGammaInvar::restoreCheckpoint() {
void RateGammaInvar::setNCategory(int ncat) {
RateGamma::setNCategory(ncat);
+ for (int cat = 0; cat < ncategory; cat++)
+ rates[cat] = 1.0/(1.0-p_invar);
name = "+I" + name;
full_name = "Invar+" + full_name;
computeRates();
@@ -60,8 +74,11 @@ string RateGammaInvar::getNameParams() {
double RateGammaInvar::computeFunction(double value) {
if (cur_optimize == 0)
gamma_shape = value;
- else
+ else {
p_invar = value;
+ for (int cat = 0; cat < ncategory; cat++)
+ rates[cat] = 1.0/(1.0-p_invar);
+ }
// need to compute rates again if p_inv or Gamma shape changes!
computeRates();
phylo_tree->clearAllPartialLH();
@@ -114,79 +131,53 @@ double RateGammaInvar::optimizeParameters(double gradient_epsilon) {
if (ndim == 0)
return phylo_tree->computeLikelihood();
+ if (verbose_mode >= VB_MED)
+ cout << "Optimizing " << name << " model parameters by " << optimize_alg << " algorithm..." << endl;
- if (!joint_optimize) {
+ if (optimize_alg.find("EM_RR") != string::npos) {
+ return randomRestartOptimization(gradient_epsilon);
+ } else if (optimize_alg.find("Brent") != string::npos) {
double lh = phylo_tree->computeLikelihood();
cur_optimize = 0;
- double gamma_lh;
- if (Params::getInstance().testAlpha) {
- gamma_lh = RateGamma::optimizeParameters(gradient_epsilon, 0.05, 10);
- } else {
- gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
- }
+ double gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
assert(gamma_lh >= lh-0.1);
cur_optimize = 1;
double invar_lh = -DBL_MAX;
invar_lh = RateInvar::optimizeParameters(gradient_epsilon);
assert(invar_lh >= gamma_lh-0.1);
- //lh = tree_lh;
-
- //assert(gamma_lh >= invar_lh - 0.1);
-// phylo_tree->clearAllPartialLH();
-// return gamma_lh;
cur_optimize = 0;
return invar_lh;
- }
-
-/*
- if (!joint_optimize) {
-// double lh = phylo_tree->computeLikelihood();
- cur_optimize = 1;
- double invar_lh = -DBL_MAX;
- invar_lh = RateInvar::optimizeParameters(gradient_epsilon);
-// assert(tree_lh >= lh-0.1);
-// lh = tree_lh;
- cur_optimize = 0;
- double gamma_lh;
- if (Params::getInstance().testAlpha) {
- gamma_lh = RateGamma::optimizeParameters(gradient_epsilon, 0.05, 10);
- } else {
- gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
- }
- //assert(gamma_lh >= invar_lh - 0.1);
- phylo_tree->clearAllPartialLH();
- return gamma_lh;
- }
-*/
-
- if (verbose_mode >= VB_MAX)
- cout << "Optimizing " << name << " model parameters by BFGS..." << endl;
-
- //if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
-
- double *variables = new double[ndim+1];
- double *upper_bound = new double[ndim+1];
- double *lower_bound = new double[ndim+1];
- bool *bound_check = new bool[ndim+1];
- double score;
-
- // by BFGS algorithm
- setVariables(variables);
- setBounds(lower_bound, upper_bound, bound_check);
-
- score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_GAMMA_SHAPE));
-
- getVariables(variables);
-
- phylo_tree->clearAllPartialLH();
- score = phylo_tree->computeLikelihood();
-
- delete [] bound_check;
- delete [] lower_bound;
- delete [] upper_bound;
- delete [] variables;
-
- return score;
+ } else if (optimize_alg.find("EM") != string::npos) {
+ return optimizeWithEM(gradient_epsilon);
+ } else if (optimize_alg.find("BFGS") != string::npos) {
+ //if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
+ double *variables = new double[ndim+1];
+ double *upper_bound = new double[ndim+1];
+ double *lower_bound = new double[ndim+1];
+ bool *bound_check = new bool[ndim+1];
+ double score;
+
+ // by BFGS algorithm
+ setVariables(variables);
+ setBounds(lower_bound, upper_bound, bound_check);
+
+ score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_GAMMA_SHAPE));
+
+ getVariables(variables);
+
+ phylo_tree->clearAllPartialLH();
+ score = phylo_tree->computeLikelihood();
+
+ delete [] bound_check;
+ delete [] lower_bound;
+ delete [] upper_bound;
+ delete [] variables;
+
+ return score;
+ } else {
+ string errMsg = "Unknown optimization algorithm: " + optimize_alg;
+ outError(errMsg.c_str());
+ }
}
@@ -219,3 +210,115 @@ int RateGammaInvar::computePatternRates(DoubleVector &pattern_rates, IntVector &
return ncategory+1;
}
+double RateGammaInvar::optimizeWithEM(double gradient_epsilon) {
+ double curlh = phylo_tree->computeLikelihood();
+
+ cur_optimize = 0;
+ double gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
+ assert(gamma_lh > curlh - 1.0);
+ curlh = gamma_lh;
+
+ size_t ncat = getNRate();
+ size_t nptn = phylo_tree->aln->getNPattern();
+ size_t nSites = phylo_tree->aln->getNSite();
+
+ // Compute the pattern likelihood for each category (invariable and variable category)
+ phylo_tree->computePatternLhCat(WSL_RATECAT);
+ phylo_tree->computePtnInvar();
+
+ double ppInvar = 0;
+ for (size_t ptn = 0; ptn < nptn; ptn++) {
+ double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn * ncat;
+ double lk_ptn = phylo_tree->ptn_invar[ptn];
+ for (size_t cat = 0; cat < ncat; cat++) {
+ lk_ptn += this_lk_cat[cat];
+ }
+ assert(lk_ptn != 0.0);
+ ppInvar += (phylo_tree->ptn_invar[ptn]) * phylo_tree->ptn_freq[ptn] / lk_ptn;
+ }
+
+ double newPInvar = ppInvar / nSites;
+ assert(newPInvar < 1.0);
+ //double curPInv = getPInvar();
+// setPInvar(newPInvar);
+ p_invar = newPInvar;
+ phylo_tree->clearAllPartialLH();
+// phylo_tree->scaleLength((1-newPInvar)/(1-curPInv));
+ double pinvLH = phylo_tree->computeLikelihood();
+ assert(pinvLH > curlh - 1.0);
+ return pinvLH;
+}
+
+double RateGammaInvar::randomRestartOptimization(double gradient_epsilon) {
+ if (testParamDone) {
+ return optimizeWithEM(gradient_epsilon);
+ }
+ double frac_const = phylo_tree->aln->frac_const_sites;
+ double bestLogl = phylo_tree->getCurScore();
+ double bestAlpha = 0.0;
+ double bestPInvar = 0.0;
+ double testInterval = (frac_const - MIN_PINVAR*2) / 10;
+ double initPInv = MIN_PINVAR;
+ double initAlpha = getGammaShape();
+
+ while (initPInv <= frac_const) {
+ if (verbose_mode >= VB_MED) {
+ cout << endl;
+ cout << "Testing with init. pinv = " << initPInv << " / init. alpha = " << initAlpha << endl;
+ }
+ setPInvar(initPInv);
+ setGammaShape(initAlpha);
+ phylo_tree->clearAllPartialLH();
+ double logl = optimizeWithEM(gradient_epsilon);
+ double estAlpha = getGammaShape();
+ double estPInv = getPInvar();
+
+ if (verbose_mode >= VB_MED) {
+ cout << "Est. alpha: " << estAlpha << " / Est. pinv: " << estPInv
+ << " / Logl: " << logl << endl;
+ }
+
+ initPInv = initPInv + testInterval;
+
+ if (logl > bestLogl) {
+ bestLogl = logl;
+ bestAlpha = estAlpha;
+ bestPInvar = estPInv;
+ }
+ }
+
+ if (verbose_mode >= VB_MED) {
+ cout << "Best gamma shape: " << bestAlpha << " / best p_inv: " << bestPInvar
+ << " / logl: " << bestLogl << endl;
+ }
+
+ setPInvar(bestPInvar);
+ setGammaShape(bestAlpha);
+ phylo_tree->clearAllPartialLH();
+ testParamDone = true;
+ return phylo_tree->computeLikelihood();
+}
+
+
+double RateGammaInvar::meanRates() {
+ double ret = 0.0;
+ for (int i = 0; i < ncategory; i++)
+ ret += rates[i];
+ ret *= (1.0-getPInvar())/ncategory;
+ return ret;
+}
+
+/**
+ * rescale rates s.t. mean rate is equal to 1, useful for FreeRate model
+ * @return rescaling factor
+ */
+double RateGammaInvar::rescaleRates() {
+ double norm = meanRates();
+ for (int i = 0; i < ncategory; i++)
+ rates[i] /= norm;
+ return norm;
+}
+
+
+
+
diff --git a/model/rategammainvar.h b/model/rategammainvar.h
index 6d7926c..3a7e3bc 100644
--- a/model/rategammainvar.h
+++ b/model/rategammainvar.h
@@ -37,7 +37,14 @@ public:
@param tree associated phylogenetic tree
@param testAlpha turn on option for doing random restart optimization of alpha and p_invar
*/
- RateGammaInvar(int ncat, double shape, bool median, double p_invar_sites, bool simultaneous, PhyloTree *tree);
+ RateGammaInvar(int ncat, double shape, bool median, double p_invar_sites, string optimize_alg, PhyloTree *tree, bool testParamDone);
+
+ /**
+ * check whether +I+G is used
+ */
+ virtual bool isGammai() const {
+ return true;
+ }
/**
save object into the checkpoint
@@ -67,10 +74,20 @@ public:
set the proportion of invariable sites. Default: do nothing
@param pinv the proportion of invariable sites
*/
- virtual void setPInvar(double pInvar) {
- p_invar = pInvar;
- computeRates();
- }
+ virtual void setPInvar(double pInvar);
+
+ /**
+ * used to normal branch lengths if mean rate is not equal to 1 (e.g. FreeRate model)
+ * @return mean rate, default = 1
+ */
+ virtual double meanRates();
+
+ /**
+ * rescale rates s.t. mean rate is equal to 1, useful for FreeRate model
+ * @return rescaling factor
+ */
+ virtual double rescaleRates();
+
/**
* @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
@@ -95,6 +112,11 @@ public:
*/
virtual double optimizeParameters(double gradient_epsilon);
+ /**
+ optimize rate parameters using EM algorithm
+ @return log-likelihood of optimized parameters
+ */
+ double optimizeWithEM();
/**
return the number of dimensions
@@ -121,7 +143,7 @@ public:
virtual void writeParameters(ostream &out);
/** TRUE to jointly optimize gamma shape and p_invar using BFGS, default: FALSE */
- bool joint_optimize;
+ //bool joint_optimize;
virtual void setNCategory(int ncat);
@@ -150,16 +172,25 @@ protected:
virtual bool getVariables(double *variables);
private:
-
- /**
- * TRUE to turn on random restart optimization for estimating alpha and p_invar
- */
-// bool rr_ai;
+ /**
+ * Determine which algorithm is used to optimized p_inv and alpha
+ */
+ string optimize_alg;
/**
current parameter to optimize. 0 if gamma shape or 1 if p_invar.
*/
int cur_optimize;
+
+ /**
+ * Optimize p_inv and gamma shape using the EM algorithm
+ */
+ double optimizeWithEM(double gradient_epsilon);
+
+ /**
+ * Start with different initial values of p_inv
+ */
+ double randomRestartOptimization(double gradient_epsilon);
};
#endif
diff --git a/model/rateheterogeneity.cpp b/model/rateheterogeneity.cpp
index fe20164..626ed00 100644
--- a/model/rateheterogeneity.cpp
+++ b/model/rateheterogeneity.cpp
@@ -70,9 +70,23 @@ void RateHeterogeneity::writeSiteRates(ostream &out, DoubleVector &pattern_rates
out << i+1 << "\t";
if (pattern_rates[ptn] >= MAX_SITE_RATE) out << "100.0"; else out << pattern_rates[ptn];
//cout << i << " "<< ptn << " " << pattern_cat[ptn] << endl;
- if (!pattern_cat.empty()) out << "\t" << pattern_cat[ptn]+1 << "\t" << getRate(pattern_cat[ptn]);
+ if (!pattern_cat.empty()) {
+ int site_cat;
+ double cat_rate;
+ if (getPInvar() == 0.0) {
+ site_cat = pattern_cat[ptn]+1;
+ cat_rate = getRate(pattern_cat[ptn]);
+ } else {
+ site_cat = pattern_cat[ptn];
+ if (site_cat == 0)
+ cat_rate = 0.0;
+ else
+ cat_rate = getRate(pattern_cat[ptn]-1);
+ }
+ out << "\t" << site_cat << "\t" << cat_rate;
+ count[pattern_cat[ptn]]++;
+ }
out << endl;
- count[pattern_cat[ptn]]++;
}
cout << "Empirical proportions for each category:";
for (i = 0; i < count.size(); i++)
diff --git a/model/rateheterogeneity.h b/model/rateheterogeneity.h
index 51bab81..c0c4ab5 100644
--- a/model/rateheterogeneity.h
+++ b/model/rateheterogeneity.h
@@ -179,6 +179,13 @@ public:
*/
virtual int isGammaRate() { return 0; }
+ /**
+ * check whether +I+G is used
+ */
+ virtual bool isGammai() const {
+ return false;
+ }
+
/**
the target function which needs to be optimized
@param x the input vector x
@@ -255,6 +262,12 @@ public:
string name;
+ /**
+ * Specify whether the initial starting value of the gamma shape and p_inv
+ * has already been tested.
+ */
+ bool testParamDone;
+
/**
full name of the rate heterogeneity type
*/
diff --git a/ncl/nxscharactersblock.cpp b/ncl/nxscharactersblock.cpp
index 180d5c3..cb78f84 100644
--- a/ncl/nxscharactersblock.cpp
+++ b/ncl/nxscharactersblock.cpp
@@ -1419,6 +1419,13 @@ void NxsCharactersBlock::HandleFormat(
{
break;
}
+ else
+ {
+ errormsg = "Expecting ';' but found ";
+ errormsg += token.GetToken();
+ errormsg += " at the end of FORMAT command";
+ throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+ }
}
// Perform some last checks before leaving the FORMAT command
@@ -1819,7 +1826,7 @@ void NxsCharactersBlock::HandleStdMatrix(
{
// This should be the taxon label
//
- token.SetLabileFlagBit(token.hyphenNotPunctuation);
+ token.SetLabileFlagBit(NxsToken::hyphenNotPunctuation + NxsToken::preserveUnderscores);
token.GetNextToken();
if (page == 0 && newtaxa)
diff --git a/ngs.cpp b/ngs.cpp
index b8be2af..6b22e1c 100644
--- a/ngs.cpp
+++ b/ngs.cpp
@@ -142,7 +142,7 @@ void NGSAlignment::computeSumPairFreq (double *sum_pair_freq) {
}
}
-void NGSAlignment::computeEmpiricalRate (double *rates) {
+void NGSAlignment::computeDivergenceMatrix (double *rates) {
int i, j, k, cat, id;
assert(rates);
double **pair_rates = (double**) new double[num_states];
@@ -410,7 +410,7 @@ NGSTree::NGSTree(Params ¶ms, NGSAlignment *alignment) {
model_factory = NULL;
optimize_by_newton = params.optimize_by_newton;
//tree.sse = params.SSE;
- setLikelihoodKernel(LK_NORMAL);
+ setLikelihoodKernel(LK_EIGEN);
}
double NGSTree::computeLikelihood(double *pattern_lh) {
diff --git a/ngs.h b/ngs.h
index 89ac89d..255f4bf 100644
--- a/ngs.h
+++ b/ngs.h
@@ -82,7 +82,7 @@ public:
compute empirical rates between state pairs
@param rates (OUT) vector of size num_states*(num_states-1)/2 for the rates
*/
- virtual void computeEmpiricalRate (double *rates);
+ virtual void computeDivergenceMatrix (double *rates);
/**
compute the empirical distance for a category, used to initialize rate scaling factor
diff --git a/pda.cpp b/pda.cpp
index 9f9db7d..88b15f8 100644
--- a/pda.cpp
+++ b/pda.cpp
@@ -2304,8 +2304,6 @@ int main(int argc, char *argv[])
#endif
} else {
switch (Params::getInstance().SSE) {
- case LK_NORMAL: cout << "Slow"; break;
- case LK_SSE: cout << "Slow SSE3"; break;
case LK_EIGEN: cout << "No SSE"; break;
case LK_EIGEN_SSE:
if (instruction_set >= 7) {
diff --git a/phyloanalysis.cpp b/phyloanalysis.cpp
index ebfffcf..1558228 100644
--- a/phyloanalysis.cpp
+++ b/phyloanalysis.cpp
@@ -61,17 +61,15 @@ void reportReferences(Params ¶ms, ofstream &out, string &original_model) {
<< "maximum likelihood phylogenies. Mol. Biol. Evol., 32:268-274." << endl << endl;
if (params.gbo_replicates)
- out << "Since you also used ultrafast bootstrap (UFBoot) please cite: " << endl << endl
+ out << "Since you used ultrafast bootstrap (UFBoot) please also cite: " << endl << endl
<< "Bui Quang Minh, Minh Anh Thi Nguyen, and Arndt von Haeseler (2013) Ultrafast" << endl
<< "approximation for phylogenetic bootstrap. Mol. Biol. Evol., 30:1188-1195." << endl << endl;
- /* "*** If you use the parallel version, please cite: " << endl << endl <<
- "Bui Quang Minh, Le Sy Vinh, Arndt von Haeseler, and Heiko A. Schmidt (2005)" << endl <<
- "pIQPNNI - parallel reconstruction of large maximum likelihood phylogenies." << endl <<
- "Bioinformatics, 21:3794-3796." << endl << endl;*/
+ if (params.partition_file)
+ out << "Since you used partition models please also cite:" << endl << endl
+ << "Olga Chernomor, Arndt von Haeseler, and Bui Quang Minh (2016) Terrace aware data" << endl
+ << "structure for phylogenomic inference from supermatrices. Syst. Biol., in press." << endl << endl;
-// if (original_model == "TEST" || original_model == "TESTONLY")
-// out << "Since you used Modeltest please also cite Posada and Crandall (1998)" << endl << endl;
}
void reportAlignment(ofstream &out, Alignment &alignment, int nremoved_seqs) {
@@ -743,13 +741,21 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
<< "NNI log-likelihood cutoff: " << tree.getNNICutoff() << endl
<< endl;
*/
- if (params.compute_ml_tree && (params.min_iterations > 0 || original_model.find("ONLY") != string::npos)) {
- if (original_model.find("ONLY") != string::npos)
+ if (params.compute_ml_tree) {
+ if (original_model.find("ONLY") != string::npos) {
out << "TREE USED FOR MODEL SELECTION" << endl
<< "-----------------------------" << endl << endl;
- else
+ } else if (params.min_iterations == 0) {
+ if (params.user_file)
+ out << "USER TREE" << endl
+ << "---------" << endl << endl;
+ else
+ out << "STARTING TREE" << endl
+ << "-------------" << endl << endl;
+ } else {
out << "MAXIMUM LIKELIHOOD TREE" << endl
<< "-----------------------" << endl << endl;
+ }
tree.setRootNode(params.root);
@@ -1727,7 +1733,7 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
iqtree.initializeAllPartialLh();
double initEpsilon = params.min_iterations == 0 ? params.modeps : (params.modeps*10);
- if (params.test_param)
+ if (params.opt_gammai)
initEpsilon = 0.1;
string initTree;
@@ -1743,14 +1749,6 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
exit(0);
}
- if (params.testAlpha) { // DO RESTART ON ALPHA AND P_INVAR
- double stime = getRealTime();
- searchGAMMAInvarByRestarting(iqtree);
- double etime = getRealTime();
- cout << "Testing alpha took: " << etime -stime << " CPU seconds" << endl;
- cout << endl;
- }
-
}
// Optimize model parameters and branch lengths using ML for the initial tree
@@ -2166,13 +2164,12 @@ void searchGAMMAInvarByRestarting(IQTree &iqtree) {
delete [] state_freqs;
delete [] bestRates;
delete [] bestStateFreqs;
- Params::getInstance().testAlpha = false;
}
// Test alpha fom 0.1 to 15 and p_invar from 0.1 to 0.99, stepsize = 0.01
void exhaustiveSearchGAMMAInvar(Params ¶ms, IQTree &iqtree) {
double alphaMin = 0.01;
- double alphaMax = 15.00;
+ double alphaMax = 2.00;
double p_invarMin = 0.01;
double p_invarMax = 1.00;
double stepSize = 0.01;
@@ -2225,9 +2222,16 @@ void runStandardBootstrap(Params ¶ms, string &original_model, Alignment *ali
vector<ModelInfo> *model_info = new vector<ModelInfo>;
StrVector removed_seqs, twin_seqs;
- // turn off aLRT test
+ // turn off all branch tests
int saved_aLRT_replicates = params.aLRT_replicates;
+ int saved_localbp_replicates = params.localbp_replicates;
+ bool saved_aLRT_test = params.aLRT_test;
+ bool saved_aBayes_test = params.aBayes_test;
params.aLRT_replicates = 0;
+ params.localbp_replicates = 0;
+ params.aLRT_test = false;
+ params.aBayes_test = false;
+
string treefile_name = params.out_prefix;
treefile_name += ".treefile";
string boottrees_name = params.out_prefix;
@@ -2376,7 +2380,12 @@ void runStandardBootstrap(Params ¶ms, string &original_model, Alignment *ali
if (params.compute_ml_tree) {
cout << endl << "===> START ANALYSIS ON THE ORIGINAL ALIGNMENT" << endl << endl;
+ // restore branch tests
params.aLRT_replicates = saved_aLRT_replicates;
+ params.localbp_replicates = saved_localbp_replicates;
+ params.aLRT_test = saved_aLRT_test;
+ params.aBayes_test = saved_aBayes_test;
+
runTreeReconstruction(params, original_model, *tree, *model_info);
cout << endl << "===> ASSIGN BOOTSTRAP SUPPORTS TO THE TREE FROM ORIGINAL ALIGNMENT" << endl << endl;
diff --git a/phylokernel.h b/phylokernel.h
index 678393e..7521c72 100644
--- a/phylokernel.h
+++ b/phylokernel.h
@@ -87,12 +87,12 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
size_t nptn = aln->size() + model_factory->unobserved_ptns.size();
PhyloNode *node = (PhyloNode*)(dad_branch->node);
+ if (!tip_partial_lh_computed)
+ computeTipPartialLikelihood();
+
if (node->isLeaf()) {
dad_branch->lh_scale_factor = 0.0;
//memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
-
- if (!tip_partial_lh_computed)
- computeTipPartialLikelihood();
return;
}
@@ -378,7 +378,7 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
}
// check if one should scale partial likelihoods
double lh_max = horizontal_max(vc_max);
- if (lh_max < SCALING_THRESHOLD) {
+ if (lh_max < SCALING_THRESHOLD && ptn_invar[ptn] == 0.0) {
// now do the likelihood scaling
partial_lh -= block; // revert its pointer
VectorClass scale_thres(SCALING_THRESHOLD_INVER);
@@ -460,7 +460,7 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
// check if one should scale partial likelihoods
double lh_max = horizontal_max(vc_max);
- if (lh_max < SCALING_THRESHOLD) {
+ if (lh_max < SCALING_THRESHOLD && ptn_invar[ptn] == 0.0) {
// now do the likelihood scaling
partial_lh -= block; // revert its pointer
VectorClass scale_thres(SCALING_THRESHOLD_INVER);
diff --git a/phylokernelmixrate.h b/phylokernelmixrate.h
index ecad5d1..a92559f 100644
--- a/phylokernelmixrate.h
+++ b/phylokernelmixrate.h
@@ -21,6 +21,12 @@
template <class VectorClass, const int VCSIZE, const int nstates>
void PhyloTree::computeMixratePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+ if (dad_branch->node->degree() > 3) {
+ // TODO: SIMD version for multifurcating node
+ computeMixratePartialLikelihoodEigen(dad_branch, dad);
+ return;
+ }
+
// don't recompute the likelihood
assert(dad);
if (dad_branch->partial_lh_computed & 1)
diff --git a/phylokernelmixture.h b/phylokernelmixture.h
index 6e50635..6b08498 100644
--- a/phylokernelmixture.h
+++ b/phylokernelmixture.h
@@ -19,6 +19,12 @@
template <class VectorClass, const int VCSIZE, const int nstates>
void PhyloTree::computeMixturePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+ if (dad_branch->node->degree() > 3) {
+ // TODO: SIMD version for multifurcating node
+ computeMixturePartialLikelihoodEigen(dad_branch, dad);
+ return;
+ }
+
// don't recompute the likelihood
assert(dad);
if (dad_branch->partial_lh_computed & 1)
diff --git a/phylosupertreeplen.cpp b/phylosupertreeplen.cpp
index bc93d40..c08c6e6 100644
--- a/phylosupertreeplen.cpp
+++ b/phylosupertreeplen.cpp
@@ -138,13 +138,12 @@ double PartitionModelPlen::optimizeParameters(bool fixed_len, bool write_info, d
tree->part_info[part].cur_score = tree->at(part)->computeLikelihood();
cur_lh += tree->part_info[part].cur_score;
-
// normalize rates s.t. branch lengths are #subst per site
double mean_rate = tree->at(part)->getRate()->rescaleRates();
- if (mean_rate != 1.0) {
+ if (fabs(mean_rate-1.0) > 1e-6) {
if (tree->fixed_rates) {
- outError("Partition " + tree->part_info[part].name + " follows FreeRate heterogeneity. Please use proportion edge-linked partition model (-spp)");
+ outError("Unsupported -spj. Please use proportion edge-linked partition model (-spp)");
}
tree->at(part)->scaleLength(mean_rate);
tree->part_info[part].part_rate *= mean_rate;
@@ -1865,6 +1864,7 @@ void PhyloSuperTreePlen::initializeAllPartialLh() {
for (it = begin(), part = 0; it != end(); it++, part++) {
(*it)->tip_partial_lh = lh_addr;
uint64_t tip_partial_lh_size = (*it)->aln->num_states * ((*it)->aln->STATE_UNKNOWN+1) * (*it)->model->getNMixtures();
+ tip_partial_lh_size = ((tip_partial_lh_size+3)/4)*4;
lh_addr += tip_partial_lh_size;
}
}
diff --git a/phylotesting.cpp b/phylotesting.cpp
index 43e9a50..5a1b20f 100644
--- a/phylotesting.cpp
+++ b/phylotesting.cpp
@@ -24,6 +24,7 @@
#include "model/rateinvar.h"
#include "model/rategammainvar.h"
#include "model/ratefree.h"
+#include "model/ratefreeinvar.h"
//#include "modeltest_wrapper.h"
#include "model/modelprotein.h"
#include "model/modelbin.h"
@@ -559,7 +560,8 @@ int getModelList(Params ¶ms, Alignment *aln, StrVector &models, bool separat
// disallow MG+F
if (freq_names[i] == "+F" && orig_model_names[j].find("MG") != string::npos)
continue;
- if (freq_names[i] != "" || model_type == 2) // empirical model also allow ""
+ if (freq_names[i] != "" || (model_type == 2 && orig_model_names[j].find("MG") == string::npos))
+ // empirical model also allow ""
model_names.push_back(orig_model_names[j] + freq_names[i]);
}
} else {
@@ -883,7 +885,7 @@ void testPartitionModel(Params ¶ms, PhyloSuperTree* in_tree, vector<ModelInf
#ifdef _OPENMP
// for (i = 0; i < in_tree->size(); i++)
// cout << distID[i]+1 << "\t" << in_tree->part_info[distID[i]].name << "\t" << -dist[i] << endl;
-#pragma omp parallel for private(i) schedule(dynamic) reduction(+: lhsum, dfsum)
+#pragma omp parallel for private(i) schedule(dynamic) reduction(+: lhsum, dfsum) if(!params.model_test_and_tree)
#endif
for (int j = 0; j < in_tree->size(); j++) {
i = distID[j];
@@ -995,7 +997,7 @@ void testPartitionModel(Params ¶ms, PhyloSuperTree* in_tree, vector<ModelInf
quicksort(dist, 0, num_pairs-1, distID);
#ifdef _OPENMP
-#pragma omp parallel for private(i) schedule(dynamic)
+#pragma omp parallel for private(i) schedule(dynamic) if(!params.model_test_and_tree)
#endif
for (int pair = 0; pair < num_pairs; pair++) {
int part1 = distID[pair] >> 16;
@@ -1034,6 +1036,11 @@ void testPartitionModel(Params ¶ms, PhyloSuperTree* in_tree, vector<ModelInf
PhyloTree *tree = in_tree->extractSubtree(merged_set);
tree->setAlignment(aln);
extractModelInfo(set_name, model_info, part_model_info);
+// TODO
+ tree->num_precision = in_tree->num_precision;
+ if (params.model_test_and_tree) {
+ tree->setCheckpoint(new Checkpoint());
+ }
//#ifdef _OPENMP
model = testModel(params, tree, part_model_info, this_fmodel, models_block, set_name);
//#else
@@ -1042,6 +1049,9 @@ void testPartitionModel(Params ¶ms, PhyloSuperTree* in_tree, vector<ModelInf
logl = part_model_info[0].logl;
df = part_model_info[0].df;
treelen = part_model_info[0].tree_len;
+ if (params.model_test_and_tree) {
+ delete tree->getCheckpoint();
+ }
delete tree;
delete aln;
}
@@ -1143,6 +1153,20 @@ void testPartitionModel(Params ¶ms, PhyloSuperTree* in_tree, vector<ModelInf
in_tree->printBestPartitionRaxml((string(params.out_prefix) + ".best_scheme").c_str());
}
+bool isMixtureModel(ModelsBlock *models_block, string &model_str) {
+ size_t mix_pos;
+ for (mix_pos = 0; mix_pos < model_str.length(); mix_pos++) {
+ size_t next_mix_pos = model_str.find_first_of("+*", mix_pos);
+ string sub_model_str = model_str.substr(mix_pos, next_mix_pos-mix_pos);
+ if (models_block->findMixModel(sub_model_str))
+ return true;
+ if (next_mix_pos == string::npos)
+ break;
+ mix_pos = next_mix_pos;
+ }
+ return false;
+}
+
string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_info, ostream &fmodel, ModelsBlock *models_block,
string set_name, bool print_mem_usage)
{
@@ -1201,12 +1225,20 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
rate_class[0] = new RateHeterogeneity();
rate_class[1] = new RateInvar(-1, NULL);
rate_class[2] = new RateGamma(params.num_rate_cats, params.gamma_shape, params.gamma_median, NULL);
- rate_class[3] = new RateGammaInvar(params.num_rate_cats, params.gamma_shape, params.gamma_median, -1, params.optimize_model_rate_joint, NULL);
+ rate_class[3] = new RateGammaInvar(params.num_rate_cats, params.gamma_shape, params.gamma_median, -1, params.optimize_alg_gammai, NULL, false);
RateFree ** rate_class_free = new RateFree*[params.max_rate_cats-1];
for (model = 0; model < params.max_rate_cats-1; model++)
rate_class_free[model] = new RateFree(model+2, params.gamma_shape, "", false, params.optimize_alg, NULL);
+
+ RateFreeInvar ** rate_class_freeinvar = new RateFreeInvar*[params.max_rate_cats-1];
+
+ for (model = 0; model < params.max_rate_cats-1; model++) {
+ rate_class_freeinvar[model] = new RateFreeInvar(model+2, params.gamma_shape, "", false, in_tree->aln->frac_const_sites/2.0, params.optimize_alg, NULL);
+ rate_class_freeinvar[model]->setFixPInvar(false);
+ }
+
ModelGTR *subst_model = NULL;
if (seq_type == SEQ_BINARY)
@@ -1281,7 +1313,7 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
int ncat = 0;
string orig_name = params.model_name;
- if (models_block->findMixModel(model_names[model])) {
+ if (isMixtureModel(models_block, model_names[model])) {
// mixture model
try {
mixture_model = true;
@@ -1302,7 +1334,7 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
// normal model
if (model_names[model].find("+ASC") != string::npos) {
model_fac->unobserved_ptns = in_tree->aln->getUnobservedConstPatterns();
- if (model_fac->unobserved_ptns.size() == 0) {
+ if (model_fac->unobserved_ptns.size() < tree->aln->getNumNonstopCodons() || in_tree->aln->frac_const_sites > 0.0) {
cout.width(3);
cout << right << model+1 << " ";
cout.width(13);
@@ -1342,7 +1374,17 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
tree->setModel(subst_model);
// initialize rate
size_t pos;
- if ((pos = model_names[model].find("+R")) != string::npos) {
+ if (model_names[model].find("+I") != string::npos && (pos = model_names[model].find("+R")) != string::npos) {
+ ncat = params.num_rate_cats;
+ if (model_names[model].length() > pos+2 && isdigit(model_names[model][pos+2])) {
+ ncat = convert_int(model_names[model].c_str() + pos+2);
+ // tree->getRate()->setNCategory(ncat);
+ }
+ if (ncat <= 1) outError("Number of rate categories for " + model_names[model] + " is <= 1");
+ if (ncat > params.max_rate_cats)
+ outError("Number of rate categories for " + model_names[model] + " exceeds " + convertIntToString(params.max_rate_cats));
+ tree->setRate(rate_class_freeinvar[ncat-2]);
+ } else if ((pos = model_names[model].find("+R")) != string::npos) {
ncat = params.num_rate_cats;
if (model_names[model].length() > pos+2 && isdigit(model_names[model][pos+2])) {
ncat = convert_int(model_names[model].c_str() + pos+2);
@@ -1394,7 +1436,8 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
if (skip_model) {
assert(prev_model_id>=0);
size_t pos_r = info.name.find("+R");
- if (pos_r == string::npos || info.name.substr(0, pos_r) != model_info[prev_model_id].name.substr(0, pos_r))
+ size_t prev_pos_r = model_info[prev_model_id].name.find("+R");
+ if (pos_r == string::npos || prev_pos_r == string::npos || info.name.substr(0, pos_r) != model_info[prev_model_id].name.substr(0, prev_pos_r))
skip_model = 0;
}
for (int i = 0; i < model_info.size(); i++)
@@ -1481,7 +1524,10 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
if (verbose_mode >= VB_MED)
cout << "reoptimizing from previous parameters of +R...." << endl;
assert(ncat >= 3);
- rate_class_free[ncat-2]->setRateAndProp(rate_class_free[ncat-3]);
+ if (tree->getRate()->getPInvar() != 0.0)
+ rate_class_freeinvar[ncat-2]->setRateAndProp(rate_class_freeinvar[ncat-3]);
+ else
+ rate_class_free[ncat-2]->setRateAndProp(rate_class_free[ncat-3]);
info.logl = tree->getModelFactory()->optimizeParameters(false, false, TOL_LIKELIHOOD_MODELTEST, TOL_GRADIENT_MODELTEST);
info.tree_len = tree->treeLength();
}
@@ -1716,6 +1762,11 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
delete rate_class_free[rate_type];
}
delete [] rate_class_free;
+
+ for (rate_type = params.max_rate_cats-2; rate_type >= 0; rate_type--) {
+ delete rate_class_freeinvar[rate_type];
+ }
+ delete [] rate_class_freeinvar;
// delete tree_hetero;
// delete tree_homo;
diff --git a/phylotree.cpp b/phylotree.cpp
index 2221943..3bbc1ac 100644
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -89,7 +89,7 @@ void PhyloTree::init() {
subTreeDistComputed = false;
dist_matrix = NULL;
var_matrix = NULL;
- setLikelihoodKernel(LK_SSE); // FOR TUNG: you forgot to initialize this variable!
+ setLikelihoodKernel(LK_EIGEN_SSE); // FOR TUNG: you forgot to initialize this variable!
save_all_trees = 0;
nodeBranchDists = NULL;
// FOR: upper bounds
@@ -630,130 +630,10 @@ size_t PhyloTree::getBitsBlockSize() {
return len;
}
-int PhyloTree::getBitsEntrySize() {
- // reserve the last entry for parsimony score
- return (aln->num_states + SIMD_BITS - 1) / UINT_BITS;
-}
-
UINT *PhyloTree::newBitsBlock() {
return aligned_alloc<UINT>(getBitsBlockSize());
}
-void PhyloTree::getBitsBlock(UINT *bit_vec, int index, UINT* &bits_entry) {
- int nstates = aln->num_states;
- int myindex = (index * nstates);
- int bit_pos_begin = myindex >> BITS_DIV;
- int bit_off_begin = myindex & BITS_MODULO;
- int bit_pos_end = (myindex + nstates) >> BITS_DIV;
- int bit_off_end = (myindex + nstates) & BITS_MODULO;
-
- if (bit_pos_begin == bit_pos_end) {
- bits_entry[0] = (bit_vec[bit_pos_begin] >> bit_off_begin) & ((1 << nstates) - 1);
- return;
- }
- UINT part1 = (bit_vec[bit_pos_begin] >> bit_off_begin);
- int rest_bits = nstates;
- int id;
- for (id = 0; rest_bits >= UINT_BITS; id++, rest_bits -= UINT_BITS, bit_pos_begin++) {
- bits_entry[id] = part1;
- if (bit_off_begin > 0)
- bits_entry[id] |= (bit_vec[bit_pos_begin + 1] << (UINT_BITS - bit_off_begin));
- part1 = (bit_vec[bit_pos_begin + 1] >> bit_off_begin);
- }
- if (bit_pos_begin == bit_pos_end) {
- bits_entry[id] = (bit_vec[bit_pos_begin] >> bit_off_begin) & ((1 << rest_bits) - 1);
- return;
- }
- UINT part2 = bit_vec[bit_pos_end];
- if (bit_off_end < UINT_BITS)
- part2 &= ((1 << bit_off_end) - 1);
- bits_entry[id] = part1;
- if (bit_off_begin > 0)
- bits_entry[id] |= (part2 << (UINT_BITS - bit_off_begin));
-}
-
-void PhyloTree::setBitsBlock(UINT* &bit_vec, int index, UINT *bits_entry) {
- int nstates = aln->num_states;
- int myindex = (index * nstates);
- int bit_pos_begin = myindex >> BITS_DIV;
- int bit_off_begin = myindex & BITS_MODULO;
- int bit_pos_end = (myindex + nstates) >> BITS_DIV;
- int bit_off_end = (myindex + nstates) & BITS_MODULO;
-
- //assert(value <= allstates);
-
- if (bit_pos_begin == bit_pos_end) {
- // first clear the bit between bit_off_begin and bit_off_end
- int allstates = (1 << nstates) - 1;
- bit_vec[bit_pos_begin] &= ~(allstates << bit_off_begin);
- // now set the bit
- bit_vec[bit_pos_begin] |= bits_entry[0] << bit_off_begin;
- return;
- }
- int len1 = UINT_BITS - bit_off_begin;
- // clear bit from bit_off_begin to UINT_BITS
- bit_vec[bit_pos_begin] &= (1 << bit_off_begin) - 1;
- // set bit from bit_off_begin to UINT_BITS
- bit_vec[bit_pos_begin] |= (bits_entry[0] << bit_off_begin);
- int rest_bits = nstates - len1;
- int id;
- for (id = 0; rest_bits >= UINT_BITS; bit_pos_begin++, id++, rest_bits -= UINT_BITS) {
- bit_vec[bit_pos_begin + 1] = (bits_entry[id + 1] << bit_off_begin);
- if (len1 < UINT_BITS)
- bit_vec[bit_pos_begin + 1] |= (bits_entry[id] >> len1);
- }
-
- assert(bit_pos_begin == bit_pos_end - 1);
- // clear bit from 0 to bit_off_end
- bit_vec[bit_pos_end] &= ~((1 << bit_off_end) - 1);
- // now set the bit the value
- if (len1 < UINT_BITS)
- bit_vec[bit_pos_end] |= (bits_entry[id] >> len1);
- rest_bits -= bit_off_begin;
- if (rest_bits > 0)
- bit_vec[bit_pos_end] |= (bits_entry[id + 1] << bit_off_begin);
-}
-
-bool PhyloTree::isEmptyBitsEntry(UINT *bits_entry) {
- int rest_bits = aln->num_states;
- int i;
- for (i = 0; rest_bits >= UINT_BITS; rest_bits -= UINT_BITS, i++)
- if (bits_entry[i])
- return false;
- if (bits_entry[i] & ((1 << rest_bits) - 1))
- return false;
- return true;
-}
-
-void PhyloTree::unionBitsEntry(UINT *bits_entry1, UINT *bits_entry2, UINT* &bits_union) {
- int rest_bits = aln->num_states;
- int i;
- for (i = 0; rest_bits > 0; rest_bits -= UINT_BITS, i++)
- bits_union[i] = bits_entry1[i] | bits_entry2[i];
-}
-
-void PhyloTree::setBitsEntry(UINT* &bits_entry, int id) {
- int bit_pos = id >> BITS_DIV;
- int bit_off = id & BITS_MODULO;
- bits_entry[bit_pos] |= (1 << bit_off);
-}
-
-bool PhyloTree::getBitsEntry(UINT* &bits_entry, int id) {
- int bit_pos = id >> BITS_DIV;
- int bit_off = id & BITS_MODULO;
- if (bits_entry[bit_pos] & (1 << bit_off))
- return true;
- return false;
-}
-
-void setBitsAll(UINT* &bit_vec, int num) {
- //int id;
- int size = num / UINT_BITS;
- memset(bit_vec, 255, size * sizeof(UINT));
- num &= BITS_MODULO;
- if (num)
- bit_vec[size] = (1 << num) - 1;
-}
void PhyloTree::computePartialParsimony(PhyloNeighbor *dad_branch, PhyloNode *dad) {
(this->*computePartialParsimonyPointer)(dad_branch, dad);
@@ -769,440 +649,16 @@ void PhyloTree::computeReversePartialParsimony(PhyloNode *node, PhyloNode *dad)
}
-void PhyloTree::computePartialParsimonyNaive(PhyloNeighbor *dad_branch, PhyloNode *dad) {
- // don't recompute the parsimony
- if (dad_branch->partial_lh_computed & 2)
- return;
- Node *node = dad_branch->node;
- //assert(node->degree() <= 3);
- int ptn;
- int nstates = aln->num_states;
- int pars_size = getBitsBlockSize();
- int entry_size = getBitsEntrySize();
- assert(dad_branch->partial_pars);
- UINT *bits_entry = new UINT[entry_size];
- UINT *bits_entry_child = new UINT[entry_size];
- //UINT *bits_entry1 = new UINT[entry_size];
- //UINT *bits_entry2 = new UINT[entry_size];
-
- if (node->isLeaf() && dad) {
- // external node
- int ambi_aa[] = {4+8, 32+64, 512+1024};
-
- setBitsAll(dad_branch->partial_pars, nstates * aln->size());
- dad_branch->partial_pars[pars_size - 1] = 0;
- for (ptn = 0; ptn < aln->size(); ptn++)
- if (!aln->at(ptn).is_const) {
- char state;
- if (node->name == ROOT_NAME) {
- state = aln->STATE_UNKNOWN;
- } else {
- assert(node->id < aln->getNSeq());
- state = (aln->at(ptn))[node->id];
- }
- if (state == aln->STATE_UNKNOWN) {
- // fill all entries with bit 1
- //setBitsBlock(dad_branch->partial_pars, ptn, (1 << nstates) - 1);
- } else if (state < nstates) {
- memset(bits_entry, 0, sizeof(UINT) * entry_size);
- setBitsEntry(bits_entry, state);
- setBitsBlock(dad_branch->partial_pars, ptn, bits_entry);
- } else if (aln->seq_type == SEQ_DNA) {
- // ambiguous character, for DNA, RNA
- state = state - (nstates - 1);
- memset(bits_entry, 0, sizeof(UINT) * entry_size);
- bits_entry[0] = state;
- setBitsBlock(dad_branch->partial_pars, ptn, bits_entry);
- } else if (aln->seq_type == SEQ_PROTEIN) {
- if (state >= 23) return;
- state -= 20;
- memset(bits_entry, 0, sizeof(UINT) * entry_size);
- bits_entry[0] = ambi_aa[(int)state];
- setBitsBlock(dad_branch->partial_pars, ptn, bits_entry);
- } else {
- assert(0);
- }
- }
- } else {
- // internal node
- memset(dad_branch->partial_pars, 255, pars_size * sizeof(int));
- UINT *partial_pars_dad = dad_branch->partial_pars;
- int partial_pars = 0;
- //UINT *partial_pars_child1 = NULL, *partial_pars_child2 = NULL;
- // take the intersection of two child states (with &= bit operation)
- FOR_NEIGHBOR_IT(node, dad, it)if ((*it)->node->name != ROOT_NAME) {
- computePartialParsimonyNaive((PhyloNeighbor*) (*it), (PhyloNode*) node);
- /*
- if (!partial_pars_child1)
- partial_pars_child1 = ((PhyloNeighbor*) (*it))->partial_pars;
- else
- partial_pars_child2 = ((PhyloNeighbor*) (*it))->partial_pars;
- */
- UINT *partial_pars_child = ((PhyloNeighbor*) (*it))->partial_pars;
- for (int i = 0; i < pars_size - 1; i++)
- partial_pars_dad[i] &= partial_pars_child[i];
- partial_pars += partial_pars_child[pars_size - 1];
- }
- //assert(partial_pars_child1 && partial_pars_child2);
- // take the intersection of two bits block
- //for (int i = 0; i < pars_size - 1; i++)
- // partial_pars_dad[i] = partial_pars_child1[i] & partial_pars_child2[i];
- //int partial_pars = partial_pars_child1[pars_size - 1] + partial_pars_child2[pars_size - 1];
- // now check if some intersection is empty, change to union (Fitch algorithm) and increase the parsimony score
- memset(bits_entry, 0, entry_size * sizeof(UINT));
- for (ptn = 0; ptn < aln->size(); ptn++)
- if (!aln->at(ptn).is_const) {
- getBitsBlock(partial_pars_dad, ptn, bits_entry);
- if (isEmptyBitsEntry(bits_entry)) {
- FOR_NEIGHBOR_IT(node, dad, it2)if ((*it2)->node->name != ROOT_NAME) {
- UINT *partial_pars_child = ((PhyloNeighbor*) (*it2))->partial_pars;
- getBitsBlock(partial_pars_child, ptn, bits_entry_child);
- unionBitsEntry(bits_entry, bits_entry_child, bits_entry);
- }
- //getBitsBlock(partial_pars_child2, ptn, bits_entry2);
- //unionBitsEntry(bits_entry1, bits_entry2, bits_entry);
- //cout << bits_entry[0] << " " << bits_entry[1] << endl;
- setBitsBlock(partial_pars_dad, ptn, bits_entry);
- partial_pars += aln->at(ptn).frequency;
- }
- }
-
- /*
- for (ptn = 0; ptn < aln->size(); ptn++)
- if (!aln->at(ptn).is_const) {
- getBitsBlock(partial_pars_dad, ptn, bits_entry);
- if (isEmptyBitsEntry(bits_entry)) {
- getBitsBlock(partial_pars_child1, ptn, bits_entry1);
- getBitsBlock(partial_pars_child2, ptn, bits_entry2);
- unionBitsEntry(bits_entry1, bits_entry2, bits_entry);
- //cout << bits_entry[0] << " " << bits_entry[1] << endl;
- setBitsBlock(partial_pars_dad, ptn, bits_entry);
- partial_pars += aln->at(ptn).frequency;
- }
- }*/
- partial_pars_dad[pars_size - 1] = partial_pars;
- }
- dad_branch->partial_lh_computed |= 2;
- //delete[] bits_entry2;
- //delete[] bits_entry1;
- delete[] bits_entry_child;
- delete[] bits_entry;
-}
int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
return (this->*computeParsimonyBranchPointer)(dad_branch, dad, branch_subst);
}
-int PhyloTree::computeParsimonyBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
-
- PhyloNode *node = (PhyloNode*) dad_branch->node;
- PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
- assert(node_branch);
- if (!central_partial_pars)
- initializeAllPartialPars();
- // swap node and dad if dad is a leaf
- if (node->isLeaf()) {
- PhyloNode *tmp_node = dad;
- dad = node;
- node = tmp_node;
- PhyloNeighbor *tmp_nei = dad_branch;
- dad_branch = node_branch;
- node_branch = tmp_nei;
- //cout << "swapped\n";
- }
- if ((dad_branch->partial_lh_computed & 2) == 0)
- computePartialParsimonyNaive(dad_branch, dad);
- if ((node_branch->partial_lh_computed & 2) == 0)
- computePartialParsimonyNaive(node_branch, node);
- // now combine likelihood at the branch
-
- int pars_size = getBitsBlockSize();
- int entry_size = getBitsEntrySize();
- //int nstates = aln->num_states;
- int i, ptn;
- int tree_pars = 0;
- UINT *partial_pars = newBitsBlock();
- UINT *bits_entry = new UINT[entry_size];
- for (i = 0; i < pars_size - 1; i++)
- partial_pars[i] = (node_branch->partial_pars[i] & dad_branch->partial_pars[i]);
-
- for (ptn = 0; ptn < aln->size(); ptn++)
- if (!aln->at(ptn).is_const) {
- getBitsBlock(partial_pars, ptn, bits_entry);
- if (isEmptyBitsEntry(bits_entry))
- tree_pars += aln->at(ptn).frequency;
- }
- if (branch_subst)
- *branch_subst = tree_pars;
- tree_pars += node_branch->partial_pars[pars_size - 1] + dad_branch->partial_pars[pars_size - 1];
- delete[] bits_entry;
- aligned_free(partial_pars);
- return tree_pars;
-}
int PhyloTree::computeParsimony() {
return computeParsimonyBranch((PhyloNeighbor*) root->neighbors[0], (PhyloNode*) root);
}
-void PhyloTree::printParsimonyStates(PhyloNeighbor *dad_branch, PhyloNode *dad) {
- if (!dad) {
- dad = (PhyloNode*) root;
- dad_branch = (PhyloNeighbor*) root->neighbors[0];
- cout << "Parsimonious states for every node and site: " << endl;
- }
- int site;
- cout << "States for node ";
- int max_len = aln->getMaxSeqNameLength();
- if (max_len < 3)
- max_len = 3;
- cout.width(max_len);
- if (!dad_branch->node->name.empty())
- cout << left << dad_branch->node->name;
- else
- cout << left << dad_branch->node->id;
- cout << " are ";
- UINT *bits_entry = new UINT[getBitsEntrySize()];
- for (site = 0; site < aln->getNSite(); site++) {
- int ptn = aln->getPatternID(site);
- getBitsBlock(dad_branch->partial_pars, ptn, bits_entry);
- if (aln->at(ptn).is_const) {
- int state = aln->at(ptn)[0];
- if (state < aln->num_states)
- setBitsEntry(bits_entry, state);
- else {
- memset(bits_entry, 0, sizeof(UINT) * getBitsEntrySize());
- bits_entry[0] = state - (aln->num_states - 1);
- ;
- }
- }
- cout << "{";
- bool first = true;
- for (int i = 0; i < aln->num_states; i++)
- if (getBitsEntry(bits_entry, i)) {
- cout << ((!first) ? "," : "") << i;
- first = false;
- }
- cout << "}\t";
- }
- cout << endl;
- delete[] bits_entry;
- FOR_NEIGHBOR_IT(dad_branch->node, dad, it)printParsimonyStates((PhyloNeighbor*) (*it), (PhyloNode*) (dad_branch->node));
-}
-
-int PhyloTree::computeParsimonyScore(int ptn, int &states, PhyloNode *node, PhyloNode *dad) {
- int score = 0;
- states = 0;
- if (!node)
- node = (PhyloNode*) root;
- if (node->degree() > 3)
- outError("Does not work with multifurcating tree");
- if (verbose_mode == VB_DEBUG)
- cout << ptn << " " << node->id << " " << node->name << endl;
-
- if (node->isLeaf()) {
- char state;
- if (node->name == ROOT_NAME) {
- state = aln->STATE_UNKNOWN;
- } else {
- assert(node->id < aln->getNSeq());
- state = (*aln)[ptn][node->id];
- }
- if (state == aln->STATE_UNKNOWN) {
- states = (1 << aln->num_states) - 1;
- } else if (state < aln->num_states)
- states = (1 << state);
- else {
- // ambiguous character, for DNA, RNA
- states = state - 3;
- }
- }
- if (!node->isLeaf() || node == root) {
- int union_states = 0;
- int intersect_states = (1 << aln->num_states) - 1;
- if (states != 0) {
- union_states = states;
- intersect_states = states;
- }
-
- FOR_NEIGHBOR_IT(node, dad, it){
- int states_child;
- int score_child = computeParsimonyScore(ptn, states_child, (PhyloNode*) ((*it)->node), node);
- union_states |= states_child;
- intersect_states &= states_child;
- score += score_child;
- }
- if (intersect_states)
- states = intersect_states;
- else {
- states = union_states;
- score++;
- }
- }
- return score;
-}
-
-int PhyloTree::computeParsimonyScore() {
- assert(root && root->isLeaf());
-
- int score = 0;
- for (int ptn = 0; ptn < aln->size(); ptn++)
- if (!aln->at(ptn).is_const) {
- int states;
- score += computeParsimonyScore(ptn, states) * (*aln)[ptn].frequency;
- }
- return score;
-}
-
-/****************************************************************************
- Nearest Neighbor Interchange with parsimony
- ****************************************************************************/
-
-double PhyloTree::swapNNI(double cur_score, PhyloNode *node1, PhyloNode *node2) {
- assert(node1->degree() == 3 && node2->degree() == 3);
- FOR_NEIGHBOR_DECLARE(node1, node2, it1)
- break;
- Node *node1_nei = (*it1)->node;
-
- FOR_NEIGHBOR_IT(node2, node1, it2){
- // do the NNI swap
- Node *node2_nei = (*it2)->node;
- node1->updateNeighbor(node1_nei, node2_nei);
- node1_nei->updateNeighbor(node1, node2);
- node2->updateNeighbor(node2_nei, node1_nei);
- node2_nei->updateNeighbor(node2, node1);
-
- // compute the score of the swapped topology
- double score = computeParsimonyScore();
- // if better: return
- if (score < cur_score) return score;
- // else, swap back
- node1->updateNeighbor(node2_nei, node1_nei);
- node1_nei->updateNeighbor(node2, node1);
- node2->updateNeighbor(node1_nei, node2_nei);
- node2_nei->updateNeighbor(node1, node2);
-}
- return cur_score;
-}
-
-double PhyloTree::searchNNI(double cur_score, PhyloNode *node, PhyloNode *dad) {
- if (!node)
- node = (PhyloNode*) root;
- if (!node->isLeaf() && dad && !dad->isLeaf()) {
- double score = swapNNI(cur_score, node, dad);
- if (score < cur_score)
- return score;
- }
-
- FOR_NEIGHBOR_IT(node, dad, it){
- double score = searchNNI(cur_score, (PhyloNode*) (*it)->node, node);
- if (score < cur_score) return score;
-}
- return cur_score;
-}
-
-void PhyloTree::searchNNI() {
- cout << "Search with Nearest Neighbor Interchange..." << endl;
- double cur_score = computeParsimonyScore();
- do {
- double score = searchNNI(cur_score);
- if (score >= cur_score)
- break;
- cout << "Better score found: " << score << endl;
- cur_score = score;
- } while (true);
-}
-
-
-int PhyloTree::addTaxonMP(Node *added_node, Node* &target_node, Node* &target_dad, Node *node, Node *dad) {
- Neighbor *dad_nei = dad->findNeighbor(node);
-
- // now insert the new node in the middle of the branch node-dad
- double len = dad_nei->length;
- node->updateNeighbor(dad, added_node, len / 2.0);
- dad->updateNeighbor(node, added_node, len / 2.0);
- added_node->updateNeighbor((Node*) 1, node, len / 2.0);
- added_node->updateNeighbor((Node*) 2, dad, len / 2.0);
- // compute the likelihood
- //clearAllPartialLh();
- int best_score = computeParsimonyScore();
- target_node = node;
- target_dad = dad;
- // remove the added node
- node->updateNeighbor(added_node, dad, len);
- dad->updateNeighbor(added_node, node, len);
- added_node->updateNeighbor(node, (Node*) 1, len);
- added_node->updateNeighbor(dad, (Node*) 2, len);
-
- // now tranverse the tree downwards
-
- FOR_NEIGHBOR_IT(node, dad, it){
- Node *target_node2;
- Node *target_dad2;
- double score = addTaxonMP(added_node, target_node2, target_dad2, (*it)->node, node);
- if (score < best_score) {
- best_score = score;
- target_node = target_node2;
- target_dad = target_dad2;
- }
-}
- return best_score;
-}
-
-void PhyloTree::growTreeMP(Alignment *alignment) {
-
- cout << "Stepwise addition using maximum parsimony..." << endl;
- aln = alignment;
- int size = aln->getNSeq();
- if (size < 3)
- outError(ERR_FEW_TAXA);
-
- root = newNode();
- Node *new_taxon;
-
- // create initial tree with 3 taxa
- for (leafNum = 0; leafNum < 3; leafNum++) {
- if (verbose_mode >= VB_MAX)
- cout << "Add " << aln->getSeqName(leafNum) << " to the tree" << endl;
- new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
- root->addNeighbor(new_taxon, 1.0);
- new_taxon->addNeighbor(root, 1.0);
- }
- root = findNodeID(0);
- //optimizeAllBranches();
-
- // stepwise adding the next taxon
- for (leafNum = 3; leafNum < size; leafNum++) {
- if (verbose_mode >= VB_MAX)
- cout << "Add " << aln->getSeqName(leafNum) << " to the tree";
- // allocate a new taxon and a new ajedcent internal node
- new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
- Node *added_node = newNode();
- added_node->addNeighbor(new_taxon, 1.0);
- new_taxon->addNeighbor(added_node, 1.0);
-
- // preserve two neighbors
- added_node->addNeighbor((Node*) 1, 1.0);
- added_node->addNeighbor((Node*) 2, 1.0);
-
- Node *target_node = NULL;
- Node *target_dad = NULL;
- int score = addTaxonMP(added_node, target_node, target_dad, root->neighbors[0]->node, root);
- if (verbose_mode >= VB_MAX)
- cout << ", score = " << score << endl;
- // now insert the new node in the middle of the branch node-dad
- double len = target_dad->findNeighbor(target_node)->length;
- target_node->updateNeighbor(target_dad, added_node, len / 2.0);
- target_dad->updateNeighbor(target_node, added_node, len / 2.0);
- added_node->updateNeighbor((Node*) 1, target_node, len / 2.0);
- added_node->updateNeighbor((Node*) 2, target_dad, len / 2.0);
- // compute the likelihood
- //clearAllPartialLh();
- //optimizeAllBranches();
- //optimizeNNI();
- }
-
- nodeNum = 2 * leafNum - 2;
-}
/****************************************************************************
likelihood function
@@ -1356,11 +812,11 @@ void PhyloTree::getMemoryRequired(uint64_t &partial_lh_entries, uint64_t &scale_
uint64_t tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures();
if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
if (params->lh_mem_save == LM_PER_NODE)
- partial_lh_entries = ((uint64_t)leafNum - 2) * (uint64_t) block_size + 2 + tip_partial_lh_size;
+ partial_lh_entries = ((uint64_t)leafNum - 2) * (uint64_t) block_size + 4 + tip_partial_lh_size;
else
- partial_lh_entries = ((uint64_t)leafNum * 3 - 6) * (uint64_t) block_size + 2 + tip_partial_lh_size;
+ partial_lh_entries = ((uint64_t)leafNum * 3 - 6) * (uint64_t) block_size + 4 + tip_partial_lh_size;
} else
- partial_lh_entries = ((uint64_t)leafNum * 4 - 6) * (uint64_t) block_size + 2 + tip_partial_lh_size;
+ partial_lh_entries = ((uint64_t)leafNum * 4 - 6) * (uint64_t) block_size + 4 + tip_partial_lh_size;
if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
@@ -1587,15 +1043,15 @@ double PhyloTree::computeLikelihood(double *pattern_lh) {
// assert(current_it_back);
}
double score;
- string root_name = ROOT_NAME;
- Node *vroot = findLeafName(root_name);
- if (root_state != aln->STATE_UNKNOWN && vroot) {
- if (verbose_mode >= VB_DEBUG)
- cout << __func__ << " HIT ROOT STATE " << endl;
- score = computeLikelihoodRooted((PhyloNeighbor*) vroot->neighbors[0], (PhyloNode*) vroot);
- } else {
+// string root_name = ROOT_NAME;
+// Node *vroot = findLeafName(root_name);
+// if (root_state != aln->STATE_UNKNOWN && vroot) {
+// if (verbose_mode >= VB_DEBUG)
+// cout << __func__ << " HIT ROOT STATE " << endl;
+// score = computeLikelihoodRooted((PhyloNeighbor*) vroot->neighbors[0], (PhyloNode*) vroot);
+// } else {
score = computeLikelihoodBranch(current_it, (PhyloNode*) current_it_back->node);
- }
+// }
if (pattern_lh)
memmove(pattern_lh, _pattern_lh, aln->size() * sizeof(double));
@@ -1611,26 +1067,27 @@ double PhyloTree::computeLikelihood(double *pattern_lh) {
outError("Scaling error ", __func__);
}*/
}
+ curScore = score;
return score;
}
-double PhyloTree::computeLikelihoodRooted(PhyloNeighbor *dad_branch, PhyloNode *dad) {
- double score = computeLikelihoodBranchNaive(dad_branch, dad);
- if (verbose_mode >= VB_DEBUG) {
- printTransMatrices(dad_branch->node, dad);
- /*
- FOR_NEIGHBOR_IT(dad_branch->node, dad, it) {
- PhyloNeighbor *pit = (PhyloNeighbor*)(*it);
- cout << pit->node->name << "\t" << pit->partial_lh[0] << endl;
-
- }*/
- }
- double* state_freq = new double[aln->num_states];
- model->getStateFrequency(state_freq);
- score -= log(state_freq[(int) root_state]);
- delete[] state_freq;
- return score;
-}
+//double PhyloTree::computeLikelihoodRooted(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+// double score = computeLikelihoodBranchNaive(dad_branch, dad);
+// if (verbose_mode >= VB_DEBUG) {
+// printTransMatrices(dad_branch->node, dad);
+// /*
+// FOR_NEIGHBOR_IT(dad_branch->node, dad, it) {
+// PhyloNeighbor *pit = (PhyloNeighbor*)(*it);
+// cout << pit->node->name << "\t" << pit->partial_lh[0] << endl;
+//
+// }*/
+// }
+// double* state_freq = new double[aln->num_states];
+// model->getStateFrequency(state_freq);
+// score -= log(state_freq[(int) root_state]);
+// delete[] state_freq;
+// return score;
+//}
int PhyloTree::getNumLhCat(SiteLoglType wsl) {
int ncat = 0;
@@ -1655,11 +1112,12 @@ double PhyloTree::computePatternLhCat(SiteLoglType wsl) {
current_it = (PhyloNeighbor*)leaf->neighbors[0];
current_it_back = (PhyloNeighbor*)current_it->node->findNeighbor(leaf);
}
- if (sse == LK_NORMAL || sse == LK_SSE) {
- if (getModel()->isMixture())
- outError("Naive kernel does not support mixture models, contact author if you really need this feature");
- return computeLikelihoodBranchNaive(current_it, (PhyloNode*)current_it_back->node);
- } else if (!getModel()->isMixture())
+// if (sse == LK_NORMAL || sse == LK_SSE) {
+// if (getModel()->isMixture())
+// outError("Naive kernel does not support mixture models, contact author if you really need this feature");
+// return computeLikelihoodBranchNaive(current_it, (PhyloNode*)current_it_back->node);
+// } else
+ if (!getModel()->isMixture())
return computeLikelihoodBranchEigen(current_it, (PhyloNode*)current_it_back->node);
else if (getModelFactory()->fused_mix_rate)
return computeMixrateLikelihoodBranchEigen(current_it, (PhyloNode*)current_it_back->node);
@@ -2621,149 +2079,6 @@ void PhyloTree::computeAllBayesianBranchLengths(Node *node, Node *dad) {
}
}
-//double PhyloTree::computeLikelihoodBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, double *pattern_lh, double *pattern_rate) {
-double PhyloTree::computeLikelihoodBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad) {
- PhyloNode *node = (PhyloNode*) dad_branch->node;
- PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
- //assert(node_branch);
- //assert(!site_rate->isSiteSpecificRate() || !model->isSiteSpecificModel());
- if (!central_partial_lh)
- initializeAllPartialLh();
- // swap node and dad if dad is a leaf
- // NEW: swap if root_state is given
- if (node->isLeaf() || (node->name == ROOT_NAME && root_state != aln->STATE_UNKNOWN)) {
- PhyloNode *tmp_node = dad;
- dad = node;
- node = tmp_node;
- PhyloNeighbor *tmp_nei = dad_branch;
- dad_branch = node_branch;
- node_branch = tmp_nei;
- //cout << "swapped\n";
- }
-
- if ((dad_branch->partial_lh_computed & 1) == 0)
- computePartialLikelihood(dad_branch, dad);
- if ((node_branch->partial_lh_computed & 1) == 0)
- computePartialLikelihood(node_branch, node);
- // now combine likelihood at the branch
-
- double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
- int ncat = site_rate->getNRate();
- double p_invar = site_rate->getPInvar();
- double p_var_cat = (1.0 - p_invar) / (double) ncat;
- int nstates = aln->num_states;
- size_t block = ncat * nstates;
- int trans_size = model->getTransMatrixSize();
- size_t ptn; // for big data size > 4GB memory required
- int cat, state1, state2;
- size_t nptn = aln->size() + model_factory->unobserved_ptns.size();
- size_t orig_nptn = aln->size();
- int discrete_cat = site_rate->getNDiscreteRate();
- double *trans_mat = new double[discrete_cat * trans_size];
- double *state_freq = new double[nstates];
- model->getStateFrequency(state_freq);
-
- if (!site_rate->isSiteSpecificRate())
- for (cat = 0; cat < discrete_cat; cat++) {
- //trans_mat[cat] = model->newTransMatrix();
- double *trans_cat = trans_mat + (cat * trans_size);
- model_factory->computeTransMatrixFreq(dad_branch->length * site_rate->getRate(cat), state_freq, trans_cat);
- }
-
- bool not_ptn_cat = (site_rate->getPtnCat(0) < 0);
- double prob_const = 0.0; // probability of unobserved const patterns
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, cat, state1, state2)
-#endif
- for (ptn = 0; ptn < nptn; ptn++) {
- double lh_ptn = 0.0; // likelihood of the pattern
- int dad_state = 1000; // just something big enough
- int ptn_cat = site_rate->getPtnCat(ptn);
- if (dad->name == ROOT_NAME && root_state != aln->STATE_UNKNOWN) {
- dad_state = root_state;
- } else if (dad->isLeaf()) {
- if (ptn < orig_nptn)
- dad_state = (*aln)[ptn][dad->id];
- else
- dad_state = model_factory->unobserved_ptns[ptn-orig_nptn];
- }
- int dad_offset = dad_state * nstates;
- if (site_rate->isSiteSpecificRate()) {
- if (ptn < orig_nptn)
- model_factory->computeTransMatrixFreq(dad_branch->length * site_rate->getPtnRate(ptn), state_freq, trans_mat);
- else
- model_factory->computeTransMatrixFreq(dad_branch->length, state_freq, trans_mat);
- }
- for (cat = 0; cat < ncat; cat++) {
- double lh_cat = 0.0; // likelihood of the pattern's category
- size_t lh_offset = cat * nstates + ptn * block;
- double *partial_lh_site = node_branch->partial_lh + lh_offset;
- double *partial_lh_child = dad_branch->partial_lh + lh_offset;
- if (dad_state < nstates) { // single state
- // external node
- double *trans_state = trans_mat + ((not_ptn_cat ? cat : ptn_cat) * trans_size + dad_offset);
- if (model->isSiteSpecificModel() && ptn < nptn)
- trans_state += (nstates * nstates * model->getPtnModelID(ptn));
- for (state2 = 0; state2 < nstates; state2++)
- lh_cat += partial_lh_child[state2] * trans_state[state2];
- } else {
- // internal node, or external node but ambiguous character
- for (state1 = 0; state1 < nstates; state1++) {
- double lh_state = 0.0; // likelihood of state1
- double *trans_state = trans_mat + ((not_ptn_cat ? cat : ptn_cat) * trans_size + state1 * nstates);
- if (model->isSiteSpecificModel() && ptn < nptn)
- trans_state += (nstates * nstates * model->getPtnModelID(ptn));
- for (state2 = 0; state2 < nstates; state2++)
- lh_state += partial_lh_child[state2] * trans_state[state2];
- lh_cat += lh_state * partial_lh_site[state1];
- }
- }
- lh_ptn += lh_cat;
- _pattern_lh_cat[ptn * ncat + cat] = lh_cat;
-// if (pattern_rate)
-// rate_ptn += lh_cat * site_rate->getRate(cat);
- }
- if (ptn < orig_nptn) {
-// if (pattern_rate)
-// pattern_rate[ptn] = rate_ptn / lh_ptn;
- lh_ptn *= p_var_cat;
- if ((*aln)[ptn].const_char == nstates)
- lh_ptn += p_invar;
- else if ((*aln)[ptn].const_char < nstates) {
- lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
- }
- //#ifdef DEBUG
- if (lh_ptn <= 0.0)
- cout << "Negative likelihood: " << lh_ptn << " " << site_rate->getPtnRate(ptn) << endl;
- //#endif
- lh_ptn = log(lh_ptn);
- _pattern_lh[ptn] = lh_ptn;
- if (discard_saturated_site && site_rate->isSiteSpecificRate() && site_rate->getPtnRate(ptn) >= MAX_SITE_RATE)
- continue;
- tree_lh += lh_ptn * aln->at(ptn).frequency;
- } else {
- lh_ptn = lh_ptn*p_var_cat + p_invar*state_freq[(int)model_factory->unobserved_ptns[ptn-orig_nptn]];
- prob_const += lh_ptn;
- }
-
- }
- if (orig_nptn < nptn) {
- // ascertainment bias correction
- prob_const = log(1.0 - prob_const);
- for (ptn = 0; ptn < orig_nptn; ptn++)
- _pattern_lh[ptn] -= prob_const;
- tree_lh -= aln->getNSite()*prob_const;
- }
-// if (pattern_lh)
-// memmove(pattern_lh, _pattern_lh, aln->size() * sizeof(double));
- delete[] state_freq;
- delete[] trans_mat;
- //for (cat = ncat-1; cat >= 0; cat--)
- //delete trans_mat[cat];
- //delete state_freq;
-
- return tree_lh;
-}
double PhyloTree::computeLikelihoodZeroBranch(PhyloNeighbor *dad_branch, PhyloNode *dad) {
double lh_zero_branch;
@@ -2779,406 +2094,6 @@ double PhyloTree::computeLikelihoodZeroBranch(PhyloNeighbor *dad_branch, PhyloNo
return lh_zero_branch;
}
-void PhyloTree::computePartialLikelihoodNaive(PhyloNeighbor *dad_branch, PhyloNode *dad) {
- // don't recompute the likelihood
- if (dad_branch->partial_lh_computed & 1)
- return;
- Node * node = dad_branch->node;
- size_t ptn, cat;
- int ncat = site_rate->getNRate();
- int nstates = aln->num_states;
- size_t block = nstates * site_rate->getNRate();
- int trans_size = model->getTransMatrixSize();
- size_t lh_size = (aln->size()+model_factory->unobserved_ptns.size()) * block;
- double *partial_lh_site;
- size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
- size_t orig_nptn = aln->size();
-
- dad_branch->lh_scale_factor = 0.0;
- memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
-
- assert(dad_branch->partial_lh);
- //if (!dad_branch->partial_lh)
- // dad_branch->partial_lh = newPartialLh();
- if (node->isLeaf() && dad) {
- /* external node */
- memset(dad_branch->partial_lh, 0, lh_size * sizeof(double));
- for (ptn = 0; ptn < nptn; ptn++) {
- char state;
- partial_lh_site = dad_branch->partial_lh + (ptn * block);
- if (node->name == ROOT_NAME) {
- state = aln->STATE_UNKNOWN;
- } else {
- assert(node->id < aln->getNSeq());
- if (ptn < orig_nptn)
- state = (aln->at(ptn))[node->id];
- else // ascertainment bias correction
- state = model_factory->unobserved_ptns[ptn-orig_nptn];
- }
- if (state < nstates) {
- for (cat = 0; cat < ncat; cat++)
- partial_lh_site[cat * nstates + state] = 1.0;
- } else if (state == aln->STATE_UNKNOWN) {
- // fill all entries (also over rate category) with 1.0
- dad_branch->scale_num[ptn] = -1;
- for (int state2 = 0; state2 < block; state2++) {
- partial_lh_site[state2] = 1.0;
- }
- } else if (aln->seq_type == SEQ_DNA) {
- // ambiguous character, for DNA, RNA
- state = state - (nstates - 1);
- for (int state2 = 0; state2 < nstates && state2 <= 6; state2++)
- if (state & (1 << state2)) {
- for (cat = 0; cat < ncat; cat++)
- partial_lh_site[cat * nstates + state2] = 1.0;
- }
- } else if (aln->seq_type == SEQ_PROTEIN) {
- // ambiguous character, for DNA, RNA
- state = state - (nstates);
- assert(state < 3);
- int state_map[] = {4+8,32+64,512+1024};
- for (int state2 = 0; state2 < 11; state2++)
- if (state_map[(int)state] & (1 << state2)) {
- for (cat = 0; cat < ncat; cat++)
- partial_lh_site[cat * nstates + state2] = 1.0;
- }
- } else {
- outError("Internal error ", __func__);
- }
- }
- } else {
- /* internal node */
- int discrete_cat = site_rate->getNDiscreteRate();
- double *trans_mat = new double[discrete_cat * trans_size];
- //for (cat = 0; cat < discrete_cat; cat++) trans_mat[cat] = model->newTransMatrix();
- for (ptn = 0; ptn < lh_size; ptn++) {
- dad_branch->partial_lh[ptn] = 1.0;
- }
- for (ptn = 0; ptn < nptn; ptn++)
- dad_branch->scale_num[ptn] = -1;
-
- FOR_NEIGHBOR_IT(node, dad, it)if ((*it)->node->name != ROOT_NAME) {
- computePartialLikelihoodNaive((PhyloNeighbor*) (*it), (PhyloNode*) node);
-
- dad_branch->lh_scale_factor += ((PhyloNeighbor*) (*it))->lh_scale_factor;
-
- if (!site_rate->isSiteSpecificRate())
- for (cat = 0; cat < discrete_cat; cat++)
- model_factory->computeTransMatrix((*it)->length * site_rate->getRate(cat), trans_mat + (cat * trans_size));
-
- bool not_ptn_cat = (site_rate->getPtnCat(0) < 0);
-
- double sum_scale = 0.0;
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, cat, partial_lh_site)
-#endif
- for (ptn = 0; ptn < nptn; ptn++)
- if (((PhyloNeighbor*) (*it))->scale_num[ptn] >= 0) {
- // avoid the case that all child partial likelihoods equal 1.0
- //double *partial_lh_child = ((PhyloNeighbor*) (*it))->partial_lh + (ptn*block);
- //if (partial_lh_child[0] < 0.0) continue;
- //
- if (dad_branch->scale_num[ptn] < 0) dad_branch->scale_num[ptn] = 0;
- dad_branch->scale_num[ptn] += ((PhyloNeighbor*) (*it))->scale_num[ptn];
- int ptn_cat = 0;
- if (ptn < orig_nptn) {
- ptn_cat = site_rate->getPtnCat(ptn);
- if (site_rate->isSiteSpecificRate())
- model_factory->computeTransMatrix((*it)->length * site_rate->getPtnRate(ptn), trans_mat);
- } else {
- if (site_rate->isSiteSpecificRate())
- model_factory->computeTransMatrix((*it)->length, trans_mat);
- }
- for (cat = 0; cat < ncat; cat++) {
- size_t lh_offset = cat * nstates + ptn*block;
- partial_lh_site = dad_branch->partial_lh + lh_offset;
- double *partial_lh_child = ((PhyloNeighbor*) (*it))->partial_lh + lh_offset;
- for (int state = 0; state < nstates; state++) {
- double lh_child = 0.0;
- double *trans_state = trans_mat + ((not_ptn_cat ? cat : ptn_cat) * trans_size + state * nstates);
- if (model->isSiteSpecificModel() && ptn < orig_nptn)
- trans_state += (nstates * nstates * model->getPtnModelID(ptn));
- for (int state2 = 0; state2 < nstates; state2++)
- lh_child += trans_state[state2] * partial_lh_child[state2];
-
- if (!isfinite(lh_child))
- outError("Numerical error with ", __func__);
- partial_lh_site[state] *= lh_child;
- }
- }
- // check if one should scale partial likelihoods
- bool do_scale = true;
- partial_lh_site = dad_branch->partial_lh + (ptn * block);
- for (cat = 0; cat < block; cat++)
- if (partial_lh_site[cat] > SCALING_THRESHOLD) {
- do_scale = false;
- break;
- }
- if (!do_scale) continue;
- // now do the likelihood scaling
- /*
- double lh_max = partial_lh_site[0];
- for (cat = 1; cat < block; cat++)
- if (lh_max < partial_lh_site[cat]) lh_max = partial_lh_site[cat];
- for (cat = 0; cat < block; cat++)
- partial_lh_site[cat] /= lh_max;
- dad_branch->lh_scale_factor += log(lh_max) * (*aln)[ptn].frequency;
-
- */
- for (cat = 0; cat < block; cat++)
- partial_lh_site[cat] /= SCALING_THRESHOLD;
- // unobserved const pattern will never have underflow
- sum_scale += LOG_SCALING_THRESHOLD * (*aln)[ptn].frequency;
- dad_branch->scale_num[ptn] += 1;
-
-// if (pattern_scale)
-// pattern_scale[ptn] += LOG_SCALING_THRESHOLD;
- }
- dad_branch->lh_scale_factor += sum_scale;
- }
- delete[] trans_mat;
- //for (cat = ncat - 1; cat >= 0; cat--)
- // delete [] trans_mat[cat];
- }
- dad_branch->partial_lh_computed |= 1;
-
-}
-
-void PhyloTree::computeLikelihoodDervNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
- PhyloNode *node = (PhyloNode*) dad_branch->node;
- PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
- assert(node_branch);
- // swap node and dad if dad is a leaf
- // NEW: swap if root_state is given
- if (node->isLeaf() || (node->name == ROOT_NAME && root_state != aln->STATE_UNKNOWN)) {
- PhyloNode *tmp_node = dad;
- dad = node;
- node = tmp_node;
- PhyloNeighbor *tmp_nei = dad_branch;
- dad_branch = node_branch;
- node_branch = tmp_nei;
- //cout << "swapped\n";
- }
- if ((dad_branch->partial_lh_computed & 1) == 0)
- computePartialLikelihoodNaive(dad_branch, dad);
- if ((node_branch->partial_lh_computed & 1) == 0)
- computePartialLikelihoodNaive(node_branch, node);
-
- // now combine likelihood at the branch
-// double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
- df = ddf = 0.0;
- int ncat = site_rate->getNRate();
- double p_invar = site_rate->getPInvar();
- double p_var_cat = (1.0 - p_invar) / (double) ncat;
- int nstates = aln->num_states;
- size_t block = ncat * nstates;
- int trans_size = model->getTransMatrixSize();
- size_t nptn = aln->size() + model_factory->unobserved_ptns.size();
- size_t orig_nptn = aln->size();
- size_t ptn, cat, state1, state2;
-
- int discrete_cat = site_rate->getNDiscreteRate();
-
- double *trans_mat = new double[discrete_cat * trans_size];
- double *trans_derv1 = new double[discrete_cat * trans_size];
- double *trans_derv2 = new double[discrete_cat * trans_size];
- double *state_freq = new double[nstates];
- model->getStateFrequency(state_freq);
-
- if (!site_rate->isSiteSpecificRate())
- for (cat = 0; cat < discrete_cat; cat++) {
- //trans_mat[cat] = model->newTransMatrix();
- double *trans_cat = trans_mat + (cat * trans_size);
- double *derv1_cat = trans_derv1 + (cat * trans_size);
- double *derv2_cat = trans_derv2 + (cat * trans_size);
- double rate_val = site_rate->getRate(cat);
- //double rate_sqr = rate_val * rate_val;
- model_factory->computeTransDervFreq(dad_branch->length, rate_val, state_freq, trans_cat, derv1_cat,
- derv2_cat);
- /*
- for (state1 = 0; state1 < nstates; state1++) {
- double *trans_mat_state = trans_cat + (state1 * nstates);
- double *trans_derv1_state = derv1_cat + (state1 * nstates);
- double *trans_derv2_state = derv2_cat + (state1 * nstates);
-
- for (state2 = 0; state2 < nstates; state2++) {
- trans_mat_state[state2] *= state_freq[state1];
- trans_derv1_state[state2] *= (state_freq[state1] * rate_val);
- trans_derv2_state[state2] *= (state_freq[state1] * rate_sqr);
- }
- }*/
- }
-
- bool not_ptn_cat = (site_rate->getPtnCat(0) < 0);
- double derv1_frac;
- double derv2_frac;
-
- double my_df = 0.0;
- double my_ddf = 0.0;
- double prob_const = 0.0;
- double prob_const_derv1 = 0.0, prob_const_derv2 = 0.0;
-
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: my_df, my_ddf, prob_const, prob_const_derv1, prob_const_derv2) private(ptn, cat, state1, state2, derv1_frac, derv2_frac)
-#endif
- for (ptn = 0; ptn < nptn; ptn++) {
- int ptn_cat = site_rate->getPtnCat(ptn);
- if (discard_saturated_site && site_rate->isSiteSpecificRate() && nptn<orig_nptn &&site_rate->getPtnRate(ptn) >= MAX_SITE_RATE)
- continue;
- double lh_ptn = 0.0; // likelihood of the pattern
- double lh_ptn_derv1 = 0.0;
- double lh_ptn_derv2 = 0.0;
- int dad_state = aln->STATE_UNKNOWN;
-
- if (dad->name == ROOT_NAME && root_state != aln->STATE_UNKNOWN)
- dad_state = root_state;
- else if (dad->isLeaf()) {
- if (ptn < orig_nptn)
- dad_state = (*aln)[ptn][dad->id];
- else
- dad_state = model_factory->unobserved_ptns[ptn-orig_nptn];
- }
- int dad_offset = dad_state * nstates;
- if (site_rate->isSiteSpecificRate()) {
- if (ptn < orig_nptn)
- model_factory->computeTransDervFreq(dad_branch->length, site_rate->getPtnRate(ptn), state_freq, trans_mat,
- trans_derv1, trans_derv2);
- else
- model_factory->computeTransDervFreq(dad_branch->length, 1.0, state_freq, trans_mat,
- trans_derv1, trans_derv2);
- }
- for (cat = 0; cat < ncat; cat++) {
- size_t lh_offset = cat * nstates + ptn * block;
- double *partial_lh_site = node_branch->partial_lh + lh_offset;
- double *partial_lh_child = dad_branch->partial_lh + lh_offset;
- if (dad_state < nstates) {
- // external node
- int cat2 = (not_ptn_cat ? cat : ptn_cat) * trans_size + dad_offset;
- if (model->isSiteSpecificModel() && ptn < orig_nptn)
- cat2 += (nstates * nstates * model->getPtnModelID(ptn));
- double *trans_state = trans_mat + cat2;
- double *derv1_state = trans_derv1 + cat2;
- double *derv2_state = trans_derv2 + cat2;
- for (state2 = 0; state2 < nstates; state2++) {
- lh_ptn += partial_lh_child[state2] * trans_state[state2];
- lh_ptn_derv1 += partial_lh_child[state2] * derv1_state[state2];
- lh_ptn_derv2 += partial_lh_child[state2] * derv2_state[state2];
- }
- } else {
- // internal node, or external node but ambiguous character
- for (state1 = 0; state1 < nstates; state1++) {
- double lh_state = 0.0; // likelihood of state1
- double lh_state_derv1 = 0.0;
- double lh_state_derv2 = 0.0;
- int cat2 = (not_ptn_cat ? cat : ptn_cat) * trans_size + state1 * nstates;
- if (model->isSiteSpecificModel() && ptn < orig_nptn)
- cat2 += (nstates * nstates * model->getPtnModelID(ptn));
- double *trans_state = trans_mat + cat2;
- double *derv1_state = trans_derv1 + cat2;
- double *derv2_state = trans_derv2 + cat2;
- for (state2 = 0; state2 < nstates; state2++) {
- lh_state += partial_lh_child[state2] * trans_state[state2];
- lh_state_derv1 += partial_lh_child[state2] * derv1_state[state2];
- lh_state_derv2 += partial_lh_child[state2] * derv2_state[state2];
- }
- lh_ptn += lh_state * partial_lh_site[state1];
- lh_ptn_derv1 += lh_state_derv1 * partial_lh_site[state1];
- lh_ptn_derv2 += lh_state_derv2 * partial_lh_site[state1];
- }
- }
- }
- /* if (p_invar > 0.0) {
- lh_ptn *= p_var_cat;
- lh_ptn_derv1 *= p_var_cat;
- lh_ptn_derv2 *= p_var_cat;
- if ((*aln)[ptn].is_const && (*aln)[ptn].const_char < nstates) {
- lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
- }
- assert(lh_ptn > 0);
- double derv1_frac = lh_ptn_derv1 / lh_ptn;
- double derv2_frac = lh_ptn_derv2 / lh_ptn;
- tree_lh += log(lh_ptn) * (*aln)[ptn].frequency;
- df += derv1_frac * (*aln)[ptn].frequency;
- ddf += (derv2_frac - derv1_frac * derv1_frac) * (*aln)[ptn].frequency;
- } else {
- double derv1_frac = lh_ptn_derv1 / lh_ptn;
- double derv2_frac = lh_ptn_derv2 / lh_ptn;
- lh_ptn *= p_var_cat;
- assert(lh_ptn > 0);
- tree_lh += log(lh_ptn) * (*aln)[ptn].frequency;
- df += derv1_frac * (*aln)[ptn].frequency;
- ddf += (derv2_frac - derv1_frac * derv1_frac) * (*aln)[ptn].frequency;
-
- }
- */
- // Tung beo's trick
- if (lh_ptn<=0) {
- cout << "Abnormal " << __func__;
- abort();
- }
-
- if (ptn < orig_nptn) {
- lh_ptn = lh_ptn * p_var_cat;
- if ((*aln)[ptn].const_char == nstates)
- lh_ptn += p_invar;
- else if ((*aln)[ptn].const_char < nstates) {
- lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
- }
- double pad = p_var_cat / lh_ptn;
- if (std::isinf(pad)) {
- lh_ptn_derv1 *= p_var_cat;
- lh_ptn_derv2 *= p_var_cat;
- derv1_frac = lh_ptn_derv1 / lh_ptn;
- derv2_frac = lh_ptn_derv2 / lh_ptn;
- } else {
- derv1_frac = lh_ptn_derv1 * pad;
- derv2_frac = lh_ptn_derv2 * pad;
- }
- double freq = (*aln)[ptn].frequency;
- double tmp1 = derv1_frac * freq;
- double tmp2 = derv2_frac * freq;
- my_df += tmp1;
- my_ddf += tmp2 - tmp1 * derv1_frac;
-// lh_ptn = log(lh_ptn);
-// tree_lh += lh_ptn * freq;
-// _pattern_lh[ptn] = lh_ptn;
- if (!isfinite(lh_ptn) || !isfinite(my_df) || !isfinite(my_ddf)) {
- cout << "Abnormal " << __func__;
- abort();
- }
- } else {
- lh_ptn = lh_ptn*p_var_cat + p_invar*state_freq[(int)model_factory->unobserved_ptns[ptn-orig_nptn]];
- prob_const += lh_ptn;
- prob_const_derv1 += lh_ptn_derv1 * p_var_cat;
- prob_const_derv2 += lh_ptn_derv2 * p_var_cat;
- }
-
- }
- if (orig_nptn < nptn) {
- // ascertainment bias correction
- prob_const = 1.0 - prob_const;
- derv1_frac = prob_const_derv1 / prob_const;
- derv2_frac = prob_const_derv2 / prob_const;
- int nsites = aln->getNSite();
- my_df += nsites * derv1_frac;
- my_ddf += nsites *(derv2_frac + derv1_frac*derv1_frac);
-// prob_const = log(prob_const);
-// tree_lh -= nsites * prob_const;
-// for (ptn = 0; ptn < orig_nptn; ptn++)
-// _pattern_lh[ptn] -= prob_const;
- }
- delete[] state_freq;
- delete[] trans_derv2;
- delete[] trans_derv1;
- delete[] trans_mat;
- //for (cat = ncat-1; cat >= 0; cat--)
- //delete trans_mat[cat];
- //delete state_freq;
- df = my_df;
- ddf = my_ddf;
-
-// return tree_lh;
-}
/****************************************************************************
Branch length optimization by maximum likelihood
@@ -3260,7 +2175,7 @@ void PhyloTree::optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, bool clear
} else {
// Brent method
optx = minimizeOneDimen(params->min_branch_length, current_len, params->max_branch_length, params->min_branch_length, &negative_lh, &ferror);
- if (verbose_mode >= VB_DEBUG) {
+ if (verbose_mode >= VB_MAX) {
cout << "minimizeBrent logl: " << -negative_lh << endl;
}
}
@@ -3391,7 +2306,7 @@ double PhyloTree::optimizeAllBranches(int my_iterations, double tolerance, int m
// }
- if (new_tree_lh < tree_lh) {
+ if (new_tree_lh < tree_lh - tolerance*0.1) {
// IN RARE CASE: tree log-likelihood decreases, revert the branch length and stop
if (verbose_mode >= VB_MED)
cout << "NOTE: Restoring branch lengths as tree log-likelihood decreases after branch length optimization: "
@@ -3414,6 +2329,7 @@ double PhyloTree::optimizeAllBranches(int my_iterations, double tolerance, int m
return new_tree_lh;
tree_lh = new_tree_lh;
}
+ curScore = tree_lh;
return tree_lh;
}
diff --git a/phylotree.h b/phylotree.h
index 68b33dd..259f649 100644
--- a/phylotree.h
+++ b/phylotree.h
@@ -22,7 +22,7 @@
//#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (512*256)
//#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (8*512*512)
-#include <Eigen/Core>
+//#include <Eigen/Core>
#include "mtree.h"
#include "alignment.h"
#include "model/modelsubst.h"
@@ -55,7 +55,7 @@ const double TOL_LIKELIHOOD_PARAMOPT = 0.001; // BQM: newly introduced for Model
const int SPR_DEPTH = 2;
-using namespace Eigen;
+//using namespace Eigen;
inline size_t get_safe_upper_limit(size_t cur_limit) {
if (instruction_set >= 7)
@@ -132,7 +132,7 @@ inline void aligned_free(void *mem) {
/**
* Row Major Array For Eigen
*/
-typedef Array<double, Dynamic, Dynamic, RowMajor> RowMajorArrayXXd;
+//typedef Array<double, Dynamic, Dynamic, RowMajor> RowMajorArrayXXd;
typedef std::map< string, double > StringDoubleMap;
@@ -514,7 +514,7 @@ public:
* Return the approximated branch length estimation using corrected parsimony branch length
* This is usually used as the starting point before using Newton-Raphson
*/
- double computeCorrectedParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad);
+// double computeCorrectedParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad);
/**
initialize partial_pars vector of all PhyloNeighbors, allocating central_partial_pars
@@ -544,7 +544,7 @@ public:
@param dad its dad, used to direct the tranversal
*/
virtual void computePartialParsimony(PhyloNeighbor *dad_branch, PhyloNode *dad);
- void computePartialParsimonyNaive(PhyloNeighbor *dad_branch, PhyloNode *dad);
+// void computePartialParsimonyNaive(PhyloNeighbor *dad_branch, PhyloNode *dad);
void computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode *dad);
template<class VectorClass>
void computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
@@ -562,13 +562,13 @@ public:
@return parsimony score of the tree
*/
virtual int computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
- int computeParsimonyBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
+// int computeParsimonyBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
int computeParsimonyBranchFast(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
template<class VectorClass>
int computeParsimonyBranchFastSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
- void printParsimonyStates(PhyloNeighbor *dad_branch = NULL, PhyloNode *dad = NULL);
+// void printParsimonyStates(PhyloNeighbor *dad_branch = NULL, PhyloNode *dad = NULL);
virtual void setParsimonyKernel(LikelihoodKernel lk);
#if defined(BINARY32) || defined(__NOAVX__)
@@ -576,23 +576,6 @@ public:
#else
virtual void setParsimonyKernelAVX();
#endif
- /**
- SLOW VERSION: compute the parsimony score of the tree, given the alignment
- @return the parsimony score
- */
- int computeParsimonyScore();
-
-
- /**
- SLOW VERSION: compute the parsimony score of the tree, given the alignment
- @return the parsimony score
- @param node the current node
- @param dad dad of the node, used to direct the search
- @param ptn pattern ID
- @param states set of admissible states at the current node (in binary code)
- */
- int computeParsimonyScore(int ptn, int &states, PhyloNode *node = NULL, PhyloNode *dad = NULL);
-
/****************************************************************************
likelihood function
@@ -675,16 +658,6 @@ public:
typedef void (PhyloTree::*ComputePartialLikelihoodType)(PhyloNeighbor *, PhyloNode *);
ComputePartialLikelihoodType computePartialLikelihoodPointer;
- /**
- * original naive version in IQ-TREE
- */
- void computePartialLikelihoodNaive(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
-
- /**
- * this implements the SSE version using Eigen library
- */
- template<int NSTATES>
- void computePartialLikelihoodSSE(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
//template <const int nstates>
void computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
@@ -725,12 +698,6 @@ public:
ComputeLikelihoodBranchType computeLikelihoodBranchPointer;
/**
- * this implements the SSE version using Eigen library
- */
- template<int NSTATES>
- double computeLikelihoodBranchSSE(PhyloNeighbor *dad_branch, PhyloNode *dad);
-
- /**
* MINH: this implements the fast alternative strategy for reversible model (March 2013)
* where partial likelihoods at nodes store real partial likelihoods times eigenvectors
*/
@@ -760,8 +727,6 @@ public:
template <class VectorClass, const int VCSIZE, const int nstates>
double computeSitemodelLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
- double computeLikelihoodBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad);
-
/****************************************************************************
computing likelihood on a branch using buffer
****************************************************************************/
@@ -805,7 +770,7 @@ public:
@param dad its dad, used to direct the tranversal
@return tree likelihood
*/
- virtual double computeLikelihoodRooted(PhyloNeighbor *dad_branch, PhyloNode *dad);
+// virtual double computeLikelihoodRooted(PhyloNeighbor *dad_branch, PhyloNode *dad);
/**
compute the tree likelihood
@@ -978,14 +943,6 @@ public:
computing derivatives of likelihood function
****************************************************************************/
- void computeLikelihoodDervNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
-
- /**
- * this implements the SSE version using Eigen library
- */
- template<int NSTATES>
- void computeLikelihoodDervSSE(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
-
//template <const int nstates>
void computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
@@ -1047,50 +1004,6 @@ public:
*/
int computeParsimonyTree(const char *out_prefix, Alignment *alignment);
- /**
- SLOW VERSION: grow the tree by step-wise addition
- @param alignment input alignment
- */
- void growTreeMP(Alignment *alignment);
-
- /**
- used internally by growTreeMP() to find the best target branch to add into the tree
- @param added_node node to add
- @param target_node (OUT) one end of the best branch found
- @param target_dad (OUT) the other end of the best branch found
- @param node the current node
- @param dad dad of the node, used to direct the search
- @return the parsimony score of the tree
- */
- int addTaxonMP(Node *added_node, Node* &target_node, Node* &target_dad, Node *node, Node *dad);
-
-
- /****************************************************************************
- Nearest Neighbor Interchange with parsimony
- ****************************************************************************/
- /**
- search by a nearest neigbor interchange with parsimony
- */
- void searchNNI();
-
- /**
- search by a nearest neigbor interchange with parsimony
- @param node the current node
- @param dad dad of the node, used to direct the search
- @param cur_score current score
- @return best score
- */
- double searchNNI(double cur_score, PhyloNode *node = NULL, PhyloNode *dad = NULL);
-
- /**
- try to swap the tree with nearest neigbor interchange at the branch connecting node1-node2.
- If a swap shows better score, return the swapped tree and the score.
- @param cur_score current score
- @param node1 1st end node of the branch
- @param node2 2nd end node of the branch
- @return best score
- */
- double swapNNI(double cur_score, PhyloNode *node1, PhyloNode *node2);
/****************************************************************************
Branch length optimization by maximum likelihood
@@ -1877,55 +1790,6 @@ protected:
*/
UINT *newBitsBlock();
- /**
- @return size of the bits entry (for storing num_states bits)
- */
- int getBitsEntrySize();
-
- /**
- @param bits_entry
- @return TRUE if bits_entry contains all 0s, FALSE otherwise
- */
- bool isEmptyBitsEntry(UINT *bits_entry);
-
- /**
- @param bits_entry1
- @param bits_entry1
- @param bits_union (OUT) union of bits_entry1 and bits_entry2
- */
- void unionBitsEntry(UINT *bits_entry1, UINT *bits_entry2, UINT* &bits_union);
-
- /**
- set a single bit to 1
- @param bits_entry
- @param id index of the bit in the entry to set to 1
- */
- void setBitsEntry(UINT* &bits_entry, int id);
-
- /**
- get a single bit content
- @param bits_entry
- @param id index of the bit in the entry
- @return TRUE if bit ID is 1, FALSE otherwise
- */
- bool getBitsEntry(UINT* &bits_entry, int id);
-
- /**
- get bit blocks, each block span num_state bits
- @param bit_vec bit block vector
- @param index block index
- @param bits_entry (OUT) content of the block at index
- */
- void getBitsBlock(UINT *bit_vec, int index, UINT* &bits_entry);
-
- /**
- set bit blocks, each block span num_state bits
- @param bit_vec (OUT) bit block vector
- @param index block index
- @param bits_entry the content of the block at index
- */
- void setBitsBlock(UINT* &bit_vec, int index, UINT *bits_entry);
-
virtual void saveCurrentTree(double logl) {
} // save current tree
diff --git a/phylotreesse.cpp b/phylotreesse.cpp
index 3bc4054..7782612 100644
--- a/phylotreesse.cpp
+++ b/phylotreesse.cpp
@@ -34,10 +34,10 @@
void PhyloTree::setParsimonyKernel(LikelihoodKernel lk) {
// set parsimony kernel
switch (lk) {
- case LK_SSE:
- computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchNaive;
- computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyNaive;
- break;
+// case LK_SSE:
+// computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchNaive;
+// computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyNaive;
+// break;
case LK_EIGEN:
computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchFast;
computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyFast;
@@ -50,10 +50,10 @@ void PhyloTree::setParsimonyKernel(LikelihoodKernel lk) {
computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyFastSIMD<Vec4ui>;
}
break;
- default:
- computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchNaive;
- computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyNaive;
- break;
+// default:
+// computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchNaive;
+// computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyNaive;
+// break;
}
}
@@ -71,12 +71,12 @@ void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec2d, 2>;
}
sse = lk;
- if (!aln || lk == LK_NORMAL) {
- computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchNaive;
- computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervNaive;
- computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodNaive;
+ if (!aln) {
+ computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigen;
+ computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigen;
+ computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigen;
computeLikelihoodFromBufferPointer = NULL;
- sse = LK_NORMAL;
+ sse = LK_EIGEN;
return;
}
@@ -145,12 +145,12 @@ void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
switch(aln->num_states) {
case 4:
switch(sse) {
- case LK_SSE:
- computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<4>;
- computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<4>;
- computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<4>;
- computeLikelihoodFromBufferPointer = NULL;
- break;
+// case LK_SSE:
+// computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<4>;
+// computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<4>;
+// computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<4>;
+// computeLikelihoodFromBufferPointer = NULL;
+// break;
case LK_EIGEN_SSE:
if (instruction_set >= 7) {
// CPU supports AVX
@@ -183,12 +183,12 @@ void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
break;
case 20:
switch(sse) {
- case LK_SSE:
- computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<20>;
- computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<20>;
- computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<20>;
- computeLikelihoodFromBufferPointer = NULL;
- break;
+// case LK_SSE:
+// computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<20>;
+// computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<20>;
+// computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<20>;
+// computeLikelihoodFromBufferPointer = NULL;
+// break;
case LK_EIGEN_SSE:
if (instruction_set >= 7) {
setLikelihoodKernelAVX();
@@ -220,12 +220,12 @@ void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
case 64: // CODON
switch(sse) {
- case LK_SSE:
- computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<64>;
- computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<64>;
- computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<64>;
- computeLikelihoodFromBufferPointer = NULL;
- break;
+// case LK_SSE:
+// computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<64>;
+// computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<64>;
+// computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<64>;
+// computeLikelihoodFromBufferPointer = NULL;
+// break;
case LK_EIGEN_SSE:
if (instruction_set >= 7) {
setLikelihoodKernelAVX();
@@ -261,12 +261,12 @@ void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
case 2:
switch(sse) {
- case LK_SSE:
- computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<2>;
- computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<2>;
- computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<2>;
- computeLikelihoodFromBufferPointer = NULL;
- break;
+// case LK_SSE:
+// computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<2>;
+// computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<2>;
+// computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<2>;
+// computeLikelihoodFromBufferPointer = NULL;
+// break;
case LK_EIGEN_SSE:
computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec2d, 2, 2>;
computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec2d, 2, 2>;
@@ -299,12 +299,12 @@ void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
computeLikelihoodFromBufferPointer = NULL;
}
sse = LK_EIGEN;
- } else {
- computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchNaive;
- computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervNaive;
- computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodNaive;
- computeLikelihoodFromBufferPointer = NULL;
- sse = LK_NORMAL;
+// } else {
+// computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchNaive;
+// computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervNaive;
+// computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodNaive;
+// computeLikelihoodFromBufferPointer = NULL;
+// sse = LK_NORMAL;
}
break;
}
@@ -312,18 +312,18 @@ void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
void PhyloTree::changeLikelihoodKernel(LikelihoodKernel lk) {
if (sse == lk) return;
- if ((sse == LK_EIGEN || sse == LK_EIGEN_SSE) && (lk == LK_NORMAL || lk == LK_SSE)) {
- // need to increase the memory usage when changing from new kernel to old kernel
- if (params->lh_mem_save == LM_PER_NODE)
- params->lh_mem_save = LM_ALL_BRANCH;
- setLikelihoodKernel(lk);
- deleteAllPartialLh();
- initializeAllPartialLh();
- clearAllPartialLH();
- } else {
+// if ((sse == LK_EIGEN || sse == LK_EIGEN_SSE) && (lk == LK_NORMAL || lk == LK_SSE)) {
+// // need to increase the memory usage when changing from new kernel to old kernel
+// if (params->lh_mem_save == LM_PER_NODE)
+// params->lh_mem_save = LM_ALL_BRANCH;
+// setLikelihoodKernel(lk);
+// deleteAllPartialLh();
+// initializeAllPartialLh();
+// clearAllPartialLH();
+// } else {
// otherwise simply assign variable sse
setLikelihoodKernel(lk);
- }
+// }
}
/*******************************************************
@@ -591,11 +591,11 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
size_t nstates = aln->num_states;
size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+ if (!tip_partial_lh_computed)
+ computeTipPartialLikelihood();
+
if (node->isLeaf()) {
dad_branch->lh_scale_factor = 0.0;
-
- if (!tip_partial_lh_computed)
- computeTipPartialLikelihood();
return;
}
@@ -787,7 +787,7 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
x++;
}
- } else {
+ } else if (ptn_invar[ptn] == 0.0) {
// now do the likelihood scaling
for (i = 0; i < block; i++) {
partial_lh[i] *= SCALING_THRESHOLD_INVER;
@@ -905,7 +905,7 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
x++;
}
- } else {
+ } else if (ptn_invar[ptn] == 0.0) {
// now do the likelihood scaling
for (i = 0; i < block; i++) {
partial_lh[i] *= SCALING_THRESHOLD_INVER;
@@ -986,7 +986,8 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
x++;
}
- } else {
+ } else if (ptn_invar[ptn] == 0.0) {
+ // BQM 2016-05-03: only scale for non-constant sites
// now do the likelihood scaling
for (i = 0; i < block; i++) {
partial_lh[i] *= SCALING_THRESHOLD_INVER;
@@ -1221,19 +1222,19 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
#endif
for (ptn = 0; ptn < nptn; ptn++) {
double lh_ptn = ptn_invar[ptn];
- double *lh_cat = _pattern_lh_cat + ptn*ncat;
- double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
- int state_dad = (ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
- double *lh_node = partial_lh_node + state_dad*block;
- for (c = 0; c < ncat; c++) {
- for (i = 0; i < nstates; i++) {
- *lh_cat += lh_node[i] * partial_lh_dad[i];
- }
- lh_node += nstates;
- partial_lh_dad += nstates;
- lh_ptn += *lh_cat;
- lh_cat++;
- }
+ double *lh_cat = _pattern_lh_cat + ptn*ncat;
+ double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+ int state_dad = (ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
+ double *lh_node = partial_lh_node + state_dad*block;
+ for (c = 0; c < ncat; c++) {
+ for (i = 0; i < nstates; i++) {
+ *lh_cat += lh_node[i] * partial_lh_dad[i];
+ }
+ lh_node += nstates;
+ partial_lh_dad += nstates;
+ lh_ptn += *lh_cat;
+ lh_cat++;
+ }
// assert(lh_ptn > -1e-10);
if (ptn < orig_nptn) {
lh_ptn = log(fabs(lh_ptn));
@@ -1255,20 +1256,20 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
#endif
for (ptn = 0; ptn < nptn; ptn++) {
double lh_ptn = ptn_invar[ptn];
- double *lh_cat = _pattern_lh_cat + ptn*ncat;
- double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
- double *partial_lh_node = node_branch->partial_lh + ptn*block;
- double *val_tmp = val;
- for (c = 0; c < ncat; c++) {
- for (i = 0; i < nstates; i++) {
- *lh_cat += val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
- }
- lh_ptn += *lh_cat;
- partial_lh_node += nstates;
- partial_lh_dad += nstates;
- val_tmp += nstates;
- lh_cat++;
- }
+ double *lh_cat = _pattern_lh_cat + ptn*ncat;
+ double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+ double *partial_lh_node = node_branch->partial_lh + ptn*block;
+ double *val_tmp = val;
+ for (c = 0; c < ncat; c++) {
+ for (i = 0; i < nstates; i++) {
+ *lh_cat += val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
+ }
+ lh_ptn += *lh_cat;
+ partial_lh_node += nstates;
+ partial_lh_dad += nstates;
+ val_tmp += nstates;
+ lh_cat++;
+ }
// assert(lh_ptn > 0.0);
if (ptn < orig_nptn) {
@@ -1338,439 +1339,6 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
}
-/************************************************************************************************
- *
- * SSE vectorized functions of the Naive implementation
- *
- *************************************************************************************************/
-
-template<const int NSTATES>
-inline double PhyloTree::computeLikelihoodBranchSSE(PhyloNeighbor *dad_branch, PhyloNode *dad) {
- PhyloNode *node = (PhyloNode*) dad_branch->node; // Node A
- PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad); // Node B
- assert(node_branch);
- if (!central_partial_lh)
- initializeAllPartialLh();
- // swap node and dad if dad is a leaf
- if (node->isLeaf()) {
- PhyloNode *tmp_node = dad;
- dad = node;
- node = tmp_node;
- PhyloNeighbor *tmp_nei = dad_branch;
- dad_branch = node_branch;
- node_branch = tmp_nei;
- }
- if ((dad_branch->partial_lh_computed & 1) == 0)
- computePartialLikelihoodSSE<NSTATES>(dad_branch, dad);
- if ((node_branch->partial_lh_computed & 1) == 0)
- computePartialLikelihoodSSE<NSTATES>(node_branch, node);
-
- // now combine likelihood at the branch
- double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
- int ptn, cat, state1, state2;
- double *partial_lh_site;
- double *partial_lh_child;
- double *trans_state;
- double p_invar = site_rate->getPInvar();
- int numCat = site_rate->getNRate();
- int numStates = model->num_states;
- int tranSize = numStates * numStates;
- int alnSize = aln->size() + model_factory->unobserved_ptns.size();
- int orig_alnSize = aln->size();
- int block = numStates * numCat;
-
- double p_var_cat = (1.0 - p_invar) / (double) numCat;
-
- EIGEN_ALIGN16 double *trans_mat_orig = new double[numCat * tranSize + 1];
- double *trans_mat = trans_mat_orig;
- if (((intptr_t) trans_mat) % 16 != 0)
- trans_mat = trans_mat + 1;
- EIGEN_ALIGN16 double state_freq[NSTATES];
- model->getStateFrequency(state_freq);
- for (cat = 0; cat < numCat; cat++) {
- double *trans_cat = trans_mat + (cat * tranSize);
- model_factory->computeTransMatrix(dad_branch->length * site_rate->getRate(cat), trans_cat);
- for (state1 = 0; state1 < NSTATES; state1++) {
- double *trans_mat_state = trans_cat + (state1 * NSTATES);
- for (state2 = 0; state2 < NSTATES; state2++)
- trans_mat_state[state2] *= state_freq[state1];
- }
- }
-
- double prob_const = 0.0; // probability of unobserved const patterns
-
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, cat) schedule(static)
-#endif
- for (ptn = 0; ptn < alnSize; ++ptn) {
- double lh_ptn = 0.0; // likelihood of the pattern
- for (cat = 0; cat < numCat; cat++) {
- partial_lh_site = node_branch->partial_lh + (ptn * block + cat * NSTATES);
- partial_lh_child = dad_branch->partial_lh + (ptn * block + cat * NSTATES);
- trans_state = trans_mat + cat * tranSize;
- Map<Matrix<double, 1, NSTATES>, Aligned> eigen_partial_lh_child(&partial_lh_child[0]);
- Map<Matrix<double, 1, NSTATES>, Aligned> eigen_partial_lh_site(&partial_lh_site[0]);
- Map<Matrix<double, NSTATES, NSTATES>, Aligned> eigen_trans_state(&trans_state[0]);
- lh_ptn += (eigen_partial_lh_child * eigen_trans_state).dot(eigen_partial_lh_site);
- }
- if (ptn < orig_alnSize) {
- lh_ptn *= p_var_cat;
- if ((*aln)[ptn].const_char == NSTATES)
- lh_ptn += p_invar;
- else if ((*aln)[ptn].const_char < NSTATES) {
- lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
- }
- lh_ptn = log(lh_ptn);
- tree_lh += lh_ptn * (aln->at(ptn).frequency);
- _pattern_lh[ptn] = lh_ptn;
- // BQM: pattern_lh contains the LOG-likelihood, not likelihood
- } else {
- lh_ptn = lh_ptn*p_var_cat + p_invar*state_freq[(int)model_factory->unobserved_ptns[ptn-orig_alnSize]];
- prob_const += lh_ptn;
-
- }
- }
- if (orig_alnSize < alnSize) {
- // ascertainment bias correction
- prob_const = log(1.0 - prob_const);
- for (ptn = 0; ptn < orig_alnSize; ptn++)
- _pattern_lh[ptn] -= prob_const;
- tree_lh -= aln->getNSite()*prob_const;
- }
-
- delete[] trans_mat_orig;
- return tree_lh;
-}
-
-template<int NSTATES>
-void PhyloTree::computePartialLikelihoodSSE(PhyloNeighbor *dad_branch, PhyloNode *dad) {
- // don't recompute the likelihood
- if (dad_branch->partial_lh_computed & 1)
- return;
- Node *node = dad_branch->node;
- int ptn, cat;
- //double *trans_state;
- double *partial_lh_site;
- double *partial_lh_child;
- dad_branch->lh_scale_factor = 0.0;
-
- int numCat = site_rate->getNRate();
- int numStates = model->num_states;
- int tranSize = numStates * numStates;
- int alnSize = aln->size() + model_factory->unobserved_ptns.size();
- int orig_alnSize = aln->size();
- int block = numStates * numCat;
- size_t lh_size = alnSize * block;
- memset(dad_branch->scale_num, 0, alnSize * sizeof(UBYTE));
-
- if (node->isLeaf() && dad) {
- // external node
- memset(dad_branch->partial_lh, 0, lh_size * sizeof(double));
- for (ptn = 0; ptn < alnSize; ++ptn) {
- char state;
- partial_lh_site = dad_branch->partial_lh + (ptn * block);
-
- if (node->name == ROOT_NAME) {
- state = aln->STATE_UNKNOWN;
- } else if (ptn < orig_alnSize){
- state = (aln->at(ptn))[node->id];
- } else {
- state = model_factory->unobserved_ptns[ptn-orig_alnSize];
- }
-
- if (state == aln->STATE_UNKNOWN) {
-#ifndef KEEP_GAP_LH
- dad_branch->scale_num[ptn] = -1;
-#endif
- for (int state2 = 0; state2 < block; state2++) {
- partial_lh_site[state2] = 1.0;
- }
- } else if (state < NSTATES) {
- double *_par_lh_site = partial_lh_site + state;
- for (cat = 0; cat < numCat; cat++) {
- *_par_lh_site = 1.0;
- _par_lh_site += NSTATES;
- }
- } else if (aln->seq_type == SEQ_DNA) {
- // ambiguous character, for DNA, RNA
- state = state - (NSTATES - 1);
- for (int state2 = 0; state2 < NSTATES; state2++)
- if (state & (1 << state2)) {
- for (cat = 0; cat < numCat; cat++)
- partial_lh_site[cat * NSTATES + state2] = 1.0;
- }
- } else if (aln->seq_type == SEQ_PROTEIN) {
- // ambiguous character, for DNA, RNA
- state = state - (NSTATES);
- assert(state < 3);
- int state_map[] = {4+8,32+64,512+1024};
- for (int state2 = 0; state2 < 11; state2++)
- if (state_map[(int)state] & (1 << state2)) {
- for (cat = 0; cat < numCat; cat++)
- partial_lh_site[cat * NSTATES + state2] = 1.0;
- }
- } else {
- outError("Internal error ", __func__);
- }
- }
- } else {
- // internal node
- EIGEN_ALIGN16 double *trans_mat_orig = new double[numCat * tranSize + 2];
- double *trans_mat = trans_mat_orig;
- if (((intptr_t) trans_mat) % 16 != 0)
- trans_mat = trans_mat + 1;
- for (ptn = 0; ptn < lh_size; ++ptn)
- dad_branch->partial_lh[ptn] = 1.0;
-#ifndef KEEP_GAP_LH
- for (ptn = 0; ptn < alnSize; ptn++)
- dad_branch->scale_num[ptn] = -1;
-#endif
- FOR_NEIGHBOR_IT(node, dad, it)if ((*it)->node->name != ROOT_NAME) {
- computePartialLikelihoodSSE<NSTATES > ((PhyloNeighbor*) (*it), (PhyloNode*) node);
- dad_branch->lh_scale_factor += ((PhyloNeighbor*) (*it))->lh_scale_factor;
- for (cat = 0; cat < numCat; cat++) {
- model_factory->computeTransMatrix((*it)->length * site_rate->getRate(cat), &trans_mat[cat * tranSize]);
- }
- partial_lh_site = dad_branch->partial_lh;
- partial_lh_child = ((PhyloNeighbor*) (*it))->partial_lh;
- double sum_scale = 0.0;
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, cat, partial_lh_site, partial_lh_child) schedule(static)
-#endif
- for (ptn = 0; ptn < alnSize; ++ptn)
-#ifndef KEEP_GAP_LH
- if (((PhyloNeighbor*) (*it))->scale_num[ptn] < 0) {
-#ifndef _OPENMP
- partial_lh_site += NSTATES * numCat;
- partial_lh_child += NSTATES * numCat;
-#endif
- } else
-#endif
- {
-#ifndef KEEP_GAP_LH
- if (dad_branch->scale_num[ptn] < 0)
- dad_branch->scale_num[ptn] = 0;
-#endif
-#ifdef _OPENMP
- int lh_offset = ptn*block;
- partial_lh_site = dad_branch->partial_lh + lh_offset;
- partial_lh_child = ((PhyloNeighbor*) (*it))->partial_lh + lh_offset;
-#endif
- dad_branch->scale_num[ptn] += ((PhyloNeighbor*) (*it))->scale_num[ptn];
- double *partial_lh_block = partial_lh_site;
- double *trans_state = trans_mat;
- bool do_scale = true;
- for (cat = 0; cat < numCat; cat++)
- {
- MappedRowVec(NSTATES) ei_partial_lh_child(partial_lh_child);
- MappedRowVec(NSTATES) ei_partial_lh_site(partial_lh_site);
- MappedMat(NSTATES) ei_trans_state(trans_state);
- ei_partial_lh_site.array() *= (ei_partial_lh_child * ei_trans_state).array();
- partial_lh_site += NSTATES;
- partial_lh_child += NSTATES;
- trans_state += tranSize;
- }
- for (cat = 0; cat < block; cat++)
- if (partial_lh_block[cat] > SCALING_THRESHOLD) {
- do_scale = false;
- break;
- }
- if (do_scale) {
- // unobserved const pattern will never have underflow
- Map<VectorXd, Aligned> ei_lh_block(partial_lh_block, block);
- ei_lh_block *= SCALING_THRESHOLD_INVER;
- sum_scale += LOG_SCALING_THRESHOLD * (*aln)[ptn].frequency;
- dad_branch->scale_num[ptn] += 1;
- }
- }
- dad_branch->lh_scale_factor += sum_scale;
- }
- delete[] trans_mat_orig;
- }
-
- dad_branch->partial_lh_computed |= 1;
-}
-
-/****************************************************************************
- computing derivatives of likelihood function
- ****************************************************************************/
-template<int NSTATES>
-inline void PhyloTree::computeLikelihoodDervSSE(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
- PhyloNode *node = (PhyloNode*) dad_branch->node;
- PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
- //assert(node_branch);
- // swap node and dad if node is a leaf
- if (node->isLeaf()) {
- PhyloNode *tmp_node = dad;
- dad = node;
- node = tmp_node;
- PhyloNeighbor *tmp_nei = dad_branch;
- dad_branch = node_branch;
- node_branch = tmp_nei;
- }
- if ((dad_branch->partial_lh_computed & 1) == 0)
- computePartialLikelihoodSSE<NSTATES>(dad_branch, dad);
- if ((node_branch->partial_lh_computed & 1) == 0)
- computePartialLikelihoodSSE<NSTATES>(node_branch, node);
- df = ddf = 0.0;
- int cat = 0;
- double *partial_lh_site = node_branch->partial_lh;
- double *partial_lh_child = dad_branch->partial_lh;
- double lh_ptn; // likelihood of the pattern
- double lh_ptn_derv1;
- double lh_ptn_derv2;
- double derv1_frac;
- double derv2_frac;
- double *trans_state;
- double *derv1_state;
- double *derv2_state;
- double p_invar = site_rate->getPInvar();
-
- int numCat = site_rate->getNRate();
- int numStates = model->num_states;
- int tranSize = numStates * numStates;
- int alnSize = aln->size() + model_factory->unobserved_ptns.size();
- int orig_alnSize = aln->size();
-
- double p_var_cat = (1.0 - p_invar) / (double) numCat;
- double state_freq[NSTATES];
- model->getStateFrequency(state_freq);
- double *trans_mat_orig = new double[numCat * tranSize + 1];
- double *trans_derv1_orig = new double[numCat * tranSize + 1];
- double *trans_derv2_orig = new double[numCat * tranSize + 1];
- // make alignment 16
- double *trans_mat = trans_mat_orig, *trans_derv1 = trans_derv1_orig, *trans_derv2 = trans_derv2_orig;
- if (((intptr_t) trans_mat) % 16 != 0)
- trans_mat = trans_mat + 1;
- if (((intptr_t) trans_derv1) % 16 != 0)
- trans_derv1 = trans_derv1 + 1;
- if (((intptr_t) trans_derv2) % 16 != 0)
- trans_derv2 = trans_derv2 + 1;
-
- int discrete_cat = site_rate->getNDiscreteRate();
- if (!site_rate->isSiteSpecificRate())
- for (cat = 0; cat < discrete_cat; cat++) {
- double *trans_cat = trans_mat + (cat * tranSize);
- double *derv1_cat = trans_derv1 + (cat * tranSize);
- double *derv2_cat = trans_derv2 + (cat * tranSize);
- double rate_val = site_rate->getRate(cat);
- model_factory->computeTransDervFreq(dad_branch->length, rate_val, state_freq, trans_cat, derv1_cat,
- derv2_cat);
- }
- int dad_state = aln->STATE_UNKNOWN;
- double my_df = 0.0;
- double my_ddf = 0.0;
- double prob_const = 0.0, prob_const_derv1 = 0.0, prob_const_derv2 = 0.0;
-
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: my_df, my_ddf,prob_const, prob_const_derv1, prob_const_derv2) \
- private(cat, partial_lh_child, partial_lh_site,\
- lh_ptn, lh_ptn_derv1, lh_ptn_derv2, derv1_frac, derv2_frac, dad_state, trans_state, derv1_state, derv2_state) schedule(static)
-#endif
- for (int ptn = 0; ptn < alnSize; ++ptn) {
-#ifdef _OPENMP
- int lh_offset = ptn*numCat*numStates;
- partial_lh_site = node_branch->partial_lh + lh_offset;
- partial_lh_child = dad_branch->partial_lh + lh_offset;
-#endif
- lh_ptn = 0.0;
- lh_ptn_derv1 = 0.0;
- lh_ptn_derv2 = 0.0;
- int padding = 0;
- dad_state = aln->STATE_UNKNOWN; // FOR TUNG: This is missing in your codes!
- if (dad->isLeaf()) {
- if (ptn < orig_alnSize)
- dad_state = (*aln)[ptn][dad->id];
- else
- dad_state = model_factory->unobserved_ptns[ptn-orig_alnSize];
- }
- padding = dad_state * NSTATES;
- if (dad_state < NSTATES) {
- //external node
- trans_state = trans_mat + padding;
- derv1_state = trans_derv1 + padding;
- derv2_state = trans_derv2 + padding;
- for (cat = 0; cat < numCat; cat++) {
- MappedVec(NSTATES)ei_partial_lh_child(partial_lh_child);
- MappedVec(NSTATES) ei_trans_state(trans_state);
- MappedVec(NSTATES) ei_derv1_state(derv1_state);
- MappedVec(NSTATES) ei_derv2_state(derv2_state);
- lh_ptn += ei_partial_lh_child.dot(ei_trans_state);
- lh_ptn_derv1 += ei_partial_lh_child.dot(ei_derv1_state);
- lh_ptn_derv2 += ei_partial_lh_child.dot(ei_derv2_state);
- partial_lh_child += NSTATES;
- partial_lh_site += NSTATES;
- trans_state += tranSize;
- derv1_state += tranSize;
- derv2_state += tranSize;
- }
- } else {
- // internal node, or external node but ambiguous character
- trans_state = trans_mat;
- derv1_state = trans_derv1;
- derv2_state = trans_derv2;
- for (cat = 0; cat < numCat; cat++) {
- MappedRowVec(NSTATES) ei_partial_lh_site(partial_lh_site);
- MappedRowVec(NSTATES) ei_partial_lh_child(partial_lh_child);
- MappedMat(NSTATES) ei_trans_state(trans_state);
- MappedMat(NSTATES) ei_derv1_state(derv1_state);
- MappedMat(NSTATES) ei_derv2_state(derv2_state);
- lh_ptn += (ei_partial_lh_child * ei_trans_state).dot(ei_partial_lh_site);
- lh_ptn_derv1 += (ei_partial_lh_child * ei_derv1_state).dot(ei_partial_lh_site);
- lh_ptn_derv2 += (ei_partial_lh_child * ei_derv2_state).dot(ei_partial_lh_site);
- partial_lh_site += NSTATES;
- partial_lh_child += NSTATES;
- trans_state += tranSize;
- derv1_state += tranSize;
- derv2_state += tranSize;
- }
- }
- if (ptn < orig_alnSize) {
- lh_ptn = lh_ptn * p_var_cat;
- if ((*aln)[ptn].const_char == NSTATES)
- lh_ptn += p_invar;
- else if ((*aln)[ptn].const_char < NSTATES) {
- lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
- }
- double pad = p_var_cat / lh_ptn;
- if (std::isinf(pad)) {
- lh_ptn_derv1 *= p_var_cat;
- lh_ptn_derv2 *= p_var_cat;
- derv1_frac = lh_ptn_derv1 / lh_ptn;
- derv2_frac = lh_ptn_derv2 / lh_ptn;
- } else {
- derv1_frac = lh_ptn_derv1 * pad;
- derv2_frac = lh_ptn_derv2 * pad;
- }
- double freq = aln->at(ptn).frequency;
- double tmp1 = derv1_frac * freq;
- double tmp2 = derv2_frac * freq;
- my_df += tmp1;
- my_ddf += tmp2 - tmp1 * derv1_frac;
- } else {
- lh_ptn = lh_ptn*p_var_cat + p_invar*state_freq[(int)model_factory->unobserved_ptns[ptn-orig_alnSize]];
- prob_const += lh_ptn;
- prob_const_derv1 += lh_ptn_derv1 * p_var_cat;
- prob_const_derv2 += lh_ptn_derv2 * p_var_cat;
- }
- }
- if (orig_alnSize < alnSize) {
- // ascertainment bias correction
- prob_const = 1.0 - prob_const;
- derv1_frac = prob_const_derv1 / prob_const;
- derv2_frac = prob_const_derv2 / prob_const;
- int nsites = aln->getNSite();
- my_df += nsites * derv1_frac;
- my_ddf += nsites *(derv2_frac + derv1_frac*derv1_frac);
- }
-
- delete[] trans_derv2_orig;
- delete[] trans_derv1_orig;
- delete[] trans_mat_orig;
- df = my_df;
- ddf = my_ddf;
-}
-
/************************************************************************************************
*
@@ -1789,11 +1357,11 @@ void PhyloTree::computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
PhyloNode *node = (PhyloNode*)(dad_branch->node);
+ if (!tip_partial_lh_computed)
+ computeTipPartialLikelihood();
+
if (node->isLeaf()) {
dad_branch->lh_scale_factor = 0.0;
-
- if (!tip_partial_lh_computed)
- computeTipPartialLikelihood();
return;
}
@@ -1814,21 +1382,15 @@ void PhyloTree::computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
dad_branch->lh_scale_factor = 0.0;
// internal node
- assert(node->degree() == 3); // it works only for strictly bifurcating tree
+// assert(node->degree() == 3); // it works only for strictly bifurcating tree
PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
FOR_NEIGHBOR_IT(node, dad, it) {
+ PhyloNeighbor *nei = (PhyloNeighbor*)*it;
if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+ if ((nei->partial_lh_computed & 1) == 0)
+ computePartialLikelihood(nei, node);
+ dad_branch->lh_scale_factor += nei->lh_scale_factor;
}
-
- if (!left->node->isLeaf() && right->node->isLeaf()) {
- PhyloNeighbor *tmp = left;
- left = right;
- right = tmp;
- }
- if ((left->partial_lh_computed & 1) == 0)
- computeMixratePartialLikelihoodEigen(left, node);
- if ((right->partial_lh_computed & 1) == 0)
- computeMixratePartialLikelihoodEigen(right, node);
if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
// re-orient partial_lh
@@ -1848,66 +1410,186 @@ void PhyloTree::computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
assert(done && "partial_lh is not re-oriented");
}
- dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
- double *eleft = new double[block*nstates], *eright = new double[block*nstates];
+ // precompute buffer to save times
+ double *echildren = new double[block*nstates*(node->degree()-1)];
+ double *partial_lh_leaves = new double[(aln->STATE_UNKNOWN+1)*block*(node->degree()-1)];
+ double *echild = echildren;
+ double *partial_lh_leaf = partial_lh_leaves;
- // precompute information buffer
- for (c = 0; c < ncat; c++) {
- double *expleft = new double[nstates];
- double *expright = new double[nstates];
- double len_left = site_rate->getRate(c) * left->length;
- double len_right = site_rate->getRate(c) * right->length;
- for (i = 0; i < nstates; i++) {
- expleft[i] = exp(eval[c*nstates+i]*len_left);
- expright[i] = exp(eval[c*nstates+i]*len_right);
- }
- for (x = 0; x < nstates; x++)
- for (i = 0; i < nstates; i++) {
- eleft[c*nstatesqr+x*nstates+i] = evec[c*nstatesqr+x*nstates+i] * expleft[i];
- eright[c*nstatesqr+x*nstates+i] = evec[c*nstatesqr+x*nstates+i] * expright[i];
- }
- delete [] expright;
- delete [] expleft;
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ double expchild[nstates];
+ PhyloNeighbor *child = (PhyloNeighbor*)*it;
+ // precompute information buffer
+ for (c = 0; c < ncat; c++) {
+ double len_child = site_rate->getRate(c) * child->length;
+ for (i = 0; i < nstates; i++) {
+ expchild[i] = exp(eval[c*nstates+i]*len_child);
+ }
+ for (x = 0; x < nstates; x++)
+ for (i = 0; i < nstates; i++) {
+ echild[c*nstatesqr+x*nstates+i] = evec[c*nstatesqr+x*nstates+i] * expchild[i];
+ }
+ }
+ // pre compute information for tip
+ if (child->node->isLeaf()) {
+ vector<int>::iterator it;
+ for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
+ int state = (*it);
+ for (c = 0; c < ncat; c++)
+ for (x = 0; x < nstates; x++) {
+ double vchild = 0.0;
+ for (i = 0; i < nstates; i++) {
+ vchild += echild[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
+ }
+ partial_lh_leaf[state*block+c*nstates+x] = vchild;
+ }
+ }
+ size_t addr = aln->STATE_UNKNOWN * block;
+ for (x = 0; x < block; x++) {
+ partial_lh_leaf[addr+x] = 1.0;
+ }
+ partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+ }
+ echild += block*nstates;
+ }
+
+ double *eleft = echildren, *eright = echildren + block*nstates;
+
+ if (!left->node->isLeaf() && right->node->isLeaf()) {
+ PhyloNeighbor *tmp = left;
+ left = right;
+ right = tmp;
+ double *etmp = eleft;
+ eleft = eright;
+ eright = etmp;
}
+
+ if (node->degree() > 3) {
+ /*--------------------- multifurcating node ------------------*/
+ double sum_scale = 0.0;
+
+ // now for-loop computing partial_lh over all site-patterns
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double partial_lh_all[block];
+ for (i = 0; i < block; i++)
+ partial_lh_all[i] = 1.0;
+ dad_branch->scale_num[ptn] = 0;
+
+ double *partial_lh_leaf = partial_lh_leaves;
+ double *echild = echildren;
- if (left->node->isLeaf() && right->node->isLeaf()) {
- // special treatment for TIP-TIP (cherry) case
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ PhyloNeighbor *child = (PhyloNeighbor*)*it;
+ if (child->node->isLeaf()) {
+ // external node
+ int state_child = (ptn < orig_ntn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+ double *child_lh = partial_lh_leaf + state_child*block;
+ for (c = 0; c < block; c++) {
+ partial_lh_all[c] *= child_lh[c];
+ }
+ partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+ } else {
+ // internal node
+ double *partial_lh = partial_lh_all;
+ double *partial_lh_child = child->partial_lh + ptn*block;
+ dad_branch->scale_num[ptn] += child->scale_num[ptn];
- // pre compute information for both tips
- double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
- double *partial_lh_right = new double[(aln->STATE_UNKNOWN+1)*block];
-
- vector<int>::iterator it;
- for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
- int state = (*it);
- for (c = 0; c < ncat; c++)
- for (x = 0; x < nstates; x++) {
- double vleft = 0.0;
+ double *echild_ptr = echild;
+ for (c = 0; c < ncat; c++) {
+ // compute real partial likelihood vector
+ for (x = 0; x < nstates; x++) {
+ double vchild = 0.0;
+ for (i = 0; i < nstates; i++) {
+ vchild += echild_ptr[i] * partial_lh_child[i];
+ }
+ echild_ptr += nstates;
+ partial_lh[x] *= vchild;
+ }
+ partial_lh += nstates;
+ partial_lh_child += nstates;
+ }
+ } // if
+ echild += block*nstates;
+ } // FOR_NEIGHBOR
+
+
+ // compute dot-product with inv_eigenvector
+ double lh_max = 0.0;
+ double *partial_lh_tmp = partial_lh_all;
+ double *partial_lh = dad_branch->partial_lh + ptn*block;
+ double *inv_evec_ptr = inv_evec;
+ for (c = 0; c < ncat; c++) {
+ // compute dot-product with inv_eigenvector
for (i = 0; i < nstates; i++) {
- vleft += eleft[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
+ double res = 0.0;
+ for (x = 0; x < nstates; x++) {
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
+ }
+ inv_evec_ptr += nstates;
+ partial_lh[i] = res;
+ lh_max = max(fabs(res), lh_max);
}
- partial_lh_left[state*block+c*nstates+x] = vleft;
+ partial_lh += nstates;
+ partial_lh_tmp += nstates;
}
- }
- for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
- int state = (*it);
- for (c = 0; c < ncat; c++)
- for (x = 0; x < nstates; x++) {
- double vright = 0.0;
- for (i = 0; i < nstates; i++) {
- vright += eright[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
+ if (lh_max < SCALING_THRESHOLD) {
+ // now do the likelihood scaling
+ partial_lh = dad_branch->partial_lh + ptn*block;
+ for (i = 0; i < block; i++) {
+ partial_lh[i] *= SCALING_THRESHOLD_INVER;
}
- partial_lh_right[state*block+c*nstates+x] = vright;
- }
- }
+ // unobserved const pattern will never have underflow
+ sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 1;
+ }
+
+ } // for ptn
+ dad_branch->lh_scale_factor += sum_scale;
+
+ // end multifurcating treatment
- for (x = 0; x < block; x++) {
- size_t addr = aln->STATE_UNKNOWN * block;
- partial_lh_left[addr+x] = 1.0;
- partial_lh_right[addr+x] = 1.0;
- }
+ } else if (left->node->isLeaf() && right->node->isLeaf()) {
+ // special treatment for TIP-TIP (cherry) case
+ // pre compute information for both tips
+ double *partial_lh_left = partial_lh_leaves;
+ double *partial_lh_right = partial_lh_leaves + (aln->STATE_UNKNOWN+1)*block;
+
+// vector<int>::iterator it;
+// for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+// int state = (*it);
+// for (c = 0; c < ncat; c++)
+// for (x = 0; x < nstates; x++) {
+// double vleft = 0.0;
+// for (i = 0; i < nstates; i++) {
+// vleft += eleft[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
+// }
+// partial_lh_left[state*block+c*nstates+x] = vleft;
+// }
+// }
+//
+// for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
+// int state = (*it);
+// for (c = 0; c < ncat; c++)
+// for (x = 0; x < nstates; x++) {
+// double vright = 0.0;
+// for (i = 0; i < nstates; i++) {
+// vright += eright[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
+// }
+// partial_lh_right[state*block+c*nstates+x] = vright;
+// }
+// }
+//
+// for (x = 0; x < block; x++) {
+// size_t addr = aln->STATE_UNKNOWN * block;
+// partial_lh_left[addr+x] = 1.0;
+// partial_lh_right[addr+x] = 1.0;
+// }
+//
// scale number must be ZERO
memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
@@ -1938,33 +1620,33 @@ void PhyloTree::computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
}
}
}
- delete [] partial_lh_right;
- delete [] partial_lh_left;
+// delete [] partial_lh_right;
+// delete [] partial_lh_left;
} else if (left->node->isLeaf() && !right->node->isLeaf()) {
// special treatment to TIP-INTERNAL NODE case
// only take scale_num from the right subtree
memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
// pre compute information for left tip
- double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
-
- vector<int>::iterator it;
- for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
- int state = (*it);
- for (c = 0; c < ncat; c++)
- for (x = 0; x < nstates; x++) {
- double vleft = 0.0;
- for (i = 0; i < nstates; i++) {
- vleft += eleft[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
- }
- partial_lh_left[state*block+c*nstates+x] = vleft;
- }
- }
- for (x = 0; x < block; x++) {
- size_t addr = aln->STATE_UNKNOWN * block;
- partial_lh_left[addr+x] = 1.0;
- }
-
+ double *partial_lh_left = partial_lh_leaves;
+
+// vector<int>::iterator it;
+// for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+// int state = (*it);
+// for (c = 0; c < ncat; c++)
+// for (x = 0; x < nstates; x++) {
+// double vleft = 0.0;
+// for (i = 0; i < nstates; i++) {
+// vleft += eleft[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
+// }
+// partial_lh_left[state*block+c*nstates+x] = vleft;
+// }
+// }
+// for (x = 0; x < block; x++) {
+// size_t addr = aln->STATE_UNKNOWN * block;
+// partial_lh_left[addr+x] = 1.0;
+// }
+//
double sum_scale = 0.0;
#ifdef _OPENMP
@@ -2012,7 +1694,7 @@ void PhyloTree::computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
}
dad_branch->lh_scale_factor += sum_scale;
- delete [] partial_lh_left;
+// delete [] partial_lh_left;
} else {
// both left and right are internal node
@@ -2066,8 +1748,8 @@ void PhyloTree::computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
}
- delete [] eright;
- delete [] eleft;
+ delete [] partial_lh_leaves;
+ delete [] echildren;
}
//template <const int nstates>
@@ -2260,14 +1942,6 @@ double PhyloTree::computeMixrateLikelihoodBranchEigen(PhyloNeighbor *dad_branch,
double *lh_tip = tip_partial_lh + (*it)*block;
for (i = 0; i < block; i++)
lh_node[i] = val[i]*lh_tip[i];
-// double *val_tmp = val;
-// for (c = 0; c < ncat; c++) {
-// for (i = 0; i < nstates; i++) {
-// lh_node[i] = val_tmp[i] * lh_tip[i];
-// }
-// lh_node += nstates;
-// val_tmp += nstates;
-// }
}
// now do the real computation
@@ -2373,11 +2047,11 @@ void PhyloTree::computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
PhyloNode *node = (PhyloNode*)(dad_branch->node);
+ if (!tip_partial_lh_computed)
+ computeTipPartialLikelihood();
+
if (node->isLeaf()) {
dad_branch->lh_scale_factor = 0.0;
-
- if (!tip_partial_lh_computed)
- computeTipPartialLikelihood();
return;
}
@@ -2398,22 +2072,15 @@ void PhyloTree::computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
dad_branch->lh_scale_factor = 0.0;
// internal node
- assert(node->degree() == 3); // it works only for strictly bifurcating tree
PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
FOR_NEIGHBOR_IT(node, dad, it) {
+ PhyloNeighbor *nei = (PhyloNeighbor*)*it;
if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+ if ((nei->partial_lh_computed & 1) == 0)
+ computePartialLikelihood(nei, node);
+ dad_branch->lh_scale_factor += nei->lh_scale_factor;
}
- if (!left->node->isLeaf() && right->node->isLeaf()) {
- PhyloNeighbor *tmp = left;
- left = right;
- right = tmp;
- }
- if ((left->partial_lh_computed & 1) == 0)
- computeMixturePartialLikelihoodEigen(left, node);
- if ((right->partial_lh_computed & 1) == 0)
- computeMixturePartialLikelihoodEigen(right, node);
-
if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
// re-orient partial_lh
bool done = false;
@@ -2433,92 +2100,166 @@ void PhyloTree::computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
}
- dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
-// double partial_lh_tmp[nstates];
- double *eleft = new double[block*nstates], *eright = new double[block*nstates];
+ double *echildren = new double[block*nstates*(node->degree()-1)];
+ double *partial_lh_leaves = new double[(aln->STATE_UNKNOWN+1)*block*(node->degree()-1)];
+ double *echild = echildren;
+ double *partial_lh_leaf = partial_lh_leaves;
- // precompute information buffer
- for (c = 0; c < ncat; c++) {
- double *expleft = new double[nstates];
- double *expright = new double[nstates];
- double len_left = site_rate->getRate(c) * left->length;
- double len_right = site_rate->getRate(c) * right->length;
- for (m = 0; m < nmixture; m++) {
- for (i = 0; i < nstates; i++) {
- expleft[i] = exp(eval[m*nstates+i]*len_left);
- expright[i] = exp(eval[m*nstates+i]*len_right);
- }
- for (x = 0; x < nstates; x++)
- for (i = 0; i < nstates; i++) {
- eleft[(m*ncat+c)*nstatesqr+x*nstates+i] = evec[m*nstatesqr+x*nstates+i] * expleft[i];
- eright[(m*ncat+c)*nstatesqr+x*nstates+i] = evec[m*nstatesqr+x*nstates+i] * expright[i];
- }
- }
- delete [] expright;
- delete [] expleft;
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ // precompute information buffer
+ double expchild[nstates];
+ PhyloNeighbor *child = (PhyloNeighbor*)*it;
+ for (c = 0; c < ncat; c++) {
+ double len_child = site_rate->getRate(c) * child->length;
+ for (m = 0; m < nmixture; m++) {
+ for (i = 0; i < nstates; i++) {
+ expchild[i] = exp(eval[m*nstates+i]*len_child);
+ }
+ for (x = 0; x < nstates; x++)
+ for (i = 0; i < nstates; i++) {
+ echild[(m*ncat+c)*nstatesqr+x*nstates+i] = evec[m*nstatesqr+x*nstates+i] * expchild[i];
+ }
+ }
+ }
+ if (child->node->isLeaf()) {
+ vector<int>::iterator it;
+ for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
+ int state = (*it);
+ for (m = 0; m < nmixture; m++) {
+ double *this_echild = &echild[m*nstatesqr*ncat];
+ double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
+ double *this_partial_lh_leaf = &partial_lh_leaf[state*block+m*statecat];
+ for (x = 0; x < statecat; x++) {
+ double vchild = 0.0;
+ for (i = 0; i < nstates; i++) {
+ vchild += this_echild[x*nstates+i] * this_tip_partial_lh[i];
+ }
+ this_partial_lh_leaf[x] = vchild;
+ }
+ }
+ }
+ size_t addr = aln->STATE_UNKNOWN * block;
+ for (x = 0; x < block; x++) {
+ partial_lh_leaf[addr+x] = 1.0;
+ }
+ partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+ }
+ echild += block*nstates;
+ }
+
+ double *eleft = echildren, *eright = echildren + block*nstates;
+
+ if (!left->node->isLeaf() && right->node->isLeaf()) {
+ PhyloNeighbor *tmp = left;
+ left = right;
+ right = tmp;
+ double *etmp = eleft;
+ eleft = eright;
+ eright = etmp;
}
+
+ if (node->degree() > 3) {
+ /*--------------------- multifurcating node ------------------*/
+ // now for-loop computing partial_lh over all site-patterns
+ double sum_scale = 0.0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double partial_lh_all[block];
+ for (i = 0; i < block; i++)
+ partial_lh_all[i] = 1.0;
+ dad_branch->scale_num[ptn] = 0;
+
+ double *partial_lh_leaf = partial_lh_leaves;
+ double *echild = echildren;
- if (left->node->isLeaf() && right->node->isLeaf()) {
- // special treatment for TIP-TIP (cherry) case
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ PhyloNeighbor *child = (PhyloNeighbor*)*it;
+ if (child->node->isLeaf()) {
+ // external node
+ int state_child = (ptn < orig_ntn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+ double *child_lh = partial_lh_leaf + state_child*block;
+ for (c = 0; c < block; c++) {
+ // compute real partial likelihood vector
+ partial_lh_all[c] *= child_lh[c];
+ }
+ partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+ } else {
+ // internal node
+ double *partial_lh = partial_lh_all;
+ double *partial_lh_child = child->partial_lh + ptn*block;
+ dad_branch->scale_num[ptn] += child->scale_num[ptn];
+ double *echild_ptr = echild;
- // pre compute information for both tips
- double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
- double *partial_lh_right = new double[(aln->STATE_UNKNOWN+1)*block];
+ for (m = 0; m < nmixture; m++) {
+ for (c = 0; c < ncat; c++) {
+ // compute real partial likelihood vector
+ for (x = 0; x < nstates; x++) {
+ double vchild = 0.0;
+// size_t addr = (m*ncat+c)*nstatesqr+x*nstates;
+ for (i = 0; i < nstates; i++) {
+ vchild += echild_ptr[i] * partial_lh_child[i];
+ }
+ echild_ptr += nstates;
+ partial_lh[x] *= vchild;
+ }
+ partial_lh += nstates;
+ partial_lh_child += nstates;
+ }
+ }
- vector<int>::iterator it;
- for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
- int state = (*it);
- for (m = 0; m < nmixture; m++) {
- double *this_eleft = &eleft[m*nstatesqr*ncat];
- double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
- double *this_partial_lh_left = &partial_lh_left[state*block+m*statecat];
-
-// for (c = 0; c < ncat; c++)
-// for (x = 0; x < nstates; x++) {
-// double vleft = 0.0;
-// for (i = 0; i < nstates; i++) {
-// vleft += this_eleft[(c*nstates+x)*nstates+i] * this_tip_partial_lh[i];
-// }
-// this_partial_lh_left[c*nstates+x] = vleft;
-// }
-
- for (x = 0; x < statecat; x++) {
- double vleft = 0.0;
- for (i = 0; i < nstates; i++) {
- vleft += this_eleft[x*nstates+i] * this_tip_partial_lh[i];
- }
- this_partial_lh_left[x] = vleft;
- }
- }
- }
+ } // if
+ echild += block*nstates;
+ } // FOR_NEIGHBOR
- for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
- int state = (*it);
- for (m = 0; m < nmixture; m++) {
- double *this_eright = &eright[m*nstatesqr*ncat];
- double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
- double *this_partial_lh_right = &partial_lh_right[state*block+m*statecat];
- for (x = 0; x < statecat; x++) {
- double vright = 0.0;
+ // compute dot-product with inv_eigenvector
+ double lh_max = 0.0;
+ double *partial_lh_tmp = partial_lh_all;
+ double *partial_lh = dad_branch->partial_lh+ptn*block;
+ for (m = 0; m < nmixture; m++) {
+ for (c = 0; c < ncat; c++) {
+ double *inv_evec_ptr = inv_evec + m*nstatesqr;
for (i = 0; i < nstates; i++) {
- vright += this_eright[x*nstates+i] * this_tip_partial_lh[i];
+ double res = 0.0;
+ for (x = 0; x < nstates; x++) {
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
+ }
+ inv_evec_ptr += nstates;
+ partial_lh[i] = res;
+ lh_max = max(fabs(res), lh_max);
}
- this_partial_lh_right[x] = vright;
+ partial_lh += nstates;
+ partial_lh_tmp += nstates;
}
- }
- }
+ }
+ if (lh_max < SCALING_THRESHOLD) {
+ // now do the likelihood scaling
+ partial_lh = dad_branch->partial_lh + ptn*block;
+ for (i = 0; i < block; i++) {
+ partial_lh[i] *= SCALING_THRESHOLD_INVER;
+ }
+ // unobserved const pattern will never have underflow
+ sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 1;
+ }
- size_t addr = aln->STATE_UNKNOWN * block;
- for (x = 0; x < block; x++) {
- partial_lh_left[addr+x] = 1.0;
- partial_lh_right[addr+x] = 1.0;
- }
+ } // for ptn
+ dad_branch->lh_scale_factor += sum_scale;
+
+ // end multifurcating treatment
+
+ } else if (left->node->isLeaf() && right->node->isLeaf()) {
+ // special treatment for TIP-TIP (cherry) case
+ // pre compute information for both tips
+ double *partial_lh_left = partial_lh_leaves;
+ double *partial_lh_right = partial_lh_leaves + (aln->STATE_UNKNOWN+1)*block;
// scale number must be ZERO
memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
#ifdef _OPENMP
-//#pragma omp parallel for private(ptn, c, x, i, m)
+#pragma omp parallel for private(ptn, c, x, i, m)
#endif
for (ptn = 0; ptn < nptn; ptn++) {
double partial_lh_tmp[nstates];
@@ -2545,36 +2286,13 @@ void PhyloTree::computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
}
}
}
- delete [] partial_lh_right;
- delete [] partial_lh_left;
} else if (left->node->isLeaf() && !right->node->isLeaf()) {
// special treatment to TIP-INTERNAL NODE case
// only take scale_num from the right subtree
memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
// pre compute information for left tip
- double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
-
- vector<int>::iterator it;
- for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
- int state = (*it);
- for (m = 0; m < nmixture; m++) {
- double *this_eleft = &eleft[m*nstatesqr*ncat];
- double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
- double *this_partial_lh_left = &partial_lh_left[state*block+m*statecat];
- for (x = 0; x < statecat; x++) {
- double vleft = 0.0;
- for (i = 0; i < nstates; i++) {
- vleft += this_eleft[x*nstates+i] * this_tip_partial_lh[i];
- }
- this_partial_lh_left[x] = vleft;
- }
- }
- }
- size_t addr = aln->STATE_UNKNOWN * block;
- for (x = 0; x < block; x++) {
- partial_lh_left[addr+x] = 1.0;
- }
+ double *partial_lh_left = partial_lh_leaves;
double sum_scale = 0.0;
#ifdef _OPENMP
@@ -2624,7 +2342,6 @@ void PhyloTree::computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
}
dad_branch->lh_scale_factor += sum_scale;
- delete [] partial_lh_left;
} else {
// both left and right are internal node
@@ -2679,9 +2396,9 @@ void PhyloTree::computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch,
dad_branch->lh_scale_factor += sum_scale;
}
-
- delete [] eright;
- delete [] eleft;
+
+ delete [] partial_lh_leaves;
+ delete [] echildren;
}
//template <const int nstates>
@@ -2854,7 +2571,6 @@ double PhyloTree::computeMixtureLikelihoodBranchEigen(PhyloNeighbor *dad_branch,
size_t block = ncat * nstates * nmixture;
size_t statemix = nstates * nmixture;
- size_t cat_states = ncat * nstates;
size_t catmix = ncat * nmixture;
size_t ptn; // for big data size > 4GB memory required
size_t c, i, m;
@@ -2908,12 +2624,6 @@ double PhyloTree::computeMixtureLikelihoodBranchEigen(PhyloNeighbor *dad_branch,
int state_dad = (ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
double *lh_node = partial_lh_node + state_dad*block;
for (m = 0; m < nmixture; m++) {
-// for (i = 0; i < cat_states; i++)
-// *lh_cat += lh_node[i] * partial_lh_dad[i];
-// lh_ptn += *lh_cat;
-// lh_node += cat_states;
-// partial_lh_dad += cat_states;
-// lh_cat++;
for (c = 0; c < ncat; c++) {
for (i = 0; i < nstates; i++) {
*lh_cat += lh_node[i] * partial_lh_dad[i];
@@ -2950,13 +2660,6 @@ double PhyloTree::computeMixtureLikelihoodBranchEigen(PhyloNeighbor *dad_branch,
double *partial_lh_node = node_branch->partial_lh + ptn*block;
double *val_tmp = val;
for (m = 0; m < nmixture; m++) {
-// for (i = 0; i < cat_states; i++)
-// *lh_cat += val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
-// lh_ptn += *lh_cat;
-// partial_lh_dad += cat_states;
-// partial_lh_node += cat_states;
-// val_tmp += cat_states;
-// lh_cat++;
for (c = 0; c < ncat; c++) {
for (i = 0; i < nstates; i++) {
*lh_cat += val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
diff --git a/quartet.cpp b/quartet.cpp
index eca150d..ca2dc5d 100644
--- a/quartet.cpp
+++ b/quartet.cpp
@@ -21,6 +21,10 @@
#include "myreader.h"
// #include "lmap.c"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
#if 0 /*** moved to phylotree.h ***/
/* Index definition for counter array needed in likelihood mapping analysis (HAS) */
#define LM_REG1 0 /* top corner */
diff --git a/tools.cpp b/tools.cpp
index 85ca06c..c137a81 100644
--- a/tools.cpp
+++ b/tools.cpp
@@ -640,9 +640,9 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
verbose_mode = VB_MIN;
params.tree_gen = NONE;
params.user_file = NULL;
- params.fai = false;
- params.testAlpha = false;
- params.test_param = false;
+ params.opt_gammai = true;
+ params.opt_gammai_fast = false;
+ params.opt_gammai_keep_bran = false;
params.testAlphaEpsAdaptive = false;
params.randomAlpha = false;
params.testAlphaEps = 0.1;
@@ -717,6 +717,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.bootstrap_spec = NULL;
params.aln_file = NULL;
+ params.phylip_sequential_format = false;
params.treeset_file = NULL;
params.topotest_replicates = 0;
params.do_weighted_test = false;
@@ -768,6 +769,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.model_test_and_tree = 0;
params.model_test_separate_rate = false;
params.optimize_mixmodel_weight = false;
+ params.optimize_rate_matrix = false;
params.store_trans_matrix = false;
//params.freq_type = FREQ_EMPIRICAL;
params.freq_type = FREQ_UNKNOWN;
@@ -780,6 +782,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.optimize_model_rate_joint = false;
params.optimize_by_newton = true;
params.optimize_alg = "2-BFGS-B,EM";
+ params.optimize_alg_gammai = "EM";
params.fixed_branch_length = false;
params.min_branch_length = 0.0; // this is now adjusted later based on alignment length
params.max_branch_length = 100.0;
@@ -1034,6 +1037,13 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.optimize_alg = argv[cnt];
continue;
}
+ if (strcmp(argv[cnt], "-optalg_gammai") == 0) {
+ cnt++;
+ if (cnt >= argc)
+ throw "Use -optalg_gammai <Brent|BFGS|EM>";
+ params.optimize_alg_gammai = argv[cnt];
+ continue;
+ }
if (strcmp(argv[cnt], "-root") == 0) {
params.is_rooted = true;
continue;
@@ -1507,6 +1517,10 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.aln_file = argv[cnt];
continue;
}
+ if (strcmp(argv[cnt], "--sequential") == 0) {
+ params.phylip_sequential_format = true;
+ continue;
+ }
if (strcmp(argv[cnt], "-z") == 0) {
cnt++;
if (cnt >= argc)
@@ -1544,6 +1558,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
throw "Use -spp <type of partition model>";
params.partition_file = argv[cnt];
params.partition_type = 'p';
+ params.opt_gammai = false;
continue;
}
if (strcmp(argv[cnt], "-spj") == 0 || strcmp(argv[cnt], "-q") == 0) {
@@ -1552,6 +1567,8 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
throw "Use -q <type of partition model>";
params.partition_file = argv[cnt];
params.partition_type = 'j';
+ params.optimize_alg_gammai = "Brent";
+ params.opt_gammai = false;
continue;
}
if (strcmp(argv[cnt], "-M") == 0) {
@@ -1711,6 +1728,9 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
cnt++;
if (cnt >= argc)
throw "Use -n <#iterations>";
+ if (params.gbo_replicates != 0) {
+ outError("Ultrafast bootstrap does not work with -n option");
+ }
params.min_iterations = convert_int(argv[cnt]);
params.stop_condition = SC_FIXED_ITERATION;
// params.autostop = false;
@@ -1805,18 +1825,22 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.optimize_mixmodel_weight = true;
continue;
}
- if (strcmp(argv[cnt], "-mh") == 0) {
- params.mvh_site_rate = true;
- params.discard_saturated_site = false;
- params.SSE = LK_NORMAL;
- continue;
- }
- if (strcmp(argv[cnt], "-mhs") == 0) {
- params.mvh_site_rate = true;
- params.discard_saturated_site = true;
- params.SSE = LK_NORMAL;
+ if (strcmp(argv[cnt], "--opt-rate-mat") == 0) {
+ params.optimize_rate_matrix = true;
continue;
}
+// if (strcmp(argv[cnt], "-mh") == 0) {
+// params.mvh_site_rate = true;
+// params.discard_saturated_site = false;
+// params.SSE = LK_NORMAL;
+// continue;
+// }
+// if (strcmp(argv[cnt], "-mhs") == 0) {
+// params.mvh_site_rate = true;
+// params.discard_saturated_site = true;
+// params.SSE = LK_NORMAL;
+// continue;
+// }
if (strcmp(argv[cnt], "-rl") == 0) {
params.rate_mh_type = false;
continue;
@@ -1845,14 +1869,14 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
throw "Lambda must be in (0,1]";
continue;
}
- if (strcmp(argv[cnt], "-nosse") == 0) {
- params.SSE = LK_NORMAL;
- continue;
- }
- if (strcmp(argv[cnt], "-slowsse") == 0) {
- params.SSE = LK_SSE;
- continue;
- }
+// if (strcmp(argv[cnt], "-nosse") == 0) {
+// params.SSE = LK_NORMAL;
+// continue;
+// }
+// if (strcmp(argv[cnt], "-slowsse") == 0) {
+// params.SSE = LK_SSE;
+// continue;
+// }
if (strcmp(argv[cnt], "-fastlk") == 0) {
params.SSE = LK_EIGEN;
continue;
@@ -2381,6 +2405,9 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
cnt++;
if (cnt >= argc)
throw "Use -bb <#replicates>";
+ if (params.min_iterations != -1) {
+ outError("Ultrafast bootstrap does not work with -te or -n option");
+ }
params.gbo_replicates = convert_int(argv[cnt]);
// params.avoid_duplicated_trees = true;
if (params.gbo_replicates < 1000)
@@ -2546,13 +2573,20 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
continue;
}
- if (strcmp(argv[cnt], "--test-alpha") == 0) {
- params.testAlpha = true;
- continue;
- }
+ if (strcmp(argv[cnt], "-opt_gammai") == 0 || strcmp(argv[cnt], "--opt-gamma-inv") == 0) {
+ params.opt_gammai = true;
+ continue;
+ }
- if (strcmp(argv[cnt], "-test_param") == 0 || strcmp(argv[cnt], "--opt-gamma-inv") == 0) {
- params.test_param = true;
+ if (strcmp(argv[cnt], "--opt-gammai-fast") == 0) {
+ params.opt_gammai_fast = true;
+ params.opt_gammai = true;
+ continue;
+ }
+
+ if (strcmp(argv[cnt], "--opt-gammai-kb") == 0) {
+ params.opt_gammai_keep_bran = true;
+ params.opt_gammai = true;
continue;
}
@@ -2564,19 +2598,6 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.randomAlpha = true;
continue;
}
- if (strcmp(argv[cnt], "--test-alpha-eps") == 0) {
- cnt++;
- if (cnt >= argc)
- throw "Use --test-alpha-eps <logl_eps>";
- params.testAlphaEps = convert_double(argv[cnt]);
- params.testAlpha = true;
- continue;
- }
-
- if (strcmp(argv[cnt], "-fai") == 0) {
- params.fai = true;
- continue;
- }
if (strcmp(argv[cnt], "-eai") == 0) {
params.exh_ai = true;
@@ -2813,14 +2834,14 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
throw "At least 1 thread please";
continue;
}
- if (strcmp(argv[cnt], "-rootstate") == 0) {
- cnt++;
- if (cnt >= argc)
- throw "Use -rootstate <rootstate>";
- params.root_state = argv[cnt];
- params.SSE = LK_NORMAL;
- continue;
- }
+// if (strcmp(argv[cnt], "-rootstate") == 0) {
+// cnt++;
+// if (cnt >= argc)
+// throw "Use -rootstate <rootstate>";
+// params.root_state = argv[cnt];
+// params.SSE = LK_NORMAL;
+// continue;
+// }
if (strcmp(argv[cnt], "-ct") == 0) {
params.count_trees = true;
continue;
@@ -2843,6 +2864,9 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
}
if (strcmp(argv[cnt], "-t") == 0 || strcmp(argv[cnt], "-te") == 0) {
if (strcmp(argv[cnt], "-te") == 0) {
+ if (params.gbo_replicates != 0) {
+ outError("Ultrafast bootstrap does not work with -te option");
+ }
params.min_iterations = 0;
params.stop_condition = SC_FIXED_ITERATION;
}
@@ -3165,6 +3189,7 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " -gmedian Median approximation for +G site rates (default: mean)" << endl
<< " --opt-gamma-inv More thorough estimation for +I+G model parameters" << endl
<< " -i <p_invar> Proportion of invariable sites (default: estimate)" << endl
+ << " -wsr Write site rates to .rate file" << endl
<< " -mh Computing site-specific rates to .mhrate file using" << endl
<< " Meyer & von Haeseler (2003) method" << endl
//<< " -c <#categories> Number of Gamma rate categories (default: 4)" << endl
@@ -3222,6 +3247,7 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " -blfix Fix branch lengths of user tree passed via -te" << endl
<< " -blmin Min branch length for optimization (default 0.000001)" << endl
<< " -blmax Max branch length for optimization (default 100)" << endl
+ << " -wsr Write site rates and categories to .rate file" << endl
<< " -wsl Write site log-likelihoods to .sitelh file" << endl
<< " -wslr Write site log-likelihoods per rate category" << endl
<< " -wslm Write site log-likelihoods per mixture class" << endl
diff --git a/tools.h b/tools.h
index 21a2462..6426a56 100644
--- a/tools.h
+++ b/tools.h
@@ -395,7 +395,7 @@ struct NNIInfo {
};
enum LikelihoodKernel {
- LK_NORMAL, LK_SSE, LK_EIGEN, LK_EIGEN_SSE
+ LK_EIGEN, LK_EIGEN_SSE
};
enum LhMemSave {
@@ -429,20 +429,18 @@ private:
public:
/**
- * Fast and accurate optimiation for alpha and p_invar
- */
- bool fai;
-
- /**
- * Use random restart strategy for estimating alpha and p_invar
- */
- bool testAlpha;
-
- /**
* Restart the optimization of alpha and pinvar from different starting
* pinv values (supercedes the option testAlpha
*/
- bool test_param;
+ bool opt_gammai;
+
+ /**
+ * A faster version of opt_gammai using a heuristic similar to binary search.
+ * Thus, one does not need to perform 10 independent trials as in opt_gammai.
+ */
+ bool opt_gammai_fast;
+
+ bool opt_gammai_keep_bran;
/**
* Automatic adjust the log-likelihood espilon using some heuristic
@@ -657,6 +655,9 @@ public:
*/
char *aln_file;
+ /** true if sequential phylip format is used, default: false (interleaved format) */
+ bool phylip_sequential_format;
+
/**
file containing multiple trees to evaluate at the end
*/
@@ -1164,6 +1165,9 @@ public:
/** TRUE to optimize mixture model weights */
bool optimize_mixmodel_weight;
+ /** TRUE to always optimize rate matrix even if user parameters are specified in e.g. GTR{1,2,3,4,5} */
+ bool optimize_rate_matrix;
+
/**
TRUE to store transition matrix into a hash table for computation efficiency
*/
@@ -1217,6 +1221,11 @@ public:
string optimize_alg;
/**
+ * Optimization algorithm for +I+G
+ */
+ string optimize_alg_gammai;
+
+ /**
TRUE if you want to fix branch lengths during model optimization
*/
bool fixed_branch_length;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/iqtree.git
More information about the debian-med-commit
mailing list