[med-svn] [iqtree] 01/03: Imported Upstream version 1.4.2+dfsg
Andreas Tille
tille at debian.org
Fri Jun 24 20:13:03 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository iqtree.
commit a68a042d6b808c14b5aa4ff28daa98c9afba848a
Author: Andreas Tille <tille at debian.org>
Date: Fri Jun 24 21:45:20 2016 +0200
Imported Upstream version 1.4.2+dfsg
---
CMakeLists.txt | 2 +-
alignment.cpp | 4 +-
alignment.h | 4 +-
iqtree.cpp | 114 +++++----
model/modelfactory.cpp | 124 +++++++++-
model/modelfactory.h | 8 +
model/modelgtr.cpp | 1 +
model/partitionmodel.cpp | 33 +++
model/partitionmodel.h | 8 +
model/ratefree.cpp | 1 +
model/ratefree.h | 2 +-
model/ratefreeinvar.cpp | 2 +-
model/rategamma.cpp | 1 +
model/rategamma.h | 2 +-
model/rategammainvar.cpp | 36 ++-
model/rategammainvar.h | 9 +
model/rateheterogeneity.h | 16 ++
model/rateinvar.cpp | 6 +-
model/ratekategory.cpp | 1 +
msetsblock.cpp | 1 +
pda.cpp | 2 +-
phyloanalysis.cpp | 34 +--
phylosupertreeplen.cpp | 7 +
phylosupertreeplen.h | 9 +
phylotesting.cpp | 41 +++-
phylotree.cpp | 5 +-
phylotree.h | 34 +--
quartet.cpp | 611 +++++++++++++++++++++++++---------------------
superalignment.cpp | 20 +-
superalignment.h | 4 +-
tools.cpp | 52 +++-
tools.h | 23 +-
32 files changed, 811 insertions(+), 406 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3eea0ea..22a4864 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,7 @@ add_definitions(-DIQ_TREE)
# The version number.
set (iqtree_VERSION_MAJOR 1)
set (iqtree_VERSION_MINOR 4)
-set (iqtree_VERSION_PATCH "0")
+set (iqtree_VERSION_PATCH "2")
set(BUILD_SHARED_LIBS OFF)
diff --git a/alignment.cpp b/alignment.cpp
index bf6d195..4a20602 100644
--- a/alignment.cpp
+++ b/alignment.cpp
@@ -1889,7 +1889,7 @@ void Alignment::printFasta(const char *file_name, bool append, const char *aln_s
}
-void Alignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char) {
+void Alignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa, IntVector *kept_partitions) {
IntVector::iterator it;
for (it = seq_id.begin(); it != seq_id.end(); it++) {
assert(*it >= 0 && *it < aln->getNSeq());
@@ -1930,6 +1930,8 @@ void Alignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_t
countConstSite();
buildSeqStates();
assert(size() <= aln->size());
+ if (kept_partitions)
+ kept_partitions->push_back(0);
}
diff --git a/alignment.h b/alignment.h
index a9dba09..1b736c9 100644
--- a/alignment.h
+++ b/alignment.h
@@ -352,8 +352,10 @@ public:
@param aln original input alignment
@param seq_id ID of sequences to extract from
@param min_true_cher the minimum number of non-gap characters, true_char<min_true_char -> delete the sequence
+ @param min_taxa only keep alignment that has >= min_taxa sequences
+ @param[out] kept_partitions (for SuperAlignment) indices of kept partitions
*/
- virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char);
+ virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa = 0, IntVector *kept_partitions = NULL);
/**
extract a sub-set of patterns
diff --git a/iqtree.cpp b/iqtree.cpp
index e38a1ed..2179c96 100644
--- a/iqtree.cpp
+++ b/iqtree.cpp
@@ -773,54 +773,54 @@ void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
}
void IQTree::initializePLL(Params ¶ms) {
- pllAttr.rateHetModel = PLL_GAMMA;
- pllAttr.fastScaling = PLL_FALSE;
- pllAttr.saveMemory = PLL_FALSE;
- pllAttr.useRecom = PLL_FALSE;
- pllAttr.randomNumberSeed = params.ran_seed;
- pllAttr.numberOfThreads = params.num_threads; /* This only affects the pthreads version */
- if (pllInst != NULL) {
- pllDestroyInstance(pllInst);
- }
- /* Create a PLL instance */
- pllInst = pllCreateInstance(&pllAttr);
-
- /* Read in the alignment file */
- stringstream pllAln;
- if (aln->isSuperAlignment()) {
- ((SuperAlignment*) aln)->printCombinedAlignment(pllAln);
- } else {
- aln->printPhylip(pllAln);
- }
- string pllAlnStr = pllAln.str();
- pllAlignment = pllParsePHYLIPString(pllAlnStr.c_str(), pllAlnStr.length());
-
- /* Read in the partition information */
- // BQM: to avoid printing file
- stringstream pllPartitionFileHandle;
- createPLLPartition(params, pllPartitionFileHandle);
- pllQueue *partitionInfo = pllPartitionParseString(pllPartitionFileHandle.str().c_str());
-
- /* Validate the partitions */
- if (!pllPartitionsValidate(partitionInfo, pllAlignment)) {
- outError("pllPartitionsValidate");
- }
-
- /* Commit the partitions and build a partitions structure */
- pllPartitions = pllPartitionsCommit(partitionInfo, pllAlignment);
-
- /* We don't need the the intermediate partition queue structure anymore */
- pllQueuePartitionsDestroy(&partitionInfo);
-
- /* eliminate duplicate sites from the alignment and update weights vector */
- pllAlignmentRemoveDups(pllAlignment, pllPartitions);
-
- pllTreeInitTopologyForAlignment(pllInst, pllAlignment);
-
- /* Connect the alignment and partition structure with the tree structure */
- if (!pllLoadAlignment(pllInst, pllAlignment, pllPartitions)) {
- outError("Incompatible tree/alignment combination");
- }
+ pllAttr.rateHetModel = PLL_GAMMA;
+ pllAttr.fastScaling = PLL_FALSE;
+ pllAttr.saveMemory = PLL_FALSE;
+ pllAttr.useRecom = PLL_FALSE;
+ pllAttr.randomNumberSeed = params.ran_seed;
+ pllAttr.numberOfThreads = params.num_threads; /* This only affects the pthreads version */
+ if (pllInst != NULL) {
+ pllDestroyInstance(pllInst);
+ }
+ /* Create a PLL instance */
+ pllInst = pllCreateInstance(&pllAttr);
+
+ /* Read in the alignment file */
+ stringstream pllAln;
+ if (aln->isSuperAlignment()) {
+ ((SuperAlignment *) aln)->printCombinedAlignment(pllAln);
+ } else {
+ aln->printPhylip(pllAln);
+ }
+ string pllAlnStr = pllAln.str();
+ pllAlignment = pllParsePHYLIPString(pllAlnStr.c_str(), pllAlnStr.length());
+
+ /* Read in the partition information */
+ // BQM: to avoid printing file
+ stringstream pllPartitionFileHandle;
+ createPLLPartition(params, pllPartitionFileHandle);
+ pllQueue *partitionInfo = pllPartitionParseString(pllPartitionFileHandle.str().c_str());
+
+ /* Validate the partitions */
+ if (!pllPartitionsValidate(partitionInfo, pllAlignment)) {
+ outError("pllPartitionsValidate");
+ }
+
+ /* Commit the partitions and build a partitions structure */
+ pllPartitions = pllPartitionsCommit(partitionInfo, pllAlignment);
+
+ /* We don't need the the intermediate partition queue structure anymore */
+ pllQueuePartitionsDestroy(&partitionInfo);
+
+ /* eliminate duplicate sites from the alignment and update weights vector */
+ pllAlignmentRemoveDups(pllAlignment, pllPartitions);
+
+ pllTreeInitTopologyForAlignment(pllInst, pllAlignment);
+
+ /* Connect the alignment and partition structure with the tree structure */
+ if (!pllLoadAlignment(pllInst, pllAlignment, pllPartitions)) {
+ outError("Incompatible tree/alignment combination");
+ }
}
@@ -1424,7 +1424,7 @@ double IQTree::getAlphaFromPLL() {
void IQTree::inputModelPLL2IQTree() {
// TODO add support for partitioned model
- dynamic_cast<RateGamma*>(getRate())->setGammaShape(pllPartitions->partitionData[0]->alpha);
+ getRate()->setGammaShape(pllPartitions->partitionData[0]->alpha);
if (aln->num_states == 4) {
((ModelGTR*) getModel())->setRateMatrix(pllPartitions->partitionData[0]->substRates);
getModel()->decomposeRateMatrix();
@@ -1720,6 +1720,8 @@ extern pllUFBootData * pllUFBootDataPtr;
string IQTree::optimizeModelParameters(bool printInfo, double logl_epsilon) {
if (logl_epsilon == -1)
logl_epsilon = params->modeps;
+// if (params->test_param)
+// logl_epsilon = 1.0;
cout << "Estimate model parameters (epsilon = " << logl_epsilon << ")" << endl;
double stime = getRealTime();
string newTree;
@@ -1743,8 +1745,18 @@ string IQTree::optimizeModelParameters(bool printInfo, double logl_epsilon) {
if (printInfo)
cout << etime - stime << " seconds (logl: " << curScore << ")" << endl;
} else {
- double modOptScore =
- getModelFactory()->optimizeParameters(params->fixed_branch_length, printInfo, logl_epsilon);
+ double modOptScore;
+ if (params->test_param) { // DO RESTART ON ALPHA AND P_INVAR
+ double stime = getRealTime();
+ modOptScore = getModelFactory()->optimizeParametersGammaInvar(params->fixed_branch_length, printInfo, logl_epsilon);
+ double etime = getRealTime();
+ cout << "Testing param took: " << etime -stime << " CPU seconds" << endl;
+ cout << endl;
+ params->test_param = false;
+ } else {
+ modOptScore = getModelFactory()->optimizeParameters(params->fixed_branch_length, printInfo, logl_epsilon);
+ }
+
if (isSuperTree()) {
((PhyloSuperTree*) this)->computeBranchLengths();
}
diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp
index fc4de9a..bb7efbd 100644
--- a/model/modelfactory.cpp
+++ b/model/modelfactory.cpp
@@ -693,22 +693,20 @@ bool ModelFactory::readSiteFreq(Alignment *aln, char* site_freq_file, IntVector
double ModelFactory::initGTRGammaIParameters(RateHeterogeneity *rate, ModelSubst *model, double initAlpha,
double initPInvar, double *initRates, double *initStateFreqs) {
- RateGammaInvar* rateGammaInvar = dynamic_cast<RateGammaInvar*>(rate);
- ModelGTR* modelGTR = dynamic_cast<ModelGTR*>(model);
+ RateHeterogeneity* rateGammaInvar = rate;
+ ModelGTR* modelGTR = (ModelGTR*)(model);
modelGTR->setRateMatrix(initRates);
modelGTR->setStateFrequency(initStateFreqs);
rateGammaInvar->setGammaShape(initAlpha);
rateGammaInvar->setPInvar(initPInvar);
modelGTR->decomposeRateMatrix();
- rateGammaInvar->computeRates();
site_rate->phylo_tree->clearAllPartialLH();
return site_rate->phylo_tree->computeLikelihood();
}
double ModelFactory::optimizeParametersOnly(double gradient_epsilon) {
double logl;
- if (Params::getInstance().fai && dynamic_cast<RateGammaInvar*>(site_rate) != NULL
- && dynamic_cast<ModelGTR*>(model) != NULL) {
+ if (Params::getInstance().fai && site_rate != NULL && model != NULL) {
cout << "Optimize substitutional and site rates with restart ..." << endl;
PhyloTree* tree = site_rate->phylo_tree;
double initAlpha = 0.1;
@@ -740,8 +738,8 @@ double ModelFactory::optimizeParametersOnly(double gradient_epsilon) {
site_rate->optimizeParameters(gradient_epsilon);
logl = tree->optimizeAllBranches(1);
}
- RateGammaInvar* rateGammaInvar = dynamic_cast<RateGammaInvar*>(site_rate);
- ModelGTR* modelGTR = dynamic_cast<ModelGTR*>(model);
+ RateHeterogeneity* rateGammaInvar = site_rate;
+ ModelGTR* modelGTR = (ModelGTR*)(model);
double curAlpha = rateGammaInvar->getGammaShape();
double curPInvar = rateGammaInvar->getPInvar();
if (logl > bestLogl) {
@@ -821,6 +819,8 @@ double ModelFactory::optimizeAllParameters(double gradient_epsilon) {
model->decomposeRateMatrix();
site_rate->phylo_tree->clearAllPartialLH();
+ score = site_rate->phylo_tree->computeLikelihood();
+
delete [] bound_check;
delete [] lower_bound;
delete [] upper_bound;
@@ -829,6 +829,112 @@ double ModelFactory::optimizeAllParameters(double gradient_epsilon) {
return score;
}
+double ModelFactory::optimizeParametersGammaInvar(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+ PhyloTree *tree = site_rate->getTree();
+
+ RateHeterogeneity* site_rates = tree->getRate();
+ if (site_rates == NULL) {
+// outError("The model must be +I+G");
+ // model is not +I+G, call conventional function instead
+ return optimizeParameters(fixed_len, write_info, logl_epsilon, gradient_epsilon);
+ }
+
+ double frac_const = tree->aln->frac_const_sites;
+ if (fixed_len) {
+ tree->setCurScore(tree->computeLikelihood());
+ } else {
+ tree->optimizeAllBranches(1);
+ }
+
+
+ /* Back up branch lengths and substitutional rates */
+ DoubleVector lenvec;
+ DoubleVector bestLens;
+ tree->saveBranchLengths(lenvec);
+ int numRateEntries = tree->getModel()->getNumRateEntries();
+ double *rates = new double[numRateEntries];
+ double *bestRates = new double[numRateEntries];
+ tree->getModel()->getRateMatrix(rates);
+ int numStates = tree->aln->num_states;
+ double *state_freqs = new double[numStates];
+ tree->getModel()->getStateFrequency(state_freqs);
+
+ /* Best estimates found */
+ double *bestStateFreqs = new double[numStates];
+ double bestLogl = tree->getCurScore();
+ double bestAlpha = 0.0;
+ double bestPInvar = 0.0;
+
+ double testInterval = (frac_const - MIN_PINVAR*2) / 10;
+ double initPInv = MIN_PINVAR;
+ double initAlpha = site_rates->getGammaShape();
+
+ if (write_info)
+ cout << "testInterval: " << testInterval << endl;
+
+ // Now perform testing different inital p_inv values
+ while (initPInv <= frac_const) {
+ if (write_info) {
+ cout << endl;
+ cout << "Testing with init. pinv = " << initPInv << " / init. alpha = " << initAlpha << endl;
+ }
+ tree->restoreBranchLengths(lenvec);
+ ((ModelGTR*) tree->getModel())->setRateMatrix(rates);
+ ((ModelGTR*) tree->getModel())->setStateFrequency(state_freqs);
+ tree->getModel()->decomposeRateMatrix();
+ site_rates->setPInvar(initPInv);
+ site_rates->setGammaShape(initAlpha);
+ tree->clearAllPartialLH();
+ optimizeParameters(fixed_len, write_info, logl_epsilon, gradient_epsilon);
+ double estAlpha = tree->getRate()->getGammaShape();
+ double estPInv = tree->getRate()->getPInvar();
+ double logl = tree->getCurScore();
+ if (write_info) {
+ cout << "Est. alpha: " << estAlpha << " / Est. pinv: " << estPInv
+ << " / Logl: " << logl << endl;
+ }
+ initPInv = initPInv + testInterval;
+
+ if (tree->getCurScore() > bestLogl) {
+ bestLogl = logl;
+ bestAlpha = estAlpha;
+ bestPInvar = estPInv;
+ bestLens.clear();
+ tree->saveBranchLengths(bestLens);
+ tree->getModel()->getRateMatrix(bestRates);
+ tree->getModel()->getStateFrequency(bestStateFreqs);
+ }
+ }
+
+ site_rates->setGammaShape(bestAlpha);
+// site_rates->setFixGammaShape(false);
+ site_rates->setPInvar(bestPInvar);
+// site_rates->setFixPInvar(false);
+ ((ModelGTR*) tree->getModel())->setRateMatrix(bestRates);
+ ((ModelGTR*) tree->getModel())->setStateFrequency(bestStateFreqs);
+ tree->restoreBranchLengths(bestLens);
+ tree->getModel()->decomposeRateMatrix();
+ tree->clearAllPartialLH();
+ tree->setCurScore(tree->computeLikelihood());
+ if (write_info) {
+ cout << endl;
+ cout << "Best initial alpha: " << bestAlpha << " / initial pinv: " << bestPInvar << " / ";
+ cout << "Logl: " << tree->getCurScore() << endl;
+ }
+
+ delete [] rates;
+ delete [] state_freqs;
+ delete [] bestRates;
+ delete [] bestStateFreqs;
+
+ // updating global variable is not safe!
+// Params::getInstance().testAlpha = false;
+
+ // 2016-03-14: this was missing!
+ return tree->getCurScore();
+}
+
+
double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
double logl_epsilon, double gradient_epsilon) {
assert(model);
@@ -950,13 +1056,14 @@ double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
}
double elapsed_secs = getRealTime() - begin_time;
if (write_info)
- cout << "Parameters optimization took " << i-1 << " rounds (" << elapsed_secs << " sec)" << endl << endl;
+ cout << "Parameters optimization took " << i-1 << " rounds (" << elapsed_secs << " sec)" << endl;
startStoringTransMatrix();
// For UpperBounds -----------
tree->mlCheck = 1;
// ---------------------------
+ tree->setCurScore(cur_lh);
return cur_lh;
}
@@ -1105,3 +1212,4 @@ bool ModelFactory::getVariables(double *variables) {
return changed;
}
+
diff --git a/model/modelfactory.h b/model/modelfactory.h
index a4334d5..baedb6e 100644
--- a/model/modelfactory.h
+++ b/model/modelfactory.h
@@ -169,6 +169,14 @@ public:
double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
/**
+ * optimize model parameters and tree branch lengths for the +I+G model
+ * using restart strategy.
+ * @param fixed_len TRUE to fix branch lengths, default is false
+ * @return the best likelihood
+ */
+ virtual double optimizeParametersGammaInvar(bool fixed_len = false, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+
+ /**
* @return TRUE if parameters are at the boundary that may cause numerical unstability
*/
virtual bool isUnstableParameters();
diff --git a/model/modelgtr.cpp b/model/modelgtr.cpp
index 39e18d8..557a291 100644
--- a/model/modelgtr.cpp
+++ b/model/modelgtr.cpp
@@ -592,6 +592,7 @@ double ModelGTR::optimizeParameters(double gradient_epsilon) {
if (changed) {
decomposeRateMatrix();
phylo_tree->clearAllPartialLH();
+ score = phylo_tree->computeLikelihood();
}
delete [] bound_check;
diff --git a/model/partitionmodel.cpp b/model/partitionmodel.cpp
index cb853c1..bddbecd 100644
--- a/model/partitionmodel.cpp
+++ b/model/partitionmodel.cpp
@@ -175,6 +175,39 @@ double PartitionModel::optimizeParameters(bool fixed_len, bool write_info, doubl
return tree_lh;
}
+
+double PartitionModel::optimizeParametersGammaInvar(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+ PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
+ double tree_lh = 0.0;
+ int ntrees = tree->size();
+
+ if (tree->part_order.empty()) tree->computePartitionOrder();
+ #ifdef _OPENMP
+ #pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(ntrees >= tree->params->num_threads)
+ #endif
+ for (int i = 0; i < ntrees; i++) {
+ int part = tree->part_order[i];
+ if (write_info)
+ #ifdef _OPENMP
+ #pragma omp critical
+ #endif
+ {
+ cout << "Optimizing " << tree->at(part)->getModelName() <<
+ " parameters for partition " << tree->part_info[part].name <<
+ " (" << tree->at(part)->getModelFactory()->getNParameters() << " free parameters)" << endl;
+ }
+ tree_lh += tree->at(part)->getModelFactory()->optimizeParametersGammaInvar(fixed_len, write_info && verbose_mode >= VB_MED,
+ logl_epsilon/min(ntrees,10), gradient_epsilon/min(ntrees,10));
+ }
+ //return ModelFactory::optimizeParameters(fixed_len, write_info);
+
+ if (tree->params->link_alpha) {
+ tree_lh = optimizeLinkedAlpha(write_info, gradient_epsilon);
+ }
+ return tree_lh;
+}
+
+
PartitionModel::~PartitionModel()
{
}
diff --git a/model/partitionmodel.h b/model/partitionmodel.h
index 8fe7fda..2a34473 100644
--- a/model/partitionmodel.h
+++ b/model/partitionmodel.h
@@ -73,6 +73,14 @@ public:
virtual double optimizeParameters(bool fixed_len = false, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
/**
+ * optimize model parameters and tree branch lengths for the +I+G model
+ * using restart strategy.
+ * @param fixed_len TRUE to fix branch lengths, default is false
+ * @return the best likelihood
+ */
+ virtual double optimizeParametersGammaInvar(bool fixed_len = false, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+
+ /**
* @return TRUE if parameters are at the boundary that may cause numerical unstability
*/
virtual bool isUnstableParameters();
diff --git a/model/ratefree.cpp b/model/ratefree.cpp
index ba5bcb6..3265a2b 100644
--- a/model/ratefree.cpp
+++ b/model/ratefree.cpp
@@ -248,6 +248,7 @@ double RateFree::optimizeParameters(double gradient_epsilon) {
if (sorted_rates)
quicksort(rates, 0, ncategory-1, prop);
phylo_tree->clearAllPartialLH();
+ score = phylo_tree->computeLikelihood();
}
optimizing_params = 0;
diff --git a/model/ratefree.h b/model/ratefree.h
index 327012b..062985a 100644
--- a/model/ratefree.h
+++ b/model/ratefree.h
@@ -10,7 +10,7 @@
#include "rategamma.h"
-class RateFree: virtual public RateGamma {
+class RateFree: public RateGamma {
public:
/**
constructor
diff --git a/model/ratefreeinvar.cpp b/model/ratefreeinvar.cpp
index b3b8c01..0ec731c 100644
--- a/model/ratefreeinvar.cpp
+++ b/model/ratefreeinvar.cpp
@@ -8,7 +8,7 @@
#include "ratefreeinvar.h"
RateFreeInvar::RateFreeInvar(int ncat, double start_alpha, string params, bool sorted_rates, double p_invar_sites, string opt_alg, PhyloTree *tree)
-: RateGamma(ncat, 1.0, false, tree), RateInvar(p_invar_sites, tree), RateFree(ncat, start_alpha, params, sorted_rates, opt_alg, tree)
+: RateInvar(p_invar_sites, tree), RateFree(ncat, start_alpha, params, sorted_rates, opt_alg, tree)
{
cur_optimize = 0;
name = "+I" + name;
diff --git a/model/rategamma.cpp b/model/rategamma.cpp
index 6957c3f..48e0814 100644
--- a/model/rategamma.cpp
+++ b/model/rategamma.cpp
@@ -150,6 +150,7 @@ void RateGamma::computeRatesMean () {
void RateGamma::setGammaShape(double gs) {
gamma_shape = gs;
+ computeRates();
}
double RateGamma::computeFunction(double shape) {
diff --git a/model/rategamma.h b/model/rategamma.h
index 8be01b6..19fc809 100644
--- a/model/rategamma.h
+++ b/model/rategamma.h
@@ -186,7 +186,7 @@ public:
return fix_gamma_shape;
}
- void setFixGammaShape(bool fixGammaShape) {
+ virtual void setFixGammaShape(bool fixGammaShape) {
fix_gamma_shape = fixGammaShape;
}
diff --git a/model/rategammainvar.cpp b/model/rategammainvar.cpp
index dc293bd..2b0641d 100644
--- a/model/rategammainvar.cpp
+++ b/model/rategammainvar.cpp
@@ -60,7 +60,7 @@ string RateGammaInvar::getNameParams() {
double RateGammaInvar::computeFunction(double value) {
if (cur_optimize == 0)
gamma_shape = value;
- else
+ else
p_invar = value;
// need to compute rates again if p_inv or Gamma shape changes!
computeRates();
@@ -114,8 +114,9 @@ double RateGammaInvar::optimizeParameters(double gradient_epsilon) {
if (ndim == 0)
return phylo_tree->computeLikelihood();
+
if (!joint_optimize) {
-// double lh = phylo_tree->computeLikelihood();
+ double lh = phylo_tree->computeLikelihood();
cur_optimize = 0;
double gamma_lh;
if (Params::getInstance().testAlpha) {
@@ -123,19 +124,41 @@ double RateGammaInvar::optimizeParameters(double gradient_epsilon) {
} else {
gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
}
+ assert(gamma_lh >= lh-0.1);
cur_optimize = 1;
double invar_lh = -DBL_MAX;
invar_lh = RateInvar::optimizeParameters(gradient_epsilon);
-// assert(tree_lh >= lh-0.1);
-// lh = tree_lh;
+ assert(invar_lh >= gamma_lh-0.1);
+ //lh = tree_lh;
-// assert(gamma_lh >= invar_lh - 0.1);
- phylo_tree->clearAllPartialLH();
+ //assert(gamma_lh >= invar_lh - 0.1);
+// phylo_tree->clearAllPartialLH();
// return gamma_lh;
cur_optimize = 0;
return invar_lh;
}
+/*
+ if (!joint_optimize) {
+// double lh = phylo_tree->computeLikelihood();
+ cur_optimize = 1;
+ double invar_lh = -DBL_MAX;
+ invar_lh = RateInvar::optimizeParameters(gradient_epsilon);
+// assert(tree_lh >= lh-0.1);
+// lh = tree_lh;
+ cur_optimize = 0;
+ double gamma_lh;
+ if (Params::getInstance().testAlpha) {
+ gamma_lh = RateGamma::optimizeParameters(gradient_epsilon, 0.05, 10);
+ } else {
+ gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
+ }
+ //assert(gamma_lh >= invar_lh - 0.1);
+ phylo_tree->clearAllPartialLH();
+ return gamma_lh;
+ }
+*/
+
if (verbose_mode >= VB_MAX)
cout << "Optimizing " << name << " model parameters by BFGS..." << endl;
@@ -156,6 +179,7 @@ double RateGammaInvar::optimizeParameters(double gradient_epsilon) {
getVariables(variables);
phylo_tree->clearAllPartialLH();
+ score = phylo_tree->computeLikelihood();
delete [] bound_check;
delete [] lower_bound;
diff --git a/model/rategammainvar.h b/model/rategammainvar.h
index c4092c8..6d7926c 100644
--- a/model/rategammainvar.h
+++ b/model/rategammainvar.h
@@ -64,6 +64,15 @@ public:
virtual double getRate(int category) { return RateGamma::getRate(category); }
/**
+ set the proportion of invariable sites. Default: do nothing
+ @param pinv the proportion of invariable sites
+ */
+ virtual void setPInvar(double pInvar) {
+ p_invar = pInvar;
+ computeRates();
+ }
+
+ /**
* @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
*/
virtual string getNameParams();
diff --git a/model/rateheterogeneity.h b/model/rateheterogeneity.h
index 85e14c6..51bab81 100644
--- a/model/rateheterogeneity.h
+++ b/model/rateheterogeneity.h
@@ -146,6 +146,11 @@ public:
*/
virtual void setPInvar(double pinv) { }
+ /**
+ set whether to fix p_invar
+ */
+ virtual void setFixPInvar(bool fixPInvar) {}
+
/**
Set whether or not to optimize p_invar
@param opt TRUE to optimize p_invar, FALSE otherwise
@@ -159,6 +164,17 @@ public:
virtual double getGammaShape() { return 0.0; }
/**
+ set the Gamma shape. Default: nothing
+ @param gs Gamma shape
+ */
+ virtual void setGammaShape(double gs) {}
+
+ /**
+ set whether to fix gamma shape
+ */
+ virtual void setFixGammaShape(bool fixGammaShape) {}
+
+ /**
@return >0 if this is a Gamma model (default: 0)
*/
virtual int isGammaRate() { return 0; }
diff --git a/model/rateinvar.cpp b/model/rateinvar.cpp
index 58b7d52..55988c5 100644
--- a/model/rateinvar.cpp
+++ b/model/rateinvar.cpp
@@ -93,10 +93,10 @@ double RateInvar::optimizeParameters(double gradient_epsilon) {
double ferror;
p_invar = minimizeOneDimen(MIN_PINVAR, p_invar, min(phylo_tree->aln->frac_const_sites, 1.0-MIN_PINVAR), max(gradient_epsilon, TOL_PINVAR), &negative_lh, &ferror);
//p_invar = minimizeOneDimen(MIN_PINVAR, p_invar, 1.0 - MIN_PINVAR, TOL_PINVAR, &negative_lh, &ferror);
- phylo_tree->clearAllPartialLH();
- phylo_tree->computePtnInvar();
+// phylo_tree->clearAllPartialLH();
+// phylo_tree->computePtnInvar();
// return -negative_lh;
- return phylo_tree->computeLikelihood();
+ return -computeFunction(p_invar);
}
void RateInvar::writeInfo(ostream &out) {
diff --git a/model/ratekategory.cpp b/model/ratekategory.cpp
index fe20122..64ceb09 100644
--- a/model/ratekategory.cpp
+++ b/model/ratekategory.cpp
@@ -86,6 +86,7 @@ double RateKategory::optimizeParameters(double gradient_epsilon)
getVariables(variables);
//sort(rates, rates+ncategory);
phylo_tree->clearAllPartialLH();
+ score = phylo_tree->computeLikelihood();
delete [] bound_check;
delete [] lower_bound;
diff --git a/msetsblock.cpp b/msetsblock.cpp
index 839467e..d4309f1 100644
--- a/msetsblock.cpp
+++ b/msetsblock.cpp
@@ -227,6 +227,7 @@ void MSetsBlock::Read(NxsToken &token)
errormsg += " instead";
throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
}
+ token.SetLabileFlagBit(NxsToken::preserveUnderscores);
if (token.Equals(";"))
break;
else
diff --git a/pda.cpp b/pda.cpp
index 9a17049..9f9db7d 100644
--- a/pda.cpp
+++ b/pda.cpp
@@ -2189,7 +2189,7 @@ int main(int argc, char *argv[])
bool append_log = false;
- if (!Params::getInstance().ignore_checkpoint) {
+ if (!Params::getInstance().ignore_checkpoint && fileExists(filename)) {
checkpoint->load();
if (checkpoint->hasKey("finished")) {
if (checkpoint->getBool("finished")) {
diff --git a/phyloanalysis.cpp b/phyloanalysis.cpp
index d321ac3..ebfffcf 100644
--- a/phyloanalysis.cpp
+++ b/phyloanalysis.cpp
@@ -681,7 +681,7 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
reportRate(out, tree);
}
- if (params.lmap_num_quartets) {
+ if (params.lmap_num_quartets >= 0) {
tree.reportLikelihoodMapping(out);
}
@@ -1080,7 +1080,7 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
cout << " Site log-likelihoods: " << params.out_prefix << ".sitelh" << endl;
}
}
- if (params.lmap_num_quartets) {
+ if (params.lmap_num_quartets >= 0) {
cout << " Likelihood mapping plot (SVG): " << params.out_prefix << ".lmap.svg" << endl;
cout << " Likelihood mapping plot (EPS): " << params.out_prefix << ".lmap.eps" << endl;
}
@@ -1727,6 +1727,9 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
iqtree.initializeAllPartialLh();
double initEpsilon = params.min_iterations == 0 ? params.modeps : (params.modeps*10);
+ if (params.test_param)
+ initEpsilon = 0.1;
+
string initTree;
if (iqtree.getRate()->name.find("+I+G") != string::npos) {
@@ -1747,6 +1750,7 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
cout << "Testing alpha took: " << etime -stime << " CPU seconds" << endl;
cout << endl;
}
+
}
// Optimize model parameters and branch lengths using ML for the initial tree
@@ -1767,11 +1771,16 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
iqtree.getCheckpoint()->dump();
}
- if (params.lmap_num_quartets) {
- cout << "Performing likelihood mapping with " << params.lmap_num_quartets << " quartets..." << endl;
+ if (params.lmap_num_quartets >= 0) {
+ cout << endl << "Performing likelihood mapping with ";
+ if (params.lmap_num_quartets > 0)
+ cout << params.lmap_num_quartets;
+ else
+ cout << "all";
+ cout << " quartets..." << endl;
double lkmap_time = getRealTime();
iqtree.doLikelihoodMapping();
- cout << getRealTime()-lkmap_time << " seconds" << endl;
+ cout << "Likelihood mapping needed " << getRealTime()-lkmap_time << " seconds" << endl << endl;
}
bool finishedCandidateSet = iqtree.getCheckpoint()->getBool("finishedCandidateSet");
@@ -1942,10 +1951,7 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
} else {
cout << "Performs final model parameters optimization" << endl;
string tree;
- if (params.testAlpha)
- tree = iqtree.optimizeModelParameters(true, 0.001);
- else
- tree = iqtree.optimizeModelParameters(true);
+ tree = iqtree.optimizeModelParameters(true);
iqtree.candidateTrees.update(tree, iqtree.getCurScore(), true);
iqtree.getCheckpoint()->putBool("finishedModelFinal", true);
iqtree.saveCheckpoint();
@@ -2038,7 +2044,7 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
}
void computeLoglFromUserInputGAMMAInvar(Params ¶ms, IQTree &iqtree) {
- RateGammaInvar *site_rates = dynamic_cast<RateGammaInvar *>(iqtree.getRate());
+ RateHeterogeneity *site_rates = iqtree.getRate();
site_rates->setFixPInvar(true);
site_rates->setFixGammaShape(true);
vector<double> alphas, p_invars, logl;
@@ -2070,7 +2076,6 @@ void computeLoglFromUserInputGAMMAInvar(Params ¶ms, IQTree &iqtree) {
aiFileResults << alphas.at(i) << " " << p_invars.at(i) << " ";
site_rates->setGammaShape(alphas.at(i));
site_rates->setPInvar(p_invars.at(i));
- site_rates->computeRates();
iqtree.clearAllPartialLH();
double lh = iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, false, 0.001);
aiFileResults << lh << " " << iqtree.treeLength() << "\n";
@@ -2086,7 +2091,7 @@ void searchGAMMAInvarByRestarting(IQTree &iqtree) {
iqtree.setCurScore(iqtree.optimizeAllBranches(1));
else
iqtree.setCurScore(iqtree.computeLikelihood());
- RateGammaInvar* site_rates = dynamic_cast<RateGammaInvar*>(iqtree.getRate());
+ RateHeterogeneity* site_rates = (iqtree.getRate());
double values[] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 };
vector<double> initAlphas;
if (Params::getInstance().randomAlpha) {
@@ -2125,7 +2130,6 @@ void searchGAMMAInvarByRestarting(IQTree &iqtree) {
iqtree.getModel()->decomposeRateMatrix();
site_rates->setGammaShape(initAlphas[i]);
site_rates->setPInvar(initPInvar);
- site_rates->computeRates();
iqtree.clearAllPartialLH();
iqtree.optimizeModelParameters(verbose_mode >= VB_MED, Params::getInstance().testAlphaEps);
double estAlpha = iqtree.getRate()->getGammaShape();
@@ -2152,7 +2156,6 @@ void searchGAMMAInvarByRestarting(IQTree &iqtree) {
((ModelGTR*) iqtree.getModel())->setStateFrequency(bestStateFreqs);
iqtree.restoreBranchLengths(bestLens);
iqtree.getModel()->decomposeRateMatrix();
- site_rates->computeRates();
iqtree.clearAllPartialLH();
iqtree.setCurScore(iqtree.computeLikelihood());
cout << endl;
@@ -2183,7 +2186,7 @@ void exhaustiveSearchGAMMAInvar(Params ¶ms, IQTree &iqtree) {
DoubleVector lenvec;
iqtree.saveBranchLengths(lenvec);
- RateGammaInvar* site_rates = dynamic_cast<RateGammaInvar*>(iqtree.getRate());
+ RateHeterogeneity* site_rates = (iqtree.getRate());
site_rates->setFixPInvar(true);
site_rates->setFixGammaShape(true);
@@ -2191,7 +2194,6 @@ void exhaustiveSearchGAMMAInvar(Params ¶ms, IQTree &iqtree) {
for (double p_invar = p_invarMin; p_invar < p_invarMax; p_invar = p_invar + stepSize) {
site_rates->setGammaShape(alpha);
site_rates->setPInvar(p_invar);
- site_rates->computeRates();
iqtree.clearAllPartialLH();
double lh = iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, false, 0.001);
stringstream ss;
diff --git a/phylosupertreeplen.cpp b/phylosupertreeplen.cpp
index 6782e9d..bc93d40 100644
--- a/phylosupertreeplen.cpp
+++ b/phylosupertreeplen.cpp
@@ -137,6 +137,8 @@ double PartitionModelPlen::optimizeParameters(bool fixed_len, bool write_info, d
if (tree->part_info[part].cur_score == 0.0)
tree->part_info[part].cur_score = tree->at(part)->computeLikelihood();
cur_lh += tree->part_info[part].cur_score;
+
+
// normalize rates s.t. branch lengths are #subst per site
double mean_rate = tree->at(part)->getRate()->rescaleRates();
@@ -197,6 +199,11 @@ double PartitionModelPlen::optimizeParameters(bool fixed_len, bool write_info, d
return tree_lh;
}
+double PartitionModelPlen::optimizeParametersGammaInvar(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+ outError("This option does not work with edge-linked partition model yet");
+ return 0.0;
+}
+
void PartitionModelPlen::writeInfo(ostream &out) {
PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
int ntrees = tree->size();
diff --git a/phylosupertreeplen.h b/phylosupertreeplen.h
index d54c46f..baa2979 100644
--- a/phylosupertreeplen.h
+++ b/phylosupertreeplen.h
@@ -130,6 +130,15 @@ public:
virtual double optimizeParameters(bool fixed_len = false, bool write_info = true,
double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+
+ /**
+ * optimize model parameters and tree branch lengths for the +I+G model
+ * using restart strategy.
+ * @param fixed_len TRUE to fix branch lengths, default is false
+ * @return the best likelihood
+ */
+ virtual double optimizeParametersGammaInvar(bool fixed_len = false, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+
double optimizeGeneRate(double tol);
// virtual double targetFunk(double x[]);
diff --git a/phylotesting.cpp b/phylotesting.cpp
index 272b9e9..43e9a50 100644
--- a/phylotesting.cpp
+++ b/phylotesting.cpp
@@ -991,7 +991,7 @@ void testPartitionModel(Params ¶ms, PhyloSuperTree* in_tree, vector<ModelInf
this_aln = in_tree->at(distID[i] & ((1<<16)-1))->aln;
dist[i] -= ((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
}
- if (params.num_threads > 1)
+ if (params.num_threads > 1 && num_pairs >= 1)
quicksort(dist, 0, num_pairs-1, distID);
#ifdef _OPENMP
@@ -1195,15 +1195,18 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
in_tree->optimize_by_newton = params.optimize_by_newton;
in_tree->setLikelihoodKernel(params.SSE);
- int num_rate_classes = 3 + params.max_rate_cats;
+// int num_rate_classes = 3 + params.max_rate_cats;
- RateHeterogeneity ** rate_class = new RateHeterogeneity*[num_rate_classes];
+ RateHeterogeneity ** rate_class = new RateHeterogeneity*[4];
rate_class[0] = new RateHeterogeneity();
rate_class[1] = new RateInvar(-1, NULL);
rate_class[2] = new RateGamma(params.num_rate_cats, params.gamma_shape, params.gamma_median, NULL);
rate_class[3] = new RateGammaInvar(params.num_rate_cats, params.gamma_shape, params.gamma_median, -1, params.optimize_model_rate_joint, NULL);
- for (model = 4; model < num_rate_classes; model++)
- rate_class[model] = new RateFree(model-2, params.gamma_shape, "", false, params.optimize_alg, NULL);
+
+ RateFree ** rate_class_free = new RateFree*[params.max_rate_cats-1];
+
+ for (model = 0; model < params.max_rate_cats-1; model++)
+ rate_class_free[model] = new RateFree(model+2, params.gamma_shape, "", false, params.optimize_alg, NULL);
ModelGTR *subst_model = NULL;
if (seq_type == SEQ_BINARY)
@@ -1348,7 +1351,7 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
if (ncat <= 1) outError("Number of rate categories for " + model_names[model] + " is <= 1");
if (ncat > params.max_rate_cats)
outError("Number of rate categories for " + model_names[model] + " exceeds " + convertIntToString(params.max_rate_cats));
- tree->setRate(rate_class[2+ncat]);
+ tree->setRate(rate_class_free[ncat-2]);
} else if (model_names[model].find("+I") != string::npos && (pos = model_names[model].find("+G")) != string::npos) {
tree->setRate(rate_class[3]);
if (model_names[model].length() > pos+2 && isdigit(model_names[model][pos+2])) {
@@ -1428,6 +1431,10 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
outError("-mtree option is not supported for partition model");
}
IQTree *iqtree = new IQTree(in_tree->aln);
+ // set checkpoint
+ iqtree->setCheckpoint(in_tree->getCheckpoint());
+ iqtree->num_precision = in_tree->num_precision;
+
cout << endl << "===> Testing model " << model+1 << ": " << params.model_name << endl;
runTreeReconstruction(params, original_model, *iqtree, model_info);
info.logl = iqtree->computeLikelihood();
@@ -1436,6 +1443,16 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
params.model_name = original_model;
params.user_file = orig_user_tree;
tree = iqtree;
+
+ // clear all checkpointed information
+ Checkpoint *newCheckpoint = new Checkpoint;
+ tree->getCheckpoint()->getSubCheckpoint(newCheckpoint, "iqtree");
+ tree->getCheckpoint()->clear();
+ tree->getCheckpoint()->insert(newCheckpoint->begin(), newCheckpoint->end());
+ tree->getCheckpoint()->putBool("finished", false);
+ tree->getCheckpoint()->dump(true);
+ delete newCheckpoint;
+
} else {
if (tree->getMemoryRequired() > RAM_requirement) {
tree->deleteAllPartialLh();
@@ -1463,7 +1480,8 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
{
if (verbose_mode >= VB_MED)
cout << "reoptimizing from previous parameters of +R...." << endl;
- dynamic_cast<RateFree*>(rate_class[2+ncat])->setRateAndProp(dynamic_cast<RateFree*>(rate_class[1+ncat]));
+ assert(ncat >= 3);
+ rate_class_free[ncat-2]->setRateAndProp(rate_class_free[ncat-3]);
info.logl = tree->getModelFactory()->optimizeParameters(false, false, TOL_LIKELIHOOD_MODELTEST, TOL_GRADIENT_MODELTEST);
info.tree_len = tree->treeLength();
}
@@ -1688,10 +1706,17 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
delete model_fac;
delete subst_model;
- for (int rate_type = num_rate_classes-1; rate_type >= 0; rate_type--) {
+ int rate_type;
+ for (rate_type = 3; rate_type >= 0; rate_type--) {
delete rate_class[rate_type];
}
delete [] rate_class;
+
+ for (rate_type = params.max_rate_cats-2; rate_type >= 0; rate_type--) {
+ delete rate_class_free[rate_type];
+ }
+ delete [] rate_class_free;
+
// delete tree_hetero;
// delete tree_homo;
in_tree->deleteAllPartialLh();
diff --git a/phylotree.cpp b/phylotree.cpp
index bdc9da4..2221943 100644
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -360,7 +360,10 @@ void PhyloTree::readTreeString(const string &tree_string) {
// str(tree_string);
// str.seekg(0, ios::beg);
freeNode();
- readTree(str, rooted);
+
+ // bug fix 2016-04-14: in case taxon name happens to be ID
+ MTree::readTree(str, rooted);
+
assignLeafNames();
// setAlignment(aln);
setRootNode(params->root);
diff --git a/phylotree.h b/phylotree.h
index 2ed8388..68b33dd 100644
--- a/phylotree.h
+++ b/phylotree.h
@@ -264,22 +264,22 @@ const double TP_MAX_EXP_DIFF = 40.0;
#define LM_MAX 10
struct QuartetGroups{
- int numGroups; // number of clusters:
- // 0: not initialized, default -> 1
- // 1: no clusters - any (a,b)|(c,d)
- // 2: 2 clusters - (a,a')|(b,b')
- // 3: 3 clusters - (a,a')|(b,c) [rare]
- // 4: 4 clusters - (a,b)|(c,d)
- int numSeqs; // number of seqs in alignment (should be #A+#B+#C+#D+#X)
- int numQuartSeqs; // number of seqs in analysis (should be #A+#B+#C+#D)
- int numGrpSeqs[5]; // number of seqs in cluster A, B, C, D, and X (exclude)
- int uniqueQuarts; // number of existing unique quartets for this grouping
- string Name[5]; // seqIDs of cluster A
- vector<int> GroupA; // seqIDs of cluster A
- vector<int> GroupB; // seqIDs of cluster B
- vector<int> GroupC; // seqIDs of cluster C
- vector<int> GroupD; // seqIDs of cluster D
- vector<int> GroupX; // seqIDs of cluster X
+ int numGroups; // number of clusters:
+ // 0: not initialized, default -> 1
+ // 1: no clusters - any (a,b)|(c,d)
+ // 2: 2 clusters - (a,a')|(b,b')
+ // 3: 3 clusters - (a,a')|(b,c) [rare]
+ // 4: 4 clusters - (a,b)|(c,d)
+ int numSeqs; // number of seqs in alignment (should be #A+#B+#C+#D+#X)
+ int numQuartSeqs; // number of seqs in analysis (should be #A+#B+#C+#D)
+ int numGrpSeqs[5]; // number of seqs in cluster A, B, C, D, and X (exclude)
+ int64_t uniqueQuarts; // number of existing unique quartets for this grouping
+ string Name[5]; // seqIDs of cluster A
+ vector<int> GroupA; // seqIDs of cluster A
+ vector<int> GroupB; // seqIDs of cluster B
+ vector<int> GroupC; // seqIDs of cluster C
+ vector<int> GroupD; // seqIDs of cluster D
+ vector<int> GroupX; // seqIDs of cluster X
};
struct QuartetInfo {
@@ -292,7 +292,7 @@ struct QuartetInfo {
};
struct SeqQuartetInfo {
- unsigned long countarr[LM_MAX]; // the 7 areas of the simplex triangle [0-6; corners (0:top, 1:right, 2:left), rectangles (3:right, 4:left, 5:bottom), 6:center] and the 3 corners [7-9; 7:top, 8:right, 9:left]
+ int64_t countarr[LM_MAX]; // the 7 areas of the simplex triangle [0-6; corners (0:top, 1:right, 2:left), rectangles (3:right, 4:left, 5:bottom), 6:center] and the 3 corners [7-9; 7:top, 8:right, 9:left]
};
// ********************************************
diff --git a/quartet.cpp b/quartet.cpp
index 2fe668d..eca150d 100644
--- a/quartet.cpp
+++ b/quartet.cpp
@@ -201,7 +201,7 @@ void plotlmpointsvg(FILE *ofp, double w1, double w2)
// void finishsvg(FILE *ofp, unsigned long **countarr)
-void finishsvg(FILE *ofp, vector<SeqQuartetInfo> lmap_seq_quartet_info, int leafNum, unsigned long Numquartets)
+void finishsvg(FILE *ofp, vector<SeqQuartetInfo> lmap_seq_quartet_info, int leafNum, int64_t Numquartets)
{
fprintf(ofp," </g>\n");
/* end triangle 1 (top) */
@@ -550,7 +550,7 @@ void plotlmpointcolor(FILE *epsofp, FILE *svgofp, double w1, double w2, int red,
/* last lines of EPSF likelihood mapping file */
//void finisheps(FILE *ofp, unsigned long **countarr)
-void finisheps(FILE *ofp, vector<SeqQuartetInfo> lmap_seq_quartet_info, int leafNum, unsigned long Numquartets)
+void finisheps(FILE *ofp, vector<SeqQuartetInfo> lmap_seq_quartet_info, int leafNum, int64_t Numquartets)
{
fprintf(ofp, "stroke\n");
fprintf(ofp, "%% second triangle (the one with 3 basins)\n");
@@ -674,8 +674,6 @@ void PhyloTree::computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info
if (leafNum < 4)
outError("Tree must have 4 or more taxa with unique sequences!");
- lmap_quartet_info.resize(params->lmap_num_quartets);
-
int qc[] = {0, 1, 2, 3, 0, 2, 1, 3, 0, 3, 1, 2};
double onethird = 1.0/3.0;
@@ -733,21 +731,97 @@ void PhyloTree::computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info
size2 = sizeA-3;
size1 = sizeA-2;
size0 = sizeA-1;
- LMGroups.uniqueQuarts = 1 + size3 +
- size2 * (size2-1) / 2 +
- size1 * (size1-1) * (size1-2) / 6 +
- size0 * (size0-1) * (size0-2) * (size0-3) / 24;
+ LMGroups.uniqueQuarts = (int64_t)1 + size3 +
+ (int64_t)size2 * (size2-1) / 2 +
+ (int64_t)size1 * (size1-1) * (size1-2) / 6 +
+ (int64_t)size0 * (size0-1) * (size0-2) * (size0-3) / 24;
break;
case 2:
- LMGroups.uniqueQuarts = (sizeA * (sizeA - 1)) / 2 * (sizeB * (sizeB - 1)) / 2; break;
+ LMGroups.uniqueQuarts = ((int64_t)sizeA * (sizeA - 1)) / 2 * (sizeB * (sizeB - 1)) / 2; break;
case 3:
- LMGroups.uniqueQuarts = sizeA * sizeB * (sizeC * (sizeC - 1)) / 2; break;
+ LMGroups.uniqueQuarts = (int64_t)sizeA * sizeB * (sizeC * (sizeC - 1)) / 2; break;
case 4:
- LMGroups.uniqueQuarts = sizeA * sizeB * sizeC * sizeD; break;
+ LMGroups.uniqueQuarts = (int64_t)sizeA * sizeB * sizeC * sizeD; break;
default:
outError("Unknown Likelihood Mapping mode! PLEASE report this to the developers!");
break;
}
+
+ if (params->lmap_num_quartets == 0)
+ params->lmap_num_quartets = LMGroups.uniqueQuarts;
+ if (params->lmap_num_quartets > LMGroups.uniqueQuarts) {
+ cout << "INFO: Number of quartets is reduced to all unique quartets " << LMGroups.uniqueQuarts << endl;
+ }
+
+ cout << "Computing " << params->lmap_num_quartets << " quartet likelihoods (one dot represents 100 quartets)." << endl << endl;
+
+ lmap_quartet_info.resize(params->lmap_num_quartets);
+
+ bool quartets_drawn = false;
+
+ if (params->lmap_num_quartets == LMGroups.uniqueQuarts) {
+ // draw all unique quartets now
+ quartets_drawn = true;
+ int64_t qid = 0;
+ switch (numGroups) {
+ case 1:
+ for (int i0 = 0; i0 < sizeA-3; i0++)
+ for (int i1 = i0+1; i1 < sizeA-2; i1++)
+ for (int i2 = i1+1; i2 < sizeA-1; i2++)
+ for (int i3 = i2+1; i3 < sizeA; i3++) {
+ lmap_quartet_info[qid].seqID[0] = LMGroups.GroupA[i0];
+ lmap_quartet_info[qid].seqID[1] = LMGroups.GroupA[i1];
+ lmap_quartet_info[qid].seqID[2] = LMGroups.GroupA[i2];
+ lmap_quartet_info[qid].seqID[3] = LMGroups.GroupA[i3];
+ qid++;
+ }
+ break;
+
+ case 2:
+ for (int i0 = 0; i0 < sizeA-1; i0++)
+ for (int i1 = i0+1; i1 < sizeA; i1++)
+ for (int i2 = 0; i2 < sizeB-1; i2++)
+ for (int i3 = i2+1; i3 < sizeB; i3++) {
+ lmap_quartet_info[qid].seqID[0] = LMGroups.GroupA[i0];
+ lmap_quartet_info[qid].seqID[1] = LMGroups.GroupA[i1];
+ lmap_quartet_info[qid].seqID[2] = LMGroups.GroupB[i2];
+ lmap_quartet_info[qid].seqID[3] = LMGroups.GroupB[i3];
+ qid++;
+ }
+ break;
+
+ case 3:
+ for (int i0 = 0; i0 < sizeA; i0++)
+ for (int i1 = 0; i1 < sizeB; i1++)
+ for (int i2 = 0; i2 < sizeC-1; i2++)
+ for (int i3 = i2+1; i3 < sizeC; i3++) {
+ lmap_quartet_info[qid].seqID[0] = LMGroups.GroupA[i0];
+ lmap_quartet_info[qid].seqID[1] = LMGroups.GroupB[i1];
+ lmap_quartet_info[qid].seqID[2] = LMGroups.GroupC[i2];
+ lmap_quartet_info[qid].seqID[3] = LMGroups.GroupC[i3];
+ qid++;
+ }
+ break;
+ case 4:
+ for (int i0 = 0; i0 < sizeA; i0++)
+ for (int i1 = 0; i1 < sizeB; i1++)
+ for (int i2 = 0; i2 < sizeC; i2++)
+ for (int i3 = 0; i3 < sizeD; i3++) {
+ lmap_quartet_info[qid].seqID[0] = LMGroups.GroupA[i0];
+ lmap_quartet_info[qid].seqID[1] = LMGroups.GroupB[i1];
+ lmap_quartet_info[qid].seqID[2] = LMGroups.GroupC[i2];
+ lmap_quartet_info[qid].seqID[3] = LMGroups.GroupD[i3];
+ qid++;
+ }
+ break;
+
+ default:
+ break;
+ }
+ // sanity check
+ assert(qid == LMGroups.uniqueQuarts);
+ }
+
// fprintf(stderr,"XXX - #quarts: %d; #groups: %d, A: %d, B:%d, C:%d, D:%d\n", LMGroups.uniqueQuarts, LMGroups.numGroups, sizeA, sizeB, sizeC, sizeD);
@@ -763,57 +837,58 @@ void PhyloTree::computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info
#ifdef _OPENMP
#pragma omp for schedule(guided)
#endif
- for (int qid = 0; qid < params->lmap_num_quartets; qid++) { /*** draw lmap_num_quartets quartets randomly ***/
- // fprintf(stderr, "%d\n", qid);
+ for (int64_t qid = 0; qid < params->lmap_num_quartets; qid++) { /*** draw lmap_num_quartets quartets randomly ***/
+ // fprintf(stderr, "%I64d\n", qid);
// uniformly draw 4 taxa
// (a) sample taxon 1
// was: lmap_quartet_info[qid].seqID[0] = random_int(leafNum);
- lmap_quartet_info[qid].seqID[0] = LMGroups.GroupA[random_int(sizeA, rstream)];
-
- do {
- // (b) sample taxon 2 according to the number of clusters
- // was: lmap_quartet_info[qid].seqID[1] = random_int(leafNum);
- switch(numGroups){
- case 1: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,A2|a3,a4
- case 2: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,A2|b1,b2
- case 3: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a ,B |c1,c2
- case 4: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a ,B |c, d
- default: outError("Unknown Likelihood Mapping sampling mode! PLEASE report this to the developers!"); break;
- }
- } while (lmap_quartet_info[qid].seqID[1] == lmap_quartet_info[qid].seqID[0]);
- do {
- // (c) sample taxon 3 according to the number of clusters
- // was: lmap_quartet_info[qid].seqID[2] = random_int(leafNum);
- switch(numGroups){
- case 1: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,a2|A3,a4
- case 2: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a1,a2|B1,b2
- case 3: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupC[random_int(sizeC, rstream)]; break; // a ,b |C1,c2
- case 4: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupC[random_int(sizeC, rstream)]; break; // a ,b |C, d
- default: outError("Unknown Likelihood Mapping sampling mode! PLEASE report this to the developers!"); break;
- }
- } while (lmap_quartet_info[qid].seqID[2] == lmap_quartet_info[qid].seqID[0] || lmap_quartet_info[qid].seqID[2] == lmap_quartet_info[qid].seqID[1]);
- do {
- // (d) sample taxon 4 according to the number of clusters
- // was: lmap_quartet_info[qid].seqID[3] = random_int(leafNum);
- switch(numGroups){
- case 1: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,a2|a3,A4
- case 2: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a1,a2|b1,B2
- case 3: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupC[random_int(sizeC, rstream)]; break; // a ,b |c1,C2
- case 4: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupD[random_int(sizeD, rstream)]; break; // a ,b |c, D
- default: outError("Unknown Likelihood Mapping sampling mode! PLEASE report this to the developers!"); break;
- }
- } while (lmap_quartet_info[qid].seqID[3] == lmap_quartet_info[qid].seqID[0] || lmap_quartet_info[qid].seqID[3] == lmap_quartet_info[qid].seqID[1]
- || lmap_quartet_info[qid].seqID[3] == lmap_quartet_info[qid].seqID[2]);
-
+ if (!quartets_drawn) {
+ // draw a random quartet
+ lmap_quartet_info[qid].seqID[0] = LMGroups.GroupA[random_int(sizeA, rstream)];
+
+ do {
+ // (b) sample taxon 2 according to the number of clusters
+ // was: lmap_quartet_info[qid].seqID[1] = random_int(leafNum);
+ switch(numGroups){
+ case 1: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,A2|a3,a4
+ case 2: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,A2|b1,b2
+ case 3: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a ,B |c1,c2
+ case 4: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a ,B |c, d
+ default: outError("Unknown Likelihood Mapping sampling mode! PLEASE report this to the developers!"); break;
+ }
+ } while (lmap_quartet_info[qid].seqID[1] == lmap_quartet_info[qid].seqID[0]);
+ do {
+ // (c) sample taxon 3 according to the number of clusters
+ // was: lmap_quartet_info[qid].seqID[2] = random_int(leafNum);
+ switch(numGroups){
+ case 1: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,a2|A3,a4
+ case 2: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a1,a2|B1,b2
+ case 3: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupC[random_int(sizeC, rstream)]; break; // a ,b |C1,c2
+ case 4: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupC[random_int(sizeC, rstream)]; break; // a ,b |C, d
+ default: outError("Unknown Likelihood Mapping sampling mode! PLEASE report this to the developers!"); break;
+ }
+ } while (lmap_quartet_info[qid].seqID[2] == lmap_quartet_info[qid].seqID[0] || lmap_quartet_info[qid].seqID[2] == lmap_quartet_info[qid].seqID[1]);
+ do {
+ // (d) sample taxon 4 according to the number of clusters
+ // was: lmap_quartet_info[qid].seqID[3] = random_int(leafNum);
+ switch(numGroups){
+ case 1: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,a2|a3,A4
+ case 2: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a1,a2|b1,B2
+ case 3: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupC[random_int(sizeC, rstream)]; break; // a ,b |c1,C2
+ case 4: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupD[random_int(sizeD, rstream)]; break; // a ,b |c, D
+ default: outError("Unknown Likelihood Mapping sampling mode! PLEASE report this to the developers!"); break;
+ }
+ } while (lmap_quartet_info[qid].seqID[3] == lmap_quartet_info[qid].seqID[0] || lmap_quartet_info[qid].seqID[3] == lmap_quartet_info[qid].seqID[1]
+ || lmap_quartet_info[qid].seqID[3] == lmap_quartet_info[qid].seqID[2]);
+ }
// fprintf(stderr, "qqq%d: %d, %d, %d, %d\n", qid, lmap_quartet_info[qid].seqID[0], lmap_quartet_info[qid].seqID[1], lmap_quartet_info[qid].seqID[2], lmap_quartet_info[qid].seqID[3]);
// *** taxa should not be sorted, because that changes the corners a dot is assigned to - removed HAS ;^)
// obsolete: sort(lmap_quartet_info[qid].seqID, lmap_quartet_info[qid].seqID+4); // why sort them?!? HAS ;^)
-
+
// initialize sub-alignment and sub-tree
Alignment *quartet_aln;
- PhyloTree *quartet_tree;
if (aln->isSuperAlignment()) {
quartet_aln = new SuperAlignment;
} else {
@@ -821,63 +896,75 @@ void PhyloTree::computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info
}
IntVector seq_id;
seq_id.insert(seq_id.begin(), lmap_quartet_info[qid].seqID, lmap_quartet_info[qid].seqID+4);
- quartet_aln->extractSubAlignment(aln, seq_id, 0);
- if (isSuperTree()) {
- quartet_tree = new PhyloSuperTree((SuperAlignment*)quartet_aln, (PhyloSuperTree*)this);
+ IntVector kept_partitions;
+ // only keep partitions with at least 3 sequences
+ quartet_aln->extractSubAlignment(aln, seq_id, 0, 3, &kept_partitions);
+
+ if (kept_partitions.size() == 0) {
+ // nothing kept
+ for (int k = 0; k < 3; k++) {
+ lmap_quartet_info[qid].logl[k] = -1.0;
+ }
} else {
- quartet_tree = new PhyloTree(quartet_aln);
- }
+ // something partition kept, do computations
+ PhyloTree *quartet_tree;
+ if (isSuperTree()) {
+ quartet_tree = new PhyloSuperTree((SuperAlignment*)quartet_aln, (PhyloSuperTree*)this);
+ } else {
+ quartet_tree = new PhyloTree(quartet_aln);
+ }
- // set up parameters
- quartet_tree->setParams(params);
- quartet_tree->optimize_by_newton = params->optimize_by_newton;
- quartet_tree->setLikelihoodKernel(params->SSE);
-
- // set up partition model
- if (isSuperTree()) {
- PhyloSuperTree *quartet_super_tree = (PhyloSuperTree*)quartet_tree;
- PhyloSuperTree *super_tree = (PhyloSuperTree*)this;
- for (int i = 0; i < super_tree->size(); i++) {
- quartet_super_tree->at(i)->setModelFactory(super_tree->at(i)->getModelFactory());
- quartet_super_tree->at(i)->setModel(super_tree->at(i)->getModel());
- quartet_super_tree->at(i)->setRate(super_tree->at(i)->getRate());
+ // set up parameters
+ quartet_tree->setParams(params);
+ quartet_tree->optimize_by_newton = params->optimize_by_newton;
+ quartet_tree->setLikelihoodKernel(params->SSE);
+
+ // set up partition model
+ if (isSuperTree()) {
+ PhyloSuperTree *quartet_super_tree = (PhyloSuperTree*)quartet_tree;
+ PhyloSuperTree *super_tree = (PhyloSuperTree*)this;
+ for (int i = 0; i < quartet_super_tree->size(); i++) {
+ quartet_super_tree->at(i)->setModelFactory(super_tree->at(kept_partitions[i])->getModelFactory());
+ quartet_super_tree->at(i)->setModel(super_tree->at(kept_partitions[i])->getModel());
+ quartet_super_tree->at(i)->setRate(super_tree->at(kept_partitions[i])->getRate());
+ }
}
- }
-
- // set model and rate
- quartet_tree->setModelFactory(model_factory);
- quartet_tree->setModel(getModel());
- quartet_tree->setRate(getRate());
- // NOTE: we don't need to set phylo_tree in model and rate because parameters are not reoptimized
-
-
-
- // loop over 3 quartets to compute likelihood
- for (int k = 0; k < 3; k++) {
- string quartet_tree_str;
- quartet_tree_str = "(" + quartet_aln->getSeqName(qc[k*4]) + "," + quartet_aln->getSeqName(qc[k*4+1]) + ",(" +
- quartet_aln->getSeqName(qc[k*4+2]) + "," + quartet_aln->getSeqName(qc[k*4+3]) + "));";
- quartet_tree->readTreeStringSeqName(quartet_tree_str);
- quartet_tree->initializeAllPartialLh();
- quartet_tree->wrapperFixNegativeBranch(true);
- // optimize branch lengths with logl_epsilon=0.1 accuracy
- lmap_quartet_info[qid].logl[k] = quartet_tree->optimizeAllBranches(10, 0.1);
- }
- // reset model & rate so that they are not deleted
- quartet_tree->setModel(NULL);
- quartet_tree->setModelFactory(NULL);
- quartet_tree->setRate(NULL);
-
- if (isSuperTree()) {
- PhyloSuperTree *quartet_super_tree = (PhyloSuperTree*)quartet_tree;
- for (int i = 0; i < quartet_super_tree->size(); i++) {
- quartet_super_tree->at(i)->setModelFactory(NULL);
- quartet_super_tree->at(i)->setModel(NULL);
- quartet_super_tree->at(i)->setRate(NULL);
+
+ // set model and rate
+ quartet_tree->setModelFactory(model_factory);
+ quartet_tree->setModel(getModel());
+ quartet_tree->setRate(getRate());
+ // NOTE: we don't need to set phylo_tree in model and rate because parameters are not reoptimized
+
+
+
+ // loop over 3 quartets to compute likelihood
+ for (int k = 0; k < 3; k++) {
+ string quartet_tree_str;
+ quartet_tree_str = "(" + quartet_aln->getSeqName(qc[k*4]) + "," + quartet_aln->getSeqName(qc[k*4+1]) + ",(" +
+ quartet_aln->getSeqName(qc[k*4+2]) + "," + quartet_aln->getSeqName(qc[k*4+3]) + "));";
+ quartet_tree->readTreeStringSeqName(quartet_tree_str);
+ quartet_tree->initializeAllPartialLh();
+ quartet_tree->wrapperFixNegativeBranch(true);
+ // optimize branch lengths with logl_epsilon=0.1 accuracy
+ lmap_quartet_info[qid].logl[k] = quartet_tree->optimizeAllBranches(10, 0.1);
}
+ // reset model & rate so that they are not deleted
+ quartet_tree->setModel(NULL);
+ quartet_tree->setModelFactory(NULL);
+ quartet_tree->setRate(NULL);
+
+ if (isSuperTree()) {
+ PhyloSuperTree *quartet_super_tree = (PhyloSuperTree*)quartet_tree;
+ for (int i = 0; i < quartet_super_tree->size(); i++) {
+ quartet_super_tree->at(i)->setModelFactory(NULL);
+ quartet_super_tree->at(i)->setModel(NULL);
+ quartet_super_tree->at(i)->setRate(NULL);
+ }
+ }
+ delete quartet_tree;
}
-
- delete quartet_tree;
+
delete quartet_aln;
// determine likelihood order
@@ -1036,7 +1123,22 @@ void PhyloTree::computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info
lmap_quartet_info[qid].area=6; // LM_REG7 - center
}
+ {
+ int64_t count = (qid+1);
+ if ((count % 100) == 0) {
+ cout << '.';
+ if ((count % 1000) == 0) { // separator after 10 dots
+ cout << ' ';
+ if ((count % 5000) == 0) // new-line after 50 dots
+ cout << " : " << count << endl;
+ }
+ cout.flush();
+ }
+ }
} /*** end draw lmap_num_quartets quartets randomly ***/
+ if ((params->lmap_num_quartets % 5000) != 0) {
+ cout << ". : " << params->lmap_num_quartets << flush << endl << endl;
+ } else cout << endl;
#ifdef _OPENMP
finish_random(rstream);
@@ -1048,6 +1150,79 @@ void PhyloTree::computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info
//**************************************
+/**
+ read groups in following format "(A, B, C, D), (E, F), (G, H), (I);"
+**/
+void readGroupNewick(char *filename, MSetsBlock *sets_block) {
+ try {
+ ifstream in;
+ in.exceptions(ios::failbit | ios::badbit);
+ in.open(filename);
+ in.exceptions(ios::badbit);
+ char ch;
+ string name;
+ TaxaSetNameVector *sets = sets_block->getSets();
+ while (!in.eof()) {
+ // read the cluster
+ TaxaSetName *myset = new TaxaSetName;
+ sets->push_back(myset);
+ in >> ch;
+ if (ch != '(')
+ throw "Cluster does not start with '('";
+ // read taxon name
+ do {
+ in >> ch;
+ name = "";
+ do {
+ name += ch;
+ ch = in.get();
+ if (controlchar(ch)) {
+ in >> ch;
+ break;
+ }
+ } while (ch != ',' && ch != ')');
+ myset->taxlist.push_back(name);
+ if (ch == ',') continue; // continue to read next taxon name
+ if (ch == ')') break;
+ throw "No ',' or ')' found after " + name;
+ } while (ch != ')');
+ in >> ch;
+ if (isalnum(ch)) {
+ // read cluster name
+ name = "";
+ do {
+ name += ch;
+ ch = in.get();
+ if (controlchar(ch)) {
+ in >> ch;
+ break;
+ }
+ } while (ch != ',' && ch != ';');
+ myset->name = name;
+ } else {
+ myset->name = "Cluster" + convertIntToString(sets->size());
+ }
+ // check for duplicated name
+ for (TaxaSetNameVector::iterator it = sets->begin(); it != sets->end()-1; it++)
+ if ((*it)->name == myset->name)
+ throw "Duplicated cluster name " + myset->name;
+ if (ch == ',') continue; // continue to read next cluster
+ if (ch == ';') break;
+ throw "No ',' or ';' found after " + name + ")";
+ }
+
+ in.clear();
+ // set the failbit again
+ in.exceptions(ios::failbit | ios::badbit);
+ in.close();
+ } catch (ios::failure) {
+ outError(ERR_READ_INPUT);
+ } catch (const char* str) {
+ outError(str);
+ } catch (string &str) {
+ outError(str);
+ }
+}
void PhyloTree::readLikelihoodMappingGroups(char *filename, QuartetGroups &LMGroups) {
@@ -1056,17 +1231,26 @@ void PhyloTree::readLikelihoodMappingGroups(char *filename, QuartetGroups &LMGro
MSetsBlock *lmclusters;
lmclusters = new MSetsBlock();
cout << endl << "Reading likelihood mapping cluster file " << filename << "..." << endl;
- cout << "(The leading numbers represent the order from the master alignment.)" << endl << endl;
- MyReader nexus(filename);
-
- nexus.Add(lmclusters);
-
- MyToken token(nexus.inf);
- nexus.Execute(token);
+ InputType intype = detectInputFile(filename);
+
+ if (intype == IN_NEXUS) {
+ MyReader nexus(filename);
+ nexus.Add(lmclusters);
+ MyToken token(nexus.inf);
+ nexus.Execute(token);
+ } else if (intype == IN_NEWICK) {
+ readGroupNewick(filename, lmclusters);
+ } else
+ outError("Unknown cluster file format, please use NEXUS or RAxML-style format");
+
+ if (lmclusters->getNSets() == 0)
+ outError("No cluster found");
// lmclusters->Report(cout);
+ cout << "(The leading numbers represent the order from the master alignment.)" << endl << endl;
+
TaxaSetNameVector *allsets = lmclusters->getSets();
numsets = allsets->size();
@@ -1148,6 +1332,7 @@ void PhyloTree::readLikelihoodMappingGroups(char *filename, QuartetGroups &LMGro
}
LMGroups.numGroups = n;
+ delete lmclusters;
} // end PhyloTree::readLikelihoodMappingGroups
@@ -1159,16 +1344,27 @@ void PhyloTree::doLikelihoodMapping() {
// vector<SeqQuartetInfo> lmap_seq_quartet_info;
// int areacount[8] = {0, 0, 0, 0, 0, 0, 0, 0};
// int cornercount[4] = {0, 0, 0, 0};
- int resolved, partly, unresolved;
- int qid;
+ int64_t resolved, partly, unresolved;
+ int64_t qid;
ofstream out;
string filename;
-
+
if(params->lmap_cluster_file != NULL) {
// cout << "YYY: test reading" << params->lmap_cluster_file << endl;
- readLikelihoodMappingGroups(params->lmap_cluster_file, LMGroups);
+ readLikelihoodMappingGroups(params->lmap_cluster_file, LMGroups);
} else {
- LMGroups.numGroups = 0; /* no clusterfile -> un-initialized */
+ LMGroups.numGroups = 0; /* no clusterfile -> un-initialized */
+ int64_t recommended_quartets;
+ if (aln->getNSeq() > 10) {
+ recommended_quartets = (int64_t)25*aln->getNSeq();
+ } else {
+ int64_t n = aln->getNSeq();
+ recommended_quartets = n*(n-1)*(n-2)*(n-3)/24;
+ }
+ if (params->lmap_num_quartets > 0 && params->lmap_num_quartets < recommended_quartets) {
+ outWarning("Number of quartets is recommended to be at least " + convertInt64ToString(recommended_quartets) + " s.t. each sequence is sampled sufficiently");
+ }
+
}
areacount[0] = 0;
@@ -1198,8 +1394,6 @@ void PhyloTree::doLikelihoodMapping() {
lmap_seq_quartet_info[qid].countarr[9] = 0;
}
- cout << "Computing quartet likelihoods..." << endl << endl;
-
computeQuartetLikelihoods(lmap_quartet_info, LMGroups);
for (qid = 0; qid < params->lmap_num_quartets; qid++) {
@@ -1254,11 +1448,12 @@ void PhyloTree::doLikelihoodMapping() {
if (params->print_lmap_quartet_lh) {
// print quartet file
+ out << "SeqIDs\tlh1\tlh2\tlh3\tweight1\tweight2\tweight3" << endl;
for (qid = 0; qid < params->lmap_num_quartets; qid++) {
- out << "(" << lmap_quartet_info[qid].seqID[0] << ","
- << lmap_quartet_info[qid].seqID[1] << ","
- << lmap_quartet_info[qid].seqID[2] << ","
- << lmap_quartet_info[qid].seqID[3] << ")"
+ out << "(" << lmap_quartet_info[qid].seqID[0]+1 << ","
+ << lmap_quartet_info[qid].seqID[1]+1 << ","
+ << lmap_quartet_info[qid].seqID[2]+1 << ","
+ << lmap_quartet_info[qid].seqID[3]+1 << ")"
<< "\t" << lmap_quartet_info[qid].logl[0]
<< "\t" << lmap_quartet_info[qid].logl[1]
<< "\t" << lmap_quartet_info[qid].logl[2]
@@ -1267,128 +1462,15 @@ void PhyloTree::doLikelihoodMapping() {
<< "\t" << lmap_quartet_info[qid].qweight[2] << endl;
}
- PhyloTree::reportLikelihoodMapping(out);
-
- /**** begin of report output ****/
- /**** moved to PhyloTree::reportLikelihoodMapping ****/
-#if 0
- // LM_REG1 0 /* top corner */
- // LM_REG2 1 /* bottom-right corner */
- // LM_REG3 2 /* bottom-left corner */
- // LM_REG4 3 /* right rectangle */
- // LM_REG5 4 /* bottom rectangle */
- // LM_REG6 5 /* left rectangle */
- // LM_REG7 6 /* center */
- // LM_AR1 7 /* top third */
- // LM_AR2 8 /* bottom-right third */
- // LM_AR3 9 /* bottom-left third */
-
- out << "LIKELIHOOD MAPPING ANALYSIS" << endl << endl;
- out << "Number of quartets: " << params->lmap_num_quartets << "(random choice)" << endl << endl;
- out << "Quartet trees are based on the selected model of substitution." << endl << endl;
- out << "Sequences are not grouped in clusters." << endl;
-
- out << endl << endl;
- out << "LIKELIHOOD MAPPING STATISTICS" << endl << endl;
-
- out << " (a,b)-(c,d) (a,b)-(c,d) " << endl;
- out << " /\\ /\\ " << endl;
- out << " / \\ / \\ " << endl;
- out << " / \\ / 1 \\ " << endl;
- out << " / a1 \\ / \\ / \\ " << endl;
- out << " /\\ /\\ / \\/ \\ " << endl;
- out << " / \\ / \\ / /\\ \\ " << endl;
- out << " / \\ / \\ / 6 / \\ 4 \\ " << endl;
- out << " / \\/ \\ /\\ / 7 \\ /\\ " << endl;
- out << " / | \\ / \\ /______\\ / \\ " << endl;
- out << " / a3 | a2 \\ / 3 | 5 | 2 \\ " << endl;
- out << " /__________|_________\\ /_____|________|_____\\ " << endl;
- out << "(a,d)-(b,c) (a,c)-(b,d) (a,b)-(c,d) (a,c)-(b,d) " << endl << endl;
- out << "For more information about likelihood-mapping refer to" << endl;
- out << " Strimmer and von Haeseler (1997) PNAS 94:6815-6819" << endl;
- out << " http://www.ncbi.nlm.nih.gov/pubmed/9192648" << endl;
- out << "and/or" << endl;
- out << " Schmidt and von Haeseler (2003) Current Protocols in Bioinformatics" << endl;
- out << " (by Baxevanis et al., Eds.), Unit 6, Wiley&Sons, New York." << endl;
- out << " http://dx.doi.org/10.1002/0471250953.bi0606s17" << endl;
-
-
- out << endl << endl;
- out << "Quartet support of regions a1, a2, a3:" << endl << endl;
- out << " #quartets a1 (% a1) a2 (% a2) a3 (% a3) name" << endl;
- for (qid = 0; qid < leafNum; qid++) {
- //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
- unsigned long sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
-
- out.setf(ios::fixed, ios::floatfield); // set fixed floating format
- out.precision(2);
- out << setw(4) << qid+1
- << setw(9) << sumq
- << setw(7) << lmap_seq_quartet_info[qid].countarr[7]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[7]/sumq << ") "
- << setw(7) << lmap_seq_quartet_info[qid].countarr[8]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[8]/sumq << ") "
- << setw(7) << lmap_seq_quartet_info[qid].countarr[9]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[9]/sumq << ") "
- << PhyloTree::aln->getSeqName(qid) << endl;
- }
-
- out << endl << endl << "Quartet support of areas 1-7:" << endl << endl;
- out << " resolved partly unresolved name" << endl;
- out << " #quartets 1 (% 1) 2 (% 2) 3 (% 3) 4 (% 4) 5 (% 5) 6 (% 6) 7 (% 7)" << endl;
- for (qid = 0; qid < leafNum; qid++) {
- //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
- unsigned long sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
-
- out.setf(ios::fixed, ios::floatfield); // set fixed floating format
- out.precision(2);
- out << setw(4) << qid+1
- << setw(9) << sumq
- << setw(7) << lmap_seq_quartet_info[qid].countarr[0]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[0]/sumq << ") "
- << setw(7) << lmap_seq_quartet_info[qid].countarr[1]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[1]/sumq << ") "
- << setw(7) << lmap_seq_quartet_info[qid].countarr[2]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[2]/sumq << ") "
- << setw(7) << lmap_seq_quartet_info[qid].countarr[3]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[3]/sumq << ") "
- << setw(7) << lmap_seq_quartet_info[qid].countarr[4]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[4]/sumq << ") "
- << setw(7) << lmap_seq_quartet_info[qid].countarr[5]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[5]/sumq << ") "
- << setw(7) << lmap_seq_quartet_info[qid].countarr[6]
- << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[6]/sumq << ") "
- << PhyloTree::aln->getSeqName(qid) << endl;
- }
+// PhyloTree::reportLikelihoodMapping(out);
- out << endl << endl << "Quartet resolution per sequence:" << endl << endl;
- out << " #quartets resolved partly unresolved name" << endl;
- for (qid = 0; qid < leafNum; qid++) {
- //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
- unsigned long resolved = lmap_seq_quartet_info[qid].countarr[LM_REG1] + lmap_seq_quartet_info[qid].countarr[LM_REG2] + lmap_seq_quartet_info[qid].countarr[LM_REG3];
- unsigned long partly = lmap_seq_quartet_info[qid].countarr[LM_REG4] + lmap_seq_quartet_info[qid].countarr[LM_REG5] + lmap_seq_quartet_info[qid].countarr[LM_REG6];
- unsigned long unres = lmap_seq_quartet_info[qid].countarr[LM_REG7];
- unsigned long sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
-
- out.setf(ios::fixed, ios::floatfield); // set fixed floating format
- out.precision(2);
- out << setw(4) << qid+1
- << setw(9) << sumq
- << setw(7) << resolved
- << " (" << setw(6) << (double) 100.0*resolved/sumq << ") "
- << setw(7) << partly
- << " (" << setw(6) << (double) 100.0*partly/sumq << ") "
- << setw(7) << unres
- << " (" << setw(6) << (double) 100.0*unres/sumq << ") "
- << PhyloTree::aln->getSeqName(qid) << endl;
- }
-#endif
}
resolved = areacount[0] + areacount[1] + areacount[2];
partly = areacount[3] + areacount[4] + areacount[5];
unresolved = areacount[6];
+#if 0 // for debugging purposes only (HAS)
out << endl << "LIKELIHOOD MAPPING SUMMARY" << endl << endl;
out << "Number of quartets: " << (resolved+partly+unresolved)
<< " (randomly drawn with replacement)" << endl << endl;
@@ -1400,13 +1482,8 @@ void PhyloTree::doLikelihoodMapping() {
out << "Number of unresolved quartets: " << unresolved
<< " (" << 100.0 * unresolved/(resolved+partly+unresolved) << "%)" << endl << endl;
-#if 0
#endif
- /**** end of report output ****/
- /**** moved to PhyloTree::reportLikelihoodMapping ****/
-
-
if (params->print_lmap_quartet_lh) {
// print quartet file
out.close();
@@ -1421,25 +1498,14 @@ void PhyloTree::doLikelihoodMapping() {
fclose(epsout);
cout << "likelihood mapping plot (EPS) written to " << lmap_epsfilename << endl;
-
-// cout << "\nOverall quartet resolution: (from " << (resolved+partly+unresolved) << " randomly drawn quartets)" << endl;
-// cout << "Fully resolved quartets: " << resolved << " (= "
-// << (double) resolved * 100.0 / (resolved+partly+unresolved) << "%)" << endl;
-// cout << "Partly resolved quartets: " << partly << " (= "
-// << (double) partly * 100.0 / (resolved+partly+unresolved) << "%)" << endl;
-// cout << "Unresolved quartets: " << unresolved << " (= "
-// << (double) unresolved * 100.0 / (resolved+partly+unresolved) << "%)" << endl << endl;
-
} // end PhyloTree::doLikelihoodMapping
void PhyloTree::reportLikelihoodMapping(ofstream &out) {
- // int areacount[8] = {0, 0, 0, 0, 0, 0, 0, 0};
- // int cornercount[4] = {0, 0, 0, 0};
- int resolved, partly, unresolved;
- int qid;
+ int64_t resolved, partly, unresolved;
+ int64_t qid;
int leafNum;
leafNum = PhyloTree::aln->getNSeq();
// vector<QuartetInfo> lmap_quartet_info;
@@ -1456,23 +1522,16 @@ void PhyloTree::reportLikelihoodMapping(ofstream &out) {
// LM_AR1 7 /* top third */
// LM_AR2 8 /* bottom-right third */
// LM_AR3 9 /* bottom-left third */
-#if 0
-#define LM_REG1 0 /* top corner */
-#define LM_REG2 1 /* bottom-right corner */
-#define LM_REG3 2 /* bottom-left corner */
-#define LM_REG4 3 /* right rectangle */
-#define LM_REG5 4 /* bottom rectangle */
-#define LM_REG6 5 /* left rectangle */
-#define LM_REG7 6 /* center */
-#define LM_AR1 7 /* top third */
-#define LM_AR2 8 /* bottom-right third */
-#define LM_AR3 9 /* bottom-left third */
-#endif
out << "LIKELIHOOD MAPPING ANALYSIS" << endl;
out << "---------------------------" << endl << endl;
- out << "Number of quartets: " << params->lmap_num_quartets << " (randomly chosen from "
+ out << "Number of quartets: " << params->lmap_num_quartets;
+ if (params->lmap_num_quartets < LMGroups.uniqueQuarts)
+ cout << " (randomly chosen with replacement from "
<< LMGroups.uniqueQuarts << " existing unique quartets)" << endl << endl;
+ else
+ out << " (all unique quartets)" << endl << endl;
+
out << "Quartet trees are based on the selected model of substitution." << endl << endl;
if(LMGroups.numGroups == 1) {
@@ -1613,8 +1672,7 @@ void PhyloTree::reportLikelihoodMapping(ofstream &out) {
out << " #quartets a1 (% a1) a2 (% a2) a3 (% a3) name" << endl;
out << "-----------------------------------------------------------------------------" << endl;
for (qid = 0; qid <= leafNum; qid++) {
- //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
- unsigned long sumq0, sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
+ int64_t sumq0, sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
if (sumq>0) sumq0=sumq;
else sumq0=1;
@@ -1650,8 +1708,7 @@ void PhyloTree::reportLikelihoodMapping(ofstream &out) {
out << " #quartets 1 (% 1) 2 (% 2) 3 (% 3) 4 (% 4) 5 (% 5) 6 (% 6) 7 (% 7)" << endl;
out << "------------------------------------------------------------------------------------------------------------------------------------------------" << endl;
for (qid = 0; qid <= leafNum; qid++) {
- //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
- unsigned long sumq0, sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
+ int64_t sumq0, sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
if (sumq>0) sumq0=sumq;
else sumq0=1;
@@ -1703,11 +1760,10 @@ void PhyloTree::reportLikelihoodMapping(ofstream &out) {
out << " #quartets resolved partly unresolved name" << endl;
out << "-----------------------------------------------------------------------------" << endl;
for (qid = 0; qid <= leafNum; qid++) {
- //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
- unsigned long resolved = lmap_seq_quartet_info[qid].countarr[LM_REG1] + lmap_seq_quartet_info[qid].countarr[LM_REG2] + lmap_seq_quartet_info[qid].countarr[LM_REG3];
- unsigned long partly = lmap_seq_quartet_info[qid].countarr[LM_REG4] + lmap_seq_quartet_info[qid].countarr[LM_REG5] + lmap_seq_quartet_info[qid].countarr[LM_REG6];
- unsigned long unres = lmap_seq_quartet_info[qid].countarr[LM_REG7];
- unsigned long sumq0, sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
+ int64_t resolved = lmap_seq_quartet_info[qid].countarr[LM_REG1] + lmap_seq_quartet_info[qid].countarr[LM_REG2] + lmap_seq_quartet_info[qid].countarr[LM_REG3];
+ int64_t partly = lmap_seq_quartet_info[qid].countarr[LM_REG4] + lmap_seq_quartet_info[qid].countarr[LM_REG5] + lmap_seq_quartet_info[qid].countarr[LM_REG6];
+ int64_t unres = lmap_seq_quartet_info[qid].countarr[LM_REG7];
+ int64_t sumq0, sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
if (sumq>0) sumq0=sumq;
else sumq0=1;
@@ -1754,3 +1810,4 @@ void PhyloTree::reportLikelihoodMapping(ofstream &out) {
<< " (=" << 100.0 * unresolved/(resolved+partly+unresolved) << "%)" << endl << endl;
} // end PhyloTree::reportLikelihoodMapping
+
diff --git a/superalignment.cpp b/superalignment.cpp
index 6d71ca0..7311455 100644
--- a/superalignment.cpp
+++ b/superalignment.cpp
@@ -101,10 +101,11 @@ void SuperAlignment::linkSubAlignment(int part) {
}
}
-void SuperAlignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char) {
+void SuperAlignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa, IntVector *kept_partitions) {
assert(aln->isSuperAlignment());
SuperAlignment *saln = (SuperAlignment*)aln;
+ int i;
IntVector::iterator it;
for (it = seq_id.begin(); it != seq_id.end(); it++) {
assert(*it >= 0 && *it < aln->getNSeq());
@@ -115,24 +116,33 @@ void SuperAlignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int
//Alignment::extractSubAlignment(aln, seq_id, 0);
taxa_index.resize(getNSeq());
- for (int i = 0; i < getNSeq(); i++)
+ for (i = 0; i < getNSeq(); i++)
taxa_index[i].resize(saln->partitions.size(), -1);
int part = 0;
- partitions.resize(saln->partitions.size());
+// partitions.resize(saln->partitions.size());
+ partitions.resize(0);
for (vector<Alignment*>::iterator ait = saln->partitions.begin(); ait != saln->partitions.end(); ait++, part++) {
IntVector sub_seq_id;
for (IntVector::iterator it = seq_id.begin(); it != seq_id.end(); it++)
if (saln->taxa_index[*it][part] >= 0)
sub_seq_id.push_back(saln->taxa_index[*it][part]);
+ if (sub_seq_id.size() < min_taxa)
+ continue;
Alignment *subaln = new Alignment;
subaln->extractSubAlignment(*ait, sub_seq_id, 0);
- partitions[part] = subaln;
- linkSubAlignment(part);
+ partitions.push_back(subaln);
+ linkSubAlignment(partitions.size()-1);
+ if (kept_partitions) kept_partitions->push_back(part);
// cout << subaln->getNSeq() << endl;
// subaln->printPhylip(cout);
}
+ if (partitions.size() < saln->partitions.size()) {
+ for (i = 0; i < getNSeq(); i++)
+ taxa_index[i].resize(partitions.size());
+ }
+
// now build the patterns based on taxa_index
buildPattern();
}
diff --git a/superalignment.h b/superalignment.h
index c3552ad..583dd0a 100644
--- a/superalignment.h
+++ b/superalignment.h
@@ -108,8 +108,10 @@ public:
@param aln original input alignment
@param seq_id ID of sequences to extract from
@param min_true_cher the minimum number of non-gap characters, true_char<min_true_char -> delete the sequence
+ @param min_taxa only keep alignment that has >= min_taxa sequences
+ @param[out] kept_partitions (for SuperAlignment) indices of kept partitions
*/
- virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char);
+ virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa = 0, IntVector *kept_partitions = NULL);
/**
* remove identical sequences from alignment
diff --git a/tools.cpp b/tools.cpp
index fc01036..85ca06c 100644
--- a/tools.cpp
+++ b/tools.cpp
@@ -263,6 +263,35 @@ void convert_int_vec(const char *str, IntVector &vec) throw (string) {
}
+int64_t convert_int64(const char *str) throw (string) {
+ char *endptr;
+ int64_t i = (int64_t)strtoll(str, &endptr, 10); // casted because 'long long' may be larger than int64_t
+
+ if ((i == 0 && endptr == str) || abs(i) == HUGE_VALL || *endptr != 0) {
+ string err = "Expecting large integer , but found \"";
+ err += str;
+ err += "\" instead";
+ throw err;
+ }
+
+ return i;
+}
+
+int64_t convert_int64(const char *str, int &end_pos) throw (string) {
+ char *endptr;
+ int64_t i = (int64_t)strtoll(str, &endptr, 10); // casted because 'long long' may be larger than int64_t
+
+ if ((i == 0 && endptr == str) || abs(i) == HUGE_VALL) {
+ string err = "Expecting large integer, but found \"";
+ err += str;
+ err += "\" instead";
+ throw err;
+ }
+ end_pos = endptr - str;
+ return i;
+}
+
+
double convert_double(const char *str) throw (string) {
char *endptr;
double d = strtod(str, &endptr);
@@ -613,6 +642,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.user_file = NULL;
params.fai = false;
params.testAlpha = false;
+ params.test_param = false;
params.testAlphaEpsAdaptive = false;
params.randomAlpha = false;
params.testAlphaEps = 0.1;
@@ -882,7 +912,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.freq_const_patterns = NULL;
params.no_rescale_gamma_invar = false;
params.compute_seq_identity_along_tree = false;
- params.lmap_num_quartets = 0;
+ params.lmap_num_quartets = -1;
params.lmap_cluster_file = NULL;
params.print_lmap_quartet_lh = false;
params.link_alpha = false;
@@ -2520,6 +2550,12 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.testAlpha = true;
continue;
}
+
+ if (strcmp(argv[cnt], "-test_param") == 0 || strcmp(argv[cnt], "--opt-gamma-inv") == 0) {
+ params.test_param = true;
+ continue;
+ }
+
if (strcmp(argv[cnt], "--adaptive-eps") == 0) {
params.testAlphaEpsAdaptive = true;
continue;
@@ -2830,9 +2866,13 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
cnt++;
if (cnt >= argc)
throw "Use -lmap <likelihood_mapping_num_quartets>";
- params.lmap_num_quartets = convert_int(argv[cnt]);
- if (params.lmap_num_quartets < 1)
- throw "Number of quartets must be >= 1";
+ if (strcmp(argv[cnt],"ALL") == 0) {
+ params.lmap_num_quartets = 0;
+ } else {
+ params.lmap_num_quartets = convert_int64(argv[cnt]);
+ if (params.lmap_num_quartets < 0)
+ throw "Number of quartets must be >= 1";
+ }
continue;
}
@@ -2844,6 +2884,8 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
// '-keep_ident' is currently required to allow a 1-to-1 mapping of the
// user-given groups (HAS) - possibly obsolete in the future versions
params.ignore_identical_seqs = false;
+ if (params.lmap_num_quartets < 0)
+ params.lmap_num_quartets = 0;
continue;
}
@@ -3121,7 +3163,7 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " number of categories (default: n=4)" << endl
<< " -a <Gamma_shape> Gamma shape parameter for site rates (default: estimate)" << endl
<< " -gmedian Median approximation for +G site rates (default: mean)" << endl
- << " --test-alpha More thorough estimation for +I+G model parameters" << endl
+ << " --opt-gamma-inv More thorough estimation for +I+G model parameters" << endl
<< " -i <p_invar> Proportion of invariable sites (default: estimate)" << endl
<< " -mh Computing site-specific rates to .mhrate file using" << endl
<< " Meyer & von Haeseler (2003) method" << endl
diff --git a/tools.h b/tools.h
index ef24f80..21a2462 100644
--- a/tools.h
+++ b/tools.h
@@ -439,6 +439,12 @@ public:
bool testAlpha;
/**
+ * Restart the optimization of alpha and pinvar from different starting
+ * pinv values (supercedes the option testAlpha
+ */
+ bool test_param;
+
+ /**
* Automatic adjust the log-likelihood espilon using some heuristic
*/
bool testAlphaEpsAdaptive;
@@ -1685,7 +1691,7 @@ public:
/** true to ignore checkpoint file */
bool ignore_checkpoint;
/** number of quartets for likelihood mapping */
- int lmap_num_quartets;
+ int64_t lmap_num_quartets;
/**
file containing the cluster information for clustered likelihood mapping
@@ -1903,6 +1909,21 @@ int convert_int(const char *str, int &end_pos) throw (string);
void convert_int_vec(const char *str, IntVector &vec) throw (string);
/**
+ convert string to int64_t, with error checking
+ @param str original string
+ @return the number
+ */
+int64_t convert_int64(const char *str) throw (string);
+
+/**
+ convert string to int64_t, with error checking
+ @param str original string
+ @param end_pos end position
+ @return the number
+ */
+int64_t convert_int64(const char *str, int64_t &end_pos) throw (string);
+
+/**
convert string to double, with error checking
@param str original string
@return the double
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/iqtree.git
More information about the debian-med-commit
mailing list