[med-svn] [iqtree] 01/04: New upstream version 1.4.4+dfsg
Andreas Tille
tille at debian.org
Fri Sep 9 09:31:33 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository iqtree.
commit cda5433cc2e2ed7163cb5c4435d18521be942f36
Author: Andreas Tille <tille at debian.org>
Date: Fri Sep 9 11:05:29 2016 +0200
New upstream version 1.4.4+dfsg
---
CMakeLists.txt | 11 +-
alignment.cpp | 209 ++++++++++++++----
alignment.h | 23 ++
checkpoint.cpp | 42 +++-
checkpoint.h | 18 ++
gzstream.cpp | 5 +-
gzstream.h | 2 +
iqtree.cpp | 10 +-
iqtree.h | 4 +-
model/modelcodon.cpp | 30 +++
model/modelfactory.cpp | 191 ++++++-----------
model/modelfactory.h | 27 ++-
model/modelgtr.cpp | 84 ++++----
model/modelgtr.h | 14 +-
model/modelmixture.cpp | 86 ++++++--
model/modelmixture.h | 11 +
model/modelprotein.cpp | 38 +++-
model/modelprotein.h | 10 +
model/modelset.cpp | 4 +-
model/modelset.h | 13 +-
model/modelsubst.h | 8 +
model/partitionmodel.cpp | 4 +-
model/partitionmodel.h | 11 +-
model/ratemeyerhaeseler.cpp | 6 +-
modelsblock.h | 11 +-
mpdablock.cpp | 2 +-
msplitsblock.cpp | 2 +-
mtreeset.cpp | 4 +-
ncl/nxscharactersblock.cpp | 1 +
ncl/nxsdistancesblock.cpp | 1 +
ncl/nxstaxablock.cpp | 1 +
pattern.cpp | 10 +-
pattern.h | 22 +-
pda.cpp | 94 ++++++---
phyloanalysis.cpp | 336 ++++++++++++++++++++---------
phylokernel.h | 4 +
phylokernelsitemodel.cpp | 230 ++++++++++----------
phylokernelsitemodel.h | 6 +
phylosupertree.cpp | 24 ++-
phylosupertree.h | 12 ++
phylosupertreeplen.cpp | 24 ++-
phylosupertreeplen.h | 12 +-
phylotesting.cpp | 505 ++++++++++++++++++++++++++++++++++++++------
phylotesting.h | 9 +-
phylotree.cpp | 122 ++++++++---
phylotree.h | 20 +-
pll/pll.h | 2 +-
pruning.h | 2 +-
tinatree.cpp | 2 +-
tools.cpp | 131 +++++++++---
tools.h | 63 +++++-
51 files changed, 1837 insertions(+), 676 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 497a66f..73688ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,7 @@ add_definitions(-DIQ_TREE)
# The version number.
set (iqtree_VERSION_MAJOR 1)
set (iqtree_VERSION_MINOR 4)
-set (iqtree_VERSION_PATCH "3")
+set (iqtree_VERSION_PATCH "4")
set(BUILD_SHARED_LIBS OFF)
@@ -270,12 +270,10 @@ elseif (IQTREE_FLAGS MATCHES "avx") # AVX instruction set
message("Vectorization : AVX")
add_definitions(-D__SSE3 -D__AVX) # define both SSE3 and AVX directive
set(COMBINED_FLAGS "${COMBINED_FLAGS} ${AVX_FLAGS}")
-
SET(EXE_SUFFIX "${EXE_SUFFIX}-avx")
else() #SSE intruction set
message("Vectorization : SSE3")
add_definitions(-D__SSE3)
-
endif()
@@ -581,7 +579,12 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
endif()
endif()
-set (PROJECT_NAME_SUFFIX "${EXE_SUFFIX}")
+if (BINARY32)
+ set (PROJECT_NAME_SUFFIX "${EXE_SUFFIX}")
+else()
+ set (PROJECT_NAME_SUFFIX "${EXE_SUFFIX}")
+endif()
+
#if (NOT IQTREE_FLAGS MATCHES "omp" AND NOT IQTREE_FLAGS MATCHES "avx" AND NOT IQTREE_FLAGS MATCHES "fma")
# set (PROJECT_NAME_SUFFIX "${PROJECT_NAME_SUFFIX}-sse")
#endif()
diff --git a/alignment.cpp b/alignment.cpp
index 25214c2..dcab0ff 100644
--- a/alignment.cpp
+++ b/alignment.cpp
@@ -56,6 +56,7 @@ Alignment::Alignment()
{
num_states = 0;
frac_const_sites = 0.0;
+ frac_invariant_sites = 0.0;
// codon_table = NULL;
genetic_code = NULL;
// non_stop_codon = NULL;
@@ -328,6 +329,7 @@ void Alignment::checkGappySeq(bool force_error) {
Alignment::Alignment(char *filename, char *sequence_type, InputType &intype) : vector<Pattern>() {
num_states = 0;
frac_const_sites = 0.0;
+ frac_invariant_sites = 0.0;
// codon_table = NULL;
genetic_code = NULL;
// non_stop_codon = NULL;
@@ -610,8 +612,9 @@ void Alignment::extractDataBlock(NxsCharactersBlock *data_block) {
determine if the pattern is constant. update the is_const variable.
*/
void Alignment::computeConst(Pattern &pat) {
- pat.is_const = false;
- pat.is_informative = false;
+ bool is_const = true;
+ bool is_invariant = false;
+ bool is_informative = false;
// critical fix: const_char was set wrongly to num_states in some data type (binary, codon),
// causing wrong log-likelihood computation for +I or +I+G model
if (STATE_UNKNOWN == num_states)
@@ -625,8 +628,8 @@ void Alignment::computeConst(Pattern &pat) {
state_app[j] = 1;
// number of appearance for each state, to compute is_informative
- int *num_app = new int[num_states];
- memset(num_app, 0, num_states*sizeof(int));
+ size_t *num_app = new size_t[num_states];
+ memset(num_app, 0, num_states*sizeof(size_t));
for (Pattern::iterator i = pat.begin(); i != pat.end(); i++) {
StateBitset this_app;
@@ -634,43 +637,48 @@ void Alignment::computeConst(Pattern &pat) {
state_app &= this_app;
if (*i < num_states) {
num_app[(int)(*i)]++;
- continue;
+ } else if (*i != STATE_UNKNOWN) {
+ // ambiguous characters
+ is_const = false;
}
- if (*i == STATE_UNKNOWN) continue;
- for (j = 0; j < num_states; j++)
- if (this_app[j])
- num_app[j]++;
}
- int count = 0;
- pat.num_chars = 0;
+ int count = 0; // number of states with >= 2 appearances
+ pat.num_chars = 0; // number of states with >= 1 appearance
for (j = 0; j < num_states; j++) if (num_app[j]) {
pat.num_chars++;
if (num_app[j] >= 2) {
count++;
}
}
+
// at least 2 states, each appearing at least twice
- if (count >= 2) pat.is_informative = true;
- delete [] num_app;
-
- count = state_app.count();
- if (count == 0) {
- return;
- }
- if (count == num_states) {
- // all-gap pattern
- pat.is_const = true;
- pat.const_char = num_states;
- return;
- }
- if (count == 1) {
- for (j = 0; j < num_states; j++)
- if (state_app.test(j)) {
- pat.is_const = true;
- pat.const_char = j;
- return;
- }
+ is_informative = (count >= 2);
+
+ // compute is_const
+ is_const = is_const && (pat.num_chars <= 1);
+ if (is_const) {
+ if (pat.num_chars == 0) // all-gap pattern
+ pat.const_char = num_states;
+ else {
+ // pat.num_chars is 1
+ for (j = 0; j < num_states; j++)
+ if (num_app[j]) {
+ pat.const_char = j;
+ break;
+ }
+ }
}
+
+ delete [] num_app;
+
+ // compute is_invariant
+ is_invariant = (state_app.count() >= 1);
+ assert(is_invariant >= is_const);
+
+ pat.flag = 0;
+ if (is_const) pat.flag |= PAT_CONST;
+ if (is_invariant) pat.flag |= PAT_INVARIANT;
+ if (is_informative) pat.flag |= PAT_INFORMATIVE;
}
@@ -742,13 +750,13 @@ void Alignment::orderPatternByNumChars() {
UINT sum = 0;
memset(pars_lower_bound, 0, (maxi+1)*sizeof(UINT));
for (ptn = 0; ptn < nptn; ptn++) {
- num_chars[ptn] = -at(ptn).num_chars + (!at(ptn).is_informative)*1024;
+ num_chars[ptn] = -at(ptn).num_chars + (!at(ptn).isInformative())*1024;
ptn_order[ptn] = ptn;
}
quicksort(num_chars, 0, nptn-1, ptn_order);
ordered_pattern.clear();
for (ptn = 0, site = 0, i = 0; ptn < nptn; ptn++) {
- if (!at(ptn_order[ptn]).is_informative)
+ if (!at(ptn_order[ptn]).isInformative())
break;
ordered_pattern.push_back(at(ptn_order[ptn]));
int freq = ordered_pattern.back().frequency;
@@ -1874,7 +1882,7 @@ int Alignment::buildRetainingSites(const char *aln_site_list, IntVector &kept_si
}
if (exclude_const_sites) {
for (j = 0; j < kept_sites.size(); j++)
- if (at(site_pattern[j]).is_const)
+ if (at(site_pattern[j]).isConst())
kept_sites[j] = 0;
}
@@ -2295,6 +2303,13 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq
pattern_freq->resize(0);
pattern_freq->resize(aln->getNPattern(), 0);
}
+
+ if (!aln->site_state_freq.empty()) {
+ // resampling also the per-site state frequency vector
+ if (aln->site_state_freq.size() != aln->getNPattern() || spec)
+ outError("Unsupported bootstrap feature, pls contact the developers");
+ }
+
IntVector site_vec;
if (!spec) {
// standard bootstrap
@@ -2302,7 +2317,14 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq
int site_id = random_int(nsite);
int ptn_id = aln->getPatternID(site_id);
Pattern pat = aln->at(ptn_id);
+ int nptn = getNPattern();
addPattern(pat, site);
+ if (!aln->site_state_freq.empty() && getNPattern() > nptn) {
+ // a new pattern is added, copy state frequency vector
+ double *state_freq = new double[num_states];
+ memcpy(state_freq, aln->site_state_freq[ptn_id], num_states*sizeof(double));
+ site_state_freq.push_back(state_freq);
+ }
if (pattern_freq) ((*pattern_freq)[ptn_id])++;
}
} else if (strncmp(spec, "GENESITE,", 9) == 0) {
@@ -2374,6 +2396,10 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq
out_site += site_vec[part+1];
}
}
+ if (!aln->site_state_freq.empty()) {
+ site_model = site_pattern;
+ assert(site_state_freq.size() == getNPattern());
+ }
verbose_mode = save_mode;
countConstSite();
buildSeqStates();
@@ -2574,13 +2600,17 @@ void Alignment::copyAlignment(Alignment *aln) {
void Alignment::countConstSite() {
int num_const_sites = 0;
num_informative_sites = 0;
+ int num_invariant_sites = 0;
for (iterator it = begin(); it != end(); it++) {
- if ((*it).is_const)
+ if ((*it).isConst())
num_const_sites += (*it).frequency;
- if (it->is_informative)
+ if (it->isInformative())
num_informative_sites += it->frequency;
+ if (it->isInvariant())
+ num_invariant_sites += it->frequency;
}
frac_const_sites = ((double)num_const_sites) / getNSite();
+ frac_invariant_sites = ((double)num_invariant_sites) / getNSite();
}
string Alignment::getUnobservedConstPatterns() {
@@ -2620,6 +2650,10 @@ Alignment::~Alignment()
delete [] pars_lower_bound;
pars_lower_bound = NULL;
}
+ for (vector<double*>::reverse_iterator it = site_state_freq.rbegin(); it != site_state_freq.rend(); it++)
+ if (*it) delete [] (*it);
+ site_state_freq.clear();
+ site_model.clear();
}
double Alignment::computeObsDist(int seq1, int seq2) {
@@ -3546,5 +3580,104 @@ double Alignment::multinomialProb (IntVector &pattern_freq)
return (fac - sumFac + sumProb);
}
-
-
+bool Alignment::readSiteStateFreq(const char* site_freq_file)
+{
+ cout << endl << "Reading site-specific state frequency file " << site_freq_file << " ..." << endl;
+ site_model.resize(getNSite(), -1);
+ int i;
+ IntVector pattern_to_site; // vector from pattern to the first site
+ pattern_to_site.resize(getNPattern(), -1);
+ for (i = 0; i < getNSite(); i++)
+ if (pattern_to_site[getPatternID(i)] == -1)
+ pattern_to_site[getPatternID(i)] = i;
+
+ bool aln_changed = false;
+
+ try {
+ ifstream in;
+ in.exceptions(ios::failbit | ios::badbit);
+ in.open(site_freq_file);
+ double freq;
+ string site_spec;
+ int specified_sites = 0;
+ in.exceptions(ios::badbit);
+ for (int model_id = 0; !in.eof(); model_id++) {
+ // remove the failbit
+ in >> site_spec;
+ if (in.eof()) break;
+ IntVector site_id;
+ extractSiteID(this, site_spec.c_str(), site_id);
+ specified_sites += site_id.size();
+ if (site_id.size() == 0) throw "No site ID specified";
+ for (IntVector::iterator it = site_id.begin(); it != site_id.end(); it++) {
+ if (site_model[*it] != -1) throw "Duplicated site ID";
+ site_model[*it] = site_state_freq.size();
+ }
+ double *site_freq_entry = new double[num_states];
+ double sum = 0;
+ for (i = 0; i < num_states; i++) {
+ in >> freq;
+ if (freq <= 0.0 || freq >= 1.0) throw "Frequencies must be strictly positive and smaller than 1";
+ site_freq_entry[i] = freq;
+ sum += freq;
+ }
+ if (fabs(sum-1.0) > 1e-4) {
+ if (fabs(sum-1.0) > 1e-3)
+ outWarning("Frequencies of site " + site_spec + " do not sum up to 1 and will be normalized");
+ sum = 1.0/sum;
+ for (i = 0; i < num_states; i++)
+ site_freq_entry[i] *= sum;
+ }
+ convfreq(site_freq_entry); // regularize frequencies (eg if some freq = 0)
+
+ // 2016-02-01: now check for equality of sites with same site-pattern and same freq
+ int prev_site = pattern_to_site[getPatternID(site_id[0])];
+ if (site_id.size() == 1 && prev_site < site_id[0] && site_model[prev_site] != -1) {
+ // compare freq with prev_site
+ bool matched_freq = true;
+ double *prev_freq = site_state_freq[site_model[prev_site]];
+ for (i = 0; i < num_states; i++) {
+ if (site_freq_entry[i] != prev_freq[i]) {
+ matched_freq = false;
+ break;
+ }
+ }
+ if (matched_freq) {
+ site_model[site_id[0]] = site_model[prev_site];
+ } else
+ aln_changed = true;
+ }
+
+ if (site_model[site_id[0]] == site_state_freq.size())
+ site_state_freq.push_back(site_freq_entry);
+ else
+ delete [] site_freq_entry;
+ }
+ if (specified_sites < site_model.size()) {
+ aln_changed = true;
+ // there are some unspecified sites
+ cout << site_model.size() - specified_sites << " unspecified sites will get default frequencies" << endl;
+ for (i = 0; i < site_model.size(); i++)
+ if (site_model[i] == -1)
+ site_model[i] = site_state_freq.size();
+ site_state_freq.push_back(NULL);
+ }
+ in.clear();
+ // set the failbit again
+ in.exceptions(ios::failbit | ios::badbit);
+ in.close();
+ } catch (const char* str) {
+ outError(str);
+ } catch (string str) {
+ outError(str);
+ } catch(ios::failure) {
+ outError(ERR_READ_INPUT);
+ }
+
+ if (aln_changed) {
+ cout << "Regrouping alignment sites..." << endl;
+ regroupSitePattern(site_state_freq.size(), site_model);
+ }
+ cout << site_state_freq.size() << " distinct per-site state frequency vectors detected" << endl;
+ return aln_changed;
+}
diff --git a/alignment.h b/alignment.h
index fdee2c8..781a378 100644
--- a/alignment.h
+++ b/alignment.h
@@ -593,6 +593,11 @@ public:
fraction of constant sites
*/
double frac_const_sites;
+
+ /**
+ fraction of invariant sites, incl. const sites and site like G-S-GG-GGGG
+ */
+ double frac_invariant_sites;
/** number of informative sites */
int num_informative_sites;
@@ -616,6 +621,14 @@ public:
vector<vector<int> > seq_states; // state set for each sequence in the alignment
+ /* for site-specific state frequency model with Huaichun, Edward, Andrew */
+
+ /* site to model ID map */
+ IntVector site_model;
+
+ /** site to state frequency vector */
+ vector<double*> site_state_freq;
+
/**
* @return true if data type is SEQ_CODON and state is a stop codon
*/
@@ -672,6 +685,16 @@ public:
void getAppearance(char state, StateBitset &state_app);
+ /**
+ * read site specific state frequency vectors from a file to create corresponding model
+ * update site_model and site_state_freq variables for this class
+ * @param aln input alignment
+ * @param site_freq_file file name
+ * @return TRUE if alignment needs to be changed, FALSE otherwise
+ */
+ bool readSiteStateFreq(const char* site_freq_file);
+
+
protected:
diff --git a/checkpoint.cpp b/checkpoint.cpp
index 1229d3f..8517ef2 100644
--- a/checkpoint.cpp
+++ b/checkpoint.cpp
@@ -10,12 +10,16 @@
#include "timeutil.h"
#include "gzstream.h"
+const char* CKP_HEADER = "--- # IQ-TREE Checkpoint";
+
Checkpoint::Checkpoint() {
filename = "";
prev_dump_time = 0;
dump_interval = 30; // dumping at most once per 30 seconds
struct_name = "";
+ compression = true;
+ header = CKP_HEADER;
}
@@ -23,8 +27,6 @@ Checkpoint::~Checkpoint() {
}
-const char* CKP_HEADER = "--- # IQ-TREE Checkpoint";
-
void Checkpoint::setFileName(string filename) {
this->filename = filename;
}
@@ -43,7 +45,7 @@ void Checkpoint::load() {
in.close();
return;
}
- if (line != CKP_HEADER)
+ if (line != header)
throw ("Invalid checkpoint file " + filename);
string struct_name;
size_t pos;
@@ -93,6 +95,18 @@ void Checkpoint::load() {
}
}
+void Checkpoint::setCompression(bool compression) {
+ this->compression = compression;
+}
+
+/**
+ set the header line to overwrite the default header
+ @param header header line
+*/
+void Checkpoint::setHeader(string header) {
+ this->header = "--- # " + header;
+}
+
void Checkpoint::setDumpInterval(double interval) {
dump_interval = interval;
}
@@ -107,10 +121,13 @@ void Checkpoint::dump(bool force) {
}
prev_dump_time = getRealTime();
try {
- ogzstream out;
- out.exceptions(ios::failbit | ios::badbit);
- out.open(filename.c_str());
- out << CKP_HEADER << endl;
+ ostream *out;
+ if (compression)
+ out = new ogzstream(filename.c_str());
+ else
+ out = new ofstream(filename.c_str());
+ out->exceptions(ios::failbit | ios::badbit);
+ *out << header << endl;
string struct_name;
size_t pos;
int listid = 0;
@@ -118,15 +135,18 @@ void Checkpoint::dump(bool force) {
if ((pos = i->first.find('.')) != string::npos) {
if (struct_name != i->first.substr(0, pos)) {
struct_name = i->first.substr(0, pos);
- out << struct_name << ":" << endl;
+ *out << struct_name << ":" << endl;
listid = 0;
}
// check if key is a collection
- out << " " << i->first.substr(pos+1) << ": " << i->second << endl;
+ *out << " " << i->first.substr(pos+1) << ": " << i->second << endl;
} else
- out << i->first << ": " << i->second << endl;
+ *out << i->first << ": " << i->second << endl;
}
- out.close();
+ if (compression)
+ ((ogzstream*)out)->close();
+ else
+ ((ofstream*)out)->close();
// cout << "Checkpoint dumped" << endl;
} catch (ios::failure &) {
outError(ERR_WRITE_OUTPUT, filename.c_str());
diff --git a/checkpoint.h b/checkpoint.h
index 20ac08c..a708d36 100644
--- a/checkpoint.h
+++ b/checkpoint.h
@@ -64,6 +64,18 @@ public:
*/
void setFileName(string filename);
+ /**
+ set compression for checkpoint file
+ @param compression true to compress checkpoint file, or false: no compression
+ */
+ void setCompression(bool compression);
+
+ /**
+ set the header line to overwrite the default header
+ @param header header line
+ */
+ void setHeader(string header);
+
/**
* load checkpoint information from file
*/
@@ -325,6 +337,12 @@ protected:
/** dumping time interval */
double dump_interval;
+ /** true (default) to compress checkpoint file, false: no compression */
+ bool compression;
+
+ /** header line of checkpoint file */
+ string header;
+
private:
/** name of the current nested key */
diff --git a/gzstream.cpp b/gzstream.cpp
index d9c81a3..648cdfe 100644
--- a/gzstream.cpp
+++ b/gzstream.cpp
@@ -57,7 +57,10 @@ gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {
else if ( mode & std::ios::out)
*fmodeptr++ = 'w';
*fmodeptr++ = 'b';
- *fmodeptr++ = '9'; // best compression ratio
+ if (mode & GZ_NO_COMPRESSION)
+ *fmodeptr++ = '0'; // no compression
+ else
+ *fmodeptr++ = '9'; // best compression ratio
*fmodeptr = '\0';
file = gzopen( name, fmode);
if (file == 0)
diff --git a/gzstream.h b/gzstream.h
index 9116b02..b00355c 100644
--- a/gzstream.h
+++ b/gzstream.h
@@ -35,6 +35,8 @@
//#include "zlib-1.2.7/zlib.h"
#include <zlib.h>
+#define GZ_NO_COMPRESSION (1L << 11)
+
#ifdef GZSTREAM_NAMESPACE
namespace GZSTREAM_NAMESPACE {
#endif
diff --git a/iqtree.cpp b/iqtree.cpp
index cf8dc72..0a9f34c 100644
--- a/iqtree.cpp
+++ b/iqtree.cpp
@@ -506,7 +506,7 @@ void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
readTree(params->user_file, myrooted);
setAlignment(aln);
if (isSuperTree())
- wrapperFixNegativeBranch(!params->fixed_branch_length);
+ wrapperFixNegativeBranch(params->fixed_branch_length == BRLEN_OPTIMIZE);
else
fixed_number = wrapperFixNegativeBranch(false);
params->numInitTrees = 1;
@@ -1784,7 +1784,7 @@ string IQTree::optimizeModelParameters(bool printInfo, double logl_epsilon) {
curScore = modOptScore;
newTree = getTreeString();
}
- if (params->print_site_posterior)
+ if (params->print_trees_site_posterior)
computePatternCategories();
}
@@ -1844,8 +1844,6 @@ double IQTree::doTreeSearch() {
cout << "--------------------------------------------------------------------" << endl;
cout << "| OPTIMIZING CANDIDATE TREE SET |" << endl;
cout << "--------------------------------------------------------------------" << endl;
- string tree_file_name = params->out_prefix;
- tree_file_name += ".treefile";
// PLEASE PRINT TREE HERE!
printResultTree();
string treels_name = params->out_prefix;
@@ -2134,7 +2132,7 @@ string IQTree::doNNISearch(int& nniCount, int& nniSteps) {
((PhyloSuperTree*) this)->computeBranchLengths();
}
treeString = getTreeString();
- if (params->print_site_posterior)
+ if (params->print_trees_site_posterior)
computePatternCategories();
}
return treeString;
@@ -3081,6 +3079,8 @@ void IQTree::addPositiveNNIMove(NNIMove myMove) {
void IQTree::printResultTree(string suffix) {
setRootNode(params->root);
+ if (params->suppress_output_flags & OUT_TREEFILE)
+ return;
string tree_file_name = params->out_prefix;
tree_file_name += ".treefile";
if (suffix.compare("") != 0) {
diff --git a/iqtree.h b/iqtree.h
index b3b6939..1a34ff0 100644
--- a/iqtree.h
+++ b/iqtree.h
@@ -684,13 +684,13 @@ public:
/** corresponding log-likelihood on original alignment */
DoubleVector boot_orig_logl;
- /** Set of splits occuring in bootstrap trees */
+ /** Set of splits occurring in bootstrap trees */
vector<SplitGraph*> boot_splits;
/** log-likelihood of bootstrap consensus tree */
double boot_consense_logl;
- /** Corresponding map for set of splits occuring in bootstrap trees */
+ /** Corresponding map for set of splits occurring in bootstrap trees */
//SplitIntMap boot_splits_map;
/** summarize all bootstrap trees */
diff --git a/model/modelcodon.cpp b/model/modelcodon.cpp
index b3c0fa0..e866e63 100644
--- a/model/modelcodon.cpp
+++ b/model/modelcodon.cpp
@@ -467,12 +467,15 @@ StateFreqType ModelCodon::initGY94(bool fix_kappa, CodonKappaStyle kappa_style)
void ModelCodon::computeRateAttributes() {
+ char symbols_protein[] = "ARNDCQEGHILKMFPSTWYVX"; // X for unknown AA
int i, j, ts, tv;
int nrates = getNumRateEntries();
if (!rate_attr) {
rate_attr = new int[nrates];
memset(rate_attr, 0, sizeof(int)*nrates);
}
+ char aa_cost_change[400];
+ memset(aa_cost_change, 10, sizeof(char)*400);
for (i = 0; i < num_states; i++) {
int *rate_attr_row = &rate_attr[i*num_states];
if (phylo_tree->aln->isStopCodon(i)) {
@@ -493,6 +496,15 @@ void ModelCodon::computeRateAttributes() {
else
attr |= CA_NONSYNONYMOUS;
+
+ int nt_changes = ((i/16) != (j/16)) + (((i%16)/4) != ((j%16)/4)) + ((i%4) != (j%4));
+ int aa1 = strchr(symbols_protein, phylo_tree->aln->genetic_code[i]) - symbols_protein;
+ int aa2 = strchr(symbols_protein, phylo_tree->aln->genetic_code[j]) - symbols_protein;
+ assert(aa1 >= 0 && aa1 < 20 && aa2 >= 0 && aa2 < 20);
+ if (nt_changes < aa_cost_change[aa1*20+aa2]) {
+ aa_cost_change[aa1*20+aa2] = aa_cost_change[aa2*20+aa1] = nt_changes;
+ }
+
if ((nuc1=i/16) != (nuc2=j/16)) {
if (abs(nuc1-nuc2)==2) { // transition
attr |= CA_TRANSITION_1NT;
@@ -530,6 +542,24 @@ void ModelCodon::computeRateAttributes() {
rate_attr_row[j] = attr;
}
}
+
+ if (verbose_mode >= VB_MAX) {
+ cout << "cost matrix by number of nt changes for TNT use" << endl;
+ cout << "smatrix =1 (aa_nt_changes)";
+ for (i = 0; i < 19; i++)
+ for (j = i+1; j < 20; j++)
+ cout << " " << symbols_protein[i] << "/" << symbols_protein[j] << " " << (int)aa_cost_change[i*20+j];
+ cout << ";" << endl;
+ cout << 20 << endl;
+ for (i = 0; i < 20; i++) {
+ aa_cost_change[i*20+i] = 0;
+ for (j = 0; j < 20; j++)
+ cout << (int)aa_cost_change[i*20+j] << " ";
+ cout << endl;
+ }
+
+ }
+
}
void ModelCodon::combineRateNTFreq() {
diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp
index ad015c6..8bbe92b 100644
--- a/model/modelfactory.cpp
+++ b/model/modelfactory.cpp
@@ -110,12 +110,12 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
if (model_str == "") {
if (tree->aln->seq_type == SEQ_DNA) model_str = "HKY";
- else if (tree->aln->seq_type == SEQ_PROTEIN) model_str = "WAG";
+ else if (tree->aln->seq_type == SEQ_PROTEIN) model_str = "LG";
else if (tree->aln->seq_type == SEQ_BINARY) model_str = "GTR2";
else if (tree->aln->seq_type == SEQ_CODON) model_str = "GY";
else if (tree->aln->seq_type == SEQ_MORPH) model_str = "MK";
else model_str = "JC";
- outWarning("Default model may be under-fitting. Use option '-m TEST' to select best-fit model.");
+ outWarning("Default model "+model_str + " may be under-fitting. Use option '-m TEST' to determine the best-fit model.");
}
/********* preprocessing model string ****************/
@@ -210,7 +210,13 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
}
// then normal frequency
- posfreq = rate_str.find("+F");
+ if (rate_str.find("+FO") != string::npos)
+ posfreq = rate_str.find("+FO");
+ else if (rate_str.find("+Fo") != string::npos)
+ posfreq = rate_str.find("+Fo");
+ else
+ posfreq = rate_str.find("+F");
+
bool optimize_mixmodel_weight = params.optimize_mixmodel_weight;
if (posfreq != string::npos) {
@@ -251,9 +257,10 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
else
freq_type = FREQ_EQUAL;
} else if (freq_str == "+FO" || freq_str == "+Fo") {
- if (freq_type == FREQ_MIXTURE)
- outError("Mixture frequency with optimized frequency is not allowed");
- else
+ if (freq_type == FREQ_MIXTURE) {
+ freq_params = "optimize," + freq_params;
+ optimize_mixmodel_weight = true;
+ } else
freq_type = FREQ_ESTIMATE;
} else if (freq_str == "+F1x4" || freq_str == "+F1X4") {
if (freq_type == FREQ_MIXTURE)
@@ -276,7 +283,7 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
/******************** initialize model ****************************/
- if (!params.site_freq_file) {
+ if (tree->aln->site_state_freq.empty()) {
if (model_str.substr(0, 3) == "MIX" || freq_type == FREQ_MIXTURE) {
string model_list;
if (model_str.substr(0, 3) == "MIX") {
@@ -303,24 +310,15 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
model = new ModelSet(model_str.c_str(), tree);
ModelSet *models = (ModelSet*)model; // assign pointer for convenience
models->init((params.freq_type != FREQ_UNKNOWN) ? params.freq_type : FREQ_EMPIRICAL);
- IntVector site_model;
- vector<double*> freq_vec;
- bool aln_changed = readSiteFreq(tree->aln, params.site_freq_file, site_model, freq_vec);
- if (aln_changed) {
- cout << "Regrouping alignment sites..." << endl;
- tree->aln->regroupSitePattern(freq_vec.size(), site_model);
- //tree->aln->ungroupSitePattern();
- tree->setAlignment(tree->aln);
- }
int i;
models->pattern_model_map.resize(tree->aln->getNPattern(), -1);
for (i = 0; i < tree->aln->getNSite(); i++) {
- models->pattern_model_map[tree->aln->getPatternID(i)] = site_model[i];
+ models->pattern_model_map[tree->aln->getPatternID(i)] = tree->aln->site_model[i];
//cout << "site " << i << " ptn " << tree->aln->getPatternID(i) << " -> model " << site_model[i] << endl;
}
double *state_freq = new double[model->num_states];
double *rates = new double[model->getNumRateEntries()];
- for (i = 0; i < freq_vec.size(); i++) {
+ for (i = 0; i < tree->aln->site_state_freq.size(); i++) {
ModelGTR *modeli;
if (i == 0) {
modeli = (ModelGTR*)createModel(model_str, models_block, (params.freq_type != FREQ_UNKNOWN) ? params.freq_type : FREQ_EMPIRICAL, "", tree, true);
@@ -331,21 +329,18 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
modeli->setStateFrequency(state_freq);
modeli->setRateMatrix(rates);
}
- if (freq_vec[i])
- modeli->setStateFrequency (freq_vec[i]);
+ if (tree->aln->site_state_freq[i])
+ modeli->setStateFrequency (tree->aln->site_state_freq[i]);
modeli->init(FREQ_USER_DEFINED);
models->push_back(modeli);
}
delete [] rates;
delete [] state_freq;
- cout << "Alignment is divided into " << models->size() << " partitions with " << tree->aln->getNPattern() << " patterns" << endl;
- for (vector<double*>::reverse_iterator it = freq_vec.rbegin(); it != freq_vec.rend(); it++)
- if (*it) delete [] (*it);
-
+
// delete information of the old alignment
- tree->aln->ordered_pattern.clear();
- tree->deleteAllPartialLh();
+// tree->aln->ordered_pattern.clear();
+// tree->deleteAllPartialLh();
}
// if (model->isMixture())
@@ -362,8 +357,17 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
tree->aln->buildSeqStates(true);
// if (unobserved_ptns.size() <= 0)
// outError("Invalid use of +ASC because all constant patterns are observed in the alignment");
- if (unobserved_ptns.size() < tree->aln->getNumNonstopCodons())
- outError("Invalid use of +ASC because constant patterns are observed in the alignment");
+ if (tree->aln->frac_invariant_sites > 0) {
+ cerr << tree->aln->frac_invariant_sites*tree->aln->getNSite() << " invariant sites are observed in the alignment (see below)" << endl;
+ for (Alignment::iterator pit = tree->aln->begin(); pit != tree->aln->end(); pit++)
+ if (pit->isInvariant()) {
+ string pat_str = "";
+ for (Pattern::iterator it = pit->begin(); it != pit->end(); it++)
+ pat_str += tree->aln->convertStateBackStr(*it);
+ cerr << pat_str << " is invariant site pattern" << endl;
+ }
+ outError("Invalid use of +ASC in the presence of invariant sites");
+ }
cout << "Ascertainment bias correction: " << unobserved_ptns.size() << " unobservable constant patterns"<< endl;
rate_str = rate_str.substr(0, posasc) + rate_str.substr(posasc+4);
}
@@ -593,101 +597,6 @@ int ModelFactory::getNParameters() {
int df = model->getNDim() + model->getNDimFreq() + site_rate->getNDim() + site_rate->phylo_tree->branchNum;
return df;
}
-bool ModelFactory::readSiteFreq(Alignment *aln, char* site_freq_file, IntVector &site_model, vector<double*> &freq_vec)
-{
- cout << "Reading site-specific state frequency file " << site_freq_file << " ..." << endl;
- site_model.resize(aln->getNSite(), -1);
- int i;
- IntVector pattern_to_site; // vector from pattern to the first site
- pattern_to_site.resize(aln->getNPattern(), -1);
- for (i = 0; i < aln->getNSite(); i++)
- if (pattern_to_site[aln->getPatternID(i)] == -1)
- pattern_to_site[aln->getPatternID(i)] = i;
-
- bool aln_changed = false;
-
- try {
- ifstream in;
- in.exceptions(ios::failbit | ios::badbit);
- in.open(site_freq_file);
- double freq;
- string site_spec;
- int specified_sites = 0;
- in.exceptions(ios::badbit);
- for (int model_id = 0; !in.eof(); model_id++) {
- // remove the failbit
- in >> site_spec;
- if (in.eof()) break;
- IntVector site_id;
- extractSiteID(aln, site_spec.c_str(), site_id);
- specified_sites += site_id.size();
- if (site_id.size() == 0) throw "No site ID specified";
- for (IntVector::iterator it = site_id.begin(); it != site_id.end(); it++) {
- if (site_model[*it] != -1) throw "Duplicated site ID";
- site_model[*it] = freq_vec.size();
- }
- double *site_freq_entry = new double[aln->num_states];
- double sum = 0;
- for (i = 0; i < aln->num_states; i++) {
- in >> freq;
- if (freq <= 0.0 || freq >= 1.0) throw "Invalid frequency entry";
- site_freq_entry[i] = freq;
- sum += freq;
- }
- if (fabs(sum-1.0) > 1e-4) {
- if (fabs(sum-1.0) > 1e-3)
- outWarning("Frequencies of site " + site_spec + " do not sum up to 1 and will be normalized");
- sum = 1.0/sum;
- for (i = 0; i < aln->num_states; i++)
- site_freq_entry[i] *= sum;
- }
- aln->convfreq(site_freq_entry); // regularize frequencies (eg if some freq = 0)
-
- // 2016-02-01: now check for equality of sites with same site-pattern and same freq
- int prev_site = pattern_to_site[aln->getPatternID(site_id[0])];
- if (site_id.size() == 1 && prev_site < site_id[0] && site_model[prev_site] != -1) {
- // compare freq with prev_site
- bool matched_freq = true;
- double *prev_freq = freq_vec[site_model[prev_site]];
- for (i = 0; i < aln->num_states; i++) {
- if (site_freq_entry[i] != prev_freq[i]) {
- matched_freq = false;
- break;
- }
- }
- if (matched_freq) {
- site_model[site_id[0]] = site_model[prev_site];
- } else
- aln_changed = true;
- }
-
- if (site_model[site_id[0]] == freq_vec.size())
- freq_vec.push_back(site_freq_entry);
- else
- delete [] site_freq_entry;
- }
- if (specified_sites < site_model.size()) {
- aln_changed = true;
- // there are some unspecified sites
- cout << site_model.size() - specified_sites << " unspecified sites will get default frequencies" << endl;
- for (i = 0; i < site_model.size(); i++)
- if (site_model[i] == -1)
- site_model[i] = freq_vec.size();
- freq_vec.push_back(NULL);
- }
- in.clear();
- // set the failbit again
- in.exceptions(ios::failbit | ios::badbit);
- in.close();
- } catch (const char* str) {
- outError(str);
- } catch (string str) {
- outError(str);
- } catch(ios::failure) {
- outError(ERR_READ_INPUT);
- }
- return aln_changed;
-}
double ModelFactory::initGTRGammaIParameters(RateHeterogeneity *rate, ModelSubst *model, double initAlpha,
double initPInvar, double *initRates, double *initStateFreqs) {
@@ -768,7 +677,7 @@ double ModelFactory::optimizeAllParameters(double gradient_epsilon) {
return score;
}
-double ModelFactory::optimizeParametersGammaInvar(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+double ModelFactory::optimizeParametersGammaInvar(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
if (!site_rate->isGammai())
return optimizeParameters(fixed_len, write_info, logl_epsilon, gradient_epsilon);
@@ -862,7 +771,7 @@ double ModelFactory::optimizeParametersGammaInvar(bool fixed_len, bool write_inf
initPInv = initPInv + testInterval;
- if (estResults[2] > bestLogl + logl_epsilon) {
+ if (estResults[2] > bestLogl) {
bestLogl = estResults[2];
bestAlpha = estResults[1];
bestPInvar = estResults[0];
@@ -906,7 +815,7 @@ double ModelFactory::optimizeParametersGammaInvar(bool fixed_len, bool write_inf
return tree->getCurScore();
}
-vector<double> ModelFactory::optimizeGammaInvWithInitValue(bool fixed_len, double logl_epsilon, double gradient_epsilon,
+vector<double> ModelFactory::optimizeGammaInvWithInitValue(int fixed_len, double logl_epsilon, double gradient_epsilon,
PhyloTree *tree, RateHeterogeneity *site_rates, double *rates,
double *state_freqs, double initPInv, double initAlpha,
DoubleVector &lenvec) {
@@ -930,7 +839,7 @@ vector<double> ModelFactory::optimizeGammaInvWithInitValue(bool fixed_len, doubl
}
-double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
+double ModelFactory::optimizeParameters(int fixed_len, bool write_info,
double logl_epsilon, double gradient_epsilon) {
assert(model);
assert(site_rate);
@@ -965,17 +874,28 @@ double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
double new_lh;
// changed to opimise edge length first, and then Q,W,R inside the loop by Thomas on Sept 11, 15
- if (!fixed_len)
+ if (fixed_len == BRLEN_OPTIMIZE)
new_lh = tree->optimizeAllBranches(min(i,3), logl_epsilon); // loop only 3 times in total (previously in v0.9.6 5 times)
+ else if (fixed_len == BRLEN_SCALE) {
+ double scaling = 1.0;
+ new_lh = tree->optimizeTreeLengthScaling(MIN_BRLEN_SCALE, scaling, MAX_BRLEN_SCALE, gradient_epsilon);
+ }
new_lh = optimizeParametersOnly(gradient_epsilon);
if (new_lh == 0.0) {
- if (!fixed_len) cur_lh = tree->optimizeAllBranches(100, logl_epsilon);
+ if (fixed_len == BRLEN_OPTIMIZE)
+ cur_lh = tree->optimizeAllBranches(100, logl_epsilon);
+ else if (fixed_len == BRLEN_SCALE) {
+ double scaling = 1.0;
+ cur_lh = tree->optimizeTreeLengthScaling(MIN_BRLEN_SCALE, scaling, MAX_BRLEN_SCALE, gradient_epsilon);
+ }
break;
}
if (verbose_mode >= VB_MED) {
model->writeInfo(cout);
site_rate->writeInfo(cout);
+ if (fixed_len == BRLEN_SCALE)
+ cout << "Scaled tree length: " << tree->treeLength() << endl;
}
if (new_lh > cur_lh + logl_epsilon) {
cur_lh = new_lh;
@@ -983,8 +903,13 @@ double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
cout << i << ". Current log-likelihood: " << cur_lh << endl;
} else {
site_rate->classifyRates(new_lh);
- if (!fixed_len) cur_lh = tree->optimizeAllBranches(100, logl_epsilon);
- break;
+ if (fixed_len == BRLEN_OPTIMIZE)
+ cur_lh = tree->optimizeAllBranches(100, logl_epsilon);
+ else if (fixed_len == BRLEN_SCALE) {
+ double scaling = 1.0;
+ cur_lh = tree->optimizeTreeLengthScaling(MIN_BRLEN_SCALE, scaling, MAX_BRLEN_SCALE, gradient_epsilon);
+ }
+ break;
}
}
@@ -993,6 +918,8 @@ double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
{
double mean_rate = site_rate->rescaleRates();
if (mean_rate != 1.0) {
+ if (fixed_len == BRLEN_FIX)
+ outError("Fixing branch lengths not supported under specified site rate model");
tree->scaleLength(mean_rate);
tree->clearAllPartialLH();
}
@@ -1011,6 +938,8 @@ double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
if (verbose_mode <= VB_MIN && write_info) {
model->writeInfo(cout);
site_rate->writeInfo(cout);
+ if (fixed_len == BRLEN_SCALE)
+ cout << "Scaled tree length: " << tree->treeLength() << endl;
}
double elapsed_secs = getRealTime() - begin_time;
if (write_info)
diff --git a/model/modelfactory.h b/model/modelfactory.h
index 842d69c..5218f0b 100644
--- a/model/modelfactory.h
+++ b/model/modelfactory.h
@@ -27,6 +27,9 @@
#include "checkpoint.h"
+const double MIN_BRLEN_SCALE = 0.01;
+const double MAX_BRLEN_SCALE = 100.0;
+
ModelsBlock *readModelsDefinition(Params ¶ms);
/**
@@ -73,16 +76,6 @@ public:
virtual void restoreCheckpoint();
/**
- * read site specific state frequency vectors from a file to create corresponding model (Ingo's idea)
- * @param aln input alignment
- * @param site_freq_file file name
- * @param site_model (OUT) site to model ID map
- * @param freq_vec (OUT) vector of frequency vectors
- * @return TRUE if alignment needs to be changed, FALSE otherwise
- */
- bool readSiteFreq(Alignment *aln, char* site_freq_file, IntVector &site_model, vector<double*> &freq_vec);
-
- /**
get the name of the model
*/
//string getModelName();
@@ -162,11 +155,15 @@ public:
/**
optimize model parameters and tree branch lengths
- @param fixed_len TRUE to fix branch lengths, default is false
+ NOTE 2016-08-20: refactor the semantic of fixed_len
+ @param fixed_len 0: optimize branch lengths, 1: fix branch lengths, 2: scale branch lengths
+ @param write_info TRUE to write model parameters every optimization step, FALSE to only print at the end
+ @param logl_epsilon log-likelihood epsilon to stop
+ @param gradient_epsilon gradient (derivative) epsilon to stop
@return the best likelihood
*/
- virtual double optimizeParameters(bool fixed_len = false, bool write_info = true,
- double logl_epsilon = 0.1, double gradient_epsilon = 0.0001);
+ virtual double optimizeParameters(int fixed_len = BRLEN_OPTIMIZE, bool write_info = true,
+ double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
/**
* optimize model parameters and tree branch lengths for the +I+G model
@@ -174,7 +171,7 @@ public:
* @param fixed_len TRUE to fix branch lengths, default is false
* @return the best likelihood
*/
- virtual double optimizeParametersGammaInvar(bool fixed_len = false, bool write_info = true,
+ virtual double optimizeParametersGammaInvar(int fixed_len = BRLEN_OPTIMIZE, bool write_info = true,
double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
/**
@@ -260,7 +257,7 @@ protected:
*/
virtual bool getVariables(double *variables);
- vector<double> optimizeGammaInvWithInitValue(bool fixed_len, double logl_epsilon, double gradient_epsilon, PhyloTree *tree,
+ vector<double> optimizeGammaInvWithInitValue(int fixed_len, double logl_epsilon, double gradient_epsilon, PhyloTree *tree,
RateHeterogeneity *site_rates, double *rates, double *state_freqs,
double initPInv, double initAlpha, DoubleVector &lenvec);
};
diff --git a/model/modelgtr.cpp b/model/modelgtr.cpp
index 7a7b27d..47bf217 100644
--- a/model/modelgtr.cpp
+++ b/model/modelgtr.cpp
@@ -19,7 +19,6 @@
***************************************************************************/
#include "modelgtr.h"
#include <stdlib.h>
-#include <assert.h>
#include <string.h>
//const double MIN_FREQ_RATIO = MIN_FREQUENCY;
@@ -31,7 +30,7 @@ ModelGTR::ModelGTR(PhyloTree *tree, bool count_rates)
half_matrix = true;
int i;
int nrate = getNumRateEntries();
- int ncoeff = num_states*num_states*num_states;
+// int ncoeff = num_states*num_states*num_states;
highest_freq_state = num_states-1;
name = "GTR";
@@ -53,7 +52,7 @@ ModelGTR::ModelGTR(PhyloTree *tree, bool count_rates)
// for (i = 0; i < num_states; i++)
// inv_eigenvectors[i] = new double[num_states];
- eigen_coeff = aligned_alloc<double>(ncoeff);
+// eigen_coeff = aligned_alloc<double>(ncoeff);
// if (count_rates)
// computeEmpiricalRate();
@@ -237,10 +236,10 @@ void ModelGTR::computeTransMatrix(double time, double *trans_matrix) {
for (j = i+1; j < num_states; j ++) {
// compute upper triangle entries
double *trans_entry = trans_row + j;
- double *coeff_entry = eigen_coeff + ((row_offset+j)*num_states);
+// double *coeff_entry = eigen_coeff + ((row_offset+j)*num_states);
*trans_entry = 0.0;
for (k = 0; k < num_states; k ++) {
- *trans_entry += coeff_entry[k] * exptime[k];
+ *trans_entry += eigenvectors[i*num_states+k] * inv_eigenvectors[k*num_states+j] * exptime[k];
}
if (*trans_entry < 0.0) {
*trans_entry = 0.0;
@@ -272,10 +271,10 @@ double ModelGTR::computeTrans(double time, int state1, int state2) {
double evol_time = time / total_num_subst;
int i;
- double *coeff_entry = eigen_coeff + ((state1*num_states+state2)*num_states);
+// double *coeff_entry = eigen_coeff + ((state1*num_states+state2)*num_states);
double trans_prob = 0.0;
for (i = 0; i < num_states; i++) {
- trans_prob += coeff_entry[i] * exp(evol_time * eigenvalues[i]);
+ trans_prob += eigenvectors[state1*num_states+i] * inv_eigenvectors[i*num_states+state2] * exp(evol_time * eigenvalues[i]);
}
return trans_prob;
}
@@ -284,11 +283,11 @@ double ModelGTR::computeTrans(double time, int state1, int state2, double &derv1
double evol_time = time / total_num_subst;
int i;
- double *coeff_entry = eigen_coeff + ((state1*num_states+state2)*num_states);
+// double *coeff_entry = eigen_coeff + ((state1*num_states+state2)*num_states);
double trans_prob = 0.0;
derv1 = derv2 = 0.0;
for (i = 0; i < num_states; i++) {
- double trans = coeff_entry[i] * exp(evol_time * eigenvalues[i]);
+ double trans = eigenvectors[state1*num_states+i] * inv_eigenvectors[i*num_states+state2] * exp(evol_time * eigenvalues[i]);
double trans2 = trans * eigenvalues[i];
trans_prob += trans;
derv1 += trans2;
@@ -317,13 +316,13 @@ void ModelGTR::computeTransDerv(double time, double *trans_matrix,
double *derv1_entry = trans_derv1 + offset;
double *derv2_entry = trans_derv2 + offset;
- int coeff_offset = offset*num_states;
- double *coeff_entry = eigen_coeff + coeff_offset;
+// int coeff_offset = offset*num_states;
+// double *coeff_entry = eigen_coeff + coeff_offset;
*trans_entry = 0.0;
*derv1_entry = 0.0;
*derv2_entry = 0.0;
for (k = 0; k < num_states; k ++) {
- double trans = coeff_entry[k] * exptime[k];
+ double trans = eigenvectors[i*num_states+k] * inv_eigenvectors[k*num_states+j] * exptime[k];
double trans2 = trans * eigenvalues[k];
*trans_entry += trans;
*derv1_entry += trans2;
@@ -699,25 +698,25 @@ void ModelGTR::decomposeRateMatrix(){
delete [] rate_matrix[i];
delete [] rate_matrix;
}
- for (i = 0; i < num_states; i++)
- for (j = 0; j < num_states; j++) {
- int offset = (i*num_states+j)*num_states;
- double sum = 0.0;
- for (k = 0; k < num_states; k++) {
- eigen_coeff[offset+k] = eigenvectors[i*num_states+k] * inv_eigenvectors[k*num_states+j];
- sum += eigen_coeff[offset+k];
- //eigen_coeff_derv1[offset+k] = eigen_coeff[offset+k] * eigenvalues[k];
- //eigen_coeff_derv2[offset+k] = eigen_coeff_derv1[offset+k] * eigenvalues[k];
- }
- if (i == j) {
- if (fabs(sum-1.0) > 1e-6) {
- cout << "sum = " << sum << endl;
- assert(0);
- }
- }
- else assert(fabs(sum) < 1e-6);
- }
-
+// for (i = 0; i < num_states; i++)
+// for (j = 0; j < num_states; j++) {
+// int offset = (i*num_states+j)*num_states;
+// double sum = 0.0;
+// for (k = 0; k < num_states; k++) {
+// eigen_coeff[offset+k] = eigenvectors[i*num_states+k] * inv_eigenvectors[k*num_states+j];
+// sum += eigen_coeff[offset+k];
+// //eigen_coeff_derv1[offset+k] = eigen_coeff[offset+k] * eigenvalues[k];
+// //eigen_coeff_derv2[offset+k] = eigen_coeff_derv1[offset+k] * eigenvalues[k];
+// }
+// if (i == j) {
+// if (fabs(sum-1.0) > 1e-6) {
+// cout << "sum = " << sum << endl;
+// assert(0);
+// }
+// }
+// else assert(fabs(sum) < 1e-6);
+// }
+//
}
@@ -813,8 +812,9 @@ void ModelGTR::readStateFreq(string str) throw(const char*) {
void ModelGTR::readParameters(const char *file_name) {
try {
ifstream in(file_name);
- if (in.fail())
+ if (in.fail()) {
outError("Invalid model name ", file_name);
+ }
cout << "Reading model parameters from file " << file_name << endl;
readRates(in);
readStateFreq(in);
@@ -837,7 +837,7 @@ void ModelGTR::freeMem()
// int i;
//delete eigen_coeff_derv2;
//delete eigen_coeff_derv1;
- aligned_free(eigen_coeff);
+// aligned_free(eigen_coeff);
// for (i = num_states-1; i>=0; i--)
// delete [] inv_eigenvectors[i];
@@ -851,11 +851,11 @@ void ModelGTR::freeMem()
if (rates) delete [] rates;
}
-double *ModelGTR::getEigenCoeff() const
-{
- return eigen_coeff;
-}
-
+//double *ModelGTR::getEigenCoeff() const
+//{
+// return eigen_coeff;
+//}
+//
double *ModelGTR::getEigenvalues() const
{
return eigenvalues;
@@ -870,10 +870,10 @@ double* ModelGTR::getInverseEigenvectors() const {
return inv_eigenvectors;
}
-void ModelGTR::setEigenCoeff(double *eigenCoeff)
-{
- eigen_coeff = eigenCoeff;
-}
+//void ModelGTR::setEigenCoeff(double *eigenCoeff)
+//{
+// eigen_coeff = eigenCoeff;
+//}
void ModelGTR::setEigenvalues(double *eigenvalues)
{
diff --git a/model/modelgtr.h b/model/modelgtr.h
index a237168..59a58a5 100644
--- a/model/modelgtr.h
+++ b/model/modelgtr.h
@@ -289,19 +289,27 @@ public:
*/
virtual void decomposeRateMatrix();
- double *getEigenCoeff() const;
+// double *getEigenCoeff() const;
virtual double *getEigenvalues() const;
virtual double *getEigenvectors() const;
virtual double *getInverseEigenvectors() const;
- void setEigenCoeff(double *eigenCoeff);
+// void setEigenCoeff(double *eigenCoeff);
void setEigenvalues(double *eigenvalues);
void setEigenvectors(double *eigenvectors);
+ /**
+ * compute the memory size for the model, can be large for site-specific models
+ * @return memory size required in bytes
+ */
+ virtual uint64_t getMemoryRequired() {
+ return ModelSubst::getMemoryRequired() + sizeof(double)*num_states*num_states*3;
+ }
+
/** default TRUE: store only upper half of the rate matrix */
bool half_matrix;
@@ -358,7 +366,7 @@ protected:
/**
coefficient cache, served for fast computation of the P(t) matrix
*/
- double *eigen_coeff;
+// double *eigen_coeff;
/** state with highest frequency, used when optimizing state frequencies +FO */
int highest_freq_state;
diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp
index 0381513..bca673d 100644
--- a/model/modelmixture.cpp
+++ b/model/modelmixture.cpp
@@ -721,7 +721,7 @@ frequency Fclass1 = 0.02549352 0.01296012 0.005545202 0.006005566 0.01002193 0.0
frequency Fclass2 = 0.09596966 0.008786096 0.02805857 0.01880183 0.005026264 0.006454635 0.01582725 0.7215719 0.003379354 0.002257725 0.003013483 0.01343441 0.001511657 0.002107865 0.006751404 0.04798539 0.01141559 0.000523736 0.002188483 0.004934972;\n\
frequency Fclass3 = 0.01726065 0.005467988 0.01092937 0.3627871 0.001046402 0.01984758 0.5149206 0.004145081 0.002563289 0.002955213 0.005286931 0.01558693 0.002693098 0.002075771 0.003006167 0.01263069 0.01082144 0.000253451 0.001144787 0.004573568;\n\
frequency Fclass4 = 0.1263139 0.09564027 0.07050061 0.03316681 0.02095119 0.05473468 0.02790523 0.009007538 0.03441334 0.005855319 0.008061884 0.1078084 0.009019514 0.05018693 0.07948 0.09447839 0.09258897 0.01390669 0.05367769 0.01230413;\n\
-model CF4 = POISSON+FMIX{empirical,Fclass1,Fclass2,Fclass3,Fclass4}+G;\n\
+model CF4 = POISSON+FMIX{Fclass1,Fclass2,Fclass3,Fclass4}+F+G;\n\
model JTTCF4G = JTT+FMIX{empirical,Fclass1,Fclass2,Fclass3,Fclass4}+G;\n\
\n\
[ ---------------------------------------------------------\n\
@@ -1063,6 +1063,11 @@ void ModelMixture::initMixture(string orig_model_name, string model_name, string
size_t cur_pos;
int m;
+ NxsModel *nxs_freq_empirical = new NxsModel("empirical");
+ NxsModel *nxs_freq_optimize = new NxsModel("optimize");
+
+ int num_pre_freq = 0;
+
vector<NxsModel*> freq_vec;
DoubleVector freq_rates;
DoubleVector freq_weights;
@@ -1096,8 +1101,12 @@ void ModelMixture::initMixture(string orig_model_name, string model_name, string
freq_rates.push_back(rate);
freq_weights.push_back(weight);
cur_pos = pos+1;
- if (this_name == "empirical") {
- freq_vec.push_back(NULL);
+ if (this_name == nxs_freq_empirical->name) {
+ freq_vec.push_back(nxs_freq_empirical);
+ num_pre_freq++;
+ } else if (this_name == nxs_freq_optimize->name) {
+ freq_vec.push_back(nxs_freq_optimize);
+ num_pre_freq++;
} else {
NxsModel *freq_mod = models_block->findModel(this_name);
if (!freq_mod)
@@ -1108,13 +1117,15 @@ void ModelMixture::initMixture(string orig_model_name, string model_name, string
}
freq_vec.push_back(freq_mod);
}
+ if (num_pre_freq >= 2)
+ outError("Defining both empirical and optimize frequencies not allowed");
}
double sum_weights = 0.0;
for (m = 0; m < freq_weights.size(); m++)
- if (freq_vec[m])
+ if (freq_vec[m] != nxs_freq_empirical && freq_vec[m] != nxs_freq_optimize)
sum_weights += freq_weights[m];
for (m = 0; m < freq_weights.size(); m++)
- if (!freq_vec[m])
+ if (freq_vec[m] == nxs_freq_empirical || freq_vec[m] == nxs_freq_optimize)
freq_weights[m] = sum_weights/freq_weights.size();
ModelGTR::init(FREQ_USER_DEFINED);
} else {
@@ -1155,10 +1166,12 @@ void ModelMixture::initMixture(string orig_model_name, string model_name, string
ModelGTR* model;
if (freq == FREQ_MIXTURE) {
for(int f = 0; f != freq_vec.size(); f++) {
- if (freq_vec[f])
- model = (ModelGTR*)createModel(this_name, models_block, FREQ_USER_DEFINED, freq_vec[f]->description, tree, count_rates);
- else
+ if (freq_vec[f] == nxs_freq_empirical)
model = (ModelGTR*)createModel(this_name, models_block, FREQ_EMPIRICAL, "", tree, count_rates);
+ else if (freq_vec[f] == nxs_freq_optimize)
+ model = (ModelGTR*)createModel(this_name, models_block, FREQ_ESTIMATE, "", tree, count_rates);
+ else
+ model = (ModelGTR*)createModel(this_name, models_block, FREQ_USER_DEFINED, freq_vec[f]->description, tree, count_rates);
model->total_num_subst = rate * freq_rates[f];
push_back(model);
weights.push_back(weight * freq_weights[f]);
@@ -1166,12 +1179,15 @@ void ModelMixture::initMixture(string orig_model_name, string model_name, string
// name += ',';
full_name += ',';
}
- if (freq_vec[f]) {
- model->name += "+F" +freq_vec[f]->name + "";
- model->full_name += "+F" +freq_vec[f]->name + "";
- } else {
+ if (freq_vec[f] == nxs_freq_empirical) {
model->name += "+F";
model->full_name += "+F";
+ } else if (freq_vec[f] == nxs_freq_optimize) {
+ model->name += "+FO";
+ model->full_name += "+FO";
+ } else {
+ model->name += "+F" +freq_vec[f]->name + "";
+ model->full_name += "+F" +freq_vec[f]->name + "";
}
// name += model->name;
full_name += model->name;
@@ -1234,13 +1250,13 @@ void ModelMixture::initMixture(string orig_model_name, string model_name, string
if (eigenvalues) aligned_free(eigenvalues);
if (eigenvectors) aligned_free(eigenvectors);
if (inv_eigenvectors) aligned_free(inv_eigenvectors);
- if (eigen_coeff) aligned_free(eigen_coeff);
+// if (eigen_coeff) aligned_free(eigen_coeff);
eigenvalues = aligned_alloc<double>(num_states*nmixtures);
eigenvectors = aligned_alloc<double>(num_states*num_states*nmixtures);
inv_eigenvectors = aligned_alloc<double>(num_states*num_states*nmixtures);
- int ncoeff = num_states*num_states*num_states;
- eigen_coeff = aligned_alloc<double>(ncoeff*nmixtures);
+// int ncoeff = num_states*num_states*num_states;
+// eigen_coeff = aligned_alloc<double>(ncoeff*nmixtures);
// assigning memory for individual models
m = 0;
@@ -1249,27 +1265,31 @@ void ModelMixture::initMixture(string orig_model_name, string model_name, string
memcpy(&eigenvalues[m*num_states], (*it)->eigenvalues, num_states*sizeof(double));
memcpy(&eigenvectors[m*num_states*num_states], (*it)->eigenvectors, num_states*num_states*sizeof(double));
memcpy(&inv_eigenvectors[m*num_states*num_states], (*it)->inv_eigenvectors, num_states*num_states*sizeof(double));
- memcpy(&eigen_coeff[m*ncoeff], (*it)->eigen_coeff, ncoeff*sizeof(double));
+// memcpy(&eigen_coeff[m*ncoeff], (*it)->eigen_coeff, ncoeff*sizeof(double));
// then delete
if ((*it)->eigenvalues) aligned_free((*it)->eigenvalues);
if ((*it)->eigenvectors) aligned_free((*it)->eigenvectors);
if ((*it)->inv_eigenvectors) aligned_free((*it)->inv_eigenvectors);
- if ((*it)->eigen_coeff) aligned_free((*it)->eigen_coeff);
+// if ((*it)->eigen_coeff) aligned_free((*it)->eigen_coeff);
// and assign new memory
(*it)->eigenvalues = &eigenvalues[m*num_states];
(*it)->eigenvectors = &eigenvectors[m*num_states*num_states];
(*it)->inv_eigenvectors = &inv_eigenvectors[m*num_states*num_states];
- (*it)->eigen_coeff = &eigen_coeff[m*ncoeff];
+// (*it)->eigen_coeff = &eigen_coeff[m*ncoeff];
}
decomposeRateMatrix();
+
+ delete nxs_freq_optimize;
+ delete nxs_freq_empirical;
+
}
ModelMixture::~ModelMixture() {
if (prop)
aligned_free(prop);
for (reverse_iterator rit = rbegin(); rit != rend(); rit++) {
- (*rit)->eigen_coeff = NULL;
+// (*rit)->eigen_coeff = NULL;
(*rit)->eigenvalues = NULL;
(*rit)->eigenvectors = NULL;
(*rit)->inv_eigenvectors = NULL;
@@ -1331,8 +1351,32 @@ int ModelMixture::getNDim() {
int ModelMixture::getNDimFreq() {
int dim = 0;
- for (iterator it = begin(); it != end(); it++)
- dim += (*it)->getNDimFreq();
+ int num_empirical = 0;
+ int num_codon_1x4 = 0;
+ int num_codon_3x4 = 0;
+ for (iterator it = begin(); it != end(); it++) {
+ // 2016-03-06: count empirical freq only once (thanks to Stephen Crotty)
+ switch ((*it)->freq_type) {
+ case FREQ_EMPIRICAL:
+ num_empirical++;
+ if (num_empirical==1)
+ dim += (*it)->getNDimFreq();
+ break;
+ case FREQ_CODON_1x4:
+ num_codon_1x4++;
+ if (num_codon_1x4==1)
+ dim += (*it)->getNDimFreq();
+ break;
+ case FREQ_CODON_3x4:
+ case FREQ_CODON_3x4C:
+ num_codon_3x4++;
+ if (num_codon_3x4==1)
+ dim += (*it)->getNDimFreq();
+ break;
+ default:
+ dim += (*it)->getNDimFreq();
+ }
+ }
return dim;
}
diff --git a/model/modelmixture.h b/model/modelmixture.h
index f38be63..0ace5c5 100644
--- a/model/modelmixture.h
+++ b/model/modelmixture.h
@@ -155,6 +155,17 @@ public:
*/
virtual string getNameParams();
+ /**
+ * compute the memory size for the model, can be large for site-specific models
+ * @return memory size required in bytes
+ */
+ virtual uint64_t getMemoryRequired() {
+ uint64_t mem = ModelGTR::getMemoryRequired();
+ for (iterator it = begin(); it != end(); it++)
+ mem += (*it)->getMemoryRequired();
+ return mem;
+ }
+
/**
rates of mixture components
*/
diff --git a/model/modelprotein.cpp b/model/modelprotein.cpp
index 66379b4..c49bedc 100644
--- a/model/modelprotein.cpp
+++ b/model/modelprotein.cpp
@@ -3125,6 +3125,7 @@ void ModelProtein::init(const char *model_name, string model_params, StateFreqTy
//string model_str;
//bool user_model = false;
double daa[400];
+ double f[20];
string name_upper = model_name;
for (string::iterator it = name_upper.begin(); it != name_upper.end(); it++)
(*it) = toupper(*it);
@@ -3155,13 +3156,27 @@ void ModelProtein::init(const char *model_name, string model_params, StateFreqTy
for (i = 0, k = 0; i < num_states-1; i++)
for (j = i+1; j < num_states; j++)
rates[k++] = daa[i*20+j];
+ num_params = 0;
} else if (!model_params.empty()) {
stringstream ss(model_params);
readRates(ss);
readStateFreq(ss);
+ num_params = 0;
+ } else if (name_upper == "GTR20") {
+ outWarning("GTR20 model will estimate 189 substitution rates that might be overfitting!");
+ outWarning("Please only use GTR20 with very large data and always test for model fit!");
+ if (freq == FREQ_UNKNOWN || freq == FREQ_USER_DEFINED)
+ freq = FREQ_EMPIRICAL;
+ // initialize rate matrix with LG
+ int i, j, k;
+ initProtMat(f, daa, "LG");
+ for (i = 0, k = 0; i < num_states-1; i++)
+ for (j = i+1; j < num_states; j++)
+ rates[k++] = daa[i*20+j];
} else {
// if name does not match, read the user-defined model
readParameters(model_name);
+ num_params = 0;
}
if (freq_params != "") {
// stringstream ss(freq_params);
@@ -3212,12 +3227,33 @@ void ModelProtein::init(const char *model_name, string model_params, StateFreqTy
readParameters(model_name);
}*/
- num_params = 0;
//assert(freq != FREQ_ESTIMATE);
if (freq == FREQ_UNKNOWN) freq = FREQ_USER_DEFINED;
ModelGTR::init(freq);
}
+void ModelProtein::saveCheckpoint() {
+ if (num_params > 0) {
+ checkpoint->startStruct("ModelProtein");
+ CKP_ARRAY_SAVE(getNumRateEntries(), rates);
+ checkpoint->endStruct();
+ }
+ ModelGTR::saveCheckpoint();
+}
+
+void ModelProtein::restoreCheckpoint() {
+ ModelGTR::restoreCheckpoint();
+
+ if (num_params > 0) {
+ checkpoint->startStruct("ModelProtein");
+ CKP_ARRAY_RESTORE(getNumRateEntries(), rates);
+ checkpoint->endStruct();
+ decomposeRateMatrix();
+ if (phylo_tree)
+ phylo_tree->clearAllPartialLH();
+ }
+}
+
void ModelProtein::readRates(istream &in) throw(const char*, string) {
int nrates = getNumRateEntries();
int row = 1, col = 0;
diff --git a/model/modelprotein.h b/model/modelprotein.h
index b363fff..f8a4df9 100644
--- a/model/modelprotein.h
+++ b/model/modelprotein.h
@@ -45,6 +45,16 @@ public:
*/
virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
read the rates from an input stream. it will throw error messages if failed
@param in input stream
diff --git a/model/modelset.cpp b/model/modelset.cpp
index a75e549..83c7d1a 100644
--- a/model/modelset.cpp
+++ b/model/modelset.cpp
@@ -124,6 +124,8 @@ void ModelSet::setVariables(double* variables)
ModelSet::~ModelSet()
{
-
+ for (reverse_iterator rit = rbegin(); rit != rend(); rit++)
+ delete (*rit);
+
}
diff --git a/model/modelset.h b/model/modelset.h
index ba77370..6c2ae9f 100644
--- a/model/modelset.h
+++ b/model/modelset.h
@@ -153,7 +153,18 @@ public:
*/
virtual void decomposeRateMatrix();
- ~ModelSet();
+ virtual ~ModelSet();
+
+ /**
+ * compute the memory size for the model, can be large for site-specific models
+ * @return memory size required in bytes
+ */
+ virtual uint64_t getMemoryRequired() {
+ uint64_t mem = ModelGTR::getMemoryRequired();
+ for (iterator it = begin(); it != end(); it++)
+ mem += (*it)->getMemoryRequired();
+ return mem;
+ }
/** map from pattern ID to model ID */
IntVector pattern_model_map;
diff --git a/model/modelsubst.h b/model/modelsubst.h
index 45444f2..a2ca4ba 100644
--- a/model/modelsubst.h
+++ b/model/modelsubst.h
@@ -257,6 +257,14 @@ public:
return NULL;
}
+ /**
+ * compute the memory size for the model, can be large for site-specific models
+ * @return memory size required in bytes
+ */
+ virtual uint64_t getMemoryRequired() {
+ return num_states*sizeof(double);
+ }
+
/*****************************************************
Checkpointing facility
*****************************************************/
diff --git a/model/partitionmodel.cpp b/model/partitionmodel.cpp
index bddbecd..a2942d6 100644
--- a/model/partitionmodel.cpp
+++ b/model/partitionmodel.cpp
@@ -144,7 +144,7 @@ double PartitionModel::optimizeLinkedAlpha(bool write_info, double gradient_epsi
}
-double PartitionModel::optimizeParameters(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+double PartitionModel::optimizeParameters(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
double tree_lh = 0.0;
int ntrees = tree->size();
@@ -176,7 +176,7 @@ double PartitionModel::optimizeParameters(bool fixed_len, bool write_info, doubl
}
-double PartitionModel::optimizeParametersGammaInvar(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+double PartitionModel::optimizeParametersGammaInvar(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
double tree_lh = 0.0;
int ntrees = tree->size();
diff --git a/model/partitionmodel.h b/model/partitionmodel.h
index 2a34473..bb4c106 100644
--- a/model/partitionmodel.h
+++ b/model/partitionmodel.h
@@ -67,10 +67,15 @@ public:
/**
optimize model parameters and tree branch lengths
- @param fixed_len TRUE to fix branch lengths, default is false
+ NOTE 2016-08-20: refactor the semantic of fixed_len
+ @param fixed_len 0: optimize branch lengths, 1: fix branch lengths, 2: scale branch lengths
+ @param write_info TRUE to write model parameters every optimization step, FALSE to only print at the end
+ @param logl_epsilon log-likelihood epsilon to stop
+ @param gradient_epsilon gradient (derivative) epsilon to stop
@return the best likelihood
*/
- virtual double optimizeParameters(bool fixed_len = false, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+ virtual double optimizeParameters(int fixed_len = BRLEN_OPTIMIZE, bool write_info = true,
+ double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
/**
* optimize model parameters and tree branch lengths for the +I+G model
@@ -78,7 +83,7 @@ public:
* @param fixed_len TRUE to fix branch lengths, default is false
* @return the best likelihood
*/
- virtual double optimizeParametersGammaInvar(bool fixed_len = false, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+ virtual double optimizeParametersGammaInvar(int fixed_len = BRLEN_OPTIMIZE, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
/**
* @return TRUE if parameters are at the boundary that may cause numerical unstability
diff --git a/model/ratemeyerhaeseler.cpp b/model/ratemeyerhaeseler.cpp
index 9b61b92..34bad12 100644
--- a/model/ratemeyerhaeseler.cpp
+++ b/model/ratemeyerhaeseler.cpp
@@ -218,7 +218,7 @@ double RateMeyerHaeseler::optimizeRate(int pattern) {
double current_rate = at(pattern);
double ferror, optx;
/* constant site alway have ZERO rates */
- if (phylo_tree->aln->at(pattern).is_const) {
+ if (phylo_tree->aln->at(pattern).isConst()) {
return (at(pattern) = MIN_SITE_RATE);
}
@@ -231,7 +231,7 @@ double RateMeyerHaeseler::optimizeRate(int pattern) {
if (phylo_tree->optimize_by_newton && rate_mh) // Newton-Raphson method
{
optx = minimizeNewtonSafeMode(MIN_SITE_RATE, current_rate, max_rate, TOL_SITE_RATE, negative_lh);
- if (optx > MAX_SITE_RATE*0.99 || (optx < MIN_SITE_RATE*2 && !phylo_tree->aln->at(pattern).is_const))
+ if (optx > MAX_SITE_RATE*0.99 || (optx < MIN_SITE_RATE*2 && !phylo_tree->aln->at(pattern).isConst()))
{
double optx2, negative_lh2;
optx2 = minimizeOneDimen(MIN_SITE_RATE, current_rate, max_rate, TOL_SITE_RATE, &negative_lh2, &ferror);
@@ -268,7 +268,7 @@ double RateMeyerHaeseler::optimizeRate(int pattern) {
}
//#ifndef NDEBUG
- if (optx == MAX_SITE_RATE || (optx == MIN_SITE_RATE && !phylo_tree->aln->at(pattern).is_const)) {
+ if (optx == MAX_SITE_RATE || (optx == MIN_SITE_RATE && !phylo_tree->aln->at(pattern).isConst())) {
ofstream out;
if (verbose_mode >= VB_MED) {
diff --git a/modelsblock.h b/modelsblock.h
index 4cca2af..f62fe21 100644
--- a/modelsblock.h
+++ b/modelsblock.h
@@ -21,9 +21,18 @@ public:
/* model description */
string description;
- /* true if model the basic model (no mixture etc.) */
+ /* flag as NM_ATOMIC or NM_FREQ or both */
int flag;
+ NxsModel() {
+ flag = 0;
+ }
+
+ NxsModel(string name) {
+ this->name = name;
+ flag = 0;
+ }
+
virtual ~NxsModel() {}
};
diff --git a/mpdablock.cpp b/mpdablock.cpp
index 7e9bca3..0ca8220 100644
--- a/mpdablock.cpp
+++ b/mpdablock.cpp
@@ -56,7 +56,7 @@ void MPdaBlock::Read(NxsToken &token)
int ntax = sgraph->getNTaxa();
if (ntax <= 0) {
- errormsg = "PDA Block should be preceeded by Splits Block";
+ errormsg = "PDA Block should be preceded by Splits Block";
throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
}
diff --git a/msplitsblock.cpp b/msplitsblock.cpp
index 4f1a6da..756da63 100644
--- a/msplitsblock.cpp
+++ b/msplitsblock.cpp
@@ -274,7 +274,7 @@ void MSplitsBlock::Read(NxsToken &token)
errormsg += " instead";
throw NxsException(errormsg, token);
}
- } // if (token.Equals("TAXLABELS"))
+ } // if (token.Equals("MATRIX"))
else if (token.Equals("END") || token.Equals("ENDBLOCK"))
{
diff --git a/mtreeset.cpp b/mtreeset.cpp
index 01c1926..fd997da 100644
--- a/mtreeset.cpp
+++ b/mtreeset.cpp
@@ -677,11 +677,11 @@ void MTreeSet::computeRFDist(int *rfdist, MTreeSet *treeset2,
if (info_file) {
oinfo.close();
- cout << "Detailed split occurences printed to " << info_file << endl;
+ cout << "Detailed split occurrences printed to " << info_file << endl;
}
if (tree_file) {
otree.close();
- cout << "Detailed split occurences on tree printed to " << tree_file << endl;
+ cout << "Detailed split occurrences on tree printed to " << tree_file << endl;
}
}
diff --git a/ncl/nxscharactersblock.cpp b/ncl/nxscharactersblock.cpp
index cb78f84..3163526 100644
--- a/ncl/nxscharactersblock.cpp
+++ b/ncl/nxscharactersblock.cpp
@@ -2330,6 +2330,7 @@ void NxsCharactersBlock::HandleTaxlabels(
for (;;)
{
+ token.SetLabileFlagBit(NxsToken::hyphenNotPunctuation + NxsToken::preserveUnderscores);
token.GetNextToken();
// Token should either be ';' or the name of a taxon
diff --git a/ncl/nxsdistancesblock.cpp b/ncl/nxsdistancesblock.cpp
index dd4cfde..68d2ea0 100644
--- a/ncl/nxsdistancesblock.cpp
+++ b/ncl/nxsdistancesblock.cpp
@@ -494,6 +494,7 @@ void NxsDistancesBlock::HandleTaxlabelsCommand(
for (unsigned i = 0; i < ntax; i++)
{
+ token.SetLabileFlagBit(NxsToken::hyphenNotPunctuation + NxsToken::preserveUnderscores);
token.GetNextToken();
taxa->AddTaxonLabel(token.GetToken());
}
diff --git a/ncl/nxstaxablock.cpp b/ncl/nxstaxablock.cpp
index 51ff10f..5240705 100644
--- a/ncl/nxstaxablock.cpp
+++ b/ncl/nxstaxablock.cpp
@@ -128,6 +128,7 @@ void NxsTaxaBlock::Read(
for (unsigned i = 0; (int)i < nominal_ntax; i++)
{
+ token.SetLabileFlagBit(NxsToken::hyphenNotPunctuation + NxsToken::preserveUnderscores);
token.GetNextToken();
//@pol should check to make sure this is not punctuation
AddTaxonLabel(token.GetToken());
diff --git a/pattern.cpp b/pattern.cpp
index faef093..18ef62c 100644
--- a/pattern.cpp
+++ b/pattern.cpp
@@ -16,8 +16,9 @@ Pattern::Pattern()
: string()
{
frequency = 0;
- is_const = false;
- is_informative = false;
+// is_const = false;
+// is_informative = false;
+ flag = 0;
const_char = 255;
num_chars = 0;
}
@@ -26,8 +27,9 @@ Pattern::Pattern(const Pattern &pat)
: string(pat)
{
frequency = pat.frequency;
- is_const = pat.is_const;
- is_informative = pat.is_informative;
+// is_const = pat.is_const;
+// is_informative = pat.is_informative;
+ flag = pat.flag;
const_char = pat.const_char;
num_chars = pat.num_chars;
}
diff --git a/pattern.h b/pattern.h
index c854259..47bb8c5 100644
--- a/pattern.h
+++ b/pattern.h
@@ -17,6 +17,10 @@
using namespace std;
+const int PAT_CONST = 1; // const site pattern, e.g. AAAAAA, CC-C-CCCC
+const int PAT_INVARIANT = 2; // invariant site pattern, including const patterns and e.g., GS--G-GGG (S = G/C)
+const int PAT_INFORMATIVE = 4; // parsimony informative sites
+
/**
Site-patterns in a multiple sequence alignment
@author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
@@ -50,6 +54,18 @@ public:
*/
virtual ~Pattern();
+ inline bool isConst() {
+ return (flag & PAT_CONST) != 0;
+ }
+
+ inline bool isInvariant() {
+ return (flag & PAT_INVARIANT) != 0;
+ }
+
+ inline bool isInformative() {
+ return (flag & PAT_INFORMATIVE) != 0;
+ }
+
/**
frequency appearance of the pattern
*/
@@ -59,10 +75,12 @@ public:
true if this is a constant pattern
2015-03-04: is_const will also be true for pattern like "AA-A--AAA"
*/
- bool is_const;
+// bool is_const;
/** true if pattern is informative, false otherwise */
- bool is_informative;
+// bool is_informative;
+
+ int flag;
/** 2015-03-04: if is_const is true, this will store the const character for the pattern */
char const_char;
diff --git a/pda.cpp b/pda.cpp
index 88b15f8..13df75c 100644
--- a/pda.cpp
+++ b/pda.cpp
@@ -1711,36 +1711,40 @@ public:
outstreambuf* open( const char* name, ios::openmode mode = ios::out);
outstreambuf* close();
~outstreambuf() { close(); }
+ streambuf *get_fout_buf() {
+ return fout_buf;
+ }
+ ofstream *get_fout() {
+ return &fout;
+ }
protected:
ofstream fout;
streambuf *cout_buf;
- streambuf *cerr_buf;
streambuf *fout_buf;
virtual int overflow( int c = EOF);
virtual int sync();
};
-
outstreambuf* outstreambuf::open( const char* name, ios::openmode mode) {
- fout.open(name, mode);
- if (!fout.is_open()) {
- cout << "Could not open " << name << " for logging" << endl;
- return NULL;
- }
+ if (!(Params::getInstance().suppress_output_flags & OUT_LOG)) {
+ fout.open(name, mode);
+ if (!fout.is_open()) {
+ cout << "Could not open " << name << " for logging" << endl;
+ return NULL;
+ }
+ fout_buf = fout.rdbuf();
+ }
cout_buf = cout.rdbuf();
- cerr_buf = cerr.rdbuf();
- fout_buf = fout.rdbuf();
cout.rdbuf(this);
cerr.rdbuf(this);
return this;
}
outstreambuf* outstreambuf::close() {
+ cout.rdbuf(cout_buf);
if ( fout.is_open()) {
sync();
- cout.rdbuf(cout_buf);
- cerr.rdbuf(cerr_buf);
fout.close();
return this;
}
@@ -1750,6 +1754,8 @@ outstreambuf* outstreambuf::close() {
int outstreambuf::overflow( int c) { // used for output buffer only
if (verbose_mode >= VB_MIN)
if (cout_buf->sputc(c) == EOF) return EOF;
+ if (Params::getInstance().suppress_output_flags & OUT_LOG)
+ return c;
if (fout_buf->sputc(c) == EOF) return EOF;
return c;
}
@@ -1757,10 +1763,47 @@ int outstreambuf::overflow( int c) { // used for output buffer only
int outstreambuf::sync() { // used for output buffer only
if (verbose_mode >= VB_MIN)
cout_buf->pubsync();
+ if (Params::getInstance().suppress_output_flags & OUT_LOG)
+ return 0;
return fout_buf->pubsync();
}
+class errstreambuf : public streambuf {
+public:
+ void init(streambuf *fout_buf) {
+ this->fout_buf = fout_buf;
+ cerr_buf = cerr.rdbuf();
+ }
+
+ ~errstreambuf() {
+ cerr.rdbuf(cerr_buf);
+ }
+
+protected:
+ streambuf *cerr_buf;
+ streambuf *fout_buf;
+
+ virtual int overflow( int c = EOF) {
+ if (cerr_buf->sputc(c) == EOF) return EOF;
+ if (Params::getInstance().suppress_output_flags & OUT_LOG)
+ return c;
+ if (fout_buf->sputc(c) == EOF) return EOF;
+ return c;
+ }
+
+ virtual int sync() {
+ cerr_buf->pubsync();
+ if (Params::getInstance().suppress_output_flags & OUT_LOG)
+ return 0;
+ return fout_buf->pubsync();
+ }
+};
+
+
+
+
outstreambuf _out_buf;
+errstreambuf _err_buf;
string _log_file;
int _exit_wait_optn = FALSE;
@@ -1770,10 +1813,12 @@ extern "C" void startLogFile(bool append_log) {
_out_buf.open(_log_file.c_str(), ios::app);
else
_out_buf.open(_log_file.c_str());
+ _err_buf.init(_out_buf.get_fout_buf());
}
extern "C" void endLogFile() {
_out_buf.close();
+
}
void funcExit(void) {
@@ -1783,7 +1828,7 @@ void funcExit(void) {
while (getchar() != '\n');
}
- endLogFile();
+ endLogFile();
}
extern "C" void funcAbort(int signal_number)
@@ -1797,16 +1842,16 @@ extern "C" void funcAbort(int signal_number)
print_stacktrace(cerr);
#endif
- cout << endl << "*** IQ-TREE CRASHES WITH SIGNAL ";
+ cerr << endl << "*** IQ-TREE CRASHES WITH SIGNAL ";
switch (signal_number) {
- case SIGABRT: cout << "ABORTED"; break;
- case SIGFPE: cout << "ERRONEOUS NUMERIC"; break;
- case SIGILL: cout << "ILLEGAL INSTRUCTION"; break;
- case SIGSEGV: cout << "SEGMENTATION FAULT"; break;
- }
- cout << endl;
- cout << "*** For bug report please send to developers:" << endl << "*** Log file: " << _log_file;
- cout << endl << "*** Alignment files (if possible)" << endl;
+ case SIGABRT: cerr << "ABORTED"; break;
+ case SIGFPE: cerr << "ERRONEOUS NUMERIC"; break;
+ case SIGILL: cerr << "ILLEGAL INSTRUCTION"; break;
+ case SIGSEGV: cerr << "SEGMENTATION FAULT"; break;
+ }
+ cerr << endl;
+ cerr << "*** For bug report please send to developers:" << endl << "*** Log file: " << _log_file;
+ cerr << endl << "*** Alignment files (if possible)" << endl;
funcExit();
signal(signal_number, SIG_DFL);
}
@@ -2210,10 +2255,9 @@ int main(int argc, char *argv[])
}
}
-
- _log_file = Params::getInstance().out_prefix;
- _log_file += ".log";
- startLogFile(append_log);
+ _log_file = Params::getInstance().out_prefix;
+ _log_file += ".log";
+ startLogFile(append_log);
if (append_log) {
cout << endl << "******************************************************"
diff --git a/phyloanalysis.cpp b/phyloanalysis.cpp
index 1558228..2eaabe6 100644
--- a/phyloanalysis.cpp
+++ b/phyloanalysis.cpp
@@ -58,17 +58,20 @@ void reportReferences(Params ¶ms, ofstream &out, string &original_model) {
out << "To cite IQ-TREE please use:" << endl << endl
<< "Lam-Tung Nguyen, Heiko A. Schmidt, Arndt von Haeseler, and Bui Quang Minh (2015)" << endl
<< "IQ-TREE: A fast and effective stochastic algorithm for estimating" << endl
- << "maximum likelihood phylogenies. Mol. Biol. Evol., 32:268-274." << endl << endl;
+ << "maximum likelihood phylogenies. Mol. Biol. Evol., 32:268-274." << endl
+ << "http://dx.doi.org/10.1093/molbev/msu300" << endl << endl;
if (params.gbo_replicates)
out << "Since you used ultrafast bootstrap (UFBoot) please also cite: " << endl << endl
<< "Bui Quang Minh, Minh Anh Thi Nguyen, and Arndt von Haeseler (2013) Ultrafast" << endl
- << "approximation for phylogenetic bootstrap. Mol. Biol. Evol., 30:1188-1195." << endl << endl;
+ << "approximation for phylogenetic bootstrap. Mol. Biol. Evol., 30:1188-1195." << endl
+ << "http://dx.doi.org/10.1093/molbev/mst024" << endl << endl;
if (params.partition_file)
out << "Since you used partition models please also cite:" << endl << endl
<< "Olga Chernomor, Arndt von Haeseler, and Bui Quang Minh (2016) Terrace aware data" << endl
- << "structure for phylogenomic inference from supermatrices. Syst. Biol., in press." << endl << endl;
+ << "structure for phylogenomic inference from supermatrices. Syst. Biol., in press." << endl
+ << "http://dx.doi.org/10.1093/sysbio/syw037" << endl << endl;
}
@@ -80,10 +83,14 @@ void reportAlignment(ofstream &out, Alignment &alignment, int nremoved_seqs) {
((alignment.seq_type == SEQ_DNA) ? "nucleotide" :
(alignment.seq_type == SEQ_PROTEIN) ? "amino-acid" :
(alignment.seq_type == SEQ_CODON) ? "codon": "morphological"))
- << " sites" << endl << "Number of constant sites: "
+ << " sites"
+ << endl << "Number of constant sites: "
<< round(alignment.frac_const_sites * alignment.getNSite())
<< " (= " << alignment.frac_const_sites * 100 << "% of all sites)"
- << endl << "Number of site patterns: " << alignment.size() << endl
+ << endl << "Number of invariant (constant or ambiguous constant) sites: "
+ << round(alignment.frac_invariant_sites * alignment.getNSite())
+ << " (= " << alignment.frac_invariant_sites * 100 << "% of all sites)"
+ << endl << "Number of distinct site patterns: " << alignment.size() << endl
<< endl;
}
@@ -170,14 +177,15 @@ void reportModelSelection(ofstream &out, Params ¶ms, vector<ModelInfo> &mode
void reportModel(ofstream &out, Alignment *aln, ModelSubst *m) {
int i, j, k;
assert(aln->num_states == m->num_states);
+ double *rate_mat = new double[m->num_states * m->num_states];
+ if (!m->isSiteSpecificModel())
+ m->getRateMatrix(rate_mat);
+ else
+ ((ModelSet*)m)->front()->getRateMatrix(rate_mat);
+
if (m->num_states <= 4) {
out << "Rate parameter R:" << endl << endl;
- double *rate_mat = new double[m->num_states * m->num_states];
- if (!m->isSiteSpecificModel())
- m->getRateMatrix(rate_mat);
- else
- ((ModelSet*)m)->front()->getRateMatrix(rate_mat);
if (m->num_states > 4)
out << fixed;
if (m->isReversible()) {
@@ -209,8 +217,29 @@ void reportModel(ofstream &out, Alignment *aln, ModelSubst *m) {
//if (tree.aln->num_states > 4)
out << endl;
out.unsetf(ios_base::fixed);
- delete[] rate_mat;
- }
+ } else if (aln->seq_type == SEQ_PROTEIN && m->getNDim() > 20) {
+ assert(m->num_states == 20);
+ out << "WARNING: This model has " << m->getNDim() + m->getNDimFreq() << " parameters that may be overfitting. Please use with caution!" << endl << endl;
+ double full_mat[400];
+ for (i = 0, k = 0; i < m->num_states - 1; i++)
+ for (j = i + 1; j < m->num_states; j++, k++) {
+ full_mat[i*m->num_states+j] = rate_mat[k];
+ }
+ out << "Substitution parameters (lower-diagonal) and state frequencies in PAML format (can be used as input for IQ-TREE): " << endl << endl;
+ for (i = 1; i < m->num_states; i++) {
+ for (j = 0; j < i; j++)
+ out << "\t" << full_mat[j*m->num_states+i];
+ out << endl;
+ }
+ double state_freq[20];
+ m->getStateFrequency(state_freq);
+ for (i = 0; i < m->num_states; i++)
+ out << "\t" << state_freq[i];
+ out << endl << endl;
+ }
+
+ delete[] rate_mat;
+
out << "State frequencies: ";
if (m->isSiteSpecificModel())
out << "(site specific frequencies)" << endl << endl;
@@ -455,8 +484,8 @@ void reportTree(ofstream &out, Params ¶ms, PhyloTree &tree, double tree_lh,
}
}
if (is_codon)
- out << endl << "NOTE: Branch lengths are intepreted as number of nucleotide substitutions per codon site!"
- << endl << " Rescale them by 1/3 if you want to have #nt substitutions per nt site" << endl;
+ out << endl << "NOTE: Branch lengths are interpreted as number of nucleotide substitutions per codon site!"
+ << endl << " Rescale them by 1/3 if you want to have #nt substitutions per nt site" << endl;
if (main_tree)
if (params.aLRT_replicates > 0 || params.gbo_replicates || (params.num_bootstrap_samples && params.compute_ml_tree)) {
out << "Numbers in parentheses are ";
@@ -534,8 +563,109 @@ void searchGAMMAInvarByRestarting(IQTree &iqtree);
void computeLoglFromUserInputGAMMAInvar(Params ¶ms, IQTree &iqtree);
+void printOutfilesInfo(Params ¶ms, string &original_model, IQTree &tree) {
+
+ cout << endl << "Analysis results written to: " << endl;
+ if (!(params.suppress_output_flags & OUT_IQTREE))
+ cout<< " IQ-TREE report: " << params.out_prefix << ".iqtree"
+ << endl;
+ if (params.compute_ml_tree) {
+ if (!(params.suppress_output_flags & OUT_TREEFILE)) {
+ if (original_model.find("ONLY") == string::npos)
+ cout << " Maximum-likelihood tree: " << params.out_prefix << ".treefile" << endl;
+ else
+ cout << " Tree used for model selection: " << params.out_prefix << ".treefile" << endl;
+ }
+ if (params.snni && params.write_local_optimal_trees) {
+ cout << " Locally optimal trees (" << tree.candidateTrees.getNumLocalOptTrees() << "): " << params.out_prefix << ".suboptimal_trees" << endl;
+ }
+ }
+ if (!params.user_file && params.start_tree == STT_BIONJ) {
+ cout << " BIONJ tree: " << params.out_prefix << ".bionj"
+ << endl;
+ }
+ if (!params.dist_file) {
+ //cout << " Juke-Cantor distances: " << params.out_prefix << ".jcdist" << endl;
+ if (params.compute_ml_dist)
+ cout << " Likelihood distances: " << params.out_prefix
+ << ".mldist" << endl;
+ if (params.print_conaln)
+ cout << " Concatenated alignment: " << params.out_prefix
+ << ".conaln" << endl;
+ }
+ if (original_model.find("TEST") != string::npos && tree.isSuperTree()) {
+ cout << " Best partitioning scheme: " << params.out_prefix << ".best_scheme.nex" << endl;
+ bool raxml_format_printed = true;
+
+ for (vector<PartitionInfo>::iterator it = ((PhyloSuperTree*)&tree)->part_info.begin();
+ it != ((PhyloSuperTree*)&tree)->part_info.end(); it++)
+ if (!it->aln_file.empty()) {
+ raxml_format_printed = false;
+ break;
+ }
+ if (raxml_format_printed)
+ cout << " in RAxML format: " << params.out_prefix << ".best_scheme" << endl;
+ }
+ if (tree.getRate()->getGammaShape() > 0 && params.print_site_rate)
+ cout << " Gamma-distributed rates: " << params.out_prefix << ".rate"
+ << endl;
+
+ if ((tree.getRate()->isSiteSpecificRate() || tree.getRate()->getPtnCat(0) >= 0) && params.print_site_rate)
+ cout << " Site-rates by MH model: " << params.out_prefix << ".rate"
+ << endl;
+
+ if (params.print_site_lh)
+ cout << " Site log-likelihoods: " << params.out_prefix << ".sitelh"
+ << endl;
+
+ if (params.print_site_prob)
+ cout << " Site probability per rate/mix: " << params.out_prefix << ".siteprob"
+ << endl;
+
+ if (params.write_intermediate_trees)
+ cout << " All intermediate trees: " << params.out_prefix << ".treels"
+ << endl;
+
+ if (params.gbo_replicates) {
+ cout << endl << "Ultrafast bootstrap approximation results written to:" << endl
+ << " Split support values: " << params.out_prefix << ".splits.nex" << endl
+ << " Consensus tree: " << params.out_prefix << ".contree" << endl;
+ if (params.print_ufboot_trees)
+ cout << " UFBoot trees: " << params.out_prefix << ".ufboot" << endl;
+
+ }
+
+ if (params.treeset_file) {
+ cout << " Evaluated user trees: " << params.out_prefix << ".trees" << endl;
+
+ if (params.print_tree_lh) {
+ cout << " Tree log-likelihoods: " << params.out_prefix << ".treelh" << endl;
+ }
+ if (params.print_site_lh) {
+ cout << " Site log-likelihoods: " << params.out_prefix << ".sitelh" << endl;
+ }
+ }
+ if (params.lmap_num_quartets >= 0) {
+ cout << " Likelihood mapping plot (SVG): " << params.out_prefix << ".lmap.svg" << endl;
+ cout << " Likelihood mapping plot (EPS): " << params.out_prefix << ".lmap.eps" << endl;
+ }
+ if (!(params.suppress_output_flags & OUT_LOG))
+ cout << " Screen log file: " << params.out_prefix << ".log" << endl;
+ /* if (original_model == "WHTEST")
+ cout <<" WH-TEST report: " << params.out_prefix << ".whtest" << endl;*/
+ cout << endl;
+
+}
+
+
void reportPhyloAnalysis(Params ¶ms, string &original_model,
IQTree &tree, vector<ModelInfo> &model_info) {
+
+ if (params.suppress_output_flags & OUT_IQTREE) {
+ printOutfilesInfo(params, original_model, tree);
+ return;
+ }
+
if (params.count_trees) {
// addon: print #distinct trees
cout << endl << "NOTE: " << pllTreeCounter.size() << " distinct trees evaluated during whole tree search" << endl;
@@ -548,7 +678,7 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
}
for (IntVector::iterator i2 = counts.begin(); i2 != counts.end(); i2++) {
if (*i2 != 0) {
- cout << "#Trees occuring " << (i2-counts.begin()) << " times: " << *i2 << endl;
+ cout << "#Trees occurring " << (i2-counts.begin()) << " times: " << *i2 << endl;
}
}
}
@@ -1012,89 +1142,8 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
} catch (ios::failure) {
outError(ERR_WRITE_OUTPUT, outfile);
}
-
- cout << endl << "Analysis results written to: " << endl
- << " IQ-TREE report: " << params.out_prefix << ".iqtree"
- << endl;
- if (params.compute_ml_tree) {
- if (original_model.find("ONLY") == string::npos)
- cout << " Maximum-likelihood tree: " << params.out_prefix << ".treefile" << endl;
- else
- cout << " Tree used for model selection: " << params.out_prefix << ".treefile" << endl;
- if (params.snni && params.write_local_optimal_trees) {
- cout << " Locally optimal trees (" << tree.candidateTrees.getNumLocalOptTrees() << "): " << params.out_prefix << ".suboptimal_trees" << endl;
- }
- }
- if (!params.user_file && params.start_tree == STT_BIONJ) {
- cout << " BIONJ tree: " << params.out_prefix << ".bionj"
- << endl;
- }
- if (!params.dist_file) {
- //cout << " Juke-Cantor distances: " << params.out_prefix << ".jcdist" << endl;
- if (params.compute_ml_dist)
- cout << " Likelihood distances: " << params.out_prefix
- << ".mldist" << endl;
- if (params.print_conaln)
- cout << " Concatenated alignment: " << params.out_prefix
- << ".conaln" << endl;
- }
- if (original_model.find("TEST") != string::npos && tree.isSuperTree()) {
- cout << " Best partitioning scheme: " << params.out_prefix << ".best_scheme.nex" << endl;
- bool raxml_format_printed = true;
-
- for (vector<PartitionInfo>::iterator it = ((PhyloSuperTree*)&tree)->part_info.begin();
- it != ((PhyloSuperTree*)&tree)->part_info.end(); it++)
- if (!it->aln_file.empty()) {
- raxml_format_printed = false;
- break;
- }
- if (raxml_format_printed)
- cout << " in RAxML format: " << params.out_prefix << ".best_scheme" << endl;
- }
- if (tree.getRate()->getGammaShape() > 0 && params.print_site_rate)
- cout << " Gamma-distributed rates: " << params.out_prefix << ".rate"
- << endl;
-
- if ((tree.getRate()->isSiteSpecificRate() || tree.getRate()->getPtnCat(0) >= 0) && params.print_site_rate)
- cout << " Site-rates by MH model: " << params.out_prefix << ".rate"
- << endl;
-
- if (params.print_site_lh)
- cout << " Site log-likelihoods: " << params.out_prefix << ".sitelh"
- << endl;
-
- if (params.write_intermediate_trees)
- cout << " All intermediate trees: " << params.out_prefix << ".treels"
- << endl;
-
- if (params.gbo_replicates) {
- cout << endl << "Ultrafast bootstrap approximation results written to:" << endl
- << " Split support values: " << params.out_prefix << ".splits.nex" << endl
- << " Consensus tree: " << params.out_prefix << ".contree" << endl;
- if (params.print_ufboot_trees)
- cout << " UFBoot trees: " << params.out_prefix << ".ufboot" << endl;
-
- }
-
- if (params.treeset_file) {
- cout << " Evaluated user trees: " << params.out_prefix << ".trees" << endl;
-
- if (params.print_tree_lh) {
- cout << " Tree log-likelihoods: " << params.out_prefix << ".treelh" << endl;
- }
- if (params.print_site_lh) {
- cout << " Site log-likelihoods: " << params.out_prefix << ".sitelh" << endl;
- }
- }
- if (params.lmap_num_quartets >= 0) {
- cout << " Likelihood mapping plot (SVG): " << params.out_prefix << ".lmap.svg" << endl;
- cout << " Likelihood mapping plot (EPS): " << params.out_prefix << ".lmap.eps" << endl;
- }
- cout << " Screen log file: " << params.out_prefix << ".log" << endl;
- /* if (original_model == "WHTEST")
- cout <<" WH-TEST report: " << params.out_prefix << ".whtest" << endl;*/
- cout << endl;
-
+
+ printOutfilesInfo(params, original_model, tree);
}
void checkZeroDist(Alignment *aln, double *dist) {
@@ -1462,14 +1511,18 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) {
else
printSiteLhCategory(site_lh_file.c_str(), &iqtree, params.print_site_lh);
}
+
+ if (params.print_site_prob && !params.pll) {
+ printSiteProbCategory(((string)params.out_prefix + ".siteprob").c_str(), &iqtree, params.print_site_prob);
+ }
- if (params.print_site_state_freq) {
+ if (params.print_site_state_freq != WSF_NONE) {
string site_freq_file = params.out_prefix;
site_freq_file += ".sitesf";
printSiteStateFreq(site_freq_file.c_str(), &iqtree);
}
- if (params.print_site_posterior) {
+ if (params.print_trees_site_posterior) {
cout << "Computing mixture posterior probabilities" << endl;
IntVector pattern_cat;
int num_mix = iqtree.computePatternCategories(&pattern_cat);
@@ -1569,6 +1622,12 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) {
}
}
+ if (params.fixed_branch_length == BRLEN_SCALE) {
+ string filename = (string)params.out_prefix + ".blscale";
+ iqtree.printTreeLengthScaling(filename.c_str());
+ cout << "Scaled tree length and model parameters printed to " << filename << endl;
+ }
+
}
void printFinalSearchInfo(Params ¶ms, IQTree &iqtree, double search_cpu_time, double search_real_time) {
@@ -1733,8 +1792,6 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
iqtree.initializeAllPartialLh();
double initEpsilon = params.min_iterations == 0 ? params.modeps : (params.modeps*10);
- if (params.opt_gammai)
- initEpsilon = 0.1;
string initTree;
@@ -2232,6 +2289,8 @@ void runStandardBootstrap(Params ¶ms, string &original_model, Alignment *ali
params.aLRT_test = false;
params.aBayes_test = false;
+ if (params.suppress_output_flags & OUT_TREEFILE)
+ outError("Suppress .treefile not allowed for standard bootstrap");
string treefile_name = params.out_prefix;
treefile_name += ".treefile";
string boottrees_name = params.out_prefix;
@@ -2457,6 +2516,63 @@ void convertAlignment(Params ¶ms, IQTree *iqtree) {
params.aln_nogaps, params.aln_no_const_sites, params.ref_seq_name);
}
+/**
+ 2016-08-04: compute a site frequency model for profile mixture model
+*/
+void computeSiteFrequencyModel(Params ¶ms, Alignment *alignment) {
+
+ cout << endl << "===> COMPUTING SITE FREQUENCY MODEL BASED ON TREE FILE " << params.tree_freq_file << endl;
+ assert(params.tree_freq_file);
+ PhyloTree *tree = new PhyloTree(alignment);
+ tree->setParams(¶ms);
+ bool myrooted = params.is_rooted;
+ tree->readTree(params.tree_freq_file, myrooted);
+ tree->setAlignment(alignment);
+ tree->setRootNode(params.root);
+
+ ModelsBlock *models_block = readModelsDefinition(params);
+ tree->setModelFactory(new ModelFactory(params, tree, models_block));
+ delete models_block;
+ tree->setModel(tree->getModelFactory()->model);
+ tree->setRate(tree->getModelFactory()->site_rate);
+ tree->setLikelihoodKernel(params.SSE);
+
+ if (!tree->getModel()->isMixture())
+ outError("No mixture model was specified!");
+ uint64_t mem_size = tree->getMemoryRequired();
+ uint64_t total_mem = getMemorySize();
+ cout << "NOTE: " << (mem_size / 1024) / 1024 << " MB RAM is required!" << endl;
+ if (mem_size >= total_mem) {
+ outError("Memory required exceeds your computer RAM size!");
+ }
+#ifdef BINARY32
+ if (mem_size >= 2000000000) {
+ outError("Memory required exceeds 2GB limit of 32-bit executable");
+ }
+#endif
+
+ tree->initializeAllPartialLh();
+ tree->getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.modeps);
+
+ size_t nptn = alignment->getNPattern(), nstates = alignment->num_states;
+ double *ptn_state_freq = new double[nptn*nstates];
+ tree->computePatternStateFreq(ptn_state_freq);
+ alignment->site_state_freq.resize(nptn);
+ for (size_t ptn = 0; ptn < nptn; ptn++) {
+ double *f = new double[nstates];
+ memcpy(f, ptn_state_freq+ptn*nstates, sizeof(double)*nstates);
+ alignment->site_state_freq[ptn] = f;
+ }
+ alignment->getSitePatternIndex(alignment->site_model);
+ printSiteStateFreq(((string)params.out_prefix+".sitefreq").c_str(), tree, ptn_state_freq);
+ params.print_site_state_freq = WSF_NONE;
+
+ delete [] ptn_state_freq;
+ delete tree;
+
+ cout << endl << "===> CONTINUE ANALYSIS USING THE INFERRED SITE FREQUENCY MODEL" << endl;
+}
+
/**********************************************************
* TOP-LEVEL FUNCTION
@@ -2492,6 +2608,22 @@ void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint) {
alignment->addConstPatterns(params.freq_const_patterns);
cout << "INFO: " << alignment->getNSite() - orig_nsite << " const sites added into alignment" << endl;
}
+
+ if (params.tree_freq_file) {
+ if (checkpoint->getBool("finishedSiteFreqFile")) {
+ alignment->readSiteStateFreq(((string)params.out_prefix + ".sitefreq").c_str());
+ params.print_site_state_freq = WSF_NONE;
+ cout << "CHECKPOINT: Site frequency model restored" << endl;
+ } else {
+ computeSiteFrequencyModel(params, alignment);
+ checkpoint->putBool("finishedSiteFreqFile", true);
+ checkpoint->dump();
+ }
+ }
+ if (params.site_freq_file) {
+ alignment->readSiteStateFreq(params.site_freq_file);
+ }
+
tree = new IQTree(alignment);
}
diff --git a/phylokernel.h b/phylokernel.h
index 7521c72..e0b9f7c 100644
--- a/phylokernel.h
+++ b/phylokernel.h
@@ -15,6 +15,10 @@
inline Vec2d horizontal_add(Vec2d x[2]) {
#if INSTRSET >= 3 // SSE3
return _mm_hadd_pd(x[0],x[1]);
+#elif INSTRSET >= 2
+ Vec2d help0 = _mm_shuffle_pd(x[0], x[1], _MM_SHUFFLE2(0,0));
+ Vec2d help1 = _mm_shuffle_pd(x[0], x[1], _MM_SHUFFLE2(1,1));
+ return _mm_add_pd(help0, help1);
#else
#error "You must compile with SSE3 enabled!"
#endif
diff --git a/phylokernelsitemodel.cpp b/phylokernelsitemodel.cpp
index a79c178..d681d6f 100644
--- a/phylokernelsitemodel.cpp
+++ b/phylokernelsitemodel.cpp
@@ -50,7 +50,7 @@ void PhyloTree::computeSitemodelPartialLikelihoodEigen(PhyloNeighbor *dad_branch
PhyloNeighbor *nei = (PhyloNeighbor*)*it;
if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
if ((nei->partial_lh_computed & 1) == 0)
- computeSitemodelPartialLikelihoodEigen(nei, node);
+ computePartialLikelihood(nei, node);
dad_branch->lh_scale_factor += nei->lh_scale_factor;
}
@@ -81,114 +81,128 @@ void PhyloTree::computeSitemodelPartialLikelihoodEigen(PhyloNeighbor *dad_branch
right = tmp;
}
- assert(node->degree() == 3); // does not work with multifurcating tree yet
+ if (node->degree() > 3) {
+
+ /*--------------------- multifurcating node ------------------*/
-// if (node->degree() > 3) {
-//
-// /*--------------------- multifurcating node ------------------*/
-//
-// // now for-loop computing partial_lh over all site-patterns
-//#ifdef _OPENMP
-//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
-//#endif
-// for (ptn = 0; ptn < nptn; ptn++) {
-// double partial_lh_all[block];
-// for (i = 0; i < block; i++)
-// partial_lh_all[i] = 1.0;
-// dad_branch->scale_num[ptn] = 0;
-//
-// double *partial_lh_leaf = partial_lh_leaves;
-// double *echild = echildren;
-//
-// FOR_NEIGHBOR_IT(node, dad, it) {
-// PhyloNeighbor *child = (PhyloNeighbor*)*it;
-// if (child->node->isLeaf()) {
-// // external node
-// int state_child = (ptn < orig_ntn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-// double *child_lh = partial_lh_leaf + state_child*block;
-// for (c = 0; c < block; c++) {
-// // compute real partial likelihood vector
-// partial_lh_all[c] *= child_lh[c];
-// }
-// partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
-// } else {
-// // internal node
-// double *partial_lh = partial_lh_all;
-// double *partial_lh_child = child->partial_lh + ptn*block;
-// dad_branch->scale_num[ptn] += child->scale_num[ptn];
-//
-// double *echild_ptr = echild;
-// for (c = 0; c < ncat; c++) {
-// // compute real partial likelihood vector
-// for (x = 0; x < nstates; x++) {
-// double vchild = 0.0;
-//// double *echild_ptr = echild + (c*nstatesqr+x*nstates);
-// for (i = 0; i < nstates; i++) {
-// vchild += echild_ptr[i] * partial_lh_child[i];
-// }
-// echild_ptr += nstates;
-// partial_lh[x] *= vchild;
-// }
-// partial_lh += nstates;
-// partial_lh_child += nstates;
-// }
-// } // if
-// echild += block*nstates;
-// } // FOR_NEIGHBOR
-//
-//
-// // compute dot-product with inv_eigenvector
-// double lh_max = 0.0;
-// double *partial_lh_tmp = partial_lh_all;
-// double *partial_lh = dad_branch->partial_lh + ptn*block;
-// for (c = 0; c < ncat; c++) {
-// double *inv_evec_ptr = inv_evec;
-// for (i = 0; i < nstates; i++) {
-// double res = 0.0;
-// for (x = 0; x < nstates; x++) {
-// res += partial_lh_tmp[x]*inv_evec_ptr[x];
-// }
-// inv_evec_ptr += nstates;
-// partial_lh[i] = res;
-// lh_max = max(lh_max, fabs(res));
-// }
-// partial_lh += nstates;
-// partial_lh_tmp += nstates;
-// }
-// // check if one should scale partial likelihoods
-// if (lh_max < SCALING_THRESHOLD) {
-// partial_lh = dad_branch->partial_lh + ptn*block;
-// if (lh_max == 0.0) {
-// // for very shitty data
-// for (c = 0; c < ncat; c++)
-// memcpy(&partial_lh[c*nstates], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
-// sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
-// //sum_scale += log(lh_max) * ptn_freq[ptn];
-// dad_branch->scale_num[ptn] += 4;
-// int nsite = aln->getNSite();
-// for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
-// if (aln->getPatternID(i) == ptn) {
-// outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
-// x++;
-// }
-// } else {
-// // now do the likelihood scaling
-// for (i = 0; i < block; i++) {
-// partial_lh[i] *= SCALING_THRESHOLD_INVER;
-// //partial_lh[i] /= lh_max;
-// }
-// // unobserved const pattern will never have underflow
-// sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-// //sum_scale += log(lh_max) * ptn_freq[ptn];
-// dad_branch->scale_num[ptn] += 1;
-// }
-// }
-//
-// } // for ptn
-// dad_branch->lh_scale_factor += sum_scale;
-//
-// // end multifurcating treatment
-// } else
+ // now for-loop computing partial_lh over all site-patterns
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double partial_lh_all[block];
+ for (i = 0; i < block; i++)
+ partial_lh_all[i] = 1.0;
+ dad_branch->scale_num[ptn] = 0;
+
+ double expchild[nstates];
+ double *eval = models->at(ptn)->getEigenvalues();
+ double *evec = models->at(ptn)->getEigenvectors();
+ double *inv_evec = models->at(ptn)->getInverseEigenvectors();
+
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ PhyloNeighbor *child = (PhyloNeighbor*)*it;
+ if (child->node->isLeaf()) {
+ // external node
+ double *tip_partial_lh_child = tip_partial_lh + (child->node->id*tip_block_size)+ptn*nstates;
+ double *partial_lh = partial_lh_all;
+ for (c = 0; c < ncat; c++) {
+ double len_child = site_rate->getRate(c) * child->length;
+ for (i = 0; i < nstates; i++) {
+ expchild[i] = exp(eval[i]*len_child) * tip_partial_lh_child[i];
+ }
+ // compute real partial likelihood vector
+ for (x = 0; x < nstates; x++) {
+ double vchild = 0.0;
+ double *this_evec = evec + x*nstates;
+ for (i = 0; i < nstates; i++) {
+ vchild += this_evec[i] * expchild[i];
+ }
+ partial_lh[x] *= vchild;
+ }
+ partial_lh += nstates;
+ }
+ } else {
+ // internal node
+ dad_branch->scale_num[ptn] += child->scale_num[ptn];
+
+ double *partial_lh_child = child->partial_lh + ptn*block;
+ double *partial_lh = partial_lh_all;
+ for (c = 0; c < ncat; c++) {
+ double len_child = site_rate->getRate(c) * child->length;
+ for (i = 0; i < nstates; i++) {
+ expchild[i] = exp(eval[i]*len_child) * partial_lh_child[i];
+ }
+ // compute real partial likelihood vector
+ for (x = 0; x < nstates; x++) {
+ double vchild = 0.0;
+ double *this_evec = evec + x*nstates;
+ for (i = 0; i < nstates; i++) {
+ vchild += this_evec[i] * expchild[i];
+ }
+ partial_lh[x] *= vchild;
+ }
+ partial_lh_child += nstates;
+ partial_lh += nstates;
+ }
+ } // if
+
+
+ } // FOR_NEIGHBOR
+
+
+ // compute dot-product with inv_eigenvector
+ double lh_max = 0.0;
+ double *partial_lh_tmp = partial_lh_all;
+ double *partial_lh = dad_branch->partial_lh + ptn*block;
+ for (c = 0; c < ncat; c++) {
+ double *inv_evec_ptr = inv_evec;
+ for (i = 0; i < nstates; i++) {
+ double res = 0.0;
+ for (x = 0; x < nstates; x++) {
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
+ }
+ inv_evec_ptr += nstates;
+ partial_lh[i] = res;
+ lh_max = max(lh_max, fabs(res));
+ }
+ partial_lh += nstates;
+ partial_lh_tmp += nstates;
+ }
+ // check if one should scale partial likelihoods
+ if (lh_max < SCALING_THRESHOLD) {
+ partial_lh = dad_branch->partial_lh + ptn*block;
+ if (lh_max == 0.0) {
+ // for very shitty data
+ for (c = 0; c < ncat; c++)
+ memcpy(&partial_lh[c*nstates], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
+ sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 4;
+ int nsite = aln->getNSite();
+ for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+ if (aln->getPatternID(i) == ptn) {
+ outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+ x++;
+ }
+ } else {
+ // now do the likelihood scaling
+ for (i = 0; i < block; i++) {
+ partial_lh[i] *= SCALING_THRESHOLD_INVER;
+ //partial_lh[i] /= lh_max;
+ }
+ // unobserved const pattern will never have underflow
+ sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 1;
+ }
+ }
+
+ } // for ptn
+ dad_branch->lh_scale_factor += sum_scale;
+
+ // end multifurcating treatment
+ } else
if (left->node->isLeaf() && right->node->isLeaf()) {
/*--------------------- TIP-TIP (cherry) case ------------------*/
diff --git a/phylokernelsitemodel.h b/phylokernelsitemodel.h
index 2bf4460..929e3aa 100644
--- a/phylokernelsitemodel.h
+++ b/phylokernelsitemodel.h
@@ -18,6 +18,12 @@ inline double horizontal_add(double x) {
template <class VectorClass, const int VCSIZE, const int nstates>
void PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+ if (dad_branch->node->degree() > 3) {
+ // TODO: SIMD version for multifurcating node
+ computeSitemodelPartialLikelihoodEigen(dad_branch, dad);
+ return;
+ }
+
// don't recompute the likelihood
assert(dad);
if (dad_branch->partial_lh_computed & 1)
diff --git a/phylosupertree.cpp b/phylosupertree.cpp
index 6ed385b..9dbf3e1 100644
--- a/phylosupertree.cpp
+++ b/phylosupertree.cpp
@@ -978,8 +978,17 @@ double PhyloSuperTree::computeLikelihood(double *pattern_lh) {
return tree_lh;
}
+int PhyloSuperTree::getNumLhCat(SiteLoglType wsl) {
+ int ncat = 0, it_cat;
+ for (iterator it = begin(); it != end(); it++)
+ if ((it_cat = (*it)->getNumLhCat(wsl)) > ncat)
+ ncat = it_cat;
+ return ncat;
+}
+
+
void PhyloSuperTree::computePatternLikelihood(double *pattern_lh, double *cur_logl, double *ptn_lh_cat, SiteLoglType wsl) {
- int offset = 0, offset_lh_cat = 0;
+ size_t offset = 0, offset_lh_cat = 0;
iterator it;
for (it = begin(); it != end(); it++) {
if (ptn_lh_cat)
@@ -987,10 +996,7 @@ void PhyloSuperTree::computePatternLikelihood(double *pattern_lh, double *cur_lo
else
(*it)->computePatternLikelihood(pattern_lh + offset);
offset += (*it)->aln->getNPattern();
- if ((*it)->getModel()->isMixture() && !(*it)->getModelFactory()->fused_mix_rate)
- offset_lh_cat += (*it)->aln->getNPattern() * (*it)->site_rate->getNDiscreteRate() * (*it)->model->getNMixtures();
- else
- offset_lh_cat += (*it)->aln->getNPattern() * (*it)->site_rate->getNDiscreteRate();
+ offset_lh_cat += (*it)->aln->getNPattern() * (*it)->getNumLhCat(wsl);
}
if (cur_logl) { // sanity check
double sum_logl = 0;
@@ -1009,6 +1015,14 @@ void PhyloSuperTree::computePatternLikelihood(double *pattern_lh, double *cur_lo
}
}
+void PhyloSuperTree::computePatternProbabilityCategory(double *ptn_prob_cat, SiteLoglType wsl) {
+ size_t offset = 0;
+ for (iterator it = begin(); it != end(); it++) {
+ (*it)->computePatternProbabilityCategory(ptn_prob_cat + offset, wsl);
+ offset += (*it)->aln->getNPattern() * (*it)->getNumLhCat(wsl);
+ }
+}
+
double PhyloSuperTree::optimizeAllBranches(int my_iterations, double tolerance, int maxNRStep) {
double tree_lh = 0.0;
int ntrees = size();
diff --git a/phylosupertree.h b/phylosupertree.h
index 6dc299a..e850977 100644
--- a/phylosupertree.h
+++ b/phylosupertree.h
@@ -213,6 +213,11 @@ public:
virtual double computeLikelihood(double *pattern_lh = NULL);
/**
+ * @return number of elements per site lhl entry, used in conjunction with computePatternLhCat
+ */
+ virtual int getNumLhCat(SiteLoglType wsl);
+
+ /**
compute pattern likelihoods only if the accumulated scaling factor is non-zero.
Otherwise, copy the pattern_lh attribute
@param pattern_lh (OUT) pattern log-likelihoods,
@@ -224,6 +229,13 @@ public:
double *pattern_lh_cat = NULL, SiteLoglType wsl = WSL_RATECAT);
/**
+ compute pattern posterior probabilities per rate/mixture category
+ @param pattern_prob_cat (OUT) all pattern-probabilities per category
+ @param wsl either WSL_RATECAT, WSL_MIXTURE or WSL_MIXTURE_RATECAT
+ */
+ virtual void computePatternProbabilityCategory(double *pattern_prob_cat, SiteLoglType wsl);
+
+ /**
optimize all branch lengths of all subtrees, then compute branch lengths
of supertree as weighted average over all subtrees
@param iterations number of iterations to loop through all branches
diff --git a/phylosupertreeplen.cpp b/phylosupertreeplen.cpp
index c08c6e6..929be9a 100644
--- a/phylosupertreeplen.cpp
+++ b/phylosupertreeplen.cpp
@@ -104,7 +104,7 @@ void PartitionModelPlen::restoreCheckpoint() {
}
-double PartitionModelPlen::optimizeParameters(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+double PartitionModelPlen::optimizeParameters(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
double tree_lh = 0.0, cur_lh = 0.0;
int ntrees = tree->size();
@@ -115,11 +115,12 @@ double PartitionModelPlen::optimizeParameters(bool fixed_len, bool write_info, d
for(int part = 0; part < ntrees; part++){
tree->part_info[part].cur_score = 0.0;
}
- if (fixed_len) {
- tree_lh = tree->computeLikelihood();
- } else {
+
+ if (fixed_len == BRLEN_OPTIMIZE) {
tree_lh = tree->optimizeAllBranches(1);
- }
+ } else {
+ tree_lh = tree->computeLikelihood();
+ }
cout<<"Initial log-likelihood: "<<tree_lh<<endl;
double begin_time = getRealTime();
@@ -176,11 +177,16 @@ double PartitionModelPlen::optimizeParameters(bool fixed_len, bool write_info, d
// Optimizing branch lengths
int my_iter = min(5,i+1);
- if(!fixed_len){
+ if (fixed_len == BRLEN_OPTIMIZE){
double new_lh = tree->optimizeAllBranches(my_iter, logl_epsilon);
assert(new_lh > cur_lh - 1.0);
cur_lh = new_lh;
- }
+ } else if (fixed_len == BRLEN_SCALE) {
+ double scaling = 1.0;
+ double new_lh = tree->optimizeTreeLengthScaling(MIN_BRLEN_SCALE, scaling, MAX_BRLEN_SCALE, gradient_epsilon);
+ assert(new_lh > cur_lh - 1.0);
+ cur_lh = new_lh;
+ }
cout<<"Current log-likelihood at step "<<i<<": "<<cur_lh<<endl;
if(fabs(cur_lh-tree_lh) < logl_epsilon) {
tree_lh = cur_lh;
@@ -198,8 +204,8 @@ double PartitionModelPlen::optimizeParameters(bool fixed_len, bool write_info, d
return tree_lh;
}
-double PartitionModelPlen::optimizeParametersGammaInvar(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
- outError("This option does not work with edge-linked partition model yet");
+double PartitionModelPlen::optimizeParametersGammaInvar(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+ outError("Thorough I+G parameter optimization does not work with edge-linked partition model yet");
return 0.0;
}
diff --git a/phylosupertreeplen.h b/phylosupertreeplen.h
index baa2979..9c844d3 100644
--- a/phylosupertreeplen.h
+++ b/phylosupertreeplen.h
@@ -124,10 +124,14 @@ public:
/**
optimize model parameters and tree branch lengths
- @param fixed_len TRUE to fix branch lengths, default is false
- @return the best likelihood
+ NOTE 2016-08-20: refactor the semantic of fixed_len
+ @param fixed_len 0: optimize branch lengths, 1: fix branch lengths, 2: scale branch lengths
+ @param write_info TRUE to write model parameters every optimization step, FALSE to only print at the end
+ @param logl_epsilon log-likelihood epsilon to stop
+ @param gradient_epsilon gradient (derivative) epsilon to stop
+ @return the best likelihood
*/
- virtual double optimizeParameters(bool fixed_len = false, bool write_info = true,
+ virtual double optimizeParameters(int fixed_len = BRLEN_OPTIMIZE, bool write_info = true,
double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
@@ -137,7 +141,7 @@ public:
* @param fixed_len TRUE to fix branch lengths, default is false
* @return the best likelihood
*/
- virtual double optimizeParametersGammaInvar(bool fixed_len = false, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+ virtual double optimizeParametersGammaInvar(int fixed_len = BRLEN_OPTIMIZE, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
double optimizeGeneRate(double tol);
diff --git a/phylotesting.cpp b/phylotesting.cpp
index 5a1b20f..0a318cd 100644
--- a/phylotesting.cpp
+++ b/phylotesting.cpp
@@ -243,6 +243,11 @@ void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh,
void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) {
+ if (tree->isSuperTree()) {
+ cout << "WARNING: -wslm, -wslr do not work with partition models yet" << endl;
+ return;
+ }
+
if (wsl == WSL_NONE || wsl == WSL_SITE)
return;
// error checking
@@ -316,7 +321,7 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl)
cout << "Log-likelihood of constant sites: " << endl;
double const_prob = 0.0;
for (i = 0; i < tree->aln->getNPattern(); i++)
- if (tree->aln->at(i).is_const) {
+ if (tree->aln->at(i).isConst()) {
Pattern pat = tree->aln->at(i);
for (Pattern::iterator it = pat.begin(); it != pat.end(); it++)
cout << tree->aln->convertStateBackStr(*it);
@@ -334,12 +339,85 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl)
}
-void printSiteStateFreq(const char*filename, PhyloTree *tree) {
+void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) {
- int i, j, nsites = tree->getAlnNSite(), nstates = tree->aln->num_states;
- double *ptn_state_freq = new double[tree->getAlnNPattern() * nstates];
+ if (wsl == WSL_NONE || wsl == WSL_SITE)
+ return;
+ // error checking
+ if (!tree->getModel()->isMixture()) {
+ if (wsl != WSL_RATECAT) {
+ outWarning("Switch now to '-wspr' as it is the only option for non-mixture model");
+ wsl = WSL_RATECAT;
+ }
+ } else {
+ // mixture model
+ if (wsl == WSL_MIXTURE_RATECAT && tree->getModelFactory()->fused_mix_rate) {
+ outWarning("-wspmr is not suitable for fused mixture model, switch now to -wspm");
+ wsl = WSL_MIXTURE;
+ }
+ }
+ size_t cat, ncat = tree->getNumLhCat(wsl);
+ double *ptn_prob_cat = new double[tree->getAlnNPattern()*ncat];
+ tree->computePatternProbabilityCategory(ptn_prob_cat, wsl);
- tree->computePatternStateFreq(ptn_state_freq);
+ try {
+ ofstream out;
+ out.exceptions(ios::failbit | ios::badbit);
+ out.open(filename);
+ if (tree->isSuperTree())
+ out << "Set\t";
+ out << "Site";
+ for (cat = 0; cat < ncat; cat++)
+ out << "\tp" << cat+1;
+ out << endl;
+ IntVector pattern_index;
+ if (tree->isSuperTree()) {
+ PhyloSuperTree *super_tree = (PhyloSuperTree*)tree;
+ size_t offset = 0;
+ for (PhyloSuperTree::iterator it = super_tree->begin(); it != super_tree->end(); it++) {
+ size_t part_ncat = (*it)->getNumLhCat(wsl);
+ (*it)->aln->getSitePatternIndex(pattern_index);
+ size_t site, nsite = (*it)->aln->getNSite();
+ for (site = 0; site < nsite; site++) {
+ out << (it-super_tree->begin())+1 << "\t" << site+1;
+ double *prob_cat = ptn_prob_cat + (offset+pattern_index[site]*part_ncat);
+ for (cat = 0; cat < part_ncat; cat++)
+ out << "\t" << prob_cat[cat];
+ out << endl;
+ }
+ offset += (*it)->aln->getNPattern()*(*it)->getNumLhCat(wsl);
+ }
+ } else {
+ tree->aln->getSitePatternIndex(pattern_index);
+ int nsite = tree->getAlnNSite();
+ for (int site = 0; site < nsite; site++) {
+ out << site+1;
+ double *prob_cat = ptn_prob_cat + pattern_index[site]*ncat;
+ for (cat = 0; cat < ncat; cat++) {
+ out << "\t" << prob_cat[cat];
+ }
+ out << endl;
+ }
+ }
+ out.close();
+ cout << "Site probabilities per category printed to " << filename << endl;
+ } catch (ios::failure) {
+ outError(ERR_WRITE_OUTPUT, filename);
+ }
+
+}
+
+
+void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freqs) {
+
+ int i, j, nsites = tree->getAlnNSite(), nstates = tree->aln->num_states;
+ double *ptn_state_freq;
+ if (state_freqs) {
+ ptn_state_freq = state_freqs;
+ } else {
+ ptn_state_freq = new double[tree->getAlnNPattern() * nstates];
+ tree->computePatternStateFreq(ptn_state_freq);
+ }
try {
ofstream out;
@@ -362,7 +440,8 @@ void printSiteStateFreq(const char*filename, PhyloTree *tree) {
} catch (ios::failure) {
outError(ERR_WRITE_OUTPUT, filename);
}
- delete [] ptn_state_freq;
+ if (!state_freqs)
+ delete [] ptn_state_freq;
}
bool checkModelFile(ifstream &in, bool is_partitioned, vector<ModelInfo> &infos) {
@@ -578,7 +657,7 @@ int getModelList(Params ¶ms, Alignment *aln, StrVector &models, bool separat
// for (i = 0; i < noptions; i++)
// test_options[i] = test_options_codon[i];
// } else
- if (aln->frac_const_sites == 0.0) {
+ if (aln->frac_invariant_sites == 0.0) {
// morphological or SNP data: activate +ASC
if (with_new) {
if (with_asc)
@@ -605,6 +684,12 @@ int getModelList(Params ¶ms, Alignment *aln, StrVector &models, bool separat
test_options = test_options_asc;
} else
test_options = test_options_default;
+ if (aln->frac_const_sites == 0.0) {
+ // deactivate +I
+ for (j = 0; j < noptions; j++)
+ if (strstr(rate_options[j], "+I"))
+ test_options[j] = false;
+ }
}
@@ -1334,7 +1419,7 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
// normal model
if (model_names[model].find("+ASC") != string::npos) {
model_fac->unobserved_ptns = in_tree->aln->getUnobservedConstPatterns();
- if (model_fac->unobserved_ptns.size() < tree->aln->getNumNonstopCodons() || in_tree->aln->frac_const_sites > 0.0) {
+ if (model_fac->unobserved_ptns.size() < tree->aln->getNumNonstopCodons() || in_tree->aln->frac_invariant_sites > 0.0) {
cout.width(3);
cout << right << model+1 << " ";
cout.width(13);
@@ -1343,8 +1428,6 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
continue;
}
tree->aln->buildSeqStates(true);
- if (model_fac->unobserved_ptns.size() < tree->aln->getNumNonstopCodons())
- outError("Invalid use of +ASC because constant patterns are observed in the alignment");
} else {
model_fac->unobserved_ptns = "";
tree->aln->buildSeqStates(false);
@@ -1869,7 +1952,7 @@ the solution is:
@param[out] y y-value
@return least square value
*/
-void doWeightedLeastSquare(int n, double *w, double *a, double *b, double *c, double &x, double &y) {
+void doWeightedLeastSquare(int n, double *w, double *a, double *b, double *c, double &x, double &y, double &se) {
int k;
double BC = 0.0, AB = 0.0, AC = 0.0, A2 = 0.0, B2 = 0.0;
double denom;
@@ -1885,6 +1968,9 @@ void doWeightedLeastSquare(int n, double *w, double *a, double *b, double *c, do
denom = 1.0/(AB*AB - A2*B2);
x = (BC*AB - AC*B2) * denom;
y = (AC*AB - BC*A2) * denom;
+
+ se = -denom*(B2+A2+2*AB);
+ assert(se >= 0.0);
}
/**
@@ -1946,13 +2032,171 @@ public:
double *rr_inv;
};
+/* BEGIN CODE WAS TAKEN FROM CONSEL PROGRAM */
+
+/* binary search for a sorted vector
+ find k s.t. vec[k-1] <= t < vec[k]
+ */
+int cntdist2(double *vec, int bb, double t)
+{
+ int i,i0,i1;
+
+ i0=0; i1=bb-1;
+ if(t < vec[0]) return 0;
+ else if(vec[bb-1] <= t) return bb;
+
+ while(i1-i0>1) {
+ i=(i0+i1)/2;
+ if(vec[i] <= t) i0=i;
+ else i1=i;
+ }
+
+ return i1;
+}
+
+/*
+ smoothing the counting for a sorted vector
+ the piecewise linear function connecting
+ F(v[i]) = 1/(2n) + i/n, for i=0,...,n-1
+ F(1.5v[0]-0.5v[1]) = 0
+ F(1.5v[n-1]-0.5v[n-2]) = 1.
+
+ 1. F(x)=0 for x<=1.5v[0]-0.5v[1]
+
+ 2. F(x)=1/(2n) + (1/n)*(x-v[0])/(v[1]-v[0])
+ for 1.5v[0]-0.5v[1] < x <= v[0]
+
+ 3. F(x)=1/(2n) + i/n + (1/n)*(x-v[i])/(v[i]-v[i+1])
+ for v[i] < x <= v[i+1], i=0,...,
+
+ 4. F(x)=1-(1/2n) + (1/n)*(x-v[n-1])/(v[n-1]-v[n-2])
+ for v[n-1] < x <= 1.5v[n-1]-0.5v[n-2]
+
+ 5. F(x)=1 for x > 1.5v[n-1]-0.5v[n-2]
+ */
+double cntdist3(double *vec, int bb, double t)
+{
+ double p,n;
+ int i;
+ i=cntdist2(vec,bb,t)-1; /* to find vec[i] <= t < vec[i+1] */
+ n=(double)bb;
+ if(i<0) {
+ if(vec[1]>vec[0]) p=0.5+(t-vec[0])/(vec[1]-vec[0]);
+ else p=0.0;
+ } else if(i<bb-1) {
+ if(vec[i+1]>vec[i]) p=0.5+(double)i+(t-vec[i])/(vec[i+1]-vec[i]);
+ else p=0.5+(double)i; /* <- should never happen */
+ } else {
+ if(vec[bb-1]-vec[bb-2]>0) p=n-0.5+(t-vec[bb-1])/(vec[bb-1]-vec[bb-2]);
+ else p=n;
+ }
+ if(p>n) p=n; else if(p<0.0) p=0.0;
+ return p;
+}
+
+double log3(double x)
+{
+ double y,z1,z2,z3,z4,z5;
+ if(fabs(x)>1.0e-3) {
+ y=-log(1.0-x);
+ } else {
+ z1=x; z2=z1*x; z3=z2*x; z4=z3*x; z5=z4*x;
+ y=((((z5/5.0)+z4/4.0)+z3/3.0)+z2/2.0)+z1;
+ }
+ return y;
+}
+
+int mleloopmax=30;
+double mleeps=1e-10;
+int mlecoef(double *cnts, double *rr, double bb, int kk,
+ double *coef0, /* set initinal value (size=2) */
+ double *lrt, double *df, /* LRT statistic */
+ double *se
+ )
+{
+ int i,m,loop;
+ double coef[2], update[2];
+ double d1f, d2f, d11f, d12f, d22f; /* derivatives */
+ double v11, v12, v22; /* inverse of -d??f */
+ double a,e;
+ double s[kk], r[kk],c[kk], b[kk],z[kk],p[kk],d[kk],g[kk],h[kk];
+
+ m=0;
+ for(i=0;i<kk;i++)
+ {
+ r[m]=rr[i]; s[m]=sqrt(rr[i]); c[m]=cnts[i]*bb; b[m]=bb;
+ m++;
+ }
+ if(m<2) return 1;
+
+ coef[0]=coef0[0]; /* signed distance */
+ coef[1]=coef0[1]; /* curvature */
+
+ for(loop=0;loop<mleloopmax;loop++) {
+ d1f=d2f=d11f=d12f=d22f=0.0;
+ for(i=0;i<m;i++) {
+ z[i]=coef[0]*s[i]+coef[1]/s[i];
+ p[i]=gsl_cdf_ugaussian_P(-z[i]);
+ d[i]=gsl_ran_ugaussian_pdf(z[i]);
+ if(p[i]>0.0 && p[i]<1.0) {
+ g[i]=d[i]*( d[i]*(-c[i]+2.0*c[i]*p[i]-b[i]*p[i]*p[i])/
+ (p[i]*p[i]*(1.0-p[i])*(1.0-p[i]))
+ + z[i]*(c[i]-b[i]*p[i])/(p[i]*(1.0-p[i])) );
+ h[i]=d[i]*(c[i]-b[i]*p[i])/(p[i]*(1.0-p[i]));
+ } else { g[i]=h[i]=0.0; }
+ d1f+= -h[i]*s[i]; d2f+= -h[i]/s[i];
+ d11f+= g[i]*r[i]; d12f+= g[i]; d22f+= g[i]/r[i];
+ }
+
+ a=d11f*d22f-d12f*d12f;
+ if(a==0.0) {
+ return 2;
+ }
+ v11=-d22f/a; v12=d12f/a; v22=-d11f/a;
+
+ /* Newton-Raphson update */
+ update[0]=v11*d1f+v12*d2f; update[1]=v12*d1f+v22*d2f;
+ coef[0]+=update[0]; coef[1]+=update[1];
+
+ /* check convergence */
+ e=-d11f*update[0]*update[0]-2.0*d12f*update[0]*update[1]
+ -d22f*update[1]*update[1];
+
+ if(e<mleeps) break;
+ }
+
+ /* calc log-likelihood */
+ *lrt=0.0; *df=0.0;
+ for(i=0;i<m;i++) {
+ if(p[i]>0.0 && p[i]<1.0) {
+ *df+=1.0;
+ if(c[i]>0.0) a=c[i]*log(c[i]/b[i]/p[i]); else a=0.0;
+ if(c[i]<b[i]) a+=(b[i]-c[i])*(log3(p[i])-log3(c[i]/b[i]));
+ *lrt += a;
+ }
+ }
+ *lrt *= 2.0; *df -= 2.0;
+
+ /* write back the results */
+ coef0[0]=coef[0]; coef0[1]=coef[1];
+ *se = v11 + v22 - 2*v12;
+// vmat[0][0]=v11;vmat[0][1]=vmat[1][0]=v12;vmat[1][1]=v22;
+ if(loop==mleloopmax || *df< -0.01) i=1; else i=0;
+ return i;
+}
+
+/* END CODE WAS TAKEN FROM CONSEL PROGRAM */
+
/**
@param tree_lhs RELL score matrix of size #trees x #replicates
*/
void performAUTest(Params ¶ms, PhyloTree *tree, double *pattern_lhs, vector<TreeInfo> &info) {
+ if (params.topotest_replicates < 10000)
+ outWarning("Too few replicates for AU test. At least -zb 10000 for reliable results!");
+
/* STEP 1: specify scale factors */
- int nscales = 10;
+ size_t nscales = 10;
double r[] = {0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4};
double rr[] = {sqrt(0.5), sqrt(0.6), sqrt(0.7), sqrt(0.8), sqrt(0.9), 1.0,
sqrt(1.1), sqrt(1.2), sqrt(1.3), sqrt(1.4)};
@@ -1960,17 +2204,28 @@ void performAUTest(Params ¶ms, PhyloTree *tree, double *pattern_lhs, vector<
sqrt(1/1.1), sqrt(1/1.2), sqrt(1/1.3), sqrt(1/1.4)};
/* STEP 2: compute bootstrap proportion */
- int ntrees = info.size();
+ size_t ntrees = info.size();
size_t nboot = params.topotest_replicates;
- double nboot_inv = 1.0 / nboot;
+// double nboot_inv = 1.0 / nboot;
- int nptn = tree->getAlnNPattern();
- int maxnptn = get_safe_upper_limit(nptn);
+ size_t nptn = tree->getAlnNPattern();
+ size_t maxnptn = get_safe_upper_limit(nptn);
- double *bp = new double[ntrees*nscales];
- memset(bp, 0, sizeof(double)*ntrees*nscales);
+// double *bp = new double[ntrees*nscales];
+// memset(bp, 0, sizeof(double)*ntrees*nscales);
- int k, tid, ptn;
+ double *treelhs;
+ cout << (ntrees*nscales*nboot*sizeof(double) >> 20) << " MB required for AU test" << endl;
+ treelhs = new double[ntrees*nscales*nboot];
+ if (!treelhs)
+ outError("Not enough memory to perform AU test!");
+
+ size_t k, tid, ptn;
+
+ double start_time = getRealTime();
+
+ cout << "Generating " << nscales << " x " << nboot << " multiscale bootstrap replicates... ";
+
#ifdef _OPENMP
#pragma omp parallel private(k, tid, ptn)
{
@@ -1986,7 +2241,7 @@ void performAUTest(Params ¶ms, PhyloTree *tree, double *pattern_lhs, vector<
double *boot_sample_dbl = aligned_alloc<double>(maxnptn);
#ifdef _OPENMP
- #pragma omp for schedule(static)
+ #pragma omp for schedule(dynamic)
#endif
for (k = 0; k < nscales; k++) {
string str = "SCALE=" + convertDoubleToString(r[k]);
@@ -1994,26 +2249,53 @@ void performAUTest(Params ¶ms, PhyloTree *tree, double *pattern_lhs, vector<
tree->aln->createBootstrapAlignment(boot_sample, str.c_str(), rstream);
for (ptn = 0; ptn < maxnptn; ptn++)
boot_sample_dbl[ptn] = boot_sample[ptn];
- double max_lh = -1e20;
+ double max_lh = -DBL_MAX, second_max_lh = -DBL_MAX;
int max_tid = -1;
for (tid = 0; tid < ntrees; tid++) {
double *pattern_lh = pattern_lhs + (tid*maxnptn);
double tree_lh;
+ if (params.SSE == LK_EIGEN) {
+ tree_lh = 0.0;
+ for (ptn = 0; ptn < nptn; ptn++)
+ tree_lh += pattern_lh[ptn] * boot_sample_dbl[ptn];
+ } else {
#ifdef BINARY32
- tree_lh = tree->dotProductSIMD<double, Vec2d, 2>(pattern_lh, boot_sample_dbl, nptn);
-#else
- if (instruction_set >= 7)
- tree_lh = tree->dotProductSIMD<double, Vec4d, 4>(pattern_lh, boot_sample_dbl, nptn);
- else
tree_lh = tree->dotProductSIMD<double, Vec2d, 2>(pattern_lh, boot_sample_dbl, nptn);
+#else
+ if (instruction_set >= 7)
+ tree_lh = tree->dotProductSIMD<double, Vec4d, 4>(pattern_lh, boot_sample_dbl, nptn);
+ else
+ tree_lh = tree->dotProductSIMD<double, Vec2d, 2>(pattern_lh, boot_sample_dbl, nptn);
#endif
+ }
+ // rescale lh
+ tree_lh /= r[k];
+
+ // find the max and second max
if (tree_lh > max_lh) {
+ second_max_lh = max_lh;
max_lh = tree_lh;
max_tid = tid;
- }
+ } else if (tree_lh > second_max_lh)
+ second_max_lh = tree_lh;
+
+ treelhs[(tid*nscales+k)*nboot + boot] = tree_lh;
}
- bp[k*ntrees+max_tid] += nboot_inv;
+
+ // compute difference from max_lh
+ for (tid = 0; tid < ntrees; tid++)
+ if (tid != max_tid)
+ treelhs[(tid*nscales+k)*nboot + boot] = max_lh - treelhs[(tid*nscales+k)*nboot + boot];
+ else
+ treelhs[(tid*nscales+k)*nboot + boot] = second_max_lh - max_lh;
+// bp[k*ntrees+max_tid] += nboot_inv;
+ }
+
+ // sort the replicates
+ for (tid = 0; tid < ntrees; tid++) {
+ quicksort<double,int>(treelhs + (tid*nscales+k)*nboot, 0, nboot-1);
}
+
}
aligned_free(boot_sample_dbl);
@@ -2024,54 +2306,147 @@ void performAUTest(Params ¶ms, PhyloTree *tree, double *pattern_lhs, vector<
}
#endif
- if (verbose_mode >= VB_MED) {
- cout << "scale";
- for (k = 0; k < nscales; k++)
- cout << "\t" << r[k];
- cout << endl;
- for (tid = 0; tid < ntrees; tid++) {
- cout << tid;
- for (k = 0; k < nscales; k++) {
- cout << "\t" << bp[tid+k*ntrees];
- }
- cout << endl;
- }
- }
+// if (verbose_mode >= VB_MED) {
+// cout << "scale";
+// for (k = 0; k < nscales; k++)
+// cout << "\t" << r[k];
+// cout << endl;
+// for (tid = 0; tid < ntrees; tid++) {
+// cout << tid;
+// for (k = 0; k < nscales; k++) {
+// cout << "\t" << bp[tid+k*ntrees];
+// }
+// cout << endl;
+// }
+// }
+
+ cout << getRealTime() - start_time << " seconds" << endl;
/* STEP 3: weighted least square fit */
double *cc = new double[nscales];
double *w = new double[nscales];
double *this_bp = new double[nscales];
- cout << "TreeID\tAU\tRSS\td_WLS\tc_WLS\td_MLE\tc_MLE" << endl;
+ cout << "TreeID\tAU\tRSS\td\tc" << endl;
for (tid = 0; tid < ntrees; tid++) {
- for (k = 0; k < nscales; k++) {
- this_bp[k] = bp[tid + k*ntrees];
- double bp_val = min(max(bp[tid + k*ntrees], nboot_inv),1.0-nboot_inv);
- double bp_cdf = gsl_cdf_ugaussian_Pinv(bp_val);
- double bp_pdf = gsl_ran_ugaussian_pdf(bp_cdf);
- cc[k] = gsl_cdf_ugaussian_Pinv(1.0 - bp_val);
- w[k] = bp_pdf*bp_pdf*nboot / (bp_val*(1.0-bp_val));
- }
+ double *this_stat = treelhs + tid*nscales*nboot;
+ double xn = this_stat[(nscales/2)*nboot + nboot/2], x;
double c, d; // c, d in original paper
- // first obtain d and c by weighted least square
- doWeightedLeastSquare(nscales, w, rr, rr_inv, cc, d, c);
+ int idf0 = -2;
+ double z = 0.0, z0 = 0.0, thp = 0.0, th = 0.0, ze = 0.0, ze0 = 0.0;
+ double pval, se;
+ int df;
+ double rss = 0.0;
+ int step;
+ const int max_step = 30;
+ bool failed = false;
+ for (step = 0; step < max_step; step++) {
+ x = xn;
+ int num_k = 0;
+ for (k = 0; k < nscales; k++) {
+ this_bp[k] = cntdist3(this_stat + k*nboot, nboot, x) / nboot;
+ if (this_bp[k] <= 0 || this_bp[k] >= 1) {
+ cc[k] = w[k] = 0.0;
+ } else {
+ double bp_val = this_bp[k];
+ cc[k] = -gsl_cdf_ugaussian_Pinv(bp_val);
+ double bp_pdf = gsl_ran_ugaussian_pdf(cc[k]);
+ w[k] = bp_pdf*bp_pdf*nboot / (bp_val*(1.0-bp_val));
+ num_k++;
+ }
+ }
+ df = num_k-2;
+ if (num_k >= 2) {
+ // first obtain d and c by weighted least square
+ doWeightedLeastSquare(nscales, w, rr, rr_inv, cc, d, c, se);
+
+ se = gsl_ran_ugaussian_pdf(d-c)*sqrt(se);
+
+ // second, perform MLE estimate of d and c
+ // OptimizationAUTest mle(d, c, nscales, this_bp, rr, rr_inv);
+ // mle.optimizeDC();
+ // d = mle.d;
+ // c = mle.c;
+
+ /* STEP 4: compute p-value according to Eq. 11 */
+ pval = 1.0 - gsl_cdf_ugaussian_P(d-c);
+ z = -pval;
+ ze = se;
+ // compute sum of squared difference
+ rss = 0.0;
+ for (k = 0; k < nscales; k++) {
+ double diff = cc[k] - (rr[k]*d + rr_inv[k]*c);
+ rss += w[k] * diff * diff;
+ }
+
+ } else {
+ // not enough data for WLS
+ double sum = 0.0;
+ for (k = 0; k < nscales; k++)
+ sum += cc[k];
+ if (sum >= 0.0)
+ pval = 0.0;
+ else
+ pval = 1.0;
+ se = 0.0;
+ d = c = 0.0;
+ rss = 0.0;
+ if (verbose_mode >= VB_MED)
+ cout << " error in wls" << endl;
+ }
+
+ // maximum likelhood fit
+// double coef0[2] = {d, c};
+// double df;
+// int mlefail = mlecoef(this_bp, r, nboot, nscales, coef0, &rss, &df, &se);
+//
+// if (!mlefail) {
+// d = coef0[0];
+// c = coef0[1];
+// pval = 1.0 - gsl_cdf_ugaussian_P(d-c);
+// z = -pval;
+// ze = se;
+// }
+
+ if (verbose_mode >= VB_MED) {
+ cout.unsetf(ios::fixed);
+ cout << "\t" << step << "\t" << th << "\t" << x << "\t" << pval << "\t" << se << "\t" << nscales-2 << "\t" << d << "\t" << c << "\t" << z << "\t" << ze << "\t" << rss << endl;
+ }
+
+ if(df < 0 && idf0 < 0) { failed = true; break;} /* degenerated */
+
+ if ((df < 0) || (idf0 >= 0 && (z-z0)*(x-thp) > 0.0 && fabs(z-z0)>0.1*ze0)) {
+ if (verbose_mode >= VB_MED)
+ cout << " non-monotone" << endl;
+ th=x;
+ xn=0.5*x+0.5*thp;
+ continue;
+ }
+ if(idf0 >= 0 && (fabs(z-z0)<0.01*ze0)) {
+ if(fabs(th)<1e-10)
+ xn=th;
+ else th=x;
+ } else
+ xn=0.5*th+0.5*x;
+ info[tid].au_pvalue = pval;
+ thp=x;
+ z0=z;
+ ze0=ze;
+ idf0 = nscales-2;
+ if(fabs(x-th)<1e-10) break;
+ }
- // second, perform MLE estimate of d and c
- OptimizationAUTest mle(d, c, nscales, this_bp, rr, rr_inv);
- mle.optimizeDC();
+ if (failed && verbose_mode >= VB_MED)
+ cout << " degenerated" << endl;
- // compute sum of squared difference
- double rss = 0.0;
- for (k = 0; k < nscales; k++) {
- double diff = cc[k] - (rr[k]*d + rr_inv[k]*c);
- rss += w[k] * diff * diff;
+ if (step == max_step) {
+ if (verbose_mode >= VB_MED)
+ cout << " non-convergence" << endl;
+ failed = true;
}
- double pchi2 = computePValueChiSquare(rss, nscales-2);
- /* STEP 4: compute p-value according to Eq. 11 */
- info[tid].au_pvalue = 1.0 - gsl_cdf_ugaussian_P(mle.d-mle.c);
- cout << tid+1 << "\t" << info[tid].au_pvalue << "\t" << rss << "\t" << d << "\t" << c << "\t" << mle.d << "\t" << mle.c;
+ double pchi2 = (failed) ? 0.0 : computePValueChiSquare(rss, df);
+ cout << tid+1 << "\t" << info[tid].au_pvalue << "\t" << rss << "\t" << d << "\t" << c;
// warning if p-value of chi-square < 0.01 (rss too high)
if (pchi2 < 0.01)
@@ -2082,7 +2457,7 @@ void performAUTest(Params ¶ms, PhyloTree *tree, double *pattern_lhs, vector<
delete [] this_bp;
delete [] w;
delete [] cc;
- delete [] bp;
+// delete [] bp;
}
diff --git a/phylotesting.h b/phylotesting.h
index 413c8da..657a06d 100644
--- a/phylotesting.h
+++ b/phylotesting.h
@@ -90,11 +90,18 @@ void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh = NULL,
void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl);
/**
+ * print site posterior probabilities per rate/mixture category to a file
+ * @param filename output file name
+ * @param tree phylogenetic tree
+ */
+void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl);
+
+/**
* print site state frequency vectors (for Huaichun)
* @param filename output file name
* @param tree phylogenetic tree
*/
-void printSiteStateFreq(const char*filename, PhyloTree *tree);
+void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freqs = NULL);
/**
* Evaluate user-trees with possibility of tree topology tests
diff --git a/phylotree.cpp b/phylotree.cpp
index 3bbc1ac..b9143ce 100644
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -791,6 +791,8 @@ uint64_t PhyloTree::getMemoryRequired(size_t ncategory) {
mem_size += tip_partial_lh_size;
if (params->gbo_replicates)
mem_size += params->gbo_replicates*nptn*sizeof(BootValType);
+ if (model)
+ mem_size += model->getMemoryRequired();
return mem_size;
}
@@ -992,7 +994,7 @@ double *PhyloTree::newPartialLh() {
return ret;
}
-int PhyloTree::getPartialLhBytes() {
+size_t PhyloTree::getPartialLhBytes() {
size_t nptn = aln->size()+aln->num_states; // +num_states for ascertainment bias correction
size_t block_size;
if (instruction_set >= 7)
@@ -1007,7 +1009,7 @@ int PhyloTree::getPartialLhBytes() {
return block_size * sizeof(double);
}
-int PhyloTree::getScaleNumBytes() {
+size_t PhyloTree::getScaleNumBytes() {
return (aln->size()+aln->num_states) * sizeof(UBYTE);
}
@@ -1169,30 +1171,52 @@ void PhyloTree::computePatternStateFreq(double *ptn_state_freq) {
size_t state, nstates = aln->num_states;
ModelMixture *models = (ModelMixture*)model;
- // loop over all site-patterns
- for (ptn = 0; ptn < nptn; ptn++) {
-
- // first compute posterior for each mixture component
- double sum_lh = 0.0;
- for (m = 0; m < nmixture; m++) {
- sum_lh += lh_cat[m];
- }
- sum_lh = 1.0/sum_lh;
- for (m = 0; m < nmixture; m++) {
- lh_cat[m] *= sum_lh;
- }
+ if (params->print_site_state_freq == WSF_POSTERIOR_MEAN) {
+ cout << "Computing posterior mean site frequencies...." << endl;
+ // loop over all site-patterns
+ for (ptn = 0; ptn < nptn; ptn++) {
- // now compute state frequencies
- for (state = 0; state < nstates; state++) {
- double freq = 0;
- for (m = 0; m < nmixture; m++)
- freq += models->at(m)->state_freq[state] * lh_cat[m];
- ptn_freq[state] = freq;
+ // first compute posterior for each mixture component
+ double sum_lh = 0.0;
+ for (m = 0; m < nmixture; m++) {
+ sum_lh += lh_cat[m];
+ }
+ sum_lh = 1.0/sum_lh;
+ for (m = 0; m < nmixture; m++) {
+ lh_cat[m] *= sum_lh;
+ }
+
+ // now compute state frequencies
+ for (state = 0; state < nstates; state++) {
+ double freq = 0;
+ for (m = 0; m < nmixture; m++)
+ freq += models->at(m)->state_freq[state] * lh_cat[m];
+ ptn_freq[state] = freq;
+ }
+
+ // increase the pointers
+ lh_cat += nmixture;
+ ptn_freq += nstates;
}
+ } else if (params->print_site_state_freq == WSF_POSTERIOR_MAX) {
+ cout << "Computing posterior max site frequencies...." << endl;
+ // loop over all site-patterns
+ for (ptn = 0; ptn < nptn; ptn++) {
- // increase the pointers
- lh_cat += nmixture;
- ptn_freq += nstates;
+ // first compute posterior for each mixture component
+ size_t max_comp = 0;
+ for (m = 1; m < nmixture; m++)
+ if (lh_cat[m] > lh_cat[max_comp]) {
+ max_comp = m;
+ }
+
+ // now compute state frequencies
+ memcpy(ptn_freq, models->at(max_comp)->state_freq, nstates*sizeof(double));
+
+ // increase the pointers
+ lh_cat += nmixture;
+ ptn_freq += nstates;
+ }
}
}
@@ -1273,6 +1297,29 @@ void PhyloTree::computePatternLikelihood(double *ptn_lh, double *cur_logl, doubl
//return score;
}
+void PhyloTree::computePatternProbabilityCategory(double *ptn_prob_cat, SiteLoglType wsl) {
+ /* if (!dad_branch) {
+ dad_branch = (PhyloNeighbor*) root->neighbors[0];
+ dad = (PhyloNode*) root;
+ }*/
+ size_t ptn, nptn = aln->getNPattern();
+ size_t cat, ncat = getNumLhCat(wsl);
+ // Right now only Naive version store _pattern_lh_cat!
+ computePatternLhCat(wsl);
+
+ memcpy(ptn_prob_cat, _pattern_lh_cat, sizeof(double)*nptn*ncat);
+
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double *lh_cat = ptn_prob_cat + ptn*ncat;
+ double sum = lh_cat[0];
+ for (cat = 1; cat < ncat; cat++)
+ sum += lh_cat[cat];
+ sum = 1.0/sum;
+ for (cat = 0; cat < ncat; cat++)
+ lh_cat[cat] *= sum;
+ }
+}
+
int PhyloTree::computePatternCategories(IntVector *pattern_ncat) {
if (sse != LK_EIGEN) {
// compute _pattern_lh_cat
@@ -2118,6 +2165,30 @@ double PhyloTree::optimizeTreeLengthScaling(double min_scaling, double &scaling,
return computeLikelihood();
}
+void PhyloTree::printTreeLengthScaling(const char *filename) {
+// double treescale = 1.0;
+//
+// cout << "Optimizing tree length scaling ..." << endl;
+//
+// double lh = optimizeTreeLengthScaling(MIN_BRLEN_SCALE, treescale, MAX_BRLEN_SCALE, 0.001);
+//
+// cout << "treescale: " << treescale << " / LogL: " << lh << endl;
+
+ Checkpoint *saved_checkpoint = getModelFactory()->getCheckpoint();
+ Checkpoint *new_checkpoint = new Checkpoint;
+ new_checkpoint->setFileName(filename);
+ new_checkpoint->setCompression(false);
+ new_checkpoint->setHeader("IQ-TREE scaled tree length and model parameters");
+ new_checkpoint->put("treelength", treeLength());
+ saved_checkpoint->put("treelength", treeLength()); // also put treelength into current checkpoint
+
+ getModelFactory()->setCheckpoint(new_checkpoint);
+ getModelFactory()->saveCheckpoint();
+ new_checkpoint->dump();
+
+ getModelFactory()->setCheckpoint(saved_checkpoint);
+}
+
double PhyloTree::computeFunction(double value) {
if (!is_opt_scaling) {
current_it->length = value;
@@ -2736,7 +2807,8 @@ void PhyloTree::doOneRandomNNI(Node *node1, Node *node2) {
node1Nei = (*it);
break;
}
- int randNum = random_int(1);
+
+ int randNum = random_int(2); // randNum is either 0 or 1
if (randNum == 0) {
node1Nei = (*it);
break;
@@ -2750,7 +2822,7 @@ void PhyloTree::doOneRandomNNI(Node *node1, Node *node2) {
node2Nei = (*it);
break;
}
- int randNum = random_int(1);
+ int randNum = random_int(2);
if (randNum == 0) {
node2Nei = (*it);
break;
diff --git a/phylotree.h b/phylotree.h
index 259f649..69187c3 100644
--- a/phylotree.h
+++ b/phylotree.h
@@ -622,7 +622,7 @@ public:
double *newPartialLh();
/** get the number of bytes occupied by partial_lh */
- int getPartialLhBytes();
+ size_t getPartialLhBytes();
/**
allocate memory for a scale num vector
@@ -630,7 +630,7 @@ public:
UBYTE *newScaleNum();
/** get the number of bytes occupied by scale_num */
- int getScaleNumBytes();
+ size_t getScaleNumBytes();
/**
* this stores partial_lh for each state at the leaves of the tree because they are the same between leaves
@@ -783,7 +783,7 @@ public:
/**
* @return number of elements per site lhl entry, used in conjunction with computePatternLhCat
*/
- int getNumLhCat(SiteLoglType wsl);
+ virtual int getNumLhCat(SiteLoglType wsl);
/**
* compute _pattern_lh_cat for site-likelihood per category
@@ -809,6 +809,13 @@ public:
virtual void computePatternLikelihood(double *pattern_lh, double *cur_logl = NULL,
double *pattern_lh_cat = NULL, SiteLoglType wsl = WSL_RATECAT);
+ /**
+ compute pattern posterior probabilities per rate/mixture category
+ @param pattern_prob_cat (OUT) all pattern-probabilities per category
+ @param wsl either WSL_RATECAT, WSL_MIXTURE or WSL_MIXTURE_RATECAT
+ */
+ virtual void computePatternProbabilityCategory(double *pattern_prob_cat, SiteLoglType wsl);
+
vector<uint64_t> ptn_cat_mask;
/**
@@ -1080,6 +1087,11 @@ public:
*/
double optimizeTreeLengthScaling(double min_scaling, double &scaling, double max_scaling, double gradient_epsilon);
+ /**
+ print tree length scaling to a file (requested by Rob Lanfear)
+ @param filename output file name written in YAML format
+ */
+ void printTreeLengthScaling(const char *filename);
/****************************************************************************
Branch length optimization by Least Squares
@@ -1616,6 +1628,8 @@ public:
void computeSeqIdentityAlongTree(Split &resp, Node *node = NULL, Node *dad = NULL);
void computeSeqIdentityAlongTree();
+ double *getPatternLhCatPointer() { return _pattern_lh_cat; }
+
protected:
/**
diff --git a/pll/pll.h b/pll/pll.h
index e450d62..efd7dec 100644
--- a/pll/pll.h
+++ b/pll/pll.h
@@ -82,7 +82,7 @@ extern "C" {
#define PLL_VECTOR_WIDTH 2
#else
-#define PLL_BYTE_ALIGNMENT 1
+#define PLL_BYTE_ALIGNMENT 8
#define PLL_VECTOR_WIDTH 1
#endif
diff --git a/pruning.h b/pruning.h
index 8e59dab..7802631 100644
--- a/pruning.h
+++ b/pruning.h
@@ -87,7 +87,7 @@ public:
LeafSet::iterator nearestLeaf();
/**
- mark the node in the inital set to be not PRUNABLE
+ mark the node in the initial set to be not PRUNABLE
*/
void doInitialSet();
diff --git a/tinatree.cpp b/tinatree.cpp
index 794bfa6..c324d98 100644
--- a/tinatree.cpp
+++ b/tinatree.cpp
@@ -89,7 +89,7 @@ int TinaTree::computeParsimonyScore() {
int score = 0;
for (int ptn = 0; ptn < aln->size(); ptn++)
- if (!aln->at(ptn).is_const) {
+ if (!aln->at(ptn).isConst()) {
int states;
int ptn_score = computeParsimonyScore(ptn, states);
score += ptn_score * (*aln)[ptn].frequency;
diff --git a/tools.cpp b/tools.cpp
index c137a81..4613cc1 100644
--- a/tools.cpp
+++ b/tools.cpp
@@ -130,7 +130,7 @@ void outError(const char *error, string msg, bool quit) {
@param error warning message
*/
void outWarning(const char *warn) {
- cerr << "WARNING: " << warn << endl;
+ cout << "WARNING: " << warn << endl;
}
void outWarning(string warn) {
@@ -803,9 +803,10 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.SSE = LK_EIGEN_SSE;
params.lk_no_avx = false;
params.print_site_lh = WSL_NONE;
- params.print_site_state_freq = 0;
+ params.print_site_prob = WSL_NONE;
+ params.print_site_state_freq = WSF_NONE;
params.print_site_rate = false;
- params.print_site_posterior = 0;
+ params.print_trees_site_posterior = 0;
params.print_tree_lh = false;
params.lambda = 1;
params.speed_conf = 1.0;
@@ -891,6 +892,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.bootlh_test = 0;
params.bootlh_partitions = NULL;
params.site_freq_file = NULL;
+ params.tree_freq_file = NULL;
#ifdef _OPENMP
params.num_threads = 0;
#else
@@ -922,6 +924,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.ignore_checkpoint = false;
params.checkpoint_dump_interval = 20;
params.force_unfinished = false;
+ params.suppress_output_flags = 0;
if (params.nni5) {
@@ -1910,6 +1913,8 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
continue;
}
if (strcmp(argv[cnt], "-fs") == 0) {
+ if (params.tree_freq_file)
+ throw "Specifying both -fs and -ft not allowed";
cnt++;
if (cnt >= argc)
throw "Use -fs <site_freq_file>";
@@ -1917,6 +1922,16 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
// params.SSE = LK_EIGEN;
continue;
}
+ if (strcmp(argv[cnt], "-ft") == 0) {
+ if (params.site_freq_file)
+ throw "Specifying both -fs and -ft not allowed";
+ cnt++;
+ if (cnt >= argc)
+ throw "Use -ft <treefile_to_infer_site_frequency_model>";
+ params.tree_freq_file = argv[cnt];
+ params.print_site_state_freq = WSF_POSTERIOR_MEAN;
+ continue;
+ }
if (strcmp(argv[cnt], "-fconst") == 0) {
cnt++;
@@ -1991,7 +2006,19 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
continue;
}
if (strcmp(argv[cnt], "-fixbr") == 0 || strcmp(argv[cnt], "-blfix") == 0) {
- params.fixed_branch_length = true;
+ params.fixed_branch_length = BRLEN_FIX;
+ params.optimize_alg_gammai = "Brent";
+ params.opt_gammai = false;
+ params.min_iterations = 0;
+ params.stop_condition = SC_FIXED_ITERATION;
+ continue;
+ }
+ if (strcmp(argv[cnt], "-blscale") == 0) {
+ params.fixed_branch_length = BRLEN_SCALE;
+ params.optimize_alg_gammai = "Brent";
+ params.opt_gammai = false;
+ params.min_iterations = 0;
+ params.stop_condition = SC_FIXED_ITERATION;
continue;
}
if (strcmp(argv[cnt], "-blmin") == 0) {
@@ -2209,6 +2236,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.print_site_lh = WSL_RATECAT;
continue;
}
+
if (strcmp(argv[cnt], "-wslm") == 0) {
params.print_site_lh = WSL_MIXTURE;
continue;
@@ -2217,16 +2245,36 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.print_site_lh = WSL_MIXTURE_RATECAT;
continue;
}
+
+ if (strcmp(argv[cnt], "-wspr") == 0) {
+ params.print_site_prob = WSL_RATECAT;
+ continue;
+ }
+
+ if (strcmp(argv[cnt], "-wspm") == 0) {
+ params.print_site_prob = WSL_MIXTURE;
+ continue;
+ }
+ if (strcmp(argv[cnt], "-wspmr") == 0 || strcmp(argv[cnt], "-wsprm") == 0) {
+ params.print_site_prob = WSL_MIXTURE_RATECAT;
+ continue;
+ }
+
+
if (strcmp(argv[cnt], "-wsr") == 0) {
params.print_site_rate = true;
continue;
}
- if (strcmp(argv[cnt], "-wsp") == 0) {
- params.print_site_posterior = 1;
+ if (strcmp(argv[cnt], "-wsptrees") == 0) {
+ params.print_trees_site_posterior = 1;
continue;
}
if (strcmp(argv[cnt], "-wsf") == 0) {
- params.print_site_state_freq = 1;
+ params.print_site_state_freq = WSF_POSTERIOR_MEAN;
+ continue;
+ }
+ if (strcmp(argv[cnt], "-wsfm") == 0) {
+ params.print_site_state_freq = WSF_POSTERIOR_MAX;
continue;
}
if (strcmp(argv[cnt], "-wba") == 0) {
@@ -2573,11 +2621,16 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
continue;
}
- if (strcmp(argv[cnt], "-opt_gammai") == 0 || strcmp(argv[cnt], "--opt-gamma-inv") == 0) {
+ if (strcmp(argv[cnt], "--opt-gamma-inv") == 0) {
params.opt_gammai = true;
continue;
}
+ if (strcmp(argv[cnt], "--no-opt-gamma-inv") == 0) {
+ params.opt_gammai = false;
+ continue;
+ }
+
if (strcmp(argv[cnt], "--opt-gammai-fast") == 0) {
params.opt_gammai_fast = true;
params.opt_gammai = true;
@@ -2940,6 +2993,25 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.checkpoint_dump_interval = convert_int(argv[cnt]);
continue;
}
+
+ if (strcmp(argv[cnt], "--no-log") == 0) {
+ params.suppress_output_flags |= OUT_LOG;
+ continue;
+ }
+
+ if (strcmp(argv[cnt], "--no-treefile") == 0) {
+ params.suppress_output_flags |= OUT_TREEFILE;
+ continue;
+ }
+ if (strcmp(argv[cnt], "--no-iqtree") == 0) {
+ params.suppress_output_flags |= OUT_IQTREE;
+ continue;
+ }
+ if (strcmp(argv[cnt], "--no-outfiles") == 0) {
+ params.suppress_output_flags |= OUT_LOG + OUT_TREEFILE + OUT_IQTREE;
+ continue;
+ }
+
if (argv[cnt][0] == '-') {
string err = "Invalid \"";
@@ -3087,6 +3159,7 @@ void usage_iqtree(char* argv[], bool full_command) {
#endif
<< " -seed <number> Random seed number, normally used for debugging purpose" << endl
<< " -v, -vv, -vvv Verbose mode, printing more messages to screen" << endl
+ << " -quiet Silent mode, suppress printing to screen (stdout)" << endl
<< " -keep-ident Keep identical sequences (default: remove & finally add)" << endl
<< endl << "CHECKPOINTING TO RESUME STOPPED RUN:" << endl
<< " -redo Redo analysis even for successful runs (default: resume)" << endl
@@ -3129,7 +3202,7 @@ void usage_iqtree(char* argv[], bool full_command) {
<< endl << "AUTOMATIC MODEL SELECTION:" << endl
<< " -m TESTONLY Standard model selection (like jModelTest, ProtTest)" << endl
<< " -m TEST Like -m TESTONLY but followed by tree reconstruction" << endl
- << " -m TESTNEWONLY New model selection including FreeRate (+R) heterogeneity" << endl
+ << " -m TESTNEWONLY Extended model selection incl. FreeRate (+R) heterogeneity" << endl
<< " -m TESTNEW Like -m TESTNEWONLY but followed by tree reconstruction" << endl
<< " -m TESTMERGEONLY Select best-fit partition scheme (like PartitionFinder)" << endl
<< " -m TESTMERGE Like -m TESTMERGEONLY but followed by tree reconstruction" << endl
@@ -3160,9 +3233,9 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " DNA: HKY (default), JC, F81, K2P, K3P, K81uf, TN/TrN, TNef," << endl
<< " TIM, TIMef, TVM, TVMef, SYM, GTR, or 6-digit model" << endl
<< " specification (e.g., 010010 = HKY)" << endl
- << " Protein: WAG (default), Poisson, cpREV, mtREV, Dayhoff, mtMAM," << endl
- << " JTT, LG, mtART, mtZOA, VT, rtREV, DCMut, PMB, HIVb," << endl
- << " HIVw, JTTDCMut, FLU, Blosum62" << endl
+ << " Protein: LG (default), Poisson, cpREV, mtREV, Dayhoff, mtMAM," << endl
+ << " JTT, WAG, mtART, mtZOA, VT, rtREV, DCMut, PMB, HIVb," << endl
+ << " HIVw, JTTDCMut, FLU, Blosum62, GTR20" << endl
<< " Protein mixture: C10,...,C60, EX2, EX3, EHO, UL2, UL3, EX_EHO, LG4M, LG4X," << endl
<< " JTTCF4G" << endl
<< " Binary: JC2 (default), GTR2" << endl
@@ -3181,7 +3254,7 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " -m \"MIX{m1,...mK}\" Mixture model with K components" << endl
<< " -m \"FMIX{f1,...fK}\" Frequency mixture model with K components" << endl
<< " -mwopt Turn on optimizing mixture weights (default: none)" << endl
- << endl << "RATE HETEROGENEITY:" << endl
+ << endl << "RATE HETEROGENEITY AMONG SITES:" << endl
<< " -m <model_name>+I or +G[n] or +I+G[n] or +R[n]" << endl
<< " Invar, Gamma, Invar+Gamma, or FreeRate model where 'n' is" << endl
<< " number of categories (default: n=4)" << endl
@@ -3192,11 +3265,15 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " -wsr Write site rates to .rate file" << endl
<< " -mh Computing site-specific rates to .mhrate file using" << endl
<< " Meyer & von Haeseler (2003) method" << endl
+ << endl << "SITE-SPECIFIC FREQUENCY MODEL:" << endl
+ << " -ft <tree_file> Input tree to infer site frequency model" << endl
+ << " -fs <in_freq_file> Input site frequency model file" << endl
+ << " -wsf Write site frequency model to .sitefreq file" << endl
//<< " -c <#categories> Number of Gamma rate categories (default: 4)" << endl
- << endl << "TEST OF MODEL HOMOGENEITY:" << endl
- << " -m WHTEST Testing model (GTR+G) homogeneity assumption using" << endl
- << " Weiss & von Haeseler (2003) method" << endl
- << " -ns <#simulations> #Simulations to obtain null-distribution (default: 1000)" << endl
+// << endl << "TEST OF MODEL HOMOGENEITY:" << endl
+// << " -m WHTEST Testing model (GTR+G) homogeneity assumption using" << endl
+// << " Weiss & von Haeseler (2003) method" << endl
+// << " -ns <#simulations> #Simulations to obtain null-distribution (default: 1000)" << endl
// << endl << "TREE INFERENCE:" << endl
// << " -p <probability> IQP: Probability of deleting leaves (default: auto)" << endl
// << " -k <#representative> IQP: Size of representative leaf set (default: 4)" << endl
@@ -3234,17 +3311,18 @@ void usage_iqtree(char* argv[], bool full_command) {
<< endl;
cout << "GENERATING RANDOM TREES:" << endl;
- cout << " -r <num_taxa> Create a random tree under Yule-Harding model." << endl;
- cout << " -ru <num_taxa> Create a random tree under Uniform model." << endl;
- cout << " -rcat <num_taxa> Create a random caterpillar tree." << endl;
- cout << " -rbal <num_taxa> Create a random balanced tree." << endl;
- cout << " -rcsg <num_taxa> Create a random circular split network." << endl;
+ cout << " -r <num_taxa> Create a random tree under Yule-Harding model" << endl;
+ cout << " -ru <num_taxa> Create a random tree under Uniform model" << endl;
+ cout << " -rcat <num_taxa> Create a random caterpillar tree" << endl;
+ cout << " -rbal <num_taxa> Create a random balanced tree" << endl;
+ cout << " -rcsg <num_taxa> Create a random circular split network" << endl;
cout << " -rlen <min_len> <mean_len> <max_len> " << endl;
- cout << " min, mean, and max branch lengths of random trees." << endl;
+ cout << " min, mean, and max branch lengths of random trees" << endl;
cout << endl << "MISCELLANEOUS:" << endl
<< " -wt Write locally optimal trees into .treels file" << endl
<< " -blfix Fix branch lengths of user tree passed via -te" << endl
+ << " -blscale Scale branch lengths of user tree passed via -t" << endl
<< " -blmin Min branch length for optimization (default 0.000001)" << endl
<< " -blmax Max branch length for optimization (default 100)" << endl
<< " -wsr Write site rates and categories to .rate file" << endl
@@ -3252,7 +3330,12 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " -wslr Write site log-likelihoods per rate category" << endl
<< " -wslm Write site log-likelihoods per mixture class" << endl
<< " -wslmr Write site log-likelihoods per mixture+rate class" << endl
- << " -fconst f1,...,fN Add constant patterns into alignment (N=#nstates)" << endl;
+ << " -wspr Write site probabilities per rate category" << endl
+ << " -wspm Write site probabilities per mixture class" << endl
+ << " -wspmr Write site probabilities per mixture+rate class" << endl
+ << " -fconst f1,...,fN Add constant patterns into alignment (N=#nstates)" << endl
+ << " -me <epsilon> Logl epsilon for model parameter optimization (default 0.01)" << endl
+ << " --no-outfiles Suppress printing output files" << endl;
// << " -d <file> Reading genetic distances from file (default: JC)" << endl
// << " -d <outfile> Calculate the distance matrix inferred from tree" << endl
// << " -stats <outfile> Output some statistics about branch lengths" << endl
diff --git a/tools.h b/tools.h
index 6426a56..480648d 100644
--- a/tools.h
+++ b/tools.h
@@ -43,6 +43,21 @@
#define SPRNG
#include "sprng/sprng.h"
+// redefine assertion
+inline void _my_assert(const char* expression, const char *func, const char* file, int line)
+{
+ char *sfile = (char*)strrchr(file, '/');
+ if (!sfile) sfile = (char*)file; else sfile++;
+ cerr << "Assertion (" << expression << ") failed, function " << func << ", file " << sfile << ", line " << line << endl;
+ abort();
+}
+
+#ifdef NDEBUG
+#define ASSERT(EXPRESSION) ((void)0)
+#else
+#define ASSERT(EXPRESSION) ((EXPRESSION) ? (void)0 : _my_assert(#EXPRESSION, __func__, __FILE__, __LINE__))
+#endif
+
#define USE_HASH_MAP
@@ -406,6 +421,18 @@ enum SiteLoglType {
WSL_NONE, WSL_SITE, WSL_RATECAT, WSL_MIXTURE, WSL_MIXTURE_RATECAT
};
+enum SiteFreqType {
+ WSF_NONE, WSF_POSTERIOR_MEAN, WSF_POSTERIOR_MAX
+};
+
+const int BRLEN_OPTIMIZE = 0; // optimize branch lengths
+const int BRLEN_FIX = 1; // fix branch lengths
+const int BRLEN_SCALE = 2; // scale branch lengths
+
+const int OUT_LOG = 1; // .log file written or not
+const int OUT_TREEFILE = 2; // .treefile file written or not
+const int OUT_IQTREE = 4; // .iqtree file written or not
+
/** maximum number of newton-raphson steps for NNI branch evaluation */
extern int NNI_MAX_NR_STEP;
@@ -1226,9 +1253,11 @@ public:
string optimize_alg_gammai;
/**
- TRUE if you want to fix branch lengths during model optimization
+ BRLEN_OPTIMIZE optimize branch lengths during model optimization
+ BRLEN_FIX fix branch lengths during model optimization
+ BRLEN_SCALE scale all branch lengths by the same factor during model optimization
*/
- bool fixed_branch_length;
+ int fixed_branch_length;
/** minimum branch length for optimization, default 0.000001 */
double min_branch_length;
@@ -1351,16 +1380,25 @@ public:
SiteLoglType print_site_lh;
/**
+ control printing posterior probability of each site belonging to a rate/mixture categories
+ same meaning as print_site_lh, but results are printed to .siteprob file
+ WSL_RATECAT: print site probability per rate category
+ WSL_MIXTURE: print site probability per mixture class
+ WSL_MIXTURE_RATECAT: print site probability per mixture class per rate category
+ */
+ SiteLoglType print_site_prob;
+
+ /**
0: print nothing
1: print site state frequency vectors
*/
- int print_site_state_freq;
+ SiteFreqType print_site_state_freq;
/** TRUE to print site-specific rates, default: FALSE */
bool print_site_rate;
- /* 1: print site posterior probability */
- int print_site_posterior;
+ /* 1: print site posterior probability for many trees during tree search */
+ int print_trees_site_posterior;
/**
TRUE to print tree log-likelihood
@@ -1639,6 +1677,11 @@ public:
*/
char *site_freq_file;
+ /**
+ user tree file used to estimate site-specific state frequency model
+ */
+ char *tree_freq_file;
+
/** number of threads for OpenMP version */
int num_threads;
@@ -1715,6 +1758,13 @@ public:
/** true if ignoring the "finished" flag in checkpoint file */
bool force_unfinished;
+ /** control output files to be written
+ * OUT_LOG
+ * OUT_TREEFILE
+ * OUT_IQTREE
+ */
+ int suppress_output_flags;
+
};
/**
@@ -1863,8 +1913,7 @@ const char ERR_WRITE_OUTPUT[] = "Cannot write to file ";
const char ERR_NO_K[] = "You must specify the number of taxa in the PD set.";
const char ERR_TOO_SMALL_K[] = "Size of PD-set must be at least the size of initial set.";
const char ERR_NO_BUDGET[] = "Total budget is not specified or less than zero.";
-const char ERR_TOO_SMALL_BUDGET[] = "Not enough budget to conserve the inital set of taxa.";
-
+const char ERR_TOO_SMALL_BUDGET[] = "Not enough budget to conserve the initial set of taxa.";
const char ERR_INTERNAL[] = "Internal error, pls contact authors!";
/*--------------------------------------------------------------*/
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/iqtree.git
More information about the debian-med-commit
mailing list