[med-svn] [iqtree] 01/01: Imported Upstream version 1.4.0+dfsg
Andreas Tille
tille at debian.org
Wed Mar 9 19:21:29 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch upstream
in repository iqtree.
commit 35c46f5bd63fceb6b6a6fab38b553dab34fc4f33
Author: Andreas Tille <tille at debian.org>
Date: Wed Mar 9 19:55:53 2016 +0100
Imported Upstream version 1.4.0+dfsg
---
CMakeLists.txt | 17 +-
alignment.cpp | 90 ++-
alignment.h | 5 +-
candidateset.cpp | 61 +-
candidateset.h | 13 +-
checkpoint.cpp | 274 ++++---
checkpoint.h | 348 ++++++++-
gsl/CMakeLists.txt | 8 +
gsl/binomial_tpe.cpp | 381 ++++++++++
gsl/gauss.cpp | 352 +++++++++
gsl/gaussinv.cpp | 205 +++++
gsl/gausspdf.cpp | 40 +
gsl/gsl_nan.h | 45 ++
gsl/multinomial.cpp | 82 ++
gsl/mygsl.h | 57 ++
gsl/pow_int.cpp | 56 ++
gsl/rat_eval.h | 25 +
hashsplitset.cpp | 8 +-
iqtree.cpp | 736 ++++++++----------
iqtree.h | 34 +-
model/modelcodon.cpp | 41 +
model/modelcodon.h | 10 +
model/modeldna.cpp | 17 +
model/modeldna.h | 10 +
model/modelfactory.cpp | 103 ++-
model/modelfactory.h | 22 +-
model/modelgtr.cpp | 17 +-
model/modelgtr.h | 10 +
model/modelmixture.cpp | 63 +-
model/modelmixture.h | 16 +
model/modelsubst.cpp | 34 +-
model/modelsubst.h | 17 +-
model/partitionmodel.cpp | 38 +
model/partitionmodel.h | 18 +
model/ratefree.cpp | 24 +
model/ratefree.h | 10 +
model/ratefreeinvar.cpp | 14 +
model/ratefreeinvar.h | 10 +
model/rategamma.cpp | 22 +
model/rategamma.h | 10 +
model/rategammainvar.cpp | 17 +
model/rategammainvar.h | 10 +
model/rateheterogeneity.cpp | 17 +-
model/rateheterogeneity.h | 13 +-
model/rateinvar.cpp | 18 +
model/rateinvar.h | 10 +
mtree.cpp | 10 +-
mtree.h | 2 +-
optimization.cpp | 2 +-
optimization.h | 3 +-
pda.cpp | 105 ++-
phyloanalysis.cpp | 298 +++++---
phyloanalysis.h | 2 +-
phylokernel.h | 7 +
phylokernelsitemodel.cpp | 715 ++++++++++++++++++
phylokernelsitemodel.h | 801 ++++++++++++++++++++
phylosupertree.cpp | 53 +-
phylosupertree.h | 16 +
phylosupertreeplen.cpp | 43 ++
phylosupertreeplen.h | 20 +
phylotesting.cpp | 369 ++++++++-
phylotesting.h | 8 +
phylotree.cpp | 181 ++++-
phylotree.h | 152 +++-
phylotreeavx.cpp | 26 +
phylotreesse.cpp | 464 +++++++++---
quartet.cpp | 1756 +++++++++++++++++++++++++++++++++++++++++++
splitgraph.cpp | 65 ++
splitgraph.h | 19 +-
stoprule.cpp | 22 +-
stoprule.h | 14 +-
superalignment.cpp | 61 +-
superalignment.h | 11 +-
tools.cpp | 135 +++-
tools.h | 40 +-
75 files changed, 7814 insertions(+), 1014 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04b998b..3eea0ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,15 +37,15 @@
#NOTE: Static linking with clang windows: make a symlink libgcc_eh.a to libgcc.a (administrator required)
# C:\TDM-GCC-64\lib\gcc\x86_64-w64-mingw32\5.1.0>mklink libgcc_eh.a libgcc.a
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 2.8.10 FATAL_ERROR)
set(CMAKE_LEGACY_CYGWIN_WIN32 0)
project(iqtree)
add_definitions(-DIQ_TREE)
# The version number.
set (iqtree_VERSION_MAJOR 1)
-set (iqtree_VERSION_MINOR 3)
-set (iqtree_VERSION_PATCH "14")
+set (iqtree_VERSION_MINOR 4)
+set (iqtree_VERSION_PATCH "0")
set(BUILD_SHARED_LIBS OFF)
@@ -113,6 +113,10 @@ if (CMAKE_COMPILER_IS_GNUCXX)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++98")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g")
set(CMAKE_C_FLAGS_RELEASE "-O3 -g")
+ # require at least gcc 4.6
+ if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.6)
+ message(FATAL_ERROR "GCC version must be at least 4.6!")
+ endif()
elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
message("Compiler : Clang")
set(CLANG "TRUE")
@@ -347,6 +351,7 @@ if(EIGEN3_FOUND)
include_directories(${EIGEN3_INCLUDE_DIR})
endif(EIGEN3_FOUND)
add_subdirectory(model)
+add_subdirectory(gsl)
##################################################################
# the main executable
@@ -393,8 +398,10 @@ phylosupertree.cpp
phylotree.cpp
phylotreesse.cpp
phylotreepars.cpp
+phylokernelsitemodel.cpp
#phylotreeavx.cpp
pruning.cpp
+quartet.cpp
split.cpp
splitgraph.cpp
splitset.cpp
@@ -478,9 +485,9 @@ if (IQTREE_FLAGS MATCHES "omp")
endif()
if (BINARY32 OR IQTREE_FLAGS MATCHES "novx")
- target_link_libraries(iqtree pll ncl lbfgsb whtest sprng vectorclass model ${PLATFORM_LIB} ${STD_LIB} ${THREAD_LIB})
+ target_link_libraries(iqtree pll ncl lbfgsb whtest sprng vectorclass model gsl ${PLATFORM_LIB} ${STD_LIB} ${THREAD_LIB})
else()
- target_link_libraries(iqtree pll pllavx ncl lbfgsb whtest sprng vectorclass model avxkernel ${PLATFORM_LIB} ${STD_LIB} ${THREAD_LIB})
+ target_link_libraries(iqtree pll pllavx ncl lbfgsb whtest sprng vectorclass model avxkernel gsl ${PLATFORM_LIB} ${STD_LIB} ${THREAD_LIB})
endif()
##################################################################
diff --git a/alignment.cpp b/alignment.cpp
index aaa8f54..bf6d195 100644
--- a/alignment.cpp
+++ b/alignment.cpp
@@ -14,6 +14,9 @@
#include <numeric>
#include <sstream>
#include "model/rategamma.h"
+#include "gsl/mygsl.h"
+
+
using namespace std;
char symbols_protein[] = "ARNDCQEGHILKMFPSTWYVX"; // X for unknown AA
@@ -1810,20 +1813,23 @@ int Alignment::buildRetainingSites(const char *aln_site_list, IntVector &kept_si
}
void Alignment::printPhylip(ostream &out, bool append, const char *aln_site_list,
- bool exclude_gaps, bool exclude_const_sites, const char *ref_seq_name) {
+ bool exclude_gaps, bool exclude_const_sites, const char *ref_seq_name, bool print_taxid) {
IntVector kept_sites;
int final_length = buildRetainingSites(aln_site_list, kept_sites, exclude_gaps, exclude_const_sites, ref_seq_name);
if (seq_type == SEQ_CODON)
final_length *= 3;
out << getNSeq() << " " << final_length << endl;
- StrVector::iterator it;
int max_len = getMaxSeqNameLength();
+ if (print_taxid) max_len = 10;
if (max_len < 10) max_len = 10;
- int seq_id = 0;
- for (it = seq_names.begin(); it != seq_names.end(); it++, seq_id++) {
+ int seq_id;
+ for (seq_id = 0; seq_id < seq_names.size(); seq_id++) {
out.width(max_len);
- out << left << (*it) << " ";
+ if (print_taxid)
+ out << left << seq_id << " ";
+ else
+ out << left << seq_names[seq_id] << " ";
int j = 0;
for (IntVector::iterator i = site_pattern.begin(); i != site_pattern.end(); i++, j++)
if (kept_sites[j])
@@ -1834,11 +1840,6 @@ void Alignment::printPhylip(ostream &out, bool append, const char *aln_site_list
void Alignment::printPhylip(const char *file_name, bool append, const char *aln_site_list,
bool exclude_gaps, bool exclude_const_sites, const char *ref_seq_name) {
- IntVector kept_sites;
- int final_length = buildRetainingSites(aln_site_list, kept_sites, exclude_gaps, exclude_const_sites, ref_seq_name);
- if (seq_type == SEQ_CODON)
- final_length *= 3;
-
try {
ofstream out;
out.exceptions(ios::failbit | ios::badbit);
@@ -1847,20 +1848,9 @@ void Alignment::printPhylip(const char *file_name, bool append, const char *aln_
out.open(file_name, ios_base::out | ios_base::app);
else
out.open(file_name);
- out << getNSeq() << " " << final_length << endl;
- StrVector::iterator it;
- int max_len = getMaxSeqNameLength();
- if (max_len < 10) max_len = 10;
- int seq_id = 0;
- for (it = seq_names.begin(); it != seq_names.end(); it++, seq_id++) {
- out.width(max_len);
- out << left << (*it) << " ";
- int j = 0;
- for (IntVector::iterator i = site_pattern.begin(); i != site_pattern.end(); i++, j++)
- if (kept_sites[j])
- out << convertStateBackStr(at(*i)[seq_id]);
- out << endl;
- }
+
+ printPhylip(out, append, aln_site_list, exclude_gaps, exclude_const_sites, ref_seq_name);
+
out.close();
if (verbose_mode >= VB_MED)
cout << "Alignment was printed to " << file_name << endl;
@@ -2324,16 +2314,38 @@ void Alignment::createBootstrapAlignment(IntVector &pattern_freq, const char *sp
delete [] internal_freq;
}
-void Alignment::createBootstrapAlignment(int *pattern_freq, const char *spec) {
+void Alignment::createBootstrapAlignment(int *pattern_freq, const char *spec, int *rstream) {
int site, nsite = getNSite();
memset(pattern_freq, 0, getNPattern()*sizeof(int));
IntVector site_vec;
- if (!spec) {
- for (site = 0; site < nsite; site++) {
- int site_id = random_int(nsite);
- int ptn_id = getPatternID(site_id);
- pattern_freq[ptn_id]++;
- }
+ if (!spec || strncmp(spec, "SCALE=", 6) == 0) {
+
+ if (spec) {
+ double scale = convert_double(spec+6);
+ nsite = (int)round(scale * nsite);
+ }
+ int nptn = getNPattern();
+
+ if (nsite/8 < nptn) {
+ int orig_nsite = getNSite();
+ for (site = 0; site < nsite; site++) {
+ int site_id = random_int(orig_nsite, rstream);
+ int ptn_id = getPatternID(site_id);
+ pattern_freq[ptn_id]++;
+ }
+ } else {
+ // BQM 2015-12-27: use multinomial sampling for faster generation if #sites is much larger than #patterns
+ int ptn;
+ double *prob = new double[nptn];
+ for (ptn = 0; ptn < nptn; ptn++)
+ prob[ptn] = at(ptn).frequency;
+ gsl_ran_multinomial(nptn, nsite, prob, (unsigned int*)pattern_freq, rstream);
+ int sum = 0;
+ for (ptn = 0; ptn < nptn; ptn++)
+ sum += pattern_freq[ptn];
+ assert(sum == nsite);
+ delete [] prob;
+ }
} else if (strncmp(spec, "GENESITE,", 9) == 0) {
// resampling genes, then resampling sites within resampled genes
convert_int_vec(spec+9, site_vec);
@@ -2348,9 +2360,9 @@ void Alignment::createBootstrapAlignment(int *pattern_freq, const char *spec) {
outError("Sum of lengths exceeded alignment length");
for (i = 0; i < site_vec.size(); i++) {
- int part = random_int(site_vec.size());
+ int part = random_int(site_vec.size(), rstream);
for (int j = 0; j < site_vec[part]; j++) {
- site = random_int(site_vec[part]) + begin_site[part];
+ site = random_int(site_vec[part], rstream) + begin_site[part];
int ptn = getPatternID(site);
pattern_freq[ptn]++;
}
@@ -2369,7 +2381,7 @@ void Alignment::createBootstrapAlignment(int *pattern_freq, const char *spec) {
outError("Sum of lengths exceeded alignment length");
for (i = 0; i < site_vec.size(); i++) {
- int part = random_int(site_vec.size());
+ int part = random_int(site_vec.size(), rstream);
for (site = begin_site[part]; site < begin_site[part] + site_vec[part]; site++) {
int ptn = getPatternID(site);
pattern_freq[ptn]++;
@@ -2385,7 +2397,7 @@ void Alignment::createBootstrapAlignment(int *pattern_freq, const char *spec) {
if (begin_site + site_vec[part] > getNSite())
outError("Sum of lengths exceeded alignment length");
for (site = 0; site < site_vec[part+1]; site++) {
- int site_id = random_int(site_vec[part]) + begin_site;
+ int site_id = random_int(site_vec[part], rstream) + begin_site;
int ptn_id = getPatternID(site_id);
pattern_freq[ptn_id]++;
}
@@ -2544,8 +2556,11 @@ double Alignment::computeObsDist(int seq1, int seq2) {
if ((*it)[seq1] != (*it)[seq2] )
diff_pos += (*it).frequency;
}
- if (!total_pos)
+ if (!total_pos) {
+ if (verbose_mode >= VB_MED)
+ outWarning("No overlapping characters between " + getSeqName(seq1) + " and " + getSeqName(seq2));
return MAX_GENETIC_DIST; // return +INF if no overlap between two sequences
+ }
return ((double)diff_pos) / total_pos;
}
@@ -2668,6 +2683,9 @@ double Alignment::readDist(istream &in, double *dist_mat) {
string dist_file = params.out_prefix;
dist_file += ".userdist";
printDist(dist_file.c_str(), dist_mat);*/
+
+ delete [] tmp_dist_mat;
+
return longest_dist;
}
diff --git a/alignment.h b/alignment.h
index b75b073..a9dba09 100644
--- a/alignment.h
+++ b/alignment.h
@@ -223,7 +223,7 @@ public:
bool exclude_gaps = false, bool exclude_const_sites = false, const char *ref_seq_name = NULL);
void printPhylip(ostream &out, bool append = false, const char *aln_site_list = NULL,
- bool exclude_gaps = false, bool exclude_const_sites = false, const char *ref_seq_name = NULL);
+ bool exclude_gaps = false, bool exclude_const_sites = false, const char *ref_seq_name = NULL, bool print_taxid = false);
void printFasta(const char *filename, bool append = false, const char *aln_site_list = NULL,
bool exclude_gaps = false, bool exclude_const_sites = false, const char *ref_seq_name = NULL);
@@ -391,8 +391,9 @@ public:
resampling pattern frequency by a non-parametric bootstrap
@param pattern_freq (OUT) resampled pattern frequencies
@param spec bootstrap specification, see above
+ @param rstream random generator stream, NULL to use the global randstream
*/
- virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL);
+ virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL, int *rstream = NULL);
/**
create a gap masked alignment from an input alignment. Gap patterns of masked_aln
diff --git a/candidateset.cpp b/candidateset.cpp
index 0962161..fa5ea81 100644
--- a/candidateset.cpp
+++ b/candidateset.cpp
@@ -16,11 +16,54 @@ void CandidateSet::init(Alignment* aln, Params *params) {
CandidateSet::~CandidateSet() {
}
-CandidateSet::CandidateSet() {
+CandidateSet::CandidateSet() : CheckpointFactory() {
aln = NULL;
params = NULL;
}
+
+void CandidateSet::saveCheckpoint() {
+ checkpoint->startStruct("CandidateSet");
+ int ntrees = min(params->numNNITrees, (int)size());
+ checkpoint->startList(params->numNNITrees);
+ for (reverse_iterator it = rbegin(); it != rend() && ntrees > 0; it++, ntrees--) {
+ checkpoint->addListElement();
+ stringstream ss;
+ ss.precision(12);
+ ss << it->second.score << " " << it->second.tree;
+// double score = it->second.score;
+// CKP_SAVE(score);
+// checkpoint->put("tree", it->second.tree);
+ checkpoint->put("", ss.str());
+ }
+ checkpoint->endList();
+ checkpoint->endStruct();
+ CheckpointFactory::saveCheckpoint();
+}
+
+void CandidateSet::restoreCheckpoint() {
+ CheckpointFactory::restoreCheckpoint();
+ checkpoint->startStruct("CandidateSet");
+ double score;
+ string tree;
+ checkpoint->startList(params->numNNITrees);
+ for (int i = 0; i < params->numNNITrees; i++) {
+ checkpoint->addListElement();
+ string str;
+ if (!checkpoint->getString("", str)) {
+ break;
+ }
+ stringstream ss(str);
+ ss >> score >> tree;
+// CKP_RESTORE(tree);
+ update(tree, score);
+
+ }
+ checkpoint->endList();
+ checkpoint->endStruct();
+}
+
+
vector<string> CandidateSet::getBestTrees() {
vector<string> res;
double bestScore = rbegin()->first;
@@ -179,18 +222,22 @@ double CandidateSet::getWorstScore() {
}
string CandidateSet::getTopology(string tree) {
- PhyloTree mtree;
+// PhyloTree mtree;
// mtree.rooted = params->is_rooted;
- mtree.aln = this->aln;
- mtree.setParams(params);
-
+// mtree.aln = this->aln;
+// mtree.setParams(params);
+ MTree mtree;
+
stringstream str;
str << tree;
str.seekg(0, ios::beg);
// freeNode();
mtree.readTree(str, params->is_rooted);
- mtree.setAlignment(aln);
- mtree.setRootNode(params->root);
+// mtree.setAlignment(aln);
+// mtree.setRootNode(params->root);
+ mtree.assignLeafID();
+ string x = "0";
+ mtree.root = mtree.findLeafName(x);
// mtree.readTreeString(tree);
// mtree.setRootNode(params->root);
diff --git a/candidateset.h b/candidateset.h
index aaa3392..48e9784 100644
--- a/candidateset.h
+++ b/candidateset.h
@@ -11,6 +11,7 @@
#include "alignment.h"
#include "mtreeset.h"
#include <stack>
+#include "checkpoint.h"
struct CandidateTree {
@@ -47,7 +48,7 @@ struct CandidateTree {
/**
* Candidate tree set, sorted in ascending order of scores, i.e. the last element is the highest scoring tree
*/
-class CandidateSet : public multimap<double, CandidateTree> {
+class CandidateSet : public multimap<double, CandidateTree>, public CheckpointFactory {
public:
/**
@@ -58,6 +59,16 @@ public:
CandidateSet();
/**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
+ /**
* return randomly one candidate tree from max_candidate
*/
string getRandCandTree();
diff --git a/checkpoint.cpp b/checkpoint.cpp
index a85a479..19312a6 100644
--- a/checkpoint.cpp
+++ b/checkpoint.cpp
@@ -6,75 +6,79 @@
*/
#include "checkpoint.h"
+#include "tools.h"
+#include "timeutil.h"
+#include "gzstream.h"
-/*
- * The following parameters have been saved for checkpoint in IQPNNI
- *
-Number iterations: 200
-Maximum number iterations: 2000
-Current number iterations: 139
-Probability of deleting a sequence: 0.5
-Number representatives: 4
-Stopping rule (0: YES, 1: YES_MIN_ITER, 2: YES_MAX_ITER, 3: NO): 3
-Type of data (0:NUCLEOTIDE, 1:AMINO_ACID): 0
-Substitution model (0:HKY85, 1: TN93, 2:GTR, 3:WAG, 4:JTT, 5:VT, 6:MtREV24, 7:Blosum62, 8:Dayhoff, 9:rtREV, 10: User-defined): 0
-Frequency of Base A: 0.248672
-Frequency of Base C: 0.261687
-Frequency of Base G: 0.250996
-Frequency of Base T: 0.238645
-Type of parameters (0:ESTIMATE, 1:USER_DEFINED, 2: EQUAL): 0
-Transition/transversion ratito: 0.766912
-Type of parameters (0:ESTIMATE, 1:USER_DEFINED): 0
-Pyridimine/purine ratito: 1
-Type of parameters (0:ESTIMATE, 1:USER_DEFINED): 0
-Transition rate from A to G: -1
-Transition rate from C to T: -1
-Transversion rate from A to C: -1
-Transversion rate from A to T: -1
-Transversion rate from C to G: -1
-Transversion rate from G to T: -1
-Type of parameters (0:ESTIMATE, 1:USER_DEFINED): 0
-Type of rate heterogeneity (0:UNIFORM, 1:SITE_SPECIFIC, 2:GAMMA): 0
-Number rates: 1
-Gamma distribution parameter alpha: 1
-Type of parameters (0:ESTIMATE, 1:USER_DEFINED): 0
-Invariant type (0: NONE, 1:ESTIMATE, 2: USER_DEFINED): 0
-Proportion of invariable sites: 0
-Out group sequence: 0
-Bootstrap sample: 0
-Current bootstrap sample: 0
-Build consensus: 0
-Current best log-likelihood: -11833.35062
-Elapsed time: 23
-Finished: 0
- */
Checkpoint::Checkpoint() {
filename = "";
+ prev_dump_time = 0;
+ dump_interval = 30; // dumping at most once per 30 seconds
+ struct_name = "";
}
+
+Checkpoint::~Checkpoint() {
+}
+
+
+const char* CKP_HEADER = "--- # IQ-TREE Checkpoint";
+
void Checkpoint::setFileName(string filename) {
this->filename = filename;
}
void Checkpoint::load() {
assert(filename != "");
+ if (!fileExists(filename)) return;
try {
- ifstream in;
+ igzstream in;
// set the failbit and badbit
in.exceptions(ios::failbit | ios::badbit);
in.open(filename.c_str());
- string line;
- getline(in, line);
- if (line != "Checkpoint file for IQ-TREE")
- throw ("Invalid checkpoint file");
// remove the failbit
in.exceptions(ios::badbit);
+ string line;
+ if (!getline(in, line)) {
+ in.close();
+ return;
+ }
+ if (line != CKP_HEADER)
+ throw ("Invalid checkpoint file " + filename);
+ string struct_name;
+ size_t pos;
+ int listid = 0;
while (!in.eof()) {
getline(in, line);
- size_t pos = line.find(" := ");
- if (pos == string::npos)
- throw "':=' is expected between key and value";
- (*this)[line.substr(0, pos)] = line.substr(pos+3);
+ pos = line.find('#');
+ if (pos != string::npos)
+ line.erase(pos);
+ line.erase(line.find_last_not_of("\n\r\t")+1);
+// trimString(line);
+ if (line.empty()) continue;
+ if (line[0] != ' ') {
+ struct_name = "";
+ }
+// trimString(line);
+ line.erase(0, line.find_first_not_of(" \n\r\t"));
+ if (line.empty()) continue;
+ pos = line.find(": ");
+ if (pos != string::npos) {
+ // mapping
+ (*this)[struct_name + line.substr(0, pos)] = line.substr(pos+2);
+ } else if (line[line.length()-1] == ':') {
+ // start a new struct
+ line.erase(line.length()-1);
+ trimString(line);
+ struct_name = line + '.';
+ listid = 0;
+ continue;
+ } else {
+ // collection
+ (*this)[struct_name + convertIntToString(listid)] = line;
+ listid++;
+// throw "':' is expected between key and value";
+ }
}
in.clear();
// set the failbit again
@@ -89,81 +93,165 @@ void Checkpoint::load() {
}
}
-void Checkpoint::commit() {
+void Checkpoint::setDumpInterval(double interval) {
+ dump_interval = interval;
+}
+
+
+void Checkpoint::dump(bool force) {
assert(filename != "");
+ if (!force && getRealTime() < prev_dump_time + dump_interval) {
+ return;
+ }
+ prev_dump_time = getRealTime();
try {
- ofstream out;
+ ogzstream out;
out.exceptions(ios::failbit | ios::badbit);
out.open(filename.c_str());
- out << "Checkpoint file for IQ-TREE" << endl;
- for (iterator i = begin(); i != end(); i++)
- out << i->first << " := " << i->second << endl;
+ out << CKP_HEADER << endl;
+ string struct_name;
+ size_t pos;
+ int listid = 0;
+ for (iterator i = begin(); i != end(); i++) {
+ if ((pos = i->first.find('.')) != string::npos) {
+ if (struct_name != i->first.substr(0, pos)) {
+ struct_name = i->first.substr(0, pos);
+ out << struct_name << ":" << endl;
+ listid = 0;
+ }
+ // check if key is a collection
+ out << " " << i->first.substr(pos+1) << ": " << i->second << endl;
+ } else
+ out << i->first << ": " << i->second << endl;
+ }
out.close();
+// cout << "Checkpoint dumped" << endl;
} catch (ios::failure &) {
outError(ERR_WRITE_OUTPUT, filename.c_str());
}
}
-bool Checkpoint::containsKey(string key) {
+bool Checkpoint::hasKey(string key) {
return (find(key) != end());
}
-/**
- * series of get functions
- */
+/*-------------------------------------------------------------
+ * series of get function to get value of a key
+ *-------------------------------------------------------------*/
-template<class T>
-void Checkpoint::get(string key, T& value) {
- assert(containsKey(key));
- stringstream ss((*this)[key]);
- ss >> value;
+bool Checkpoint::getBool(string key, bool &ret) {
+ string value;
+ if (!get(key, value)) return false;
+ if (value == "true")
+ ret = true;
+ else if (value == "false")
+ ret = false;
+ else
+ outError("Invalid boolean value " + value + " for key " + key);
+ return true;
}
bool Checkpoint::getBool(string key) {
- assert(containsKey(key));
- if ((*this)[key] == "1") return true;
- return false;
+ bool ret;
+ if (!getBool(key, ret))
+ return false;
+ return ret;
}
-char Checkpoint::getChar(string key) {
- assert(containsKey(key));
- return (*this)[key][0];
+/*-------------------------------------------------------------
+ * series of put function to put pair of (key,value)
+ *-------------------------------------------------------------*/
+
+void Checkpoint::putBool(string key, bool value) {
+ if (value)
+ put(key, "true");
+ else
+ put(key, "false");
}
-double Checkpoint::getDouble(string key) {
- assert(containsKey(key));
- return convert_double((*this)[key].c_str());
+/*-------------------------------------------------------------
+ * nested structures
+ *-------------------------------------------------------------*/
+void Checkpoint::startStruct(string name) {
+ struct_name = struct_name + name + '.';
}
-int Checkpoint::getInt(string key) {
- assert(containsKey(key));
- return convert_int((*this)[key].c_str());
+/**
+ end the current struct
+*/
+void Checkpoint::endStruct() {
+ size_t pos = struct_name.find_last_of('.', struct_name.length()-2);
+ if (pos == string::npos)
+ struct_name = "";
+ else
+ struct_name.erase(pos+1);
+}
+void Checkpoint::startList(int nelem) {
+ list_element.push_back(-1);
+ if (nelem > 0)
+ list_element_precision.push_back((int)ceil(log10(nelem)));
+ else
+ list_element_precision.push_back(0);
+}
+
+void Checkpoint::addListElement() {
+ list_element.back()++;
+ if (list_element.back() > 0) {
+ size_t pos = struct_name.find_last_of('.', struct_name.length()-2);
+ assert(pos != string::npos);
+ struct_name.erase(pos+1);
+ }
+ stringstream ss;
+ ss << setw(list_element_precision.back()) << setfill('0') << list_element.back();
+// ss << list_element.back();
+ struct_name += ss.str() + ".";
+}
+
+void Checkpoint::endList() {
+ assert(!list_element.empty());
+
+ if (list_element.back() >= 0) {
+ size_t pos = struct_name.find_last_of('.', struct_name.length()-2);
+ assert(pos != string::npos);
+ struct_name.erase(pos+1);
+ }
+
+ list_element.pop_back();
+ list_element_precision.pop_back();
+
+}
+
+void Checkpoint::getSubCheckpoint(Checkpoint *target, string partial_key) {
+ for (iterator it = begin(); it != end(); it++) {
+ if (it->first.find(partial_key) != string::npos)
+ (*target)[it->first] = it->second;
+ }
}
-/**
- * series of put functions
- */
-template<class T>
-void Checkpoint::put(string key, T value) {
- stringstream ss;
- ss << value;
- (*this)[key] = ss.str();
+/*-------------------------------------------------------------
+ * CheckpointFactory
+ *-------------------------------------------------------------*/
+
+CheckpointFactory::CheckpointFactory() {
+ checkpoint = NULL;
}
-template<class T>
-void Checkpoint::putArray(string key, int num, T* value) {
- stringstream ss;
- for (int i = 0; i < num; i++) {
- if (i > 0) ss << ',';
- ss << value[i];
- }
- (*this)[key] = ss.str();
+void CheckpointFactory::setCheckpoint(Checkpoint *checkpoint) {
+ this->checkpoint = checkpoint;
}
+Checkpoint *CheckpointFactory::getCheckpoint() {
+ return checkpoint;
+}
-Checkpoint::~Checkpoint() {
+void CheckpointFactory::saveCheckpoint() {
+ // do nothing
+}
+
+void CheckpointFactory::restoreCheckpoint() {
+ // do nothing
}
diff --git a/checkpoint.h b/checkpoint.h
index f92c3a4..20ac08c 100644
--- a/checkpoint.h
+++ b/checkpoint.h
@@ -8,58 +8,376 @@
#ifndef CHECKPOINT_H_
#define CHECKPOINT_H_
-#include "tools.h"
+#include <stdio.h>
+#include <map>
+#include <string>
+#include <sstream>
+#include <cassert>
+#include <vector>
+#include <typeinfo>
+
+using namespace std;
+
+// several useful declaration to save to or restore from a checkpoint
+#define CKP_SAVE(var) checkpoint->put(#var, var)
+#define CKP_ARRAY_SAVE(num, arr) checkpoint->putArray(#arr, num, arr)
+#define CKP_VECTOR_SAVE(arr) checkpoint->putVector(#arr, arr)
+
+#define CKP_RESTORE(var) checkpoint->get(#var, var)
+#define CKP_RESTORE_STRING(var) checkpoint->getString(#var, var)
+#define CKP_ARRAY_RESTORE(num, arr) checkpoint->getArray(#arr, num, arr)
+#define CKP_VECTOR_RESTORE(arr) checkpoint->getVector(#arr, arr)
+
+/** checkpoint stream */
+class CkpStream : public stringstream {
+public:
+ explicit CkpStream (ios_base::openmode which = ios_base::in | ios_base::out) : stringstream(which) {}
+
+ explicit CkpStream (const string& str, ios_base::openmode which = ios_base::in | ios_base::out) :
+ stringstream(str, which) {}
+
+};
+
+/* overload operators */
+//ostream& operator<<(ostream& os, const T& obj) {
+// return os;
+//}
+//
+//std::istream& operator>>(std::istream& is, T& obj) {
+// return is;
+//}
/**
* Checkpoint as map from key strings to value strings
*/
class Checkpoint : public map<string, string> {
public:
+
+ /** constructor */
Checkpoint();
+
+ /** destructor */
+ virtual ~Checkpoint();
+
/**
* @param filename file name
*/
void setFileName(string filename);
+
/**
* load checkpoint information from file
*/
void load();
/**
- * commit checkpoint information into file
+ * dump checkpoint information into file
+ * @param force TRUE to dump no matter if time interval exceeded or not
*/
- void commit();
+ void dump(bool force = false);
+
+ /**
+ set dumping interval in seconds
+ @param interval dumping interval
+ */
+ void setDumpInterval(double interval);
/**
* @return true if checkpoint contains the key
* @param key key to search for
*/
- bool containsKey(string key);
+ bool hasKey(string key);
+
+ /*-------------------------------------------------------------
+ * series of get function to get value of a key
+ *-------------------------------------------------------------*/
+
+ /**
+ @param key key name
+ @param[out] value value for key
+ @return true if key exists, false otherwise
+ */
+ template<class T>
+ bool get(string key, T& value) {
+ if (key.empty())
+ key = struct_name.substr(0, struct_name.length()-1);
+ else
+ key = struct_name + key;
+ iterator it = find(key);
+ if (it == end())
+ return false;
+ CkpStream ss(it->second);
+ ss >> value;
+ return true;
+ }
/**
- * series of get functions
+ @param key key name
+ @param[out] value entire string
+ @return true if key exists, false otherwise
*/
+ bool getString(string key, string &value) {
+ if (key.empty())
+ key = struct_name.substr(0, struct_name.length()-1);
+ else
+ key = struct_name + key;
+ iterator it = find(key);
+ if (it == end())
+ return false;
+ value = it->second;
+ return true;
+ }
+
+ /**
+ get an array from checkpoint
+ @param key key name
+ @param num number of elements
+ @param[out] value value
+ */
+ template<class T>
+ bool getArray(string key, int maxnum, T* value) {
+ if (key.empty())
+ key = struct_name.substr(0, struct_name.length()-1);
+ else
+ key = struct_name + key;
+ iterator it = find(key);
+ if (it == end())
+ return false;
+ size_t pos = 0, next_pos;
+ for (int i = 0; i < maxnum; i++) {
+ next_pos = it->second.find(", ", pos);
+ CkpStream ss(it->second.substr(pos, next_pos-pos));
+ if (!(ss >> value[i]))
+ break;
+ if (next_pos == string::npos) break;
+ pos = next_pos+2;
+ }
+ return true;
+ }
+
+ /**
+ get an array from checkpoint
+ @param key key name
+ @param num number of elements
+ @param[out] value value
+ */
template<class T>
- void get(string key, T& value);
+ bool getVector(string key, vector<T> &value) {
+ if (key.empty())
+ key = struct_name.substr(0, struct_name.length()-1);
+ else
+ key = struct_name + key;
+ iterator it = find(key);
+ if (it == end())
+ return false;
+ size_t pos = 0, next_pos;
+ value.clear();
+ for (int i = 0; ; i++) {
+ next_pos = it->second.find(", ", pos);
+ CkpStream ss(it->second.substr(pos, next_pos-pos));
+ T val;
+ if (ss >> val)
+ value.push_back(val);
+ else
+ break;
+ if (next_pos == string::npos) break;
+ pos = next_pos+2;
+ }
+ return true;
+ }
+ /**
+ @param key key name
+ @return bool value for key
+ */
+ bool getBool(string key, bool &ret);
bool getBool(string key);
- char getChar(string key);
- double getDouble(string key);
- int getInt(string key);
+// /**
+// @param key key name
+// @return double value for key
+// */
+// double getDouble(string key);
+//
+// /**
+// @param key key name
+// @return int value for key
+// */
+// int getInt(string key);
- /**
- * series of put functions
- */
+
+ /*-------------------------------------------------------------
+ * series of put function to put pair of (key,value)
+ *-------------------------------------------------------------*/
+
+ /**
+ put pair of (key,value) to checkpoint
+ @param key key name
+ @param value value
+ */
template<class T>
- void put(string key, T value);
+ void put(string key, T value) {
+ if (key.empty())
+ key = struct_name.substr(0, struct_name.length()-1);
+ else
+ key = struct_name + key;
+ CkpStream ss;
+ if (typeid(T) == typeid(double))
+ ss.precision(10);
+ ss << value;
+ (*this)[key] = ss.str();
+ }
+
+ /**
+ @param key key name
+ @param value
+ */
+ void putBool(string key, bool value);
+ /**
+ put an array to checkpoint
+ @param key key name
+ @param num number of elements
+ @param value value
+ */
template<class T>
- void putArray(string key, int num, T* value);
+ void putArray(string key, int num, T* value) {
+ if (key.empty())
+ key = struct_name.substr(0, struct_name.length()-1);
+ else
+ key = struct_name + key;
+ CkpStream ss;
+ if (typeid(T) == typeid(double))
+ ss.precision(10);
+ for (int i = 0; i < num; i++) {
+ if (i > 0) ss << ", ";
+ ss << value[i];
+ }
+ (*this)[key] = ss.str();
+ }
- virtual ~Checkpoint();
+ /**
+ put an STL vector to checkpoint
+ @param key key name
+ @param num number of elements
+ @param value value
+ */
+ template<class T>
+ void putVector(string key, vector<T> &value) {
+ if (key.empty())
+ key = struct_name.substr(0, struct_name.length()-1);
+ else
+ key = struct_name + key;
+ CkpStream ss;
+ if (typeid(T) == typeid(double))
+ ss.precision(10);
+ for (int i = 0; i < value.size(); i++) {
+ if (i > 0) ss << ", ";
+ ss << value[i];
+ }
+ (*this)[key] = ss.str();
+ }
+
+ /*-------------------------------------------------------------
+ * helper functions
+ *-------------------------------------------------------------*/
+
+ /**
+ start a new struct
+ */
+ void startStruct(string name);
+
+ /**
+ end the current struct
+ */
+ void endStruct();
+
+ /**
+ start a new list in the current scope
+ @param nelem number of elements
+ */
+ void startList(int nelem);
+
+ /**
+ add an element to the current list
+ */
+ void addListElement();
+
+ /**
+ end the current list
+ */
+ void endList();
+
+ /**
+ get a subset of checkpoint where the key string contains a given substring
+ @param[out] target checkpoint
+ @param sub_key key substring to search for
+ */
+ void getSubCheckpoint(Checkpoint *target, string sub_key);
+protected:
+
+ /** filename to write checkpoint */
string filename;
+
+ /** previous dump time in seconds */
+ double prev_dump_time;
+
+ /** dumping time interval */
+ double dump_interval;
+
+private:
+
+ /** name of the current nested key */
+ string struct_name;
+
+ /** current list element ID */
+ vector<int> list_element;
+
+ /** width to element ID for prefixing with '0' */
+ vector<int> list_element_precision;
+
+};
+
+
+
+/**
+ Root class handling all checkpoint facilities. Inherit this class
+ if you want a class to be checkpointed.
+*/
+class CheckpointFactory {
+public:
+
+ /** constructor */
+ CheckpointFactory();
+
+ virtual ~CheckpointFactory() {}
+
+ /**
+ set checkpoint object
+ @param checkpoint
+ */
+ virtual void setCheckpoint(Checkpoint *checkpoint);
+
+ /**
+ get checkpoint object
+ @return checkpoint
+ */
+ Checkpoint *getCheckpoint();
+
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
+protected:
+
+ Checkpoint *checkpoint;
+
};
#endif /* CHECKPOINT_H_ */
diff --git a/gsl/CMakeLists.txt b/gsl/CMakeLists.txt
new file mode 100644
index 0000000..06a9cef
--- /dev/null
+++ b/gsl/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_library(gsl
+binomial_tpe.cpp
+multinomial.cpp
+pow_int.cpp
+gauss.cpp
+gaussinv.cpp
+gausspdf.cpp
+)
diff --git a/gsl/binomial_tpe.cpp b/gsl/binomial_tpe.cpp
new file mode 100644
index 0000000..e68c721
--- /dev/null
+++ b/gsl/binomial_tpe.cpp
@@ -0,0 +1,381 @@
+/* randist/binomial_tpe.c
+ *
+ * Copyright (C) 1996, 2003, 2007 James Theiler, Brian Gough
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+//#include <config.h>
+#include <stdlib.h>
+#include <math.h>
+//#include "../tools.h"
+#include "mygsl.h"
+//#include <gsl/gsl_rng.h>
+//#include <gsl/gsl_randist.h>
+//#include <gsl/gsl_pow_int.h>
+//#include <gsl/gsl_sf_gamma.h>
+
+extern double random_double(int *rstream);
+
+
+/* The binomial distribution has the form,
+
+ f(x) = n!/(x!(n-x)!) * p^x (1-p)^(n-x) for integer 0 <= x <= n
+ = 0 otherwise
+
+ This implementation follows the public domain ranlib function
+ "ignbin", the bulk of which is the BTPE (Binomial Triangle
+ Parallelogram Exponential) algorithm introduced in
+ Kachitvichyanukul and Schmeiser[1]. It has been translated to use
+ modern C coding standards.
+
+ If n is small and/or p is near 0 or near 1 (specifically, if
+ n*min(p,1-p) < SMALL_MEAN), then a different algorithm, called
+ BINV, is used which has an average runtime that scales linearly
+ with n*min(p,1-p).
+
+ But for larger problems, the BTPE algorithm takes the form of two
+ functions b(x) and t(x) -- "bottom" and "top" -- for which b(x) <
+ f(x)/f(M) < t(x), with M = floor(n*p+p). b(x) defines a triangular
+ region, and t(x) includes a parallelogram and two tails. Details
+ (including a nice drawing) are in the paper.
+
+ [1] Kachitvichyanukul, V. and Schmeiser, B. W. Binomial Random
+ Variate Generation. Communications of the ACM, 31, 2 (February,
+ 1988) 216.
+
+ Note, Bruce Schmeiser (personal communication) points out that if
+ you want very fast binomial deviates, and you are happy with
+ approximate results, and/or n and n*p are both large, then you can
+ just use gaussian estimates: mean=n*p, variance=n*p*(1-p).
+
+ This implementation by James Theiler, April 2003, after obtaining
+ permission -- and some good advice -- from Drs. Kachitvichyanukul
+ and Schmeiser to use their code as a starting point, and then doing
+ a little bit of tweaking.
+
+ Additional polishing for GSL coding standards by Brian Gough. */
+
+#define SMALL_MEAN 14 /* If n*p < SMALL_MEAN then use BINV
+ algorithm. The ranlib
+ implementation used cutoff=30; but
+ on my computer 14 works better */
+
+#define BINV_CUTOFF 110 /* In BINV, do not permit ix too large */
+
+#define FAR_FROM_MEAN 20 /* If ix-n*p is larger than this, then
+ use the "squeeze" algorithm.
+ Ranlib used 20, and this seems to
+ be the best choice on my machine as
+ well */
+
+#define LNFACT(x) gsl_sf_lnfact(x)
+
+inline static double
+Stirling (double y1)
+{
+ double y2 = y1 * y1;
+ double s =
+ (13860.0 -
+ (462.0 - (132.0 - (99.0 - 140.0 / y2) / y2) / y2) / y2) / y1 / 166320.0;
+ return s;
+}
+
+unsigned int
+gsl_ran_binomial (double p, unsigned int n, int *rstream)
+{
+ int ix; /* return value */
+ int flipped = 0;
+ double q, s, np;
+
+ if (n == 0)
+ return 0;
+
+ if (p > 0.5)
+ {
+ p = 1.0 - p; /* work with small p */
+ flipped = 1;
+ }
+
+ q = 1 - p;
+ s = p / q;
+ np = n * p;
+
+ /* Inverse cdf logic for small mean (BINV in K+S) */
+
+ if (np < SMALL_MEAN)
+ {
+ double f0 = gsl_pow_uint (q, n); /* f(x), starting with x=0 */
+
+ while (1)
+ {
+ /* This while(1) loop will almost certainly only loop once; but
+ * if u=1 to within a few epsilons of machine precision, then it
+ * is possible for roundoff to prevent the main loop over ix to
+ * achieve its proper value. following the ranlib implementation,
+ * we introduce a check for that situation, and when it occurs,
+ * we just try again.
+ */
+
+ double f = f0;
+ double u = random_double(rstream);
+
+ for (ix = 0; ix <= BINV_CUTOFF; ++ix)
+ {
+ if (u < f)
+ goto Finish;
+ u -= f;
+ /* Use recursion f(x+1) = f(x)*[(n-x)/(x+1)]*[p/(1-p)] */
+ f *= s * (n - ix) / (ix + 1);
+ }
+
+ /* It should be the case that the 'goto Finish' was encountered
+ * before this point was ever reached. But if we have reached
+ * this point, then roundoff has prevented u from decreasing
+ * all the way to zero. This can happen only if the initial u
+ * was very nearly equal to 1, which is a rare situation. In
+ * that rare situation, we just try again.
+ *
+ * Note, following the ranlib implementation, we loop ix only to
+ * a hardcoded value of SMALL_MEAN_LARGE_N=110; we could have
+ * looped to n, and 99.99...% of the time it won't matter. This
+ * choice, I think is a little more robust against the rare
+ * roundoff error. If n>LARGE_N, then it is technically
+ * possible for ix>LARGE_N, but it is astronomically rare, and
+ * if ix is that large, it is more likely due to roundoff than
+ * probability, so better to nip it at LARGE_N than to take a
+ * chance that roundoff will somehow conspire to produce an even
+ * larger (and more improbable) ix. If n<LARGE_N, then once
+ * ix=n, f=0, and the loop will continue until ix=LARGE_N.
+ */
+ }
+ }
+ else
+ {
+ /* For n >= SMALL_MEAN, we invoke the BTPE algorithm */
+
+ int k;
+
+ double ffm = np + p; /* ffm = n*p+p */
+ int m = (int) ffm; /* m = int floor[n*p+p] */
+ double fm = m; /* fm = double m; */
+ double xm = fm + 0.5; /* xm = half integer mean (tip of triangle) */
+ double npq = np * q; /* npq = n*p*q */
+
+ /* Compute cumulative area of tri, para, exp tails */
+
+ /* p1: radius of triangle region; since height=1, also: area of region */
+ /* p2: p1 + area of parallelogram region */
+ /* p3: p2 + area of left tail */
+ /* p4: p3 + area of right tail */
+ /* pi/p4: probability of i'th area (i=1,2,3,4) */
+
+ /* Note: magic numbers 2.195, 4.6, 0.134, 20.5, 15.3 */
+ /* These magic numbers are not adjustable...at least not easily! */
+
+ double p1 = floor (2.195 * sqrt (npq) - 4.6 * q) + 0.5;
+
+ /* xl, xr: left and right edges of triangle */
+ double xl = xm - p1;
+ double xr = xm + p1;
+
+ /* Parameter of exponential tails */
+ /* Left tail: t(x) = c*exp(-lambda_l*[xl - (x+0.5)]) */
+ /* Right tail: t(x) = c*exp(-lambda_r*[(x+0.5) - xr]) */
+
+ double c = 0.134 + 20.5 / (15.3 + fm);
+ double p2 = p1 * (1.0 + c + c);
+
+ double al = (ffm - xl) / (ffm - xl * p);
+ double lambda_l = al * (1.0 + 0.5 * al);
+ double ar = (xr - ffm) / (xr * q);
+ double lambda_r = ar * (1.0 + 0.5 * ar);
+ double p3 = p2 + c / lambda_l;
+ double p4 = p3 + c / lambda_r;
+
+ double var, accept;
+ double u, v; /* random variates */
+
+ TryAgain:
+
+ /* generate random variates, u specifies which region: Tri, Par, Tail */
+ u = random_double(rstream) * p4;
+ v = random_double(rstream);
+
+ if (u <= p1)
+ {
+ /* Triangular region */
+ ix = (int) (xm - p1 * v + u);
+ goto Finish;
+ }
+ else if (u <= p2)
+ {
+ /* Parallelogram region */
+ double x = xl + (u - p1) / c;
+ v = v * c + 1.0 - fabs (x - xm) / p1;
+ if (v > 1.0 || v <= 0.0)
+ goto TryAgain;
+ ix = (int) x;
+ }
+ else if (u <= p3)
+ {
+ /* Left tail */
+ ix = (int) (xl + log (v) / lambda_l);
+ if (ix < 0)
+ goto TryAgain;
+ v *= ((u - p2) * lambda_l);
+ }
+ else
+ {
+ /* Right tail */
+ ix = (int) (xr - log (v) / lambda_r);
+ if (ix > (double) n)
+ goto TryAgain;
+ v *= ((u - p3) * lambda_r);
+ }
+
+ /* At this point, the goal is to test whether v <= f(x)/f(m)
+ *
+ * v <= f(x)/f(m) = (m!(n-m)! / (x!(n-x)!)) * (p/q)^{x-m}
+ *
+ */
+
+ /* Here is a direct test using logarithms. It is a little
+ * slower than the various "squeezing" computations below, but
+ * if things are working, it should give exactly the same answer
+ * (given the same random number seed). */
+
+#ifdef DIRECT
+ var = log (v);
+
+ accept =
+ LNFACT (m) + LNFACT (n - m) - LNFACT (ix) - LNFACT (n - ix)
+ + (ix - m) * log (p / q);
+
+#else /* SQUEEZE METHOD */
+
+ /* More efficient determination of whether v < f(x)/f(M) */
+
+ k = abs (ix - m);
+
+ if (k <= FAR_FROM_MEAN)
+ {
+ /*
+ * If ix near m (ie, |ix-m|<FAR_FROM_MEAN), then do
+ * explicit evaluation using recursion relation for f(x)
+ */
+ double g = (n + 1) * s;
+ double f = 1.0;
+
+ var = v;
+
+ if (m < ix)
+ {
+ int i;
+ for (i = m + 1; i <= ix; i++)
+ {
+ f *= (g / i - s);
+ }
+ }
+ else if (m > ix)
+ {
+ int i;
+ for (i = ix + 1; i <= m; i++)
+ {
+ f /= (g / i - s);
+ }
+ }
+
+ accept = f;
+ }
+ else
+ {
+ /* If ix is far from the mean m: k=ABS(ix-m) large */
+
+ var = log (v);
+
+ if (k < npq / 2 - 1)
+ {
+ /* "Squeeze" using upper and lower bounds on
+ * log(f(x)) The squeeze condition was derived
+ * under the condition k < npq/2-1 */
+ double amaxp =
+ k / npq * ((k * (k / 3.0 + 0.625) + (1.0 / 6.0)) / npq + 0.5);
+ double ynorm = -(k * k / (2.0 * npq));
+ if (var < ynorm - amaxp)
+ goto Finish;
+ if (var > ynorm + amaxp)
+ goto TryAgain;
+ }
+
+ /* Now, again: do the test log(v) vs. log f(x)/f(M) */
+
+#if USE_EXACT
+ /* This is equivalent to the above, but is a little (~20%) slower */
+ /* There are five log's vs three above, maybe that's it? */
+
+ accept = LNFACT (m) + LNFACT (n - m)
+ - LNFACT (ix) - LNFACT (n - ix) + (ix - m) * log (p / q);
+
+#else /* USE STIRLING */
+ /* The "#define Stirling" above corresponds to the first five
+ * terms in asymptoic formula for
+ * log Gamma (y) - (y-0.5)log(y) + y - 0.5 log(2*pi);
+ * See Abramowitz and Stegun, eq 6.1.40
+ */
+
+ /* Note below: two Stirling's are added, and two are
+ * subtracted. In both K+S, and in the ranlib
+ * implementation, all four are added. I (jt) believe that
+ * is a mistake -- this has been confirmed by personal
+ * correspondence w/ Dr. Kachitvichyanukul. Note, however,
+ * the corrections are so small, that I couldn't find an
+ * example where it made a difference that could be
+ * observed, let alone tested. In fact, define'ing Stirling
+ * to be zero gave identical results!! In practice, alv is
+ * O(1), ranging 0 to -10 or so, while the Stirling
+ * correction is typically O(10^{-5}) ...setting the
+ * correction to zero gives about a 2% performance boost;
+ * might as well keep it just to be pendantic. */
+
+ {
+ double x1 = ix + 1.0;
+ double w1 = n - ix + 1.0;
+ double f1 = fm + 1.0;
+ double z1 = n + 1.0 - fm;
+
+ accept = xm * log (f1 / x1) + (n - m + 0.5) * log (z1 / w1)
+ + (ix - m) * log (w1 * p / (x1 * q))
+ + Stirling (f1) + Stirling (z1) - Stirling (x1) - Stirling (w1);
+ }
+#endif
+#endif
+ }
+
+
+ if (var <= accept)
+ {
+ goto Finish;
+ }
+ else
+ {
+ goto TryAgain;
+ }
+ }
+
+Finish:
+
+ return (flipped) ? (n - ix) : (unsigned int)ix;
+}
diff --git a/gsl/gauss.cpp b/gsl/gauss.cpp
new file mode 100644
index 0000000..b6c2d0e
--- /dev/null
+++ b/gsl/gauss.cpp
@@ -0,0 +1,352 @@
+/* cdf/gauss.c
+ *
+ * Copyright (C) 2002, 2004 Jason H. Stover.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * Computes the cumulative distribution function for the Gaussian
+ * distribution using a rational function approximation. The
+ * computation is for the standard Normal distribution, i.e., mean 0
+ * and standard deviation 1. If you want to compute Pr(X < t) for a
+ * Gaussian random variable X with non-zero mean m and standard
+ * deviation sd not equal to 1, find gsl_cdf_ugaussian ((t-m)/sd).
+ * This approximation is accurate to at least double precision. The
+ * accuracy was verified with a pari-gp script. The largest error
+ * found was about 1.4E-20. The coefficients were derived by Cody.
+ *
+ * References:
+ *
+ * W.J. Cody. "Rational Chebyshev Approximations for the Error
+ * Function," Mathematics of Computation, v23 n107 1969, 631-637.
+ *
+ * W. Fraser, J.F Hart. "On the Computation of Rational Approximations
+ * to Continuous Functions," Communications of the ACM, v5 1962.
+ *
+ * W.J. Kennedy Jr., J.E. Gentle. "Statistical Computing." Marcel Dekker. 1980.
+ *
+ *
+ */
+
+//#include <config.h>
+#include <math.h>
+//#include <gsl/gsl_math.h>
+//#include <gsl/gsl_cdf.h>
+#define GSL_DBL_EPSILON 2.2204460492503131e-16
+
+#ifndef M_1_SQRT2PI
+#define M_1_SQRT2PI (M_2_SQRTPI * M_SQRT1_2 / 2.0)
+#endif
+
+#define SQRT32 (4.0 * M_SQRT2)
+
+/*
+ * IEEE double precision dependent constants.
+ *
+ * GAUSS_EPSILON: Smallest positive value such that
+ * gsl_cdf_gaussian(x) > 0.5.
+ * GAUSS_XUPPER: Largest value x such that gsl_cdf_gaussian(x) < 1.0.
+ * GAUSS_XLOWER: Smallest value x such that gsl_cdf_gaussian(x) > 0.0.
+ */
+
+#define GAUSS_EPSILON (GSL_DBL_EPSILON / 2)
+#define GAUSS_XUPPER (8.572)
+#define GAUSS_XLOWER (-37.519)
+
+#define GAUSS_SCALE (16.0)
+
+static double
+get_del (double x, double rational)
+{
+ double xsq = 0.0;
+ double del = 0.0;
+ double result = 0.0;
+
+ xsq = floor (x * GAUSS_SCALE) / GAUSS_SCALE;
+ del = (x - xsq) * (x + xsq);
+ del *= 0.5;
+
+ result = exp (-0.5 * xsq * xsq) * exp (-1.0 * del) * rational;
+
+ return result;
+}
+
+/*
+ * Normal cdf for fabs(x) < 0.66291
+ */
+static double
+gauss_small (const double x)
+{
+ unsigned int i;
+ double result = 0.0;
+ double xsq;
+ double xnum;
+ double xden;
+
+ const double a[5] = {
+ 2.2352520354606839287,
+ 161.02823106855587881,
+ 1067.6894854603709582,
+ 18154.981253343561249,
+ 0.065682337918207449113
+ };
+ const double b[4] = {
+ 47.20258190468824187,
+ 976.09855173777669322,
+ 10260.932208618978205,
+ 45507.789335026729956
+ };
+
+ xsq = x * x;
+ xnum = a[4] * xsq;
+ xden = xsq;
+
+ for (i = 0; i < 3; i++)
+ {
+ xnum = (xnum + a[i]) * xsq;
+ xden = (xden + b[i]) * xsq;
+ }
+
+ result = x * (xnum + a[3]) / (xden + b[3]);
+
+ return result;
+}
+
+/*
+ * Normal cdf for 0.66291 < fabs(x) < sqrt(32).
+ */
+static double
+gauss_medium (const double x)
+{
+ unsigned int i;
+ double temp = 0.0;
+ double result = 0.0;
+ double xnum;
+ double xden;
+ double absx;
+
+ const double c[9] = {
+ 0.39894151208813466764,
+ 8.8831497943883759412,
+ 93.506656132177855979,
+ 597.27027639480026226,
+ 2494.5375852903726711,
+ 6848.1904505362823326,
+ 11602.651437647350124,
+ 9842.7148383839780218,
+ 1.0765576773720192317e-8
+ };
+ const double d[8] = {
+ 22.266688044328115691,
+ 235.38790178262499861,
+ 1519.377599407554805,
+ 6485.558298266760755,
+ 18615.571640885098091,
+ 34900.952721145977266,
+ 38912.003286093271411,
+ 19685.429676859990727
+ };
+
+ absx = fabs (x);
+
+ xnum = c[8] * absx;
+ xden = absx;
+
+ for (i = 0; i < 7; i++)
+ {
+ xnum = (xnum + c[i]) * absx;
+ xden = (xden + d[i]) * absx;
+ }
+
+ temp = (xnum + c[7]) / (xden + d[7]);
+
+ result = get_del (x, temp);
+
+ return result;
+}
+
+/*
+ * Normal cdf for
+ * {sqrt(32) < x < GAUSS_XUPPER} union { GAUSS_XLOWER < x < -sqrt(32) }.
+ */
+static double
+gauss_large (const double x)
+{
+ int i;
+ double result;
+ double xsq;
+ double temp;
+ double xnum;
+ double xden;
+ double absx;
+
+ const double p[6] = {
+ 0.21589853405795699,
+ 0.1274011611602473639,
+ 0.022235277870649807,
+ 0.001421619193227893466,
+ 2.9112874951168792e-5,
+ 0.02307344176494017303
+ };
+ const double q[5] = {
+ 1.28426009614491121,
+ 0.468238212480865118,
+ 0.0659881378689285515,
+ 0.00378239633202758244,
+ 7.29751555083966205e-5
+ };
+
+ absx = fabs (x);
+ xsq = 1.0 / (x * x);
+ xnum = p[5] * xsq;
+ xden = xsq;
+
+ for (i = 0; i < 4; i++)
+ {
+ xnum = (xnum + p[i]) * xsq;
+ xden = (xden + q[i]) * xsq;
+ }
+
+ temp = xsq * (xnum + p[4]) / (xden + q[4]);
+ temp = (M_1_SQRT2PI - temp) / absx;
+
+ result = get_del (x, temp);
+
+ return result;
+}
+
+double
+gsl_cdf_ugaussian_P (const double x)
+{
+ double result;
+ double absx = fabs (x);
+
+ if (absx < GAUSS_EPSILON)
+ {
+ result = 0.5;
+ return result;
+ }
+ else if (absx < 0.66291)
+ {
+ result = 0.5 + gauss_small (x);
+ return result;
+ }
+ else if (absx < SQRT32)
+ {
+ result = gauss_medium (x);
+
+ if (x > 0.0)
+ {
+ result = 1.0 - result;
+ }
+
+ return result;
+ }
+ else if (x > GAUSS_XUPPER)
+ {
+ result = 1.0;
+ return result;
+ }
+ else if (x < GAUSS_XLOWER)
+ {
+ result = 0.0;
+ return result;
+ }
+ else
+ {
+ result = gauss_large (x);
+
+ if (x > 0.0)
+ {
+ result = 1.0 - result;
+ }
+ }
+
+ return result;
+}
+
+double
+gsl_cdf_ugaussian_Q (const double x)
+{
+ double result;
+ double absx = fabs (x);
+
+ if (absx < GAUSS_EPSILON)
+ {
+ result = 0.5;
+ return result;
+ }
+ else if (absx < 0.66291)
+ {
+ result = gauss_small (x);
+
+ if (x < 0.0)
+ {
+ result = fabs (result) + 0.5;
+ }
+ else
+ {
+ result = 0.5 - result;
+ }
+
+ return result;
+ }
+ else if (absx < SQRT32)
+ {
+ result = gauss_medium (x);
+
+ if (x < 0.0)
+ {
+ result = 1.0 - result;
+ }
+
+ return result;
+ }
+ else if (x > -(GAUSS_XLOWER))
+ {
+ result = 0.0;
+ return result;
+ }
+ else if (x < -(GAUSS_XUPPER))
+ {
+ result = 1.0;
+ return result;
+ }
+ else
+ {
+ result = gauss_large (x);
+
+ if (x < 0.0)
+ {
+ result = 1.0 - result;
+ }
+
+ }
+
+ return result;
+}
+
+double
+gsl_cdf_gaussian_P (const double x, const double sigma)
+{
+ return gsl_cdf_ugaussian_P (x / sigma);
+}
+
+double
+gsl_cdf_gaussian_Q (const double x, const double sigma)
+{
+ return gsl_cdf_ugaussian_Q (x / sigma);
+}
diff --git a/gsl/gaussinv.cpp b/gsl/gaussinv.cpp
new file mode 100644
index 0000000..0dadf5d
--- /dev/null
+++ b/gsl/gaussinv.cpp
@@ -0,0 +1,205 @@
+/* cdf/inverse_normal.c
+ *
+ * Copyright (C) 2002 Przemyslaw Sliwa and Jason H. Stover.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * Computes the inverse normal cumulative distribution function
+ * according to the algorithm shown in
+ *
+ * Wichura, M.J. (1988).
+ * Algorithm AS 241: The Percentage Points of the Normal Distribution.
+ * Applied Statistics, 37, 477-484.
+ */
+
+//#include <config.h>
+//#include <gsl/gsl_errno.h>
+//#include <gsl/gsl_math.h>
+//#include <gsl/gsl_cdf.h>
+
+#include <stdlib.h>
+#include <math.h>
+#include "rat_eval.h"
+#include "gsl_nan.h"
+
+static double
+small (double q)
+{
+ const double a[8] = { 3.387132872796366608, 133.14166789178437745,
+ 1971.5909503065514427, 13731.693765509461125,
+ 45921.953931549871457, 67265.770927008700853,
+ 33430.575583588128105, 2509.0809287301226727
+ };
+
+ const double b[8] = { 1.0, 42.313330701600911252,
+ 687.1870074920579083, 5394.1960214247511077,
+ 21213.794301586595867, 39307.89580009271061,
+ 28729.085735721942674, 5226.495278852854561
+ };
+
+ double r = 0.180625 - q * q;
+
+ double x = q * rat_eval (a, 8, b, 8, r);
+
+ return x;
+}
+
+static double
+intermediate (double r)
+{
+ const double a[] = { 1.42343711074968357734, 4.6303378461565452959,
+ 5.7694972214606914055, 3.64784832476320460504,
+ 1.27045825245236838258, 0.24178072517745061177,
+ 0.0227238449892691845833, 7.7454501427834140764e-4
+ };
+
+ const double b[] = { 1.0, 2.05319162663775882187,
+ 1.6763848301838038494, 0.68976733498510000455,
+ 0.14810397642748007459, 0.0151986665636164571966,
+ 5.475938084995344946e-4, 1.05075007164441684324e-9
+ };
+
+ double x = rat_eval (a, 8, b, 8, (r - 1.6));
+
+ return x;
+}
+
+static double
+tail (double r)
+{
+ const double a[] = { 6.6579046435011037772, 5.4637849111641143699,
+ 1.7848265399172913358, 0.29656057182850489123,
+ 0.026532189526576123093, 0.0012426609473880784386,
+ 2.71155556874348757815e-5, 2.01033439929228813265e-7
+ };
+
+ const double b[] = { 1.0, 0.59983220655588793769,
+ 0.13692988092273580531, 0.0148753612908506148525,
+ 7.868691311456132591e-4, 1.8463183175100546818e-5,
+ 1.4215117583164458887e-7, 2.04426310338993978564e-15
+ };
+
+ double x = rat_eval (a, 8, b, 8, (r - 5.0));
+
+ return x;
+}
+
+double
+gsl_cdf_ugaussian_Pinv (const double P)
+{
+ double r, x, pp;
+
+ double dP = P - 0.5;
+
+ if (P == 1.0)
+ {
+ return GSL_POSINF;
+ }
+ else if (P == 0.0)
+ {
+ return GSL_NEGINF;
+ }
+
+ if (fabs (dP) <= 0.425)
+ {
+ x = small (dP);
+
+ return x;
+ }
+
+ pp = (P < 0.5) ? P : 1.0 - P;
+
+ r = sqrt (-log (pp));
+
+ if (r <= 5.0)
+ {
+ x = intermediate (r);
+ }
+ else
+ {
+ x = tail (r);
+ }
+
+ if (P < 0.5)
+ {
+ return -x;
+ }
+ else
+ {
+ return x;
+ }
+
+}
+
+double
+gsl_cdf_ugaussian_Qinv (const double Q)
+{
+ double r, x, pp;
+
+ double dQ = Q - 0.5;
+
+ if (Q == 1.0)
+ {
+ return GSL_NEGINF;
+ }
+ else if (Q == 0.0)
+ {
+ return GSL_POSINF;
+ }
+
+ if (fabs (dQ) <= 0.425)
+ {
+ x = small (dQ);
+
+ return -x;
+ }
+
+ pp = (Q < 0.5) ? Q : 1.0 - Q;
+
+ r = sqrt (-log (pp));
+
+ if (r <= 5.0)
+ {
+ x = intermediate (r);
+ }
+ else
+ {
+ x = tail (r);
+ }
+
+ if (Q < 0.5)
+ {
+ return x;
+ }
+ else
+ {
+ return -x;
+ }
+}
+
+
+double
+gsl_cdf_gaussian_Pinv (const double P, const double sigma)
+{
+ return sigma * gsl_cdf_ugaussian_Pinv (P);
+}
+
+double
+gsl_cdf_gaussian_Qinv (const double Q, const double sigma)
+{
+ return sigma * gsl_cdf_ugaussian_Qinv (Q);
+}
diff --git a/gsl/gausspdf.cpp b/gsl/gausspdf.cpp
new file mode 100644
index 0000000..d80eaaf
--- /dev/null
+++ b/gsl/gausspdf.cpp
@@ -0,0 +1,40 @@
+/* randist/gauss.c
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000, 2006, 2007 James Theiler, Brian Gough
+ * Copyright (C) 2006 Charles Karney
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+//#include <config.h>
+#include <math.h>
+//#include <gsl/gsl_math.h>
+//#include <gsl/gsl_rng.h>
+//#include <gsl/gsl_randist.h>
+
+double
+gsl_ran_gaussian_pdf (const double x, const double sigma)
+{
+ double u = x / fabs (sigma);
+ double p = (1 / (sqrt (2 * 3.14159265358979323846264338327950288) * fabs (sigma))) * exp (-u * u / 2);
+ return p;
+}
+
+double
+gsl_ran_ugaussian_pdf (const double x)
+{
+ return gsl_ran_gaussian_pdf (x, 1.0);
+}
+
diff --git a/gsl/gsl_nan.h b/gsl/gsl_nan.h
new file mode 100644
index 0000000..5cb52ef
--- /dev/null
+++ b/gsl/gsl_nan.h
@@ -0,0 +1,45 @@
+/* gsl_nan.h
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000, 2007 Gerard Jungman, Brian Gough
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GSL_NAN_H__
+#define __GSL_NAN_H__
+
+#ifdef INFINITY
+# define GSL_POSINF INFINITY
+# define GSL_NEGINF (-INFINITY)
+#elif defined(HUGE_VAL)
+# define GSL_POSINF HUGE_VAL
+# define GSL_NEGINF (-HUGE_VAL)
+#else
+# define GSL_POSINF (gsl_posinf())
+# define GSL_NEGINF (gsl_neginf())
+#endif
+
+#ifdef NAN
+# define GSL_NAN NAN
+#elif defined(INFINITY)
+# define GSL_NAN (INFINITY/INFINITY)
+#else
+# define GSL_NAN (gsl_nan())
+#endif
+
+#define GSL_POSZERO (+0.0)
+#define GSL_NEGZERO (-0.0)
+
+#endif /* __GSL_NAN_H__ */
diff --git a/gsl/multinomial.cpp b/gsl/multinomial.cpp
new file mode 100644
index 0000000..1e98dfa
--- /dev/null
+++ b/gsl/multinomial.cpp
@@ -0,0 +1,82 @@
+/* randist/multinomial.c
+ *
+ * Copyright (C) 2002 Gavin E. Crooks <gec at compbio.berkeley.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+//#include <config.h>
+#include <stdlib.h>
+#include <math.h>
+//#include "../tools.h"
+//#include <gsl/gsl_rng.h>
+//#include <gsl/gsl_randist.h>
+//#include <gsl/gsl_sf_gamma.h>
+#include "mygsl.h"
+
+/* The multinomial distribution has the form
+
+ N! n_1 n_2 n_K
+ prob(n_1, n_2, ... n_K) = -------------------- p_1 p_2 ... p_K
+ (n_1! n_2! ... n_K!)
+
+ where n_1, n_2, ... n_K are nonnegative integers, sum_{k=1,K} n_k = N,
+ and p = (p_1, p_2, ..., p_K) is a probability distribution.
+
+ Random variates are generated using the conditional binomial method.
+ This scales well with N and does not require a setup step.
+
+ Ref:
+ C.S. David, The computer generation of multinomial random variates,
+ Comp. Stat. Data Anal. 16 (1993) 205-217
+*/
+
+void
+gsl_ran_multinomial (const size_t K,
+ const unsigned int N, const double p[], unsigned int n[], int *rstream)
+{
+ size_t k;
+ double norm = 0.0;
+ double sum_p = 0.0;
+
+ unsigned int sum_n = 0;
+
+ /* p[k] may contain non-negative weights that do not sum to 1.0.
+ * Even a probability distribution will not exactly sum to 1.0
+ * due to rounding errors.
+ */
+
+ for (k = 0; k < K; k++)
+ {
+ norm += p[k];
+ }
+
+ for (k = 0; k < K; k++)
+ {
+ if (p[k] > 0.0)
+ {
+ n[k] = gsl_ran_binomial (p[k] / (norm - sum_p), N - sum_n, rstream);
+ }
+ else
+ {
+ n[k] = 0;
+ }
+
+ sum_p += p[k];
+ sum_n += n[k];
+ }
+
+}
+
diff --git a/gsl/mygsl.h b/gsl/mygsl.h
new file mode 100644
index 0000000..2a66247
--- /dev/null
+++ b/gsl/mygsl.h
@@ -0,0 +1,57 @@
+
+/**
+ stripped GSL (GNU Scientific library) used for IQ-TREE code
+*/
+
+#ifndef _MYGSL_H
+#define _MYGSL_H
+
+#include <stdio.h>
+
+/*
+ x power n (x^n)
+ @return x^n
+*/
+double gsl_pow_uint(double x, unsigned int n);
+
+/*
+ binomial sampling
+ @param p probability
+ @param n sample size
+ @return random value drawn from binominal distribution
+*/
+unsigned int gsl_ran_binomial (double p, unsigned int n, int *rstream);
+
+/*
+ multinomial sampling
+ @param K number of categories
+ @param N sample size
+ @param p probability vector of length K, will be normalized to 1 if not summing up to 1
+ @param[out] n output vector of length K as drawn from multinomial distribution, sum to N
+*/
+void gsl_ran_multinomial (const size_t K, const unsigned int N, const double p[], unsigned int n[], int *rstream);
+
+
+/*
+ probability density function for standard normal distribution
+ @param x x-value
+ @return probability density p(x)
+*/
+double gsl_ran_ugaussian_pdf (const double x);
+
+/*
+ cumulative distribution function for standard normal distribution
+ @param x x-value
+ @return CDF at x
+*/
+double gsl_cdf_ugaussian_P (const double x);
+
+/*
+ quantile function for standard normal distribution (or CDF-inverse function)
+ @param P probability value
+ @return x-value
+*/
+double gsl_cdf_ugaussian_Pinv (const double P);
+
+#endif
+
diff --git a/gsl/pow_int.cpp b/gsl/pow_int.cpp
new file mode 100644
index 0000000..f878880
--- /dev/null
+++ b/gsl/pow_int.cpp
@@ -0,0 +1,56 @@
+/* sys/pow_int.c
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000 Gerard Jungman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include <math.h>
+
+/* Compile all the inline functions */
+
+#define COMPILE_INLINE_STATIC
+//#include "build.h"
+//#include <gsl/gsl_pow_int.h>
+#include "mygsl.h"
+
+double gsl_pow_int(double x, int n)
+{
+ unsigned int un;
+
+ if(n < 0) {
+ x = 1.0/x;
+ un = -n;
+ } else {
+ un = n;
+ }
+
+ return gsl_pow_uint(x, un);
+}
+
+double gsl_pow_uint(double x, unsigned int n)
+{
+ double value = 1.0;
+
+ /* repeated squaring method
+ * returns 0.0^0 = 1.0, so continuous in x
+ */
+ do {
+ if(n & 1) value *= x; /* for n odd */
+ n >>= 1;
+ x *= x;
+ } while (n);
+
+ return value;
+}
diff --git a/gsl/rat_eval.h b/gsl/rat_eval.h
new file mode 100644
index 0000000..5e6fc19
--- /dev/null
+++ b/gsl/rat_eval.h
@@ -0,0 +1,25 @@
+static double
+rat_eval (const double a[], const size_t na,
+ const double b[], const size_t nb, const double x)
+{
+ size_t i, j;
+ double u, v, r;
+
+ u = a[na - 1];
+
+ for (i = na - 1; i > 0; i--)
+ {
+ u = x * u + a[i - 1];
+ }
+
+ v = b[nb - 1];
+
+ for (j = nb - 1; j > 0; j--)
+ {
+ v = x * v + b[j - 1];
+ }
+
+ r = u / v;
+
+ return r;
+}
diff --git a/hashsplitset.cpp b/hashsplitset.cpp
index 93a2399..81f84a7 100644
--- a/hashsplitset.cpp
+++ b/hashsplitset.cpp
@@ -41,22 +41,22 @@ Split *SplitIntMap::findSplit(Split *sp, int &value) {
int SplitIntMap::getValue(Split *sp) {
int value;
- if (!findSplit(sp, value)) outError(__func__);
+ assert(findSplit(sp, value));
return value;
}
void SplitIntMap::setValue(Split *sp, int value) {
- if (!findSplit(sp)) outError(__func__);
+ assert(findSplit(sp));
(*this)[sp] = value;
}
void SplitIntMap::eraseSplit(Split *sp) {
- if (!findSplit(sp)) outError(__func__);
+ assert(findSplit(sp));
erase(sp);
}
void SplitIntMap::insertSplit(Split *sp, int value) {
- if (findSplit(sp)) outError(__func__);
+ assert(!findSplit(sp));
if (verbose_mode >= VB_MAX) sp->report(cout);
(*this)[sp] = value;
}
diff --git a/iqtree.cpp b/iqtree.cpp
index 19ca598..e38a1ed 100644
--- a/iqtree.cpp
+++ b/iqtree.cpp
@@ -57,7 +57,7 @@ void IQTree::init() {
testNNI = false;
print_tree_lh = false;
write_intermediate_trees = 0;
- max_candidate_trees = 0;
+// max_candidate_trees = 0;
logl_cutoff = 0.0;
len_scale = 10000;
// save_all_br_lens = false;
@@ -71,6 +71,112 @@ IQTree::IQTree(Alignment *aln) : PhyloTree(aln) {
IQTree::init();
}
+void IQTree::setCheckpoint(Checkpoint *checkpoint) {
+ PhyloTree::setCheckpoint(checkpoint);
+ stop_rule.setCheckpoint(checkpoint);
+ candidateTrees.setCheckpoint(checkpoint);
+}
+
+void IQTree::saveCheckpoint() {
+ stop_rule.saveCheckpoint();
+ candidateTrees.saveCheckpoint();
+
+ if (boot_samples.size() > 0 && !boot_trees.front().empty()) {
+ checkpoint->startStruct("UFBoot");
+// CKP_SAVE(max_candidate_trees);
+ CKP_SAVE(logl_cutoff);
+ // save boot_samples and boot_trees
+ int id = 0;
+ checkpoint->startList(boot_samples.size());
+ // TODO: save boot_trees_brlen
+ for (vector<BootValType* >::iterator it = boot_samples.begin(); it != boot_samples.end(); it++, id++) {
+ checkpoint->addListElement();
+ stringstream ss;
+ ss.precision(10);
+ ss << boot_counts[id] << " " << boot_logl[id] << " " << boot_orig_logl[id] << " " << boot_trees[id];
+ checkpoint->put("", ss.str());
+// string &bt = boot_trees[id];
+// CKP_SAVE(bt);
+// double bl = boot_logl[id];
+// CKP_SAVE(bl);
+// double bol=boot_orig_logl[id];
+// CKP_SAVE(bol);
+// int bc = boot_counts[id];
+// CKP_SAVE(bc);
+ }
+ checkpoint->endList();
+ CKP_SAVE(boot_consense_logl);
+ int boot_splits_size = boot_splits.size();
+ CKP_SAVE(boot_splits_size);
+ checkpoint->endStruct();
+
+ // boot_splits
+ id = 0;
+ for (vector<SplitGraph*>::iterator sit = boot_splits.begin(); sit != boot_splits.end(); sit++, id++) {
+ checkpoint->startStruct("UFBootSplit" + convertIntToString(id));
+ (*sit)->saveCheckpoint();
+ checkpoint->endStruct();
+ }
+ }
+
+ PhyloTree::saveCheckpoint();
+}
+
+void IQTree::restoreCheckpoint() {
+ PhyloTree::restoreCheckpoint();
+ stop_rule.restoreCheckpoint();
+ candidateTrees.restoreCheckpoint();
+
+ if (params->gbo_replicates > 0 && checkpoint->hasKey("UFBoot.logl_cutoff")) {
+ checkpoint->startStruct("UFBoot");
+// CKP_RESTORE(max_candidate_trees);
+ CKP_RESTORE(logl_cutoff);
+ // save boot_samples and boot_trees
+ int id = 0;
+ checkpoint->startList(params->gbo_replicates);
+ boot_trees.resize(params->gbo_replicates);
+ boot_logl.resize(params->gbo_replicates);
+ boot_orig_logl.resize(params->gbo_replicates);
+ boot_counts.resize(params->gbo_replicates);
+ for (id = 0; id < params->gbo_replicates; id++) {
+ checkpoint->addListElement();
+ string str;
+ checkpoint->getString("", str);
+ stringstream ss(str);
+ ss >> boot_counts[id] >> boot_logl[id] >> boot_orig_logl[id] >> boot_trees[id];
+// string bt;
+// CKP_RESTORE(bt);
+// boot_trees[id] = bt;
+// double bl;
+// CKP_RESTORE(bl);
+// boot_logl[id] = bl;
+// double bol;
+// CKP_RESTORE(bol);
+// boot_orig_logl[id] = bol;
+// int bc;
+// CKP_RESTORE(bc);
+// boot_counts[id] = bc;
+ }
+ checkpoint->endList();
+ CKP_RESTORE(boot_consense_logl);
+ int boot_splits_size = 0;
+ CKP_RESTORE(boot_splits_size);
+ checkpoint->endStruct();
+
+ // boot_splits
+ for (id = 0; id < boot_splits_size; id++) {
+ checkpoint->startStruct("UFBootSplit" + convertIntToString(id));
+ SplitGraph *sg = new SplitGraph;
+ sg->setCheckpoint(checkpoint);
+ sg->restoreCheckpoint();
+ boot_splits.push_back(sg);
+ checkpoint->endStruct();
+ }
+ }
+
+
+}
+
void IQTree::initSettings(Params ¶ms) {
searchinfo.speednni = params.speednni;
searchinfo.nni_type = params.nni_type;
@@ -163,9 +269,9 @@ void IQTree::initSettings(Params ¶ms) {
// if (params.gbo_replicates > 0 && params.do_compression)
// save_all_br_lens = true;
print_tree_lh = params.print_tree_lh;
- max_candidate_trees = params.max_candidate_trees;
- if (max_candidate_trees == 0)
- max_candidate_trees = aln->getNSeq() * params.step_iterations;
+// max_candidate_trees = params.max_candidate_trees;
+// if (max_candidate_trees == 0)
+// max_candidate_trees = aln->getNSeq() * params.step_iterations;
setRootNode(params.root);
string bootaln_name = params.out_prefix;
@@ -178,7 +284,14 @@ void IQTree::initSettings(Params ¶ms) {
size_t i;
if (params.online_bootstrap && params.gbo_replicates > 0) {
- cout << "Generating " << params.gbo_replicates << " samples for ultrafast bootstrap..." << endl;
+ if (aln->getNSeq() < 4)
+ outError("It makes no sense to perform bootstrap with less than 4 sequences.");
+ // 2015-12-17: initialize random stream for creating bootstrap samples
+ // mainly so that checkpointing does not need to save bootstrap samples
+ int *saved_randstream = randstream;
+ init_random(params.ran_seed);
+
+ cout << "Generating " << params.gbo_replicates << " samples for ultrafast bootstrap (seed: " << params.ran_seed << ")..." << endl;
// allocate memory for boot_samples
boot_samples.resize(params.gbo_replicates);
size_t orig_nptn = getAlnNPattern();
@@ -192,11 +305,16 @@ void IQTree::initSettings(Params ¶ms) {
for (i = 0; i < params.gbo_replicates; i++)
boot_samples[i] = mem + i*nptn;
- boot_logl.resize(params.gbo_replicates, -DBL_MAX);
- boot_trees.resize(params.gbo_replicates, "");
- if (params.print_ufboot_trees == 2)
- boot_trees_brlen.resize(params.gbo_replicates);
- boot_counts.resize(params.gbo_replicates, 0);
+ if (boot_trees.empty()) {
+ boot_logl.resize(params.gbo_replicates, -DBL_MAX);
+ boot_orig_logl.resize(params.gbo_replicates, -DBL_MAX);
+ boot_trees.resize(params.gbo_replicates, "");
+ boot_counts.resize(params.gbo_replicates, 0);
+ if (params.print_ufboot_trees == 2)
+ boot_trees_brlen.resize(params.gbo_replicates);
+ } else {
+ cout << "CHECKPOINT: " << boot_trees.size() << " UFBoot trees and " << boot_splits.size() << " UFBootSplits restored" << endl;
+ }
VerboseMode saved_mode = verbose_mode;
verbose_mode = VB_QUIET;
for (i = 0; i < params.gbo_replicates; i++) {
@@ -224,7 +342,11 @@ void IQTree::initSettings(Params ¶ms) {
cout << "Bootstrap alignments printed to " << bootaln_name << endl;
}
- cout << "Max candidate trees (tau): " << max_candidate_trees << endl;
+// cout << "Max candidate trees (tau): " << max_candidate_trees << endl;
+
+ // restore randstream
+ finish_random();
+ randstream = saved_randstream;
}
if (params.root_state) {
@@ -241,9 +363,6 @@ IQTree::~IQTree() {
//delete bonus_values;
//bonus_values = NULL;
-// for (vector<double*>::reverse_iterator it = treels_ptnlh.rbegin(); it != treels_ptnlh.rend(); it++)
-// delete[] (*it);
-// treels_ptnlh.clear();
for (vector<SplitGraph*>::reverse_iterator it2 = boot_splits.rbegin(); it2 != boot_splits.rend(); it2++)
delete (*it2);
boot_splits.clear();
@@ -357,6 +476,16 @@ void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
int fixed_number = 0;
setParsimonyKernel(kernel);
+ candidateTrees.init(aln, params);
+// restoreCheckpoint();
+// if (leafNum != 0) {
+// if (!candidateTrees.empty()) {
+// readTreeString(candidateTrees.getTopTrees(1)[0]);
+// cout << endl << "CHECKPOINT: Current best tree restored, LogL: " << candidateTrees.getBestScore() << endl;
+// } else
+// cout << endl << "CHECKPOINT: Initial tree restored" << endl;
+// return;
+// } else
if (params->user_file) {
// start the search with user-defined tree
cout << "Reading input tree file " << params->user_file << " ..." << endl;
@@ -370,61 +499,61 @@ void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
params->numInitTrees = 1;
params->numNNITrees = 1;
// change to old kernel if tree is multifurcating
- if ((params->SSE == LK_EIGEN || params->SSE == LK_EIGEN_SSE) && !isBifurcating()) {
- cout << "NOTE: Changing to old kernel as input tree is multifurcating" << endl;
- params->SSE = LK_SSE;
- }
+// if ((params->SSE == LK_EIGEN || params->SSE == LK_EIGEN_SSE) && !isBifurcating()) {
+// cout << "NOTE: Changing to old kernel as input tree is multifurcating" << endl;
+// params->SSE = LK_SSE;
+// }
if (params->pll)
pllReadNewick(getTreeString());
- } else switch (params->start_tree) {
- case STT_PARSIMONY:
- // Create parsimony tree using IQ-Tree kernel
- if (kernel == LK_EIGEN_SSE)
- cout << "Creating fast SIMD initial parsimony tree by random order stepwise addition..." << endl;
- else if (kernel == LK_EIGEN)
- cout << "Creating fast initial parsimony tree by random order stepwise addition..." << endl;
- else
- cout << "Creating initial parsimony tree by random order stepwise addition..." << endl;
-// aln->orderPatternByNumChars();
- start = getRealTime();
- score = computeParsimonyTree(params->out_prefix, aln);
- cout << getRealTime() - start << " seconds, parsimony score: " << score
- << " (based on " << aln->num_informative_sites << " informative sites)"<< endl;
-// if (params->pll)
-// pllReadNewick(getTreeString());
- wrapperFixNegativeBranch(false);
-
- break;
- case STT_RANDOM_TREE:
- case STT_PLL_PARSIMONY:
- cout << endl;
- cout << "Create initial parsimony tree by phylogenetic likelihood library (PLL)... ";
- pllInst->randomNumberSeed = params->ran_seed;
- pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInst, pllPartitions, params->sprDist);
- resetBranches(pllInst);
- pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back,
- PLL_FALSE, PLL_TRUE, PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
- PhyloTree::readTreeString(string(pllInst->tree_string));
- cout << getRealTime() - start << " seconds" << endl;
- wrapperFixNegativeBranch(true);
- break;
- case STT_BIONJ:
- // This is the old default option: using BIONJ as starting tree
- computeBioNJ(*params, aln, dist_file);
- cout << getRealTime() - start << " seconds" << endl;
- params->numInitTrees = 1;
-// if (params->pll)
-// pllReadNewick(getTreeString());
- if (isSuperTree())
- wrapperFixNegativeBranch(true);
- else
- fixed_number = wrapperFixNegativeBranch(false);
- break;
-// case STT_RANDOM_TREE:
-// cout << "Generate random initial Yule-Harding tree..." << endl;
-// generateRandomTree(YULE_HARDING);
-// wrapperFixNegativeBranch(true);
-// break;
+ } else if (CKP_RESTORE(initTree)) {
+ readTreeString(initTree);
+ cout << endl << "CHECKPOINT: Initial tree restored" << endl;
+ } else {
+ switch (params->start_tree) {
+ case STT_PARSIMONY:
+ // Create parsimony tree using IQ-Tree kernel
+ if (kernel == LK_EIGEN_SSE)
+ cout << "Creating fast SIMD initial parsimony tree by random order stepwise addition..." << endl;
+ else if (kernel == LK_EIGEN)
+ cout << "Creating fast initial parsimony tree by random order stepwise addition..." << endl;
+ else
+ cout << "Creating initial parsimony tree by random order stepwise addition..." << endl;
+// aln->orderPatternByNumChars();
+ start = getRealTime();
+ score = computeParsimonyTree(params->out_prefix, aln);
+ cout << getRealTime() - start << " seconds, parsimony score: " << score
+ << " (based on " << aln->num_informative_sites << " informative sites)"<< endl;
+ wrapperFixNegativeBranch(false);
+
+ break;
+ case STT_RANDOM_TREE:
+ case STT_PLL_PARSIMONY:
+ cout << endl;
+ cout << "Create initial parsimony tree by phylogenetic likelihood library (PLL)... ";
+ pllInst->randomNumberSeed = params->ran_seed;
+ pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInst, pllPartitions, params->sprDist);
+ resetBranches(pllInst);
+ pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back,
+ PLL_FALSE, PLL_TRUE, PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
+ PhyloTree::readTreeStringSeqName(string(pllInst->tree_string));
+ cout << getRealTime() - start << " seconds" << endl;
+ wrapperFixNegativeBranch(true);
+ break;
+ case STT_BIONJ:
+ // This is the old default option: using BIONJ as starting tree
+ computeBioNJ(*params, aln, dist_file);
+ cout << getRealTime() - start << " seconds" << endl;
+ params->numInitTrees = 1;
+ if (isSuperTree())
+ wrapperFixNegativeBranch(true);
+ else
+ fixed_number = wrapperFixNegativeBranch(false);
+ break;
+ }
+ initTree = getTreeString();
+ CKP_SAVE(initTree);
+ saveCheckpoint();
+ checkpoint->dump(true);
}
if (fixed_number) {
@@ -445,8 +574,7 @@ void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
}
}
-void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
-
+void IQTree::createInitTrees(int nParTrees) {
if (nParTrees > 0) {
if (params->start_tree == STT_RANDOM_TREE)
cout << "Generating " << nParTrees << " random trees... ";
@@ -485,7 +613,7 @@ void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
pllInst->start->back, PLL_FALSE, PLL_TRUE, PLL_FALSE,
PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
curParsTree = string(pllInst->tree_string);
- PhyloTree::readTreeString(curParsTree);
+ PhyloTree::readTreeStringSeqName(curParsTree);
wrapperFixNegativeBranch(true);
curParsTree = getTreeString();
} else if (params->start_tree == STT_RANDOM_TREE) {
@@ -562,17 +690,40 @@ void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
double loglTime = getRealTime() - startTime;
cout << loglTime << " seconds" << endl;
+}
+
+void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
+
+ bool finishedInitTree = checkpoint->getBool("finishedInitTree");
+
+ if (finishedInitTree) {
+ cout << "CHECKPOINT: " << min(nParTrees, (int)candidateTrees.size()) << " initial trees restored" << endl;
+ } else {
+ createInitTrees(nParTrees);
+ checkpoint->putBool("finishedInitTree", true);
+ saveCheckpoint();
+ checkpoint->dump();
+ }
// Only select the best nNNITrees for doing NNI search
CandidateSet initParsimonyTrees = candidateTrees.getBestCandidateTrees(nNNITrees);
candidateTrees.clear();
cout << "Optimizing top " << initParsimonyTrees.size() << " initial trees with NNI..." << endl;
- startTime = getCPUTime();
+ double startTime = getCPUTime();
/*********** START: Do NNI on the best parsimony trees ************************************/
- CandidateSet::reverse_iterator rit;
- stop_rule.setCurIt(0);
- for (rit = initParsimonyTrees.rbegin(); rit != initParsimonyTrees.rend(); ++rit) {
+ CandidateSet::reverse_iterator rit = initParsimonyTrees.rbegin();
+
+// stop_rule.setCurIt(0);
+ if (stop_rule.getCurIt() > 0) {
+ int step = stop_rule.getCurIt();
+ for (; rit != initParsimonyTrees.rend() && step > 0; ++rit, step--) {
+ // increase iterator accordingly
+ candidateTrees.update(rit->second.tree, rit->first);
+ }
+ cout << "CHECKPOINT: " << stop_rule.getCurIt() << " initial iterations restored" << endl;
+ }
+ for (; rit != initParsimonyTrees.rend(); ++rit) {
stop_rule.setCurIt(stop_rule.getCurIt() + 1);
int nniCount, nniStep;
double initLogl, nniLogl;
@@ -600,6 +751,7 @@ void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
if (getCurScore() > candidateTrees.getBestScore() + params->modeps) {
// Re-optimize model parameters (the sNNI algorithm)
tree = optimizeModelParameters(false, params->modeps * 10);
+ getModelFactory()->saveCheckpoint();
betterScore = true;
}
bool newTree = candidateTrees.update(tree, getCurScore());
@@ -611,6 +763,8 @@ void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
cout << "BETTER SCORE FOUND at iteration " << stop_rule.getCurIt() << ": "
<< getCurScore() << endl;
}
+ saveCheckpoint();
+ checkpoint->dump();
// if (params.partition_type)
// ((PhyloSuperTreePlen*)&iqtree)->printNNIcasesNUM();
}
@@ -687,6 +841,7 @@ void IQTree::initializeModel(Params ¶ms, ModelsBlock *models_block) {
}
setModel(getModelFactory()->model);
setRate(getModelFactory()->site_rate);
+ getModelFactory()->setCheckpoint(checkpoint);
if (params.pll) {
if (getRate()->getNDiscreteRate() == 1) {
@@ -1183,43 +1338,7 @@ void IQTree::doRandomNNIs(int numNNI) {
resetCurScore();
}
-/*
-void IQTree::doRandomNNIs(int numNNI) {
- map<int, Node*> usedNodes;
- NodeVector nodeList1, nodeList2;
- getInternalBranches(nodeList1, nodeList2);
- int numInBran = nodeList1.size();
- assert(numInBran == aln->getNSeq() - 3);
- for (int i = 0; i < numNNI; i++) {
- int index = random_int(numInBran);
- if (usedNodes.find(nodeList1[index]->id) == usedNodes.end()
- && usedNodes.find(nodeList2[index]->id) == usedNodes.end()) {
- doOneRandomNNI(nodeList1[index], nodeList2[index]);
- usedNodes.insert(map<int, Node*>::value_type(nodeList1[index]->id, nodeList1[index]));
- usedNodes.insert(map<int, Node*>::value_type(nodeList2[index]->id, nodeList2[index]));
- } else {
- usedNodes.clear();
- nodeList1.clear();
- nodeList2.clear();
- getInternalBranches(nodeList1, nodeList2);
- doOneRandomNNI(nodeList1[index], nodeList2[index]);
- usedNodes.insert(map<int, Node*>::value_type(nodeList1[index]->id, nodeList1[index]));
- usedNodes.insert(map<int, Node*>::value_type(nodeList2[index]->id, nodeList2[index]));
- }
- }
- setAlignment(aln);
- setRootNode(params->root);
-
- if (isSuperTree()) {
- ((PhyloSuperTree*) this)->mapTrees();
-}
- if (params->pll) {
- pllReadNewick(getTreeString());
- }
- lhComputed = false;
-}
-*/
void IQTree::doIQP() {
if (verbose_mode >= VB_DEBUG)
@@ -1737,7 +1856,11 @@ double IQTree::doTreeSearch() {
//printTree(bestTopoStream, WT_TAXON_ID + WT_SORT_TAXA);
//string best_tree_topo = bestTopoStream.str();
- stop_rule.addImprovedIteration(1);
+ // if not zero, it means already recovered from checkpoint
+ if (stop_rule.getLastImprovedIteration() == 0)
+ stop_rule.addImprovedIteration(1);
+ else
+ cout << "CHECKPOINT: " << stop_rule.getCurIt() << " search iterations restored" << endl;
searchinfo.curPerStrength = params->initPS;
double cur_correlation = 0.0;
@@ -1749,26 +1872,25 @@ double IQTree::doTreeSearch() {
stop_rule.setCurIt(stop_rule.getCurIt() + 1);
searchinfo.curIter = stop_rule.getCurIt();
// estimate logl_cutoff for bootstrap
- if (/*params->avoid_duplicated_trees &&*/ max_candidate_trees > 0 && treels_logl.size() > 1000) {
- int predicted_iteration = ((stop_rule.getCurIt()+params->step_iterations-1)/params->step_iterations)*params->step_iterations;
- int num_entries = floor(max_candidate_trees * ((double) stop_rule.getCurIt() / predicted_iteration));
- if (num_entries < treels_logl.size() * 0.9) {
- DoubleVector logl = treels_logl;
- nth_element(logl.begin(), logl.begin() + (treels_logl.size() - num_entries), logl.end());
- logl_cutoff = logl[treels_logl.size() - num_entries] - 1.0;
- } else
- logl_cutoff = 0.0;
- if (verbose_mode >= VB_MED) {
- if (stop_rule.getCurIt() % 10 == 0) {
- cout << treels_logl.size() << " logls, logl_cutoff= " << logl_cutoff;
-// if (params->store_candidate_trees)
-// cout << " duplicates= " << duplication_counter << " ("
-// << (int) round(100 * ((double) duplication_counter / treels_logl.size())) << "%)" << endl;
-// else
- cout << endl;
- }
- }
- }
+ if (!boot_orig_logl.empty())
+ logl_cutoff = *min_element(boot_orig_logl.begin(), boot_orig_logl.end());
+
+// if (/*params->avoid_duplicated_trees && max_candidate_trees > 0 &&*/ stop_rule.getCurIt() > 2 /* && treels_logl.size() > 1000*/) {
+// int predicted_iteration = ((stop_rule.getCurIt()+params->step_iterations-1)/params->step_iterations)*params->step_iterations;
+// int num_entries = floor(max_candidate_trees * ((double) stop_rule.getCurIt() / predicted_iteration));
+// if (num_entries < treels_logl.size() * 0.9) {
+// DoubleVector logl = treels_logl;
+// nth_element(logl.begin(), logl.begin() + (treels_logl.size() - num_entries), logl.end());
+// logl_cutoff = logl[treels_logl.size() - num_entries] - 1.0;
+// } else
+// logl_cutoff = 0.0;
+// if (verbose_mode >= VB_MED) {
+// if (stop_rule.getCurIt() % 10 == 0) {
+// cout << treels_logl.size() << " logls, logl_cutoff= " << logl_cutoff;
+// cout << endl;
+// }
+// }
+// }
if (estimate_nni_cutoff && nni_info.size() >= 500) {
estimate_nni_cutoff = false;
@@ -1879,6 +2001,7 @@ double IQTree::doTreeSearch() {
if (curScore > candidateTrees.getBestScore() + params->modeps) {
if (params->snni) {
imd_tree = optimizeModelParameters();
+ getModelFactory()->saveCheckpoint();
}
if (!candidateTrees.treeExist(imd_tree)) {
stop_rule.addImprovedIteration(stop_rule.getCurIt());
@@ -1907,21 +2030,24 @@ double IQTree::doTreeSearch() {
// compute split support every half step
SplitGraph *sg = new SplitGraph;
summarizeBootstrap(*sg);
+ sg->removeTrivialSplits();
+ sg->setCheckpoint(checkpoint);
boot_splits.push_back(sg);
- if (params->max_candidate_trees == 0)
- max_candidate_trees = treels_logl.size() * (stop_rule.getCurIt() + (params->step_iterations / 2)) /
- stop_rule.getCurIt();
- cout << "NOTE: " << treels_logl.size() << " bootstrap candidate trees evaluated (logl-cutoff: " << logl_cutoff << ")" << endl;
+// if (params->max_candidate_trees == 0)
+// max_candidate_trees = treels_logl.size() * (stop_rule.getCurIt() + (params->step_iterations / 2)) /
+// stop_rule.getCurIt();
+// cout << "NOTE: " << treels_logl.size() << " bootstrap candidate trees evaluated (logl-cutoff: " << logl_cutoff << ")" << endl;
+ cout << "Log-likelihood cutoff on original alignment: " << logl_cutoff << endl;
// check convergence every full step
if (stop_rule.getCurIt() % params->step_iterations == 0) {
cur_correlation = computeBootstrapCorrelation();
cout << "NOTE: Bootstrap correlation coefficient of split occurrence frequencies: " << cur_correlation << endl;
if (!stop_rule.meetStopCondition(stop_rule.getCurIt(), cur_correlation)) {
- if (params->max_candidate_trees == 0) {
- max_candidate_trees = treels_logl.size() * (stop_rule.getCurIt() + params->step_iterations) /
- stop_rule.getCurIt();
- }
+// if (params->max_candidate_trees == 0) {
+// max_candidate_trees = treels_logl.size() * (stop_rule.getCurIt() + params->step_iterations) /
+// stop_rule.getCurIt();
+// }
// cout << "INFO: UFBoot does not converge, continue " << params->step_iterations << " more iterations" << endl;
}
}
@@ -1932,6 +2058,9 @@ double IQTree::doTreeSearch() {
stop_rule.getCurIt() % 10 == 0)
writeUFBootTrees(*params);
+ saveCheckpoint();
+ checkpoint->dump();
+
//if (params->partition_type)
// ((PhyloSuperTreePlen*)this)->printNNIcasesNUM();
@@ -2228,24 +2357,19 @@ void IQTree::pllInitUFBootData(){
if(params->online_bootstrap && params->gbo_replicates > 0){
if(!pll2iqtree_pattern_index) pllBuildIQTreePatternIndex();
- pllUFBootDataPtr->treels = pllHashInit(max_candidate_trees);
- pllUFBootDataPtr->treels_size = max_candidate_trees; // track size of treels_logl, treels_newick, treels_ptnlh
+// pllUFBootDataPtr->treels = pllHashInit(max_candidate_trees);
+// pllUFBootDataPtr->treels_size = max_candidate_trees; // track size of treels_logl, treels_newick, treels_ptnlh
- pllUFBootDataPtr->treels_logl =
- (double *) malloc(max_candidate_trees * (sizeof(double)));
- if(!pllUFBootDataPtr->treels_logl) outError("Not enough dynamic memory!");
- //memset(pllUFBootDataPtr->treels_logl, 0, max_candidate_trees * (sizeof(double)));
+// pllUFBootDataPtr->treels_logl =
+// (double *) malloc(max_candidate_trees * (sizeof(double)));
+// if(!pllUFBootDataPtr->treels_logl) outError("Not enough dynamic memory!");
-// pllUFBootDataPtr->treels_newick =
-// (char **) malloc(max_candidate_trees * (sizeof(char *)));
-// if(!pllUFBootDataPtr->treels_newick) outError("Not enough dynamic memory!");
-// memset(pllUFBootDataPtr->treels_newick, 0, max_candidate_trees * (sizeof(char *)));
- pllUFBootDataPtr->treels_ptnlh =
- (double **) malloc(max_candidate_trees * (sizeof(double *)));
- if(!pllUFBootDataPtr->treels_ptnlh) outError("Not enough dynamic memory!");
- memset(pllUFBootDataPtr->treels_ptnlh, 0, max_candidate_trees * (sizeof(double *)));
+// pllUFBootDataPtr->treels_ptnlh =
+// (double **) malloc(max_candidate_trees * (sizeof(double *)));
+// if(!pllUFBootDataPtr->treels_ptnlh) outError("Not enough dynamic memory!");
+// memset(pllUFBootDataPtr->treels_ptnlh, 0, max_candidate_trees * (sizeof(double *)));
// aln->createBootstrapAlignment() must be called before this fragment
pllUFBootDataPtr->boot_samples =
@@ -2261,8 +2385,6 @@ void IQTree::pllInitUFBootData(){
}
}
-// pllLogBootSamples(pllUFBootDataPtr->boot_samples,
-// params->gbo_replicates, pllAlignment->sequenceLength);
pllUFBootDataPtr->boot_logl =
(double *) malloc(params->gbo_replicates * (sizeof(double)));
@@ -2275,16 +2397,12 @@ void IQTree::pllInitUFBootData(){
if(!pllUFBootDataPtr->boot_counts) outError("Not enough dynamic memory!");
memset(pllUFBootDataPtr->boot_counts, 0, params->gbo_replicates * (sizeof(int)));
-// pllUFBootDataPtr->boot_trees =
-// (int *) malloc(params->gbo_replicates * (sizeof(int)));
-// if(!pllUFBootDataPtr->boot_trees) outError("Not enough dynamic memory!");
pllUFBootDataPtr->boot_trees.resize(params->gbo_replicates, "");
pllUFBootDataPtr->duplication_counter = 0;
}
}
- pllUFBootDataPtr->max_candidate_trees = max_candidate_trees;
+// pllUFBootDataPtr->max_candidate_trees = max_candidate_trees;
pllUFBootDataPtr->save_all_trees = save_all_trees;
-// pllUFBootDataPtr->save_all_br_lens = save_all_br_lens;
pllUFBootDataPtr->logl_cutoff = logl_cutoff;
pllUFBootDataPtr->n_patterns = pllAlignment->sequenceLength;
}
@@ -2300,10 +2418,6 @@ void IQTree::pllDestroyUFBootData(){
free(pllUFBootDataPtr->treels_logl);
-// for(int i = 0; i < pllUFBootDataPtr->candidate_trees_count; i++)
-// if(pllUFBootDataPtr->treels_newick[i])
-// free(pllUFBootDataPtr->treels_newick[i]);
-// free(pllUFBootDataPtr->treels_newick);
for(int i = 0; i < pllUFBootDataPtr->treels_size; i++)
if(pllUFBootDataPtr->treels_ptnlh[i])
@@ -2318,7 +2432,6 @@ void IQTree::pllDestroyUFBootData(){
free(pllUFBootDataPtr->boot_counts);
-// free(pllUFBootDataPtr->boot_trees);
}
free(pllUFBootDataPtr);
pllUFBootDataPtr = NULL;
@@ -2337,9 +2450,6 @@ void IQTree::doNNIs(int nni2apply, bool changeBran) {
// 2015-10-14: has to reset this pointer when read in
current_it = current_it_back = NULL;
-// if (params->lh_mem_save == LM_PER_NODE) {
-// initializeAllPartialLh();
-// }
}
@@ -2552,48 +2662,11 @@ void IQTree::estimateNNICutoff(Params* params) {
}
void IQTree::saveCurrentTree(double cur_logl) {
-// StringIntMap::iterator it = treels.end();
-// if (params->store_candidate_trees) {
-// printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
-// tree_str = ostr.str();
-// it = treels.find(tree_str);
-// }
-// int tree_index = -1;
-// if (it != treels.end()) { // already in treels
-// duplication_counter++;
-// tree_index = it->second;
-// if (cur_logl <= treels_logl[it->second] + 1e-4) {
-// if (cur_logl < treels_logl[it->second] - 5.0)
-// if (verbose_mode >= VB_MED)
-// cout << "Current lh " << cur_logl << " is much worse than expected " << treels_logl[it->second]
-// << endl;
-// return;
-// }
-// if (verbose_mode >= VB_MAX)
-// cout << "Updated logl " << treels_logl[it->second] << " to " << cur_logl << endl;
-// treels_logl[it->second] = cur_logl;
-//// if (save_all_br_lens) {
-//// ostr.seekp(ios::beg);
-//// printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA | WT_BR_LEN | WT_BR_SCALE | WT_BR_LEN_ROUNDING);
-//// treels_newick[it->second] = ostr.str();
-//// }
-// if (boot_samples.empty()) {
-//// computePatternLikelihood(treels_ptnlh[it->second], &cur_logl);
-// return;
-// }
-// if (verbose_mode >= VB_MAX)
-// cout << "Update treels_logl[" << tree_index << "] := " << cur_logl << endl;
-// } else
- {
- if (logl_cutoff != 0.0 && cur_logl <= logl_cutoff + 1e-4)
- return;
-// tree_index = treels_logl.size();
-// if (params->store_candidate_trees)
-// treels[tree_str] = tree_index;
- treels_logl.push_back(cur_logl);
-// if (verbose_mode >= VB_MAX)
-// cout << "Add treels_logl[" << tree_index << "] := " << cur_logl << endl;
- }
+
+ if (logl_cutoff != 0.0 && cur_logl < logl_cutoff - 1.0)
+ return;
+// treels_logl.push_back(cur_logl);
+// num_trees_for_rell++;
if (write_intermediate_trees)
printTree(out_treels, WT_NEWLINE | WT_BR_LEN);
@@ -2618,11 +2691,6 @@ void IQTree::saveCurrentTree(double cur_logl) {
if (boot_samples.empty()) {
// for runGuidedBootstrap
-//#ifdef BOOT_VAL_FLOAT
-// treels_ptnlh.push_back(pattern_lh_orig);
-//#else
-// treels_ptnlh.push_back(pattern_lh);
-//#endif
} else {
// online bootstrap
// int ptn;
@@ -2630,7 +2698,8 @@ void IQTree::saveCurrentTree(double cur_logl) {
int nsamples = boot_samples.size();
ostringstream ostr;
string tree_str, tree_str_brlen;
- printTree(ostr, WT_TAXON_ID);
+ setRootNode(params->root);
+ printTree(ostr, WT_TAXON_ID + WT_SORT_TAXA);
tree_str = ostr.str();
if (params->print_ufboot_trees == 2) {
ostringstream ostr_brlen;
@@ -2645,13 +2714,6 @@ void IQTree::saveCurrentTree(double cur_logl) {
for (int sample = 0; sample < nsamples; sample++) {
double rell = 0.0;
-// if (false) {
-// BootValType *boot_sample = boot_samples[sample];
-// BootValType rellll = 0.0;
-// for (ptn = 0; ptn < nptn; ptn++)
-// rellll += pattern_lh[ptn] * boot_sample[ptn];
-// rell = (double)rellll;
-// } else
{
// SSE optimized version of the above loop
BootValType *boot_sample = boot_samples[sample];
@@ -2663,56 +2725,23 @@ void IQTree::saveCurrentTree(double cur_logl) {
bool better = rell > boot_logl[sample] + params->ufboot_epsilon;
if (!better && rell > boot_logl[sample] - params->ufboot_epsilon) {
-// #ifdef _OPENMP
-// #pragma omp critical
-// #endif
better = (rand_double <= 1.0 / (boot_counts[sample] + 1));
}
if (better) {
-// if (tree_str == "")
-// #ifdef _OPENMP
-// #pragma omp critical
-// #endif
-// {
-// printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
-// tree_str = ostr.str();
-// it = treels.find(tree_str);
-// if (it != treels.end()) {
-// tree_index = it->second;
-// } else {
-// tree_index = treels.size();
-// treels[tree_str] = tree_index;
-// }
-// }
if (rell <= boot_logl[sample] + params->ufboot_epsilon) {
boot_counts[sample]++;
} else {
boot_counts[sample] = 1;
}
boot_logl[sample] = max(boot_logl[sample], rell);
+ boot_orig_logl[sample] = cur_logl;
boot_trees[sample] = tree_str;
if (params->print_ufboot_trees == 2) {
boot_trees_brlen[sample] = tree_str_brlen;
}
-// updated++;
- } /*else if (verbose_mode >= VB_MED && rell > boot_logl[sample] - 0.01) {
- cout << "Info: multiple RELL score trees detected" << endl;
- }*/
- }
-// if (updated && verbose_mode >= VB_MAX)
-// cout << updated << " boot trees updated" << endl;
- /*
- if (tree_index >= max_candidate_trees/2 && boot_splits->empty()) {
- // summarize split support half way for stopping criterion
- cout << "Summarizing current bootstrap supports..." << endl;
- summarizeBootstrap(*boot_splits);
- }*/
+ }
+ }
}
-// if (save_all_br_lens) {
-// ostr.seekp(ios::beg);
-// printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA | WT_BR_LEN | WT_BR_SCALE | WT_BR_LEN_ROUNDING);
-// treels_newick.push_back(ostr.str());
-// }
if (print_tree_lh) {
out_treelh << cur_logl;
double prob;
@@ -2859,24 +2888,7 @@ void IQTree::summarizeBootstrap(Params ¶ms, MTreeSet &trees) {
cout << "Support values written to " << out_file << endl;
*/
-// if (params.print_ufboot_trees) {
-// string filename = params.out_prefix;
-// filename += ".ufboot";
-// ofstream out(filename.c_str());
-// for (i = 0; i < trees.size(); i++) {
-// NodeVector taxa;
-// // change the taxa name from ID to real name
-// trees[i]->getOrderedTaxa(taxa);
-// for (j = 0; j < taxa.size(); j++)
-// taxa[j]->name = aln->getSeqName(taxa[j]->id);
-// // now print to file
-// for (j = 0; j < trees.tree_weights[i]; j++)
-// trees[i]->printTree(out, WT_NEWLINE);
-// }
-// out.close();
-// cout << "UFBoot trees printed to " << filename << endl;
-// }
-//
+
}
void IQTree::writeUFBootTrees(Params ¶ms) {
@@ -2889,10 +2901,6 @@ void IQTree::writeUFBootTrees(Params ¶ms) {
if (params.print_ufboot_trees == 1) {
// print trees without branch lengths
-// tree_weights.resize(treels_logl.size(), 0);
-// for (sample = 0; sample < boot_trees.size(); sample++)
-// tree_weights[boot_trees[sample]]++;
-// trees.init(treels, rooted, tree_weights);
trees.init(boot_trees, rooted);
for (i = 0; i < trees.size(); i++) {
NodeVector taxa;
@@ -2923,26 +2931,13 @@ void IQTree::writeUFBootTrees(Params ¶ms) {
void IQTree::summarizeBootstrap(Params ¶ms) {
setRootNode(params.root);
-// if (verbose_mode >= VB_MED)
-// cout << "Summarizing from " << treels.size() << " candidate trees..." << endl;
MTreeSet trees;
-// IntVector tree_weights;
-// int sample;
-// tree_weights.resize(treels_logl.size(), 0);
-// for (sample = 0; sample < boot_trees.size(); sample++)
-// tree_weights[boot_trees[sample]]++;
-// trees.init(treels, rooted, tree_weights);
trees.init(boot_trees, rooted);
summarizeBootstrap(params, trees);
}
void IQTree::summarizeBootstrap(SplitGraph &sg) {
MTreeSet trees;
-// IntVector tree_weights;
-// tree_weights.resize(treels_logl.size(), 0);
-// for (int sample = 0; sample < boot_trees.size(); sample++)
-// tree_weights[boot_trees[sample]]++;
-// trees.init(treels, rooted, tree_weights);
//SplitGraph sg;
trees.init(boot_trees, rooted);
SplitIntMap hash_ss;
@@ -2963,29 +2958,15 @@ void IQTree::pllConvertUFBootData2IQTree(){
// duplication_counter
duplication_counter = pllUFBootDataPtr->duplication_counter;
//treels_logl
- treels_logl.clear();
- for(int i = 0; i < pllUFBootDataPtr->candidate_trees_count; i++)
- treels_logl.push_back(pllUFBootDataPtr->treels_logl[i]);
+// treels_logl.clear();
+// for(int i = 0; i < pllUFBootDataPtr->candidate_trees_count; i++)
+// treels_logl.push_back(pllUFBootDataPtr->treels_logl[i]);
//boot_trees
boot_trees.clear();
for(int i = 0; i < params->gbo_replicates; i++)
boot_trees.push_back(pllUFBootDataPtr->boot_trees[i]);
- //treels
-// treels.clear();
-// if(pllUFBootDataPtr->candidate_trees_count > 0){
-// struct pllHashItem * hItem;
-// struct pllHashTable * hTable = pllUFBootDataPtr->treels;
-// for (int i = 0; i < hTable->size; ++ i){
-// hItem = hTable->Items[i];
-// while (hItem){
-// string k(hItem->str);
-// treels[k] = *((int *)hItem->data);
-// hItem = hItem->next;
-// }
-// }
-// }
}
double computeCorrelation(IntVector &ix, IntVector &iy) {
@@ -3059,24 +3040,7 @@ double IQTree::computeBootstrapCorrelation() {
// now compute correlation coefficient
double corr = computeCorrelation(split_supports, split_supports_new);
- // printing supports into file
- /*
- string outfile = params->out_prefix;
- outfile += ".splitsup";
- try {
- ofstream out;
- out.exceptions(ios::failbit | ios::badbit);
- out.open(outfile.c_str());
- out << "tau=" << max_candidate_trees / 2 << "\ttau="
- << treels_logl.size() << endl;
- for (int i = 0; i < split_supports.size(); i++)
- out << split_supports[i] << "\t" << split_supports_new[i] << endl;
- out.close();
- cout << "Split support values printed to " << outfile << endl;
- } catch (ios::failure) {
- outError(ERR_WRITE_OUTPUT, outfile);
- }
- */
+
return corr;
}
@@ -3102,33 +3066,6 @@ void IQTree::printResultTree(ostream &out) {
printTree(out, WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
}
-/*
-void IQTree::printPhylolibModelParams(const char* suffix) {
- char phyloliModelFile[1024];
- strcpy(phyloliModelFile, params->out_prefix);
- strcat(phyloliModelFile, suffix);
- ofstream modelfile;
- modelfile.open(phyloliModelFile);
- for (int model = 0; model < pllInst->NumberOfModels; model++) {
- cout << "Rate parameters: ";
- for (int i = 0; i < 6; i++) {
- cout << pllInst->partitionData[model].substRates[i] << " ";
- modelfile << pllInst->partitionData[model].substRates[i] << " ";
- }
- cout << endl;
- modelfile << endl;
- cout << "Base frequencies: ";
- for (int i = 0; i < aln->num_states; i++) {
- cout << pll_tree->partitionData[model].frequencies[i] << " ";
- modelfile << pll_tree->partitionData[model].frequencies[i] << " ";
- }
- cout << endl;
- modelfile << endl;
- cout << "Gamma shape :" << pll_tree->partitionData[model].alpha << endl;
- modelfile << pll_tree->partitionData[model].alpha << endl;
- }
-}
-*/
void IQTree::printPhylolibTree(const char* suffix) {
pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back, PLL_TRUE, 1, 0, 0, 0,
@@ -3143,74 +3080,29 @@ void IQTree::printPhylolibTree(const char* suffix) {
void IQTree::printIntermediateTree(int brtype) {
setRootNode(params->root);
- bool duplicated_tree = false;
double *pattern_lh = NULL;
double logl = curScore;
-// if (params->avoid_duplicated_trees) {
-// // estimate logl_cutoff
-// stringstream ostr;
-// printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
-// string tree_str = ostr.str();
-// StringIntMap::iterator it = treels.find(tree_str);
-// if (it != treels.end()) { // already in treels
-// duplicated_tree = true;
-// if (curScore > treels_logl[it->second] + 1e-4) {
-// if (verbose_mode >= VB_MAX)
-// cout << "Updated logl " << treels_logl[it->second] << " to " << curScore << endl;
-// treels_logl[it->second] = curScore;
-//// computeLikelihood(treels_ptnlh[it->second]);
-//// if (save_all_br_lens) {
-//// ostr.seekp(ios::beg);
-//// printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA | WT_BR_LEN | WT_BR_SCALE | WT_BR_LEN_ROUNDING);
-//// treels_newick[it->second] = ostr.str();
-//// }
-// }
-// //pattern_lh = treels_ptnlh[treels[tree_str]];
-// } else {
-// //cout << __func__ << ": new tree" << endl;
-// if (logl_cutoff != 0.0 && curScore <= logl_cutoff + 1e-4)
-// duplicated_tree = true;
-// else {
-//// treels[tree_str] = treels_ptnlh.size();
-// pattern_lh = new double[getAlnNPattern()];
-//// computePatternLikelihood(pattern_lh, &logl);
-// computePatternLikelihood(pattern_lh);
-//// treels_ptnlh.push_back(pattern_lh);
-// treels_logl.push_back(logl);
-//// if (save_all_br_lens) {
-//// ostr.seekp(ios::beg);
-//// printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA | WT_BR_LEN | WT_BR_SCALE | WT_BR_LEN_ROUNDING);
-//// treels_newick.push_back(ostr.str());
-//// }
-// }
-// }
-// //cout << tree_str << endl;
-// } else
- {
- if (params->print_tree_lh) {
- pattern_lh = new double[getAlnNPattern()];
- computePatternLikelihood(pattern_lh, &logl);
- }
- }
-
- if (!duplicated_tree) {
- if (write_intermediate_trees)
- printTree(out_treels, brtype);
- if (params->print_tree_lh) {
- out_treelh.precision(10);
- out_treelh << logl;
- double prob;
- aln->multinomialProb(pattern_lh, prob);
- out_treelh << "\t" << prob << endl;
- if (!(brtype & WT_APPEND))
- out_sitelh << aln->getNSite() << endl;
- out_sitelh << "Site_Lh ";
- for (int i = 0; i < aln->getNSite(); i++)
- out_sitelh << "\t" << pattern_lh[aln->getPatternID(i)];
- out_sitelh << endl;
-// if (!params->avoid_duplicated_trees)
- delete[] pattern_lh;
- }
+
+ if (params->print_tree_lh) {
+ pattern_lh = new double[getAlnNPattern()];
+ computePatternLikelihood(pattern_lh, &logl);
+ }
+
+ if (write_intermediate_trees)
+ printTree(out_treels, brtype);
+ if (params->print_tree_lh) {
+ out_treelh.precision(10);
+ out_treelh << logl;
+ double prob;
+ aln->multinomialProb(pattern_lh, prob);
+ out_treelh << "\t" << prob << endl;
+ if (!(brtype & WT_APPEND))
+ out_sitelh << aln->getNSite() << endl;
+ out_sitelh << "Site_Lh ";
+ for (int i = 0; i < aln->getNSite(); i++)
+ out_sitelh << "\t" << pattern_lh[aln->getPatternID(i)];
+ out_sitelh << endl;
+ delete[] pattern_lh;
}
if (params->write_intermediate_trees == 1 && save_all_trees != 1) {
return;
diff --git a/iqtree.h b/iqtree.h
index fd3ed13..b3b6939 100644
--- a/iqtree.h
+++ b/iqtree.h
@@ -97,6 +97,23 @@ public:
void init();
/**
+ set checkpoint object
+ @param checkpoint
+ */
+ virtual void setCheckpoint(Checkpoint *checkpoint);
+
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
+
+ /**
* setup all necessary parameters (declared as virtual needed for phylosupertree)
*/
virtual void initSettings(Params& params);
@@ -587,6 +604,12 @@ protected:
public:
/**
+ * Generate the initial parsimony/random trees, called by initCandidateTreeSet
+ * @param nParTrees number of parsimony/random trees to generate
+ */
+ void createInitTrees(int nParTrees);
+
+ /**
* Generate the initial candidate tree set
* @param nParTrees number of parsimony trees to generate
* @param nNNITrees number of NNI locally optimal trees to generate
@@ -631,14 +654,14 @@ public:
/** pattern log-likelihood vector for each treels */
// vector<double* > treels_ptnlh;
- /** tree log-likelihood for each treels */
- DoubleVector treels_logl;
+ /** OBSOLETE: tree log-likelihood for each treels */
+// DoubleVector treels_logl;
/** NEWICK string for each treels */
// StrVector treels_newick;
- /** maximum number of distinct candidate trees (tau parameter) */
- int max_candidate_trees;
+ /** OBSOLETE: maximum number of distinct candidate trees (tau parameter) */
+// int max_candidate_trees;
/** log-likelihood threshold (l_min) */
double logl_cutoff;
@@ -658,6 +681,9 @@ public:
/** corresponding RELL log-likelihood */
DoubleVector boot_logl;
+ /** corresponding log-likelihood on original alignment */
+ DoubleVector boot_orig_logl;
+
/** Set of splits occuring in bootstrap trees */
vector<SplitGraph*> boot_splits;
diff --git a/model/modelcodon.cpp b/model/modelcodon.cpp
index 27b034b..205b38a 100644
--- a/model/modelcodon.cpp
+++ b/model/modelcodon.cpp
@@ -259,6 +259,47 @@ ModelCodon::~ModelCodon() {
}
}
+void ModelCodon::saveCheckpoint() {
+ checkpoint->startStruct("ModelCodon");
+// CKP_ARRAY_SAVE(12, ntfreq);
+ CKP_SAVE(omega);
+// CKP_SAVE(fix_omega);
+// int codon_kappa_style = this->codon_kappa_style;
+// CKP_SAVE(codon_kappa_style);
+ CKP_SAVE(kappa);
+// CKP_SAVE(fix_kappa);
+ CKP_SAVE(kappa2);
+// CKP_SAVE(fix_kappa2);
+// int codon_freq_style = this->codon_freq_style;
+// CKP_SAVE(codon_freq_style);
+ checkpoint->endStruct();
+ ModelGTR::saveCheckpoint();
+}
+
+void ModelCodon::restoreCheckpoint() {
+ ModelGTR::restoreCheckpoint();
+ checkpoint->startStruct("ModelCodon");
+// CKP_ARRAY_RESTORE(12, ntfreq);
+ CKP_RESTORE(omega);
+// CKP_RESTORE(fix_omega);
+// int codon_kappa_style;
+// CKP_RESTORE(codon_kappa_style);
+// this->codon_kappa_style = (CodonKappaStyle)codon_kappa_style;
+ CKP_RESTORE(kappa);
+// CKP_RESTORE(fix_kappa);
+ CKP_RESTORE(kappa2);
+// CKP_RESTORE(fix_kappa2);
+// int codon_freq_style;
+// CKP_RESTORE(codon_freq_style);
+// this->codon_freq_style = (CodonFreqStyle)codon_freq_style;
+ checkpoint->endStruct();
+
+ decomposeRateMatrix();
+ if (phylo_tree)
+ phylo_tree->clearAllPartialLH();
+
+}
+
StateFreqType ModelCodon::initCodon(const char *model_name, StateFreqType freq, bool reset_params) {
string name_upper = model_name;
for (string::iterator it = name_upper.begin(); it != name_upper.end(); it++)
diff --git a/model/modelcodon.h b/model/modelcodon.h
index 3d617fd..134d72a 100644
--- a/model/modelcodon.h
+++ b/model/modelcodon.h
@@ -49,6 +49,16 @@ public:
*/
virtual ~ModelCodon();
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
@return the number of rate entries, equal to the number of non-diagonal elements of the rate matrix
since we store full matrix here
diff --git a/model/modeldna.cpp b/model/modeldna.cpp
index 8fd890c..f3ce873 100644
--- a/model/modeldna.cpp
+++ b/model/modeldna.cpp
@@ -188,6 +188,23 @@ void ModelDNA::init(const char *model_name, string model_params, StateFreqType f
ModelGTR::init(freq);
}
+void ModelDNA::saveCheckpoint() {
+ checkpoint->startStruct("ModelDNA");
+ CKP_ARRAY_SAVE(6, rates);
+ checkpoint->endStruct();
+ ModelGTR::saveCheckpoint();
+}
+
+void ModelDNA::restoreCheckpoint() {
+ ModelGTR::restoreCheckpoint();
+ checkpoint->startStruct("ModelDNA");
+ CKP_ARRAY_RESTORE(6, rates);
+ checkpoint->endStruct();
+
+ decomposeRateMatrix();
+ if (phylo_tree)
+ phylo_tree->clearAllPartialLH();
+}
void ModelDNA::readRates(string str) throw(const char*) {
int nrates = *max_element(param_spec.begin(), param_spec.end());
diff --git a/model/modeldna.h b/model/modeldna.h
index 9f91d29..a37ff73 100644
--- a/model/modeldna.h
+++ b/model/modeldna.h
@@ -62,6 +62,16 @@ public:
*/
virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
* @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
*/
diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp
index a14cf00..fc4de9a 100644
--- a/model/modelfactory.cpp
+++ b/model/modelfactory.cpp
@@ -76,7 +76,7 @@ ModelsBlock *readModelsDefinition(Params ¶ms) {
return models_block;
}
-ModelFactory::ModelFactory() {
+ModelFactory::ModelFactory() : CheckpointFactory() {
model = NULL;
site_rate = NULL;
store_trans_matrix = false;
@@ -97,7 +97,7 @@ size_t findCloseBracket(string &str, size_t start_pos) {
return string::npos;
}
-ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_block) {
+ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_block) : CheckpointFactory() {
store_trans_matrix = params.store_trans_matrix;
is_storing = false;
joint_optimize = params.optimize_model_rate_joint;
@@ -303,13 +303,16 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
outError("JC is not suitable for site-specific model");
model = new ModelSet(model_str.c_str(), tree);
ModelSet *models = (ModelSet*)model; // assign pointer for convenience
- models->init(params.freq_type);
+ models->init((params.freq_type != FREQ_UNKNOWN) ? params.freq_type : FREQ_EMPIRICAL);
IntVector site_model;
vector<double*> freq_vec;
- readSiteFreq(tree->aln, params.site_freq_file, site_model, freq_vec);
- tree->aln->regroupSitePattern(freq_vec.size(), site_model);
- //tree->aln->ungroupSitePattern();
- tree->setAlignment(tree->aln);
+ bool aln_changed = readSiteFreq(tree->aln, params.site_freq_file, site_model, freq_vec);
+ if (aln_changed) {
+ cout << "Regrouping alignment sites..." << endl;
+ tree->aln->regroupSitePattern(freq_vec.size(), site_model);
+ //tree->aln->ungroupSitePattern();
+ tree->setAlignment(tree->aln);
+ }
int i;
models->pattern_model_map.resize(tree->aln->getNPattern(), -1);
for (i = 0; i < tree->aln->getNSite(); i++) {
@@ -321,7 +324,7 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
for (i = 0; i < freq_vec.size(); i++) {
ModelGTR *modeli;
if (i == 0) {
- modeli = (ModelGTR*)createModel(model_str, models_block, params.freq_type, "", tree, true);
+ modeli = (ModelGTR*)createModel(model_str, models_block, (params.freq_type != FREQ_UNKNOWN) ? params.freq_type : FREQ_EMPIRICAL, "", tree, true);
modeli->getStateFrequency(state_freq);
modeli->getRateMatrix(rates);
} else {
@@ -340,8 +343,12 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
cout << "Alignment is divided into " << models->size() << " partitions with " << tree->aln->getNPattern() << " patterns" << endl;
for (vector<double*>::reverse_iterator it = freq_vec.rbegin(); it != freq_vec.rend(); it++)
if (*it) delete [] (*it);
+
+ // delete information of the old alignment
+ tree->aln->ordered_pattern.clear();
+ tree->deleteAllPartialLh();
}
-
+
// if (model->isMixture())
// cout << "Mixture model with " << model->getNMixtures() << " components!" << endl;
@@ -556,14 +563,50 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_
}
+void ModelFactory::setCheckpoint(Checkpoint *checkpoint) {
+ CheckpointFactory::setCheckpoint(checkpoint);
+ model->setCheckpoint(checkpoint);
+ site_rate->setCheckpoint(checkpoint);
+}
+
+void ModelFactory::saveCheckpoint() {
+ model->saveCheckpoint();
+ site_rate->saveCheckpoint();
+ checkpoint->startStruct("ModelFactory");
+// CKP_SAVE(fused_mix_rate);
+// CKP_SAVE(unobserved_ptns);
+// CKP_SAVE(joint_optimize);
+ checkpoint->endStruct();
+ CheckpointFactory::saveCheckpoint();
+}
+
+void ModelFactory::restoreCheckpoint() {
+ model->restoreCheckpoint();
+ site_rate->restoreCheckpoint();
+ checkpoint->startStruct("ModelFactory");
+// CKP_RESTORE(fused_mix_rate);
+// CKP_RESTORE(unobserved_ptns);
+// CKP_RESTORE(joint_optimize);
+ checkpoint->endStruct();
+}
+
int ModelFactory::getNParameters() {
int df = model->getNDim() + model->getNDimFreq() + site_rate->getNDim() + site_rate->phylo_tree->branchNum;
return df;
}
-void ModelFactory::readSiteFreq(Alignment *aln, char* site_freq_file, IntVector &site_model, vector<double*> &freq_vec)
+bool ModelFactory::readSiteFreq(Alignment *aln, char* site_freq_file, IntVector &site_model, vector<double*> &freq_vec)
{
cout << "Reading site-specific state frequency file " << site_freq_file << " ..." << endl;
site_model.resize(aln->getNSite(), -1);
+ int i;
+ IntVector pattern_to_site; // vector from pattern to the first site
+ pattern_to_site.resize(aln->getNPattern(), -1);
+ for (i = 0; i < aln->getNSite(); i++)
+ if (pattern_to_site[aln->getPatternID(i)] == -1)
+ pattern_to_site[aln->getPatternID(i)] = i;
+
+ bool aln_changed = false;
+
try {
ifstream in;
in.exceptions(ios::failbit | ios::badbit);
@@ -582,24 +625,53 @@ void ModelFactory::readSiteFreq(Alignment *aln, char* site_freq_file, IntVector
if (site_id.size() == 0) throw "No site ID specified";
for (IntVector::iterator it = site_id.begin(); it != site_id.end(); it++) {
if (site_model[*it] != -1) throw "Duplicated site ID";
- site_model[*it] = model_id;
+ site_model[*it] = freq_vec.size();
}
double *site_freq_entry = new double[aln->num_states];
double sum = 0;
- for (int i = 0; i < aln->num_states; i++) {
+ for (i = 0; i < aln->num_states; i++) {
in >> freq;
if (freq <= 0.0 || freq >= 1.0) throw "Invalid frequency entry";
site_freq_entry[i] = freq;
sum += freq;
}
- if (fabs(sum-1.0) > 1e-4) throw "Frequencies do not sum up to 1";
+ if (fabs(sum-1.0) > 1e-4) {
+ if (fabs(sum-1.0) > 1e-3)
+ outWarning("Frequencies of site " + site_spec + " do not sum up to 1 and will be normalized");
+ sum = 1.0/sum;
+ for (i = 0; i < aln->num_states; i++)
+ site_freq_entry[i] *= sum;
+ }
aln->convfreq(site_freq_entry); // regularize frequencies (eg if some freq = 0)
- freq_vec.push_back(site_freq_entry);
+
+ // 2016-02-01: now check for equality of sites with same site-pattern and same freq
+ int prev_site = pattern_to_site[aln->getPatternID(site_id[0])];
+ if (site_id.size() == 1 && prev_site < site_id[0] && site_model[prev_site] != -1) {
+ // compare freq with prev_site
+ bool matched_freq = true;
+ double *prev_freq = freq_vec[site_model[prev_site]];
+ for (i = 0; i < aln->num_states; i++) {
+ if (site_freq_entry[i] != prev_freq[i]) {
+ matched_freq = false;
+ break;
+ }
+ }
+ if (matched_freq) {
+ site_model[site_id[0]] = site_model[prev_site];
+ } else
+ aln_changed = true;
+ }
+
+ if (site_model[site_id[0]] == freq_vec.size())
+ freq_vec.push_back(site_freq_entry);
+ else
+ delete [] site_freq_entry;
}
if (specified_sites < site_model.size()) {
+ aln_changed = true;
// there are some unspecified sites
cout << site_model.size() - specified_sites << " unspecified sites will get default frequencies" << endl;
- for (int i = 0; i < site_model.size(); i++)
+ for (i = 0; i < site_model.size(); i++)
if (site_model[i] == -1)
site_model[i] = freq_vec.size();
freq_vec.push_back(NULL);
@@ -615,6 +687,7 @@ void ModelFactory::readSiteFreq(Alignment *aln, char* site_freq_file, IntVector
} catch(ios::failure) {
outError(ERR_READ_INPUT);
}
+ return aln_changed;
}
double ModelFactory::initGTRGammaIParameters(RateHeterogeneity *rate, ModelSubst *model, double initAlpha,
diff --git a/model/modelfactory.h b/model/modelfactory.h
index 6baf427..a4334d5 100644
--- a/model/modelfactory.h
+++ b/model/modelfactory.h
@@ -24,6 +24,7 @@
#include "modelsubst.h"
#include "rateheterogeneity.h"
#include "modelsblock.h"
+#include "checkpoint.h"
ModelsBlock *readModelsDefinition(Params ¶ms);
@@ -35,7 +36,7 @@ The values of the map contain 3 matricies consecutively: transition matrix, 1st,
@author BUI Quang Minh <minh.bui at univie.ac.at>
*/
-class ModelFactory : public unordered_map<int, double*>, public Optimization
+class ModelFactory : public unordered_map<int, double*>, public Optimization, public CheckpointFactory
{
public:
@@ -55,14 +56,31 @@ public:
ModelFactory();
+ /**
+ set checkpoint object
+ @param checkpoint
+ */
+ virtual void setCheckpoint(Checkpoint *checkpoint);
+
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
* read site specific state frequency vectors from a file to create corresponding model (Ingo's idea)
* @param aln input alignment
* @param site_freq_file file name
* @param site_model (OUT) site to model ID map
* @param freq_vec (OUT) vector of frequency vectors
+ * @return TRUE if alignment needs to be changed, FALSE otherwise
*/
- void readSiteFreq(Alignment *aln, char* site_freq_file, IntVector &site_model, vector<double*> &freq_vec);
+ bool readSiteFreq(Alignment *aln, char* site_freq_file, IntVector &site_model, vector<double*> &freq_vec);
/**
get the name of the model
diff --git a/model/modelgtr.cpp b/model/modelgtr.cpp
index 1728f8a..39e18d8 100644
--- a/model/modelgtr.cpp
+++ b/model/modelgtr.cpp
@@ -64,6 +64,19 @@ ModelGTR::ModelGTR(PhyloTree *tree, bool count_rates)
num_params = getNumRateEntries() - 1;
}
+void ModelGTR::saveCheckpoint() {
+ checkpoint->startStruct("ModelGTR");
+ checkpoint->endStruct();
+ ModelSubst::saveCheckpoint();
+}
+
+void ModelGTR::restoreCheckpoint() {
+ ModelSubst::restoreCheckpoint();
+ checkpoint->startStruct("ModelGTR");
+ checkpoint->endStruct();
+}
+
+
void ModelGTR::setTree(PhyloTree *tree) {
phylo_tree = tree;
}
@@ -606,8 +619,8 @@ void ModelGTR::decomposeRateMatrix(){
for (i = 1; i < num_states; i++)
eigenvalues[i] = -mu;
- double *f = new double[num_states];
- for (i = 0; i < num_states; i++) f[i] = sqrt(state_freq[i]);
+// double *f = new double[num_states];
+// for (i = 0; i < num_states; i++) f[i] = sqrt(state_freq[i]);
// compute eigenvectors
memset(eigenvectors, 0, num_states*num_states*sizeof(double));
memset(inv_eigenvectors, 0, num_states*num_states*sizeof(double));
diff --git a/model/modelgtr.h b/model/modelgtr.h
index b73007a..c26bc52 100644
--- a/model/modelgtr.h
+++ b/model/modelgtr.h
@@ -70,6 +70,16 @@ public:
*/
virtual ~ModelGTR();
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
* @return model name
*/
diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp
index b4239de..2923b3a 100644
--- a/model/modelmixture.cpp
+++ b/model/modelmixture.cpp
@@ -1003,16 +1003,17 @@ ModelSubst* createModel(string model_str, ModelsBlock *models_block, StateFreqTy
{
model = new ModelSubst(tree->aln->num_states);
} else */
- if ((model_str == "GTR" && tree->aln->seq_type == SEQ_DNA) ||
- (model_str == "GTR2" && tree->aln->seq_type == SEQ_BINARY) ||
- (model_str == "GTR20" && tree->aln->seq_type == SEQ_PROTEIN)) {
- model = new ModelGTR(tree, count_rates);
- if (freq_params != "")
- ((ModelGTR*)model)->readStateFreq(freq_params);
- if (model_params != "")
- ((ModelGTR*)model)->readRates(model_params);
- ((ModelGTR*)model)->init(freq_type);
- } else if (model_str == "UNREST") {
+// if ((model_str == "GTR" && tree->aln->seq_type == SEQ_DNA) ||
+// (model_str == "GTR2" && tree->aln->seq_type == SEQ_BINARY) ||
+// (model_str == "GTR20" && tree->aln->seq_type == SEQ_PROTEIN)) {
+// model = new ModelGTR(tree, count_rates);
+// if (freq_params != "")
+// ((ModelGTR*)model)->readStateFreq(freq_params);
+// if (model_params != "")
+// ((ModelGTR*)model)->readRates(model_params);
+// ((ModelGTR*)model)->init(freq_type);
+// } else
+ if (model_str == "UNREST") {
freq_type = FREQ_EQUAL;
//params.optimize_by_newton = false;
tree->optimize_by_newton = false;
@@ -1276,6 +1277,48 @@ ModelMixture::~ModelMixture() {
}
}
+void ModelMixture::setCheckpoint(Checkpoint *checkpoint) {
+ CheckpointFactory::setCheckpoint(checkpoint);
+ for (iterator it = begin(); it != end(); it++)
+ (*it)->setCheckpoint(checkpoint);
+}
+
+void ModelMixture::saveCheckpoint() {
+ checkpoint->startStruct("ModelMixture");
+// CKP_SAVE(fix_prop);
+ int nmix = getNMixtures();
+ CKP_ARRAY_SAVE(nmix, prop);
+ int part = 1;
+ for (iterator it = begin(); it != end(); it++, part++) {
+ checkpoint->startStruct("Component" + convertIntToString(part));
+ (*it)->saveCheckpoint();
+ checkpoint->endStruct();
+ }
+ checkpoint->endStruct();
+
+ ModelGTR::saveCheckpoint();
+}
+
+void ModelMixture::restoreCheckpoint() {
+ ModelGTR::restoreCheckpoint();
+
+ checkpoint->startStruct("ModelMixture");
+// CKP_RESTORE(fix_prop);
+ int nmix = getNMixtures();
+ CKP_ARRAY_RESTORE(nmix, prop);
+ int part = 1;
+ for (iterator it = begin(); it != end(); it++, part++) {
+ checkpoint->startStruct("Component" + convertIntToString(part));
+ (*it)->restoreCheckpoint();
+ checkpoint->endStruct();
+ }
+ checkpoint->endStruct();
+
+ decomposeRateMatrix();
+ if (phylo_tree)
+ phylo_tree->clearAllPartialLH();
+}
+
int ModelMixture::getNDim() {
// int dim = (fix_prop) ? 0: (size()-1);
int dim = 0;
diff --git a/model/modelmixture.h b/model/modelmixture.h
index 45abec9..f38be63 100644
--- a/model/modelmixture.h
+++ b/model/modelmixture.h
@@ -58,6 +58,22 @@ public:
virtual ~ModelMixture();
+ /**
+ set checkpoint object
+ @param checkpoint
+ */
+ virtual void setCheckpoint(Checkpoint *checkpoint);
+
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
* @return TRUE if this is a mixture model, FALSE otherwise
diff --git a/model/modelsubst.cpp b/model/modelsubst.cpp
index 1b79b65..15ccea6 100644
--- a/model/modelsubst.cpp
+++ b/model/modelsubst.cpp
@@ -12,7 +12,7 @@
#include "modelsubst.h"
#include "tools.h"
-ModelSubst::ModelSubst(int nstates) : Optimization()
+ModelSubst::ModelSubst(int nstates) : Optimization(), CheckpointFactory()
{
num_states = nstates;
name = "JC";
@@ -23,6 +23,38 @@ ModelSubst::ModelSubst(int nstates) : Optimization()
freq_type = FREQ_EQUAL;
}
+void ModelSubst::saveCheckpoint() {
+ checkpoint->startStruct("ModelSubst");
+// CKP_SAVE(num_states);
+ CKP_SAVE(name);
+// CKP_SAVE(full_name);
+// CKP_SAVE(freq_type);
+ if (freq_type == FREQ_EMPIRICAL || freq_type == FREQ_ESTIMATE)
+ CKP_ARRAY_SAVE(num_states, state_freq);
+ checkpoint->endStruct();
+ CheckpointFactory::saveCheckpoint();
+}
+
+ /**
+ restore object from the checkpoint
+ @param ckp checkpoint to restore from
+ */
+void ModelSubst::restoreCheckpoint() {
+ CheckpointFactory::restoreCheckpoint();
+ checkpoint->startStruct("ModelSubst");
+// CKP_RESTORE(num_states);
+ CKP_RESTORE(name);
+// CKP_RESTORE(full_name);
+// int freq_type = this->freq_type;
+// CKP_RESTORE(freq_type);
+// this->freq_type = (StateFreqType)freq_type;
+ if (freq_type == FREQ_EMPIRICAL || freq_type == FREQ_ESTIMATE)
+ CKP_ARRAY_RESTORE(num_states, state_freq);
+ checkpoint->endStruct();
+
+ decomposeRateMatrix();
+}
+
// here the simplest Juke-Cantor model is implemented, valid for all kind of data (DNA, AA,...)
void ModelSubst::computeTransMatrix(double time, double *trans_matrix) {
double non_diagonal = (1.0 - exp(-time*num_states/(num_states - 1))) / num_states;
diff --git a/model/modelsubst.h b/model/modelsubst.h
index e7a5442..45444f2 100644
--- a/model/modelsubst.h
+++ b/model/modelsubst.h
@@ -15,6 +15,7 @@
#include <string>
#include "tools.h"
#include "optimization.h"
+#include "checkpoint.h"
using namespace std;
@@ -23,7 +24,7 @@ Substitution model abstract class
@author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
*/
-class ModelSubst: public Optimization
+class ModelSubst: public Optimization, public CheckpointFactory
{
friend class ModelFactory;
@@ -256,6 +257,20 @@ public:
return NULL;
}
+ /*****************************************************
+ Checkpointing facility
+ *****************************************************/
+
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
number of states
*/
diff --git a/model/partitionmodel.cpp b/model/partitionmodel.cpp
index c1a8be0..cb853c1 100644
--- a/model/partitionmodel.cpp
+++ b/model/partitionmodel.cpp
@@ -69,6 +69,44 @@ PartitionModel::PartitionModel(Params ¶ms, PhyloSuperTree *tree, ModelsBlock
}
}
+void PartitionModel::setCheckpoint(Checkpoint *checkpoint) {
+ ModelFactory::setCheckpoint(checkpoint);
+ PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
+ for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++)
+ (*it)->getModelFactory()->setCheckpoint(checkpoint);
+}
+
+void PartitionModel::saveCheckpoint() {
+ checkpoint->startStruct("PartitionModel");
+ CKP_SAVE(linked_alpha);
+ PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
+ int part = 0;
+ for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++, part++) {
+ checkpoint->startStruct(tree->part_info[part].name);
+ (*it)->getModelFactory()->saveCheckpoint();
+ checkpoint->endStruct();
+ }
+ checkpoint->endStruct();
+
+ CheckpointFactory::saveCheckpoint();
+}
+
+void PartitionModel::restoreCheckpoint() {
+ CheckpointFactory::restoreCheckpoint();
+ checkpoint->startStruct("PartitionModel");
+ CKP_RESTORE(linked_alpha);
+
+ PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
+ int part = 0;
+ for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++, part++) {
+ checkpoint->startStruct(tree->part_info[part].name);
+ (*it)->getModelFactory()->restoreCheckpoint();
+ checkpoint->endStruct();
+ }
+
+ checkpoint->endStruct();
+}
+
int PartitionModel::getNParameters() {
PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
int df = 0;
diff --git a/model/partitionmodel.h b/model/partitionmodel.h
index b3a72be..8fe7fda 100644
--- a/model/partitionmodel.h
+++ b/model/partitionmodel.h
@@ -45,6 +45,22 @@ public:
~PartitionModel();
/**
+ set checkpoint object
+ @param checkpoint
+ */
+ virtual void setCheckpoint(Checkpoint *checkpoint);
+
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
+ /**
* @return #parameters of the model + # branches
*/
virtual int getNParameters();
@@ -72,6 +88,8 @@ public:
protected:
+
+ /** linked Gamma shape alpha between partitions */
double linked_alpha;
};
diff --git a/model/ratefree.cpp b/model/ratefree.cpp
index 91fc0fd..ba5bcb6 100644
--- a/model/ratefree.cpp
+++ b/model/ratefree.cpp
@@ -54,6 +54,30 @@ RateFree::RateFree(int ncat, double start_alpha, string params, bool sorted_rate
}
}
+void RateFree::saveCheckpoint() {
+ checkpoint->startStruct("RateFree");
+// CKP_SAVE(fix_params);
+// CKP_SAVE(sorted_rates);
+// CKP_SAVE(optimize_alg);
+ CKP_ARRAY_SAVE(ncategory, prop);
+ CKP_ARRAY_SAVE(ncategory, rates);
+ checkpoint->endStruct();
+ RateGamma::saveCheckpoint();
+}
+
+void RateFree::restoreCheckpoint() {
+ RateGamma::restoreCheckpoint();
+ checkpoint->startStruct("RateFree");
+// CKP_RESTORE(fix_params);
+// CKP_RESTORE(sorted_rates);
+// CKP_RESTORE(optimize_alg);
+ CKP_ARRAY_RESTORE(ncategory, prop);
+ CKP_ARRAY_RESTORE(ncategory, rates);
+ checkpoint->endStruct();
+
+// setNCategory(ncategory);
+}
+
void RateFree::setNCategory(int ncat) {
// initialize with gamma rates
diff --git a/model/ratefree.h b/model/ratefree.h
index 6a9b341..327012b 100644
--- a/model/ratefree.h
+++ b/model/ratefree.h
@@ -22,6 +22,16 @@ public:
virtual ~RateFree();
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
@return true if this is a Gamma model (default: false)
*/
diff --git a/model/ratefreeinvar.cpp b/model/ratefreeinvar.cpp
index 2b0213a..b3b8c01 100644
--- a/model/ratefreeinvar.cpp
+++ b/model/ratefreeinvar.cpp
@@ -15,6 +15,20 @@ RateFreeInvar::RateFreeInvar(int ncat, double start_alpha, string params, bool s
full_name = "Invar+" + full_name;
}
+void RateFreeInvar::saveCheckpoint() {
+ checkpoint->startStruct("RateFreeInvar");
+ checkpoint->endStruct();
+ RateInvar::saveCheckpoint();
+ RateFree::saveCheckpoint();
+}
+
+void RateFreeInvar::restoreCheckpoint() {
+ RateInvar::restoreCheckpoint();
+ RateFree::restoreCheckpoint();
+ checkpoint->startStruct("RateFreeInvar");
+ checkpoint->endStruct();
+}
+
void RateFreeInvar::setNCategory(int ncat) {
RateFree::setNCategory(ncat);
name = "+I" + name;
diff --git a/model/ratefreeinvar.h b/model/ratefreeinvar.h
index 458e9d7..cb1422c 100644
--- a/model/ratefreeinvar.h
+++ b/model/ratefreeinvar.h
@@ -21,6 +21,16 @@ public:
*/
RateFreeInvar(int ncat, double start_alpha, string params, bool sorted_rates, double p_invar_sites, string opt_alg, PhyloTree *tree);
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
return the number of dimensions
diff --git a/model/rategamma.cpp b/model/rategamma.cpp
index c0ddd47..6957c3f 100644
--- a/model/rategamma.cpp
+++ b/model/rategamma.cpp
@@ -42,6 +42,28 @@ RateGamma::RateGamma(int ncat, double shape, bool median, PhyloTree *tree) : Rat
setNCategory(ncat);
}
+void RateGamma::saveCheckpoint() {
+ checkpoint->startStruct("RateGamma");
+ CKP_SAVE(gamma_shape);
+// CKP_SAVE(fix_gamma_shape);
+// CKP_SAVE(cut_median);
+// CKP_SAVE(ncategory);
+ checkpoint->endStruct();
+ RateHeterogeneity::saveCheckpoint();
+}
+
+void RateGamma::restoreCheckpoint() {
+ RateHeterogeneity::restoreCheckpoint();
+ checkpoint->startStruct("RateGamma");
+ CKP_RESTORE(gamma_shape);
+// CKP_RESTORE(fix_gamma_shape);
+// CKP_RESTORE(cut_median);
+// CKP_RESTORE(ncategory);
+ checkpoint->endStruct();
+ // necessary compute rates after restoring gamma_shape
+ computeRates();
+}
+
void RateGamma::setNCategory(int ncat) {
ncategory = ncat;
if (rates) delete [] rates;
diff --git a/model/rategamma.h b/model/rategamma.h
index df55149..8be01b6 100644
--- a/model/rategamma.h
+++ b/model/rategamma.h
@@ -56,6 +56,16 @@ public:
*/
virtual ~RateGamma();
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
@return true if this is a Gamma model (default: false)
*/
diff --git a/model/rategammainvar.cpp b/model/rategammainvar.cpp
index cd36744..dc293bd 100644
--- a/model/rategammainvar.cpp
+++ b/model/rategammainvar.cpp
@@ -29,6 +29,23 @@ RateGammaInvar::RateGammaInvar(int ncat, double shape, bool median,
computeRates();
}
+void RateGammaInvar::saveCheckpoint() {
+ checkpoint->startStruct("RateGammaInvar");
+// CKP_SAVE(joint_optimize);
+ checkpoint->endStruct();
+ RateInvar::saveCheckpoint();
+ RateGamma::saveCheckpoint();
+}
+
+void RateGammaInvar::restoreCheckpoint() {
+ // should restore p_invar first before gamma, because RateGamma will call computeRates()
+ RateInvar::restoreCheckpoint();
+ RateGamma::restoreCheckpoint();
+ checkpoint->startStruct("RateGammaInvar");
+// CKP_RESTORE(joint_optimize);
+ checkpoint->endStruct();
+}
+
void RateGammaInvar::setNCategory(int ncat) {
RateGamma::setNCategory(ncat);
name = "+I" + name;
diff --git a/model/rategammainvar.h b/model/rategammainvar.h
index 16b6901..c4092c8 100644
--- a/model/rategammainvar.h
+++ b/model/rategammainvar.h
@@ -39,6 +39,16 @@ public:
*/
RateGammaInvar(int ncat, double shape, bool median, double p_invar_sites, bool simultaneous, PhyloTree *tree);
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
get the proportion of sites under a specified category.
@param category category ID from 0 to #category-1
diff --git a/model/rateheterogeneity.cpp b/model/rateheterogeneity.cpp
index 3bc02d3..fe20164 100644
--- a/model/rateheterogeneity.cpp
+++ b/model/rateheterogeneity.cpp
@@ -23,7 +23,7 @@
RateHeterogeneity::RateHeterogeneity()
- : Optimization()
+ : Optimization(), CheckpointFactory()
{
name = "";
full_name = "Uniform";
@@ -38,6 +38,21 @@ RateHeterogeneity::~RateHeterogeneity()
{
}
+void RateHeterogeneity::saveCheckpoint() {
+ checkpoint->startStruct("RateHeterogeneity");
+// CKP_SAVE(name);
+// CKP_SAVE(full_name);
+ checkpoint->endStruct();
+ CheckpointFactory::saveCheckpoint();
+}
+
+void RateHeterogeneity::restoreCheckpoint() {
+ checkpoint->startStruct("RateHeterogeneity");
+// CKP_RESTORE(name);
+// CKP_RESTORE(full_name);
+ checkpoint->endStruct();
+}
+
void RateHeterogeneity::writeSiteRates(ostream &out, DoubleVector &pattern_rates, IntVector &pattern_cat, int ncategory) {
int nsite = phylo_tree->aln->getNSite();
int i;
diff --git a/model/rateheterogeneity.h b/model/rateheterogeneity.h
index 596038e..85e14c6 100644
--- a/model/rateheterogeneity.h
+++ b/model/rateheterogeneity.h
@@ -24,6 +24,7 @@
#include "optimization.h"
#include <string>
#include "tools.h"
+#include "checkpoint.h"
using namespace std;
@@ -40,7 +41,7 @@ class for among-site rate heterogeneity, the default is homogeneous (equal) rate
@author BUI Quang Minh <minh.bui at univie.ac.at>
*/
-class RateHeterogeneity : public Optimization
+class RateHeterogeneity : public Optimization, public CheckpointFactory
{
friend class ModelFactory;
@@ -55,6 +56,16 @@ public:
*/
virtual ~RateHeterogeneity();
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
set phylogenetic tree
@param tree associated phyogenetic tree
diff --git a/model/rateinvar.cpp b/model/rateinvar.cpp
index d0fea17..58b7d52 100644
--- a/model/rateinvar.cpp
+++ b/model/rateinvar.cpp
@@ -38,6 +38,24 @@ RateInvar::RateInvar(double p_invar_sites, PhyloTree *tree)
}
}
+void RateInvar::saveCheckpoint() {
+ checkpoint->startStruct("RateInvar");
+ CKP_SAVE(p_invar);
+// CKP_SAVE(fix_p_invar);
+// CKP_SAVE(optimize_p_invar);
+ checkpoint->endStruct();
+ RateHeterogeneity::saveCheckpoint();
+}
+
+void RateInvar::restoreCheckpoint() {
+ RateHeterogeneity::restoreCheckpoint();
+ checkpoint->startStruct("RateInvar");
+ CKP_RESTORE(p_invar);
+// CKP_RESTORE(fix_p_invar);
+// CKP_RESTORE(optimize_p_invar);
+ checkpoint->endStruct();
+}
+
string RateInvar::getNameParams() {
ostringstream str;
str << "+I{" << p_invar << '}';
diff --git a/model/rateinvar.h b/model/rateinvar.h
index b20a515..6ae98ed 100644
--- a/model/rateinvar.h
+++ b/model/rateinvar.h
@@ -43,6 +43,16 @@ public:
*/
RateInvar(double p_invar_sites, PhyloTree *tree);
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
* @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
*/
diff --git a/mtree.cpp b/mtree.cpp
index 0a74fa2..9a39486 100644
--- a/mtree.cpp
+++ b/mtree.cpp
@@ -236,11 +236,11 @@ void MTree::printTree(const char *ofile, int brtype)
}
}
-string MTree::getTreeString() {
- stringstream tree_stream;
- printTree(tree_stream);
- return tree_stream.str();
-}
+//string MTree::getTreeString() {
+// stringstream tree_stream;
+// printTree(tree_stream);
+// return tree_stream.str();
+//}
void MTree::printTree(ostream &out, int brtype) {
if (root->isLeaf()) {
diff --git a/mtree.h b/mtree.h
index 6cad508..44e7c42 100644
--- a/mtree.h
+++ b/mtree.h
@@ -154,7 +154,7 @@ public:
void printTree(ostream & out, int brtype = WT_BR_LEN);
- string getTreeString();
+// string getTreeString();
/**
print the tree to the output file in newick format
diff --git a/optimization.cpp b/optimization.cpp
index 22c21e9..92a8093 100644
--- a/optimization.cpp
+++ b/optimization.cpp
@@ -555,7 +555,7 @@ void Optimization::lnsrch(int n, double xold[], double fold, double g[], double
const int MAX_ITER = 3;
-extern double random_double();
+extern double random_double(int *rstream);
double Optimization::minimizeMultiDimen(double guess[], int ndim, double lower[], double upper[], bool bound_check[], double gtol) {
int i, iter;
diff --git a/optimization.h b/optimization.h
index e3c8fb2..f34c24d 100644
--- a/optimization.h
+++ b/optimization.h
@@ -19,8 +19,9 @@ Optimization class, implement some methods like Brent, Newton-Raphson (for 1 var
*/
class Optimization{
public:
- Optimization();
+ /** constructor */
+ Optimization();
/*****************************************************
One dimensional optimization with Brent method
diff --git a/pda.cpp b/pda.cpp
index cbb0aa0..9a17049 100644
--- a/pda.cpp
+++ b/pda.cpp
@@ -1765,12 +1765,11 @@ string _log_file;
int _exit_wait_optn = FALSE;
-extern "C" void startLogFile() {
- _out_buf.open(_log_file.c_str());
-}
-
-extern "C" void appendLogFile() {
- _out_buf.open(_log_file.c_str(), ios::app);
+extern "C" void startLogFile(bool append_log) {
+ if (append_log)
+ _out_buf.open(_log_file.c_str(), ios::app);
+ else
+ _out_buf.open(_log_file.c_str());
}
extern "C" void endLogFile() {
@@ -2183,9 +2182,44 @@ int main(int argc, char *argv[])
//Params params;
parseArg(argc, argv, Params::getInstance());
+ // 2015-12-05
+ Checkpoint *checkpoint = new Checkpoint;
+ string filename = (string)Params::getInstance().out_prefix + ".ckp.gz";
+ checkpoint->setFileName(filename);
+
+ bool append_log = false;
+
+ if (!Params::getInstance().ignore_checkpoint) {
+ checkpoint->load();
+ if (checkpoint->hasKey("finished")) {
+ if (checkpoint->getBool("finished")) {
+ if (Params::getInstance().force_unfinished) {
+ cout << "NOTE: Continue analysis although a previous run already finished" << endl;
+ } else {
+ outError("Checkpoint (" + filename + ") indicates that a previous run successfully finished\n" +
+ "Use `-redo` option if you really want to redo the analysis and overwrite all output files.");
+ delete checkpoint;
+ return EXIT_FAILURE;
+ }
+ } else {
+ append_log = true;
+ }
+ } else {
+ outWarning("Ignore invalid checkpoint file " + filename);
+ checkpoint->clear();
+ }
+ }
+
+
_log_file = Params::getInstance().out_prefix;
_log_file += ".log";
- startLogFile();
+ startLogFile(append_log);
+
+ if (append_log) {
+ cout << endl << "******************************************************"
+ << endl << "CHECKPOINT: Resuming analysis from " << filename << endl << endl;
+ }
+
atexit(funcExit);
signal(SIGABRT, &funcAbort);
signal(SIGFPE, &funcAbort);
@@ -2245,16 +2279,18 @@ int main(int argc, char *argv[])
//#endif
cout << "Command:";
- for (int i = 0; i < argc; i++)
+ int i;
+ for (i = 0; i < argc; i++)
cout << " " << argv[i];
cout << endl;
+ checkpoint->get("iqtree.seed", Params::getInstance().ran_seed);
cout << "Seed: " << Params::getInstance().ran_seed << " ";
- init_random(Params::getInstance().ran_seed);
+ init_random(Params::getInstance().ran_seed, true);
- time_t cur_time;
- time(&cur_time);
- cout << "Time: " << ctime(&cur_time);
+ time_t start_time;
+ time(&start_time);
+ cout << "Time: " << ctime(&start_time);
if (Params::getInstance().lk_no_avx)
instruction_set = min(instruction_set, 6);
@@ -2317,6 +2353,44 @@ int main(int argc, char *argv[])
cout.precision(3);
cout.setf(ios::fixed);
+
+ // checkpoint general run information
+ checkpoint->startStruct("iqtree");
+ string command;
+
+ if (CKP_RESTORE_STRING(command)) {
+ // compare command between saved and current commands
+ stringstream ss(command);
+ string str;
+ bool mismatch = false;
+ for (i = 1; i < argc; i++) {
+ if (!(ss >> str)) {
+ outWarning("Number of command-line arguments differs from checkpoint");
+ mismatch = true;
+ break;
+ }
+ if (str != argv[i]) {
+ outWarning((string)"Command-line argument `" + argv[i] + "` differs from checkpoint `" + str + "`");
+ mismatch = true;
+ }
+ }
+ if (mismatch) {
+ outWarning("Command-line differs from checkpoint!");
+ }
+ command = "";
+ }
+
+ for (i = 1; i < argc; i++)
+ command += string(" ") + argv[i];
+ CKP_SAVE(command);
+ int seed = Params::getInstance().ran_seed;
+ CKP_SAVE(seed);
+ CKP_SAVE(start_time);
+ stringstream sversion;
+ sversion << iqtree_VERSION_MAJOR << "." << iqtree_VERSION_MINOR << "." << iqtree_VERSION_PATCH;
+ string version = sversion.str();
+ CKP_SAVE(version);
+ checkpoint->endStruct();
// call the main function
if (Params::getInstance().tree_gen != NONE) {
@@ -2352,7 +2426,7 @@ int main(int argc, char *argv[])
if (Params::getInstance().second_align)
computeMulProb(Params::getInstance());
} else {
- runPhyloAnalysis(Params::getInstance());
+ runPhyloAnalysis(Params::getInstance(), checkpoint);
}
} else if (Params::getInstance().ngs_file || Params::getInstance().ngs_mapped_reads) {
runNGSAnalysis(Params::getInstance());
@@ -2413,8 +2487,9 @@ int main(int argc, char *argv[])
}
}
- time(&cur_time);
- cout << "Date and Time: " << ctime(&cur_time);
+ delete checkpoint;
+ time(&start_time);
+ cout << "Date and Time: " << ctime(&start_time);
finish_random();
return EXIT_SUCCESS;
diff --git a/phyloanalysis.cpp b/phyloanalysis.cpp
index 6b93d76..d321ac3 100644
--- a/phyloanalysis.cpp
+++ b/phyloanalysis.cpp
@@ -681,6 +681,11 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
reportRate(out, tree);
}
+ if (params.lmap_num_quartets) {
+ tree.reportLikelihoodMapping(out);
+ }
+
+
/*
out << "RATE HETEROGENEITY" << endl << "------------------" << endl
<< endl;
@@ -738,11 +743,11 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
<< "NNI log-likelihood cutoff: " << tree.getNNICutoff() << endl
<< endl;
*/
- if (params.compute_ml_tree) {
+ if (params.compute_ml_tree && (params.min_iterations > 0 || original_model.find("ONLY") != string::npos)) {
if (original_model.find("ONLY") != string::npos)
out << "TREE USED FOR MODEL SELECTION" << endl
<< "-----------------------------" << endl << endl;
- else
+ else
out << "MAXIMUM LIKELIHOOD TREE" << endl
<< "-----------------------" << endl << endl;
@@ -860,18 +865,24 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
if (params.treeset_file) {
evaluateTrees(params, &tree, info, distinct_trees);
out.precision(4);
+ out.setf(ios_base::fixed);
out << endl << "USER TREES" << endl << "----------" << endl << endl;
- out << "See " << params.treeset_file << ".trees for trees with branch lengths." << endl << endl;
+ out << "See " << params.out_prefix << ".trees for trees with branch lengths." << endl << endl;
if (params.topotest_replicates && info.size() > 1) {
- if (params.do_weighted_test) {
- out << "Tree logL deltaL bp-RELL p-KH p-SH p-WKH p-WSH c-ELW" << endl;
- out << "-------------------------------------------------------------------------------" << endl;
- } else {
- out << "Tree logL deltaL bp-RELL p-KH p-SH c-ELW" << endl;
- out << "-------------------------------------------------------------" << endl;
-
- }
+ out << "Tree logL deltaL bp-RELL p-KH p-SH ";
+ if (params.do_weighted_test)
+ out << "p-WKH p-WSH ";
+ out << "c-ELW";
+ if (params.do_au_test)
+ out << " p-AU";
+
+ out << endl << "------------------------------------------------------------------";
+ if (params.do_weighted_test)
+ out << "------------------";
+ if (params.do_au_test)
+ out << "-------";
+ out << endl;
} else {
out << "Tree logL deltaL" << endl;
out << "-------------------------" << endl;
@@ -918,6 +929,7 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
out << " - ";
else
out << " + ";
+
if (params.do_weighted_test) {
out.width(6);
out << right << info[tid].wkh_pvalue;
@@ -935,9 +947,19 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
out.width(6);
out << info[tid].elw_value;
if (info[tid].elw_confident)
- out << " +";
+ out << " + ";
else
- out << " -";
+ out << " - ";
+
+ if (params.do_au_test) {
+ out.width(6);
+ out << right << info[tid].au_pvalue;
+ if (info[tid].au_pvalue < 0.05)
+ out << " - ";
+ else
+ out << " + ";
+ }
+
out << endl;
tid++;
}
@@ -952,7 +974,11 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
out << "p-WKH : p-value of weighted KH test." << endl
<< "p-WSH : p-value of weighted SH test." << endl;
}
- out << "c-ELW : Expected Likelihood Weight (Strimmer & Rambaut 2002)." << endl << endl
+ out << "c-ELW : Expected Likelihood Weight (Strimmer & Rambaut 2002)." << endl;
+ if (params.do_au_test) {
+ out << "p-AU : p-value of approximately unbiased (AU) test (Shimodaira, 2002)." << endl;
+ }
+ out << endl
<< "Plus signs denote the 95% confidence sets." << endl
<< "Minus signs denote significant exclusion." << endl
<< "All tests performed "
@@ -1054,6 +1080,10 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model,
cout << " Site log-likelihoods: " << params.out_prefix << ".sitelh" << endl;
}
}
+ if (params.lmap_num_quartets) {
+ cout << " Likelihood mapping plot (SVG): " << params.out_prefix << ".lmap.svg" << endl;
+ cout << " Likelihood mapping plot (EPS): " << params.out_prefix << ".lmap.eps" << endl;
+ }
cout << " Screen log file: " << params.out_prefix << ".log" << endl;
/* if (original_model == "WHTEST")
cout <<" WH-TEST report: " << params.out_prefix << ".whtest" << endl;*/
@@ -1164,8 +1194,8 @@ void printAnalysisInfo(int model_df, IQTree& iqtree, Params& params) {
void computeMLDist(Params& params, IQTree& iqtree, string &dist_file, double begin_time) {
double longest_dist;
- stringstream best_tree_string;
- iqtree.printTree(best_tree_string, WT_BR_LEN + WT_TAXON_ID);
+// stringstream best_tree_string;
+// iqtree.printTree(best_tree_string, WT_BR_LEN + WT_TAXON_ID);
cout << "Computing ML distances based on estimated model parameters...";
double *ml_dist = NULL;
double *ml_var = NULL;
@@ -1216,8 +1246,9 @@ void initializeParams(Params ¶ms, IQTree &iqtree, vector<ModelInfo> &model_i
bool test_only = params.model_name.find("ONLY") != string::npos;
/* initialize substitution model */
if (params.model_name.substr(0, 4) == "TEST") {
- if (iqtree.isSuperTree())
- ((PhyloSuperTree*) &iqtree)->mapTrees();
+ // TODO: check if necessary
+// if (iqtree.isSuperTree())
+// ((PhyloSuperTree*) &iqtree)->mapTrees();
double start_cpu_time = getCPUTime();
double start_real_time = getRealTime();
ofstream fmodel;
@@ -1276,8 +1307,9 @@ void initializeParams(Params ¶ms, IQTree &iqtree, vector<ModelInfo> &model_i
if (params.gbo_replicates)
params.speed_conf = 1.0;
- if (iqtree.isSuperTree())
- ((PhyloSuperTree*) &iqtree)->mapTrees();
+ // TODO: check if necessary
+// if (iqtree.isSuperTree())
+// ((PhyloSuperTree*) &iqtree)->mapTrees();
// set parameter for the current tree
// iqtree.setParams(params);
@@ -1424,6 +1456,12 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) {
else
printSiteLhCategory(site_lh_file.c_str(), &iqtree, params.print_site_lh);
}
+
+ if (params.print_site_state_freq) {
+ string site_freq_file = params.out_prefix;
+ site_freq_file += ".sitesf";
+ printSiteStateFreq(site_freq_file.c_str(), &iqtree);
+ }
if (params.print_site_posterior) {
cout << "Computing mixture posterior probabilities" << endl;
@@ -1630,6 +1668,8 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
ModelsBlock *models_block = readModelsDefinition(params);
initializeParams(params, iqtree, model_info, models_block);
+
+ iqtree.restoreCheckpoint();
iqtree.initSettings(params);
/*********************** INITIAL MODEL OPTIMIZATION *****************/
@@ -1711,10 +1751,34 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
// Optimize model parameters and branch lengths using ML for the initial tree
iqtree.clearAllPartialLH();
- initTree = iqtree.optimizeModelParameters(true, initEpsilon);
+ iqtree.getModelFactory()->restoreCheckpoint();
+ if (iqtree.getCheckpoint()->getBool("finishedModelInit")) {
+ // model optimization already done: ignore this step
+ if (!iqtree.candidateTrees.empty())
+ iqtree.readTreeString(iqtree.candidateTrees.getTopTrees(1)[0]);
+ iqtree.setCurScore(iqtree.computeLikelihood());
+ initTree = iqtree.getTreeString();
+ cout << "CHECKPOINT: Model parameters restored, LogL: " << iqtree.getCurScore() << endl;
+ } else {
+ initTree = iqtree.optimizeModelParameters(true, initEpsilon);
+ iqtree.saveCheckpoint();
+ iqtree.getModelFactory()->saveCheckpoint();
+ iqtree.getCheckpoint()->putBool("finishedModelInit", true);
+ iqtree.getCheckpoint()->dump();
+ }
+
+ if (params.lmap_num_quartets) {
+ cout << "Performing likelihood mapping with " << params.lmap_num_quartets << " quartets..." << endl;
+ double lkmap_time = getRealTime();
+ iqtree.doLikelihoodMapping();
+ cout << getRealTime()-lkmap_time << " seconds" << endl;
+ }
+
+ bool finishedCandidateSet = iqtree.getCheckpoint()->getBool("finishedCandidateSet");
+ bool finishedInitTree = iqtree.getCheckpoint()->getBool("finishedInitTree");
// now overwrite with random tree
- if (params.start_tree == STT_RANDOM_TREE) {
+ if (params.start_tree == STT_RANDOM_TREE && !finishedInitTree) {
cout << "Generate random initial Yule-Harding tree..." << endl;
iqtree.generateRandomTree(YULE_HARDING);
iqtree.wrapperFixNegativeBranch(true);
@@ -1726,9 +1790,12 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
/****************** NOW PERFORM MAXIMUM LIKELIHOOD TREE RECONSTRUCTION ******************/
// Update best tree
- iqtree.candidateTrees.update(initTree, iqtree.getCurScore());
+ if (!finishedInitTree)
+ iqtree.candidateTrees.update(initTree, iqtree.getCurScore());
if (params.min_iterations > 0) {
+ if (!iqtree.isBifurcating())
+ outError("Tree search does not work with initial multifurcating tree. Please specify `-n 0` to avoid this.");
cout << "--------------------------------------------------------------------" << endl;
cout << "| INITIALIZING CANDIDATE TREE SET |" << endl;
cout << "--------------------------------------------------------------------" << endl;
@@ -1749,7 +1816,7 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
// params.compute_ml_dist = false;
// }
- if ((!params.dist_file && params.compute_ml_dist) || params.leastSquareBranch) {
+ if (!finishedInitTree && ((!params.dist_file && params.compute_ml_dist) || params.leastSquareBranch)) {
computeMLDist(params, iqtree, dist_file, getCPUTime());
if (!params.user_file && params.start_tree != STT_RANDOM_TREE) {
// NEW 2015-08-10: always compute BIONJ tree into the candidate set
@@ -1771,10 +1838,12 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
}
}
+// iqtree.saveCheckpoint();
+
double cputime_search_start = getCPUTime();
double realtime_search_start = getRealTime();
- if (params.min_iterations > 0) {
+ if (params.min_iterations > 0 && !finishedCandidateSet) {
double initTime = getCPUTime();
// if (!params.user_file && (params.start_tree == STT_PARSIMONY || params.start_tree == STT_PLL_PARSIMONY))
@@ -1791,6 +1860,13 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
<< getCPUTime() - initTime << endl;
}
+ if (finishedCandidateSet) {
+ cout << "CHECKPOINT: Candidate tree set restored, best LogL: " << iqtree.candidateTrees.getBestScore() << endl;
+ } else {
+ iqtree.saveCheckpoint();
+ iqtree.getCheckpoint()->putBool("finishedCandidateSet", true);
+ iqtree.getCheckpoint()->dump(true);
+ }
if (params.leastSquareNNI) {
iqtree.computeSubtreeDists();
@@ -1859,14 +1935,22 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre
cout << "--------------------------------------------------------------------" << endl;
cout << "| FINALIZING TREE SEARCH |" << endl;
cout << "--------------------------------------------------------------------" << endl;
- cout << "Performs final model parameters optimization" << endl;
- string tree;
- if (params.testAlpha)
- tree = iqtree.optimizeModelParameters(true, 0.001);
- else
- tree = iqtree.optimizeModelParameters(true);
- iqtree.candidateTrees.update(tree, iqtree.getCurScore(), true);
+ if (iqtree.getCheckpoint()->getBool("finishedModelFinal")) {
+ iqtree.setCurScore(iqtree.computeLikelihood());
+ cout << "CHECKPOINT: Final model parameters restored" << endl;
+ } else {
+ cout << "Performs final model parameters optimization" << endl;
+ string tree;
+ if (params.testAlpha)
+ tree = iqtree.optimizeModelParameters(true, 0.001);
+ else
+ tree = iqtree.optimizeModelParameters(true);
+ iqtree.candidateTrees.update(tree, iqtree.getCurScore(), true);
+ iqtree.getCheckpoint()->putBool("finishedModelFinal", true);
+ iqtree.saveCheckpoint();
+ }
+
}
if (iqtree.isSuperTree())
@@ -2150,41 +2234,58 @@ void runStandardBootstrap(Params ¶ms, string &original_model, Alignment *ali
bootaln_name += ".bootaln";
string bootlh_name = params.out_prefix;
bootlh_name += ".bootlh";
- // first empty the boottrees file
- try {
- ofstream tree_out;
- tree_out.exceptions(ios::failbit | ios::badbit);
- tree_out.open(boottrees_name.c_str());
- tree_out.close();
- } catch (ios::failure) {
- outError(ERR_WRITE_OUTPUT, boottrees_name);
- }
-
- // empty the bootaln file
- if (params.print_bootaln)
- try {
- ofstream tree_out;
- tree_out.exceptions(ios::failbit | ios::badbit);
- tree_out.open(bootaln_name.c_str());
- tree_out.close();
- } catch (ios::failure) {
- outError(ERR_WRITE_OUTPUT, bootaln_name);
- }
+ int bootSample = 0;
+ if (tree->getCheckpoint()->get("bootSample", bootSample)) {
+ cout << "CHECKPOINT: " << bootSample << " bootstrap analyses restored" << endl;
+ } else {
+ // first empty the boottrees file
+ try {
+ ofstream tree_out;
+ tree_out.exceptions(ios::failbit | ios::badbit);
+ tree_out.open(boottrees_name.c_str());
+ tree_out.close();
+ } catch (ios::failure) {
+ outError(ERR_WRITE_OUTPUT, boottrees_name);
+ }
+ // empty the bootaln file
+ if (params.print_bootaln)
+ try {
+ ofstream tree_out;
+ tree_out.exceptions(ios::failbit | ios::badbit);
+ tree_out.open(bootaln_name.c_str());
+ tree_out.close();
+ } catch (ios::failure) {
+ outError(ERR_WRITE_OUTPUT, bootaln_name);
+ }
+ }
+
double start_time = getCPUTime();
+
+
// do bootstrap analysis
- for (int sample = 0; sample < params.num_bootstrap_samples; sample++) {
+ for (int sample = bootSample; sample < params.num_bootstrap_samples; sample++) {
cout << endl << "===> START BOOTSTRAP REPLICATE NUMBER "
<< sample + 1 << endl << endl;
+ // 2015-12-17: initialize random stream for creating bootstrap samples
+ // mainly so that checkpointing does not need to save bootstrap samples
+ int *saved_randstream = randstream;
+ init_random(params.ran_seed + sample);
+
Alignment* bootstrap_alignment;
- cout << "Creating bootstrap alignment..." << endl;
+ cout << "Creating bootstrap alignment (seed: " << params.ran_seed+sample << ")..." << endl;
if (alignment->isSuperAlignment())
bootstrap_alignment = new SuperAlignment;
else
bootstrap_alignment = new Alignment;
bootstrap_alignment->createBootstrapAlignment(alignment, NULL, params.bootstrap_spec);
+
+ // restore randstream
+ finish_random();
+ randstream = saved_randstream;
+
if (params.print_tree_lh) {
double prob;
bootstrap_alignment->multinomialProb(*alignment, prob);
@@ -2207,24 +2308,30 @@ void runStandardBootstrap(Params ¶ms, string &original_model, Alignment *ali
boot_tree = new IQTree(bootstrap_alignment);
if (params.print_bootaln)
bootstrap_alignment->printPhylip(bootaln_name.c_str(), true);
+
+ // set checkpoint
+ boot_tree->setCheckpoint(tree->getCheckpoint());
+ boot_tree->num_precision = tree->num_precision;
+
runTreeReconstruction(params, original_model, *boot_tree, *model_info);
// read in the output tree file
- string tree_str;
- try {
- ifstream tree_in;
- tree_in.exceptions(ios::failbit | ios::badbit);
- tree_in.open(treefile_name.c_str());
- tree_in >> tree_str;
- tree_in.close();
- } catch (ios::failure) {
- outError(ERR_READ_INPUT, treefile_name);
- }
+ stringstream ss;
+ boot_tree->printTree(ss);
+// try {
+// ifstream tree_in;
+// tree_in.exceptions(ios::failbit | ios::badbit);
+// tree_in.open(treefile_name.c_str());
+// tree_in >> tree_str;
+// tree_in.close();
+// } catch (ios::failure) {
+// outError(ERR_READ_INPUT, treefile_name);
+// }
// write the tree into .boottrees file
try {
ofstream tree_out;
tree_out.exceptions(ios::failbit | ios::badbit);
tree_out.open(boottrees_name.c_str(), ios_base::out | ios_base::app);
- tree_out << tree_str << endl;
+ tree_out << ss.str() << endl;
tree_out.close();
} catch (ios::failure) {
outError(ERR_WRITE_OUTPUT, boottrees_name);
@@ -2243,8 +2350,20 @@ void runStandardBootstrap(Params ¶ms, string &original_model, Alignment *ali
delete boot_tree;
// fix bug: bootstrap_alignment might be changed
delete bootstrap_alignment;
+
+ // clear all checkpointed information
+ Checkpoint *newCheckpoint = new Checkpoint;
+ tree->getCheckpoint()->getSubCheckpoint(newCheckpoint, "iqtree");
+ tree->getCheckpoint()->clear();
+ tree->getCheckpoint()->insert(newCheckpoint->begin(), newCheckpoint->end());
+ tree->getCheckpoint()->put("bootSample", sample+1);
+ tree->getCheckpoint()->putBool("finished", false);
+ tree->getCheckpoint()->dump(true);
+ delete newCheckpoint;
+
}
+
if (params.consensus_type == CT_CONSENSUS_TREE) {
cout << endl << "===> COMPUTE CONSENSUS TREE FROM "
@@ -2331,9 +2450,12 @@ void convertAlignment(Params ¶ms, IQTree *iqtree) {
/**********************************************************
* TOP-LEVEL FUNCTION
***********************************************************/
-void runPhyloAnalysis(Params ¶ms) {
+void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint) {
Alignment *alignment;
IQTree *tree;
+
+ checkpoint->putBool("finished", false);
+ checkpoint->setDumpInterval(params.checkpoint_dump_interval);
/****************** read in alignment **********************/
if (params.partition_file) {
@@ -2362,6 +2484,7 @@ void runPhyloAnalysis(Params ¶ms) {
tree = new IQTree(alignment);
}
+ tree->setCheckpoint(checkpoint);
if (params.min_branch_length <= 0.0) {
params.min_branch_length = 1e-6;
if (tree->getAlnNSite() >= 100000) {
@@ -2440,33 +2563,16 @@ void runPhyloAnalysis(Params ¶ms) {
splitsfile += ".contree";
tree->readTreeFile(splitsfile);
// bug fix
- if ((tree->sse == LK_EIGEN || tree->sse == LK_EIGEN_SSE) && !tree->isBifurcating()) {
- cout << "NOTE: Changing to old kernel as consensus tree is multifurcating" << endl;
- if (tree->sse == LK_EIGEN)
- tree->changeLikelihoodKernel(LK_NORMAL);
- else
- tree->changeLikelihoodKernel(LK_SSE);
- }
+// if ((tree->sse == LK_EIGEN || tree->sse == LK_EIGEN_SSE) && !tree->isBifurcating()) {
+// cout << "NOTE: Changing to old kernel as consensus tree is multifurcating" << endl;
+// if (tree->sse == LK_EIGEN)
+// tree->changeLikelihoodKernel(LK_NORMAL);
+// else
+// tree->changeLikelihoodKernel(LK_SSE);
+// }
tree->initializeAllPartialLh();
tree->fixNegativeBranch(true);
-// if (tree->isSuperTree()) {
-// if (params.partition_type == 0) {
-// PhyloSuperTree *stree = (PhyloSuperTree*) tree;
-// tree->clearAllPartialLH();
-// // full partition model
-// for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++) {
-// (*it)->fixNegativeBranch(true);
-// }
-// tree->clearAllPartialLH();
-// } else {
-// // joint/prop. partition model
-// tree->assignRandomBranchLengths(true);
-// ((PhyloSuperTree*)tree)->mapTrees();
-// }
-// } else {
-// tree->fixNegativeBranch(true);
-// }
tree->boot_consense_logl = tree->optimizeAllBranches();
cout << "Log-likelihood of consensus tree: " << tree->boot_consense_logl << endl;
@@ -2475,16 +2581,9 @@ void runPhyloAnalysis(Params ¶ms) {
tree->printTree(splitsfile.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
// revert the best tree
tree->readTreeString(current_tree);
-// if (tree->isSuperTree()) {
-// tree->optimizeAllBranches();
-// ((PhyloSuperTree*)tree)->computeBranchLengths();
-// }
}
// reinsert identical sequences
if (tree->removed_seqs.size() > 0) {
- // BUG HERE!
-// delete tree->aln;
-// tree->reinsertIdenticalSeqs(alignment);
// BUG FIX: dont use reinsertIdenticalSeqs anymore
tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
tree->printResultTree();
@@ -2495,6 +2594,8 @@ void runPhyloAnalysis(Params ¶ms) {
// the classical non-parameter bootstrap (SBS)
if (params.model_name.find("LINK") != string::npos || params.model_name.find("MERGE") != string::npos)
outError("-m TESTMERGE is not allowed when doing standard bootstrap. Please first\nfind partition scheme on the original alignment and use it for bootstrap analysis");
+ if (alignment->getNSeq() < 4)
+ outError("It makes no sense to perform bootstrap with less than 4 sequences.");
runStandardBootstrap(params, original_model, alignment, tree);
}
@@ -2514,6 +2615,9 @@ void runPhyloAnalysis(Params ¶ms) {
// 2015-09-22: THIS IS STUPID: after deleting tree, one cannot access tree->aln anymore
// alignment = tree->aln;
delete alignment;
+
+ checkpoint->putBool("finished", true);
+ checkpoint->dump(true);
}
void assignBranchSupportNew(Params ¶ms) {
diff --git a/phyloanalysis.h b/phyloanalysis.h
index 069f17f..afcfede 100644
--- a/phyloanalysis.h
+++ b/phyloanalysis.h
@@ -34,7 +34,7 @@ class IQTree;
main function to carry out phylogenetic inference
@param params program parameters
*/
-void runPhyloAnalysis(Params ¶ms);
+void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint);
void runTreeReconstruction(Params ¶ms, string &original_model,
IQTree &tree, vector<ModelInfo> &model_info);
diff --git a/phylokernel.h b/phylokernel.h
index 9b2799c..678393e 100644
--- a/phylokernel.h
+++ b/phylokernel.h
@@ -69,6 +69,13 @@ Numeric PhyloTree::dotProductSIMD(Numeric *x, Numeric *y, int size) {
template <class VectorClass, const int VCSIZE, const int nstates>
void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+
+ if (dad_branch->node->degree() > 3) {
+ // TODO: SIMD version for multifurcating node
+ computePartialLikelihoodEigen(dad_branch, dad);
+ return;
+ }
+
// don't recompute the likelihood
assert(dad);
if (dad_branch->partial_lh_computed & 1)
diff --git a/phylokernelsitemodel.cpp b/phylokernelsitemodel.cpp
new file mode 100644
index 0000000..a79c178
--- /dev/null
+++ b/phylokernelsitemodel.cpp
@@ -0,0 +1,715 @@
+/*
+ * phylokernelsitemodel.cpp
+ * likelihood kernel site-specific frequency model
+ *
+ * Created on: Jan 9, 2016
+ * Author: minh
+ */
+
+
+
+#include "phylotree.h"
+#include "model/modelset.h"
+
+void PhyloTree::computeSitemodelPartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+
+ // don't recompute the likelihood
+ assert(dad);
+ if (dad_branch->partial_lh_computed & 1)
+ return;
+ dad_branch->partial_lh_computed |= 1;
+ PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+
+ size_t nstates = aln->num_states;
+ size_t nptn = aln->size(), tip_block_size = get_safe_upper_limit(nptn)*nstates;
+ size_t ptn, c;
+ size_t ncat = site_rate->getNRate();
+ size_t i, x;
+ size_t block = nstates * ncat;
+ ModelSet *models = (ModelSet*) model;
+ assert(models->size() == nptn);
+
+
+ if (node->isLeaf()) {
+ dad_branch->lh_scale_factor = 0.0;
+ // scale number must be ZERO
+// memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+
+ if (!tip_partial_lh_computed)
+ computeTipPartialLikelihood();
+
+ return;
+ }
+
+ dad_branch->lh_scale_factor = 0.0;
+
+ // internal node
+ PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ PhyloNeighbor *nei = (PhyloNeighbor*)*it;
+ if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+ if ((nei->partial_lh_computed & 1) == 0)
+ computeSitemodelPartialLikelihoodEigen(nei, node);
+ dad_branch->lh_scale_factor += nei->lh_scale_factor;
+ }
+
+ if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
+ // re-orient partial_lh
+ bool done = false;
+ FOR_NEIGHBOR_IT(node, dad, it2) {
+ PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
+ if (backnei->partial_lh) {
+ dad_branch->partial_lh = backnei->partial_lh;
+ dad_branch->scale_num = backnei->scale_num;
+ backnei->partial_lh = NULL;
+ backnei->scale_num = NULL;
+ backnei->partial_lh_computed &= ~1; // clear bit
+ done = true;
+ break;
+ }
+ }
+ assert(done && "partial_lh is not re-oriented");
+ }
+
+
+ double sum_scale = 0.0;
+
+ if (!left->node->isLeaf() && right->node->isLeaf()) {
+ PhyloNeighbor *tmp = left;
+ left = right;
+ right = tmp;
+ }
+
+ assert(node->degree() == 3); // does not work with multifurcating tree yet
+
+// if (node->degree() > 3) {
+//
+// /*--------------------- multifurcating node ------------------*/
+//
+// // now for-loop computing partial_lh over all site-patterns
+//#ifdef _OPENMP
+//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+//#endif
+// for (ptn = 0; ptn < nptn; ptn++) {
+// double partial_lh_all[block];
+// for (i = 0; i < block; i++)
+// partial_lh_all[i] = 1.0;
+// dad_branch->scale_num[ptn] = 0;
+//
+// double *partial_lh_leaf = partial_lh_leaves;
+// double *echild = echildren;
+//
+// FOR_NEIGHBOR_IT(node, dad, it) {
+// PhyloNeighbor *child = (PhyloNeighbor*)*it;
+// if (child->node->isLeaf()) {
+// // external node
+// int state_child = (ptn < orig_ntn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+// double *child_lh = partial_lh_leaf + state_child*block;
+// for (c = 0; c < block; c++) {
+// // compute real partial likelihood vector
+// partial_lh_all[c] *= child_lh[c];
+// }
+// partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+// } else {
+// // internal node
+// double *partial_lh = partial_lh_all;
+// double *partial_lh_child = child->partial_lh + ptn*block;
+// dad_branch->scale_num[ptn] += child->scale_num[ptn];
+//
+// double *echild_ptr = echild;
+// for (c = 0; c < ncat; c++) {
+// // compute real partial likelihood vector
+// for (x = 0; x < nstates; x++) {
+// double vchild = 0.0;
+//// double *echild_ptr = echild + (c*nstatesqr+x*nstates);
+// for (i = 0; i < nstates; i++) {
+// vchild += echild_ptr[i] * partial_lh_child[i];
+// }
+// echild_ptr += nstates;
+// partial_lh[x] *= vchild;
+// }
+// partial_lh += nstates;
+// partial_lh_child += nstates;
+// }
+// } // if
+// echild += block*nstates;
+// } // FOR_NEIGHBOR
+//
+//
+// // compute dot-product with inv_eigenvector
+// double lh_max = 0.0;
+// double *partial_lh_tmp = partial_lh_all;
+// double *partial_lh = dad_branch->partial_lh + ptn*block;
+// for (c = 0; c < ncat; c++) {
+// double *inv_evec_ptr = inv_evec;
+// for (i = 0; i < nstates; i++) {
+// double res = 0.0;
+// for (x = 0; x < nstates; x++) {
+// res += partial_lh_tmp[x]*inv_evec_ptr[x];
+// }
+// inv_evec_ptr += nstates;
+// partial_lh[i] = res;
+// lh_max = max(lh_max, fabs(res));
+// }
+// partial_lh += nstates;
+// partial_lh_tmp += nstates;
+// }
+// // check if one should scale partial likelihoods
+// if (lh_max < SCALING_THRESHOLD) {
+// partial_lh = dad_branch->partial_lh + ptn*block;
+// if (lh_max == 0.0) {
+// // for very shitty data
+// for (c = 0; c < ncat; c++)
+// memcpy(&partial_lh[c*nstates], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
+// sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+// //sum_scale += log(lh_max) * ptn_freq[ptn];
+// dad_branch->scale_num[ptn] += 4;
+// int nsite = aln->getNSite();
+// for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+// if (aln->getPatternID(i) == ptn) {
+// outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+// x++;
+// }
+// } else {
+// // now do the likelihood scaling
+// for (i = 0; i < block; i++) {
+// partial_lh[i] *= SCALING_THRESHOLD_INVER;
+// //partial_lh[i] /= lh_max;
+// }
+// // unobserved const pattern will never have underflow
+// sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+// //sum_scale += log(lh_max) * ptn_freq[ptn];
+// dad_branch->scale_num[ptn] += 1;
+// }
+// }
+//
+// } // for ptn
+// dad_branch->lh_scale_factor += sum_scale;
+//
+// // end multifurcating treatment
+// } else
+ if (left->node->isLeaf() && right->node->isLeaf()) {
+
+ /*--------------------- TIP-TIP (cherry) case ------------------*/
+
+ double *tip_partial_lh_left = tip_partial_lh + (left->node->id * tip_block_size);
+ double *tip_partial_lh_right = tip_partial_lh + (right->node->id * tip_block_size);
+
+ // scale number must be ZERO
+ memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, c, x, i) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double partial_lh_tmp[nstates];
+ double *partial_lh = dad_branch->partial_lh + ptn*block;
+ double *partial_lh_left = tip_partial_lh_left + ptn*nstates;
+ double *partial_lh_right = tip_partial_lh_right + ptn*nstates;
+
+ double expleft[nstates];
+ double expright[nstates];
+ double *eval = models->at(ptn)->getEigenvalues();
+ double *evec = models->at(ptn)->getEigenvectors();
+ double *inv_evec = models->at(ptn)->getInverseEigenvectors();
+
+ for (c = 0; c < ncat; c++) {
+ double len_left = site_rate->getRate(c) * left->length;
+ double len_right = site_rate->getRate(c) * right->length;
+ for (i = 0; i < nstates; i++) {
+ expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
+ expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
+ }
+
+ // compute real partial likelihood vector
+ for (x = 0; x < nstates; x++) {
+ double vleft = 0.0, vright = 0.0;
+ double *this_evec = evec + x*nstates;
+ for (i = 0; i < nstates; i++) {
+ vleft += this_evec[i] * expleft[i];
+ vright += this_evec[i] * expright[i];
+ }
+ partial_lh_tmp[x] = vleft*vright;
+ }
+
+ // do not increase partial_lh_left and right for tips
+
+ // compute dot-product with inv_eigenvector
+ double *inv_evec_ptr = inv_evec;
+ for (i = 0; i < nstates; i++) {
+ double res = 0.0;
+ for (x = 0; x < nstates; x++) {
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
+ }
+ inv_evec_ptr += nstates;
+ partial_lh[c*nstates+i] = res;
+ }
+ }
+
+ }
+
+ } else if (left->node->isLeaf() && !right->node->isLeaf()) {
+
+ /*--------------------- TIP-INTERNAL NODE case ------------------*/
+
+ double *tip_partial_lh_left = tip_partial_lh + (left->node->id * tip_block_size);
+
+ // only take scale_num from the right subtree
+ memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double partial_lh_tmp[nstates];
+ double *partial_lh = dad_branch->partial_lh + ptn*block;
+ double *partial_lh_left = tip_partial_lh_left + ptn*nstates;
+ double *partial_lh_right = right->partial_lh + ptn*block;
+ double lh_max = 0.0;
+
+ double expleft[nstates];
+ double expright[nstates];
+ double *eval = models->at(ptn)->getEigenvalues();
+ double *evec = models->at(ptn)->getEigenvectors();
+ double *inv_evec = models->at(ptn)->getInverseEigenvectors();
+
+ for (c = 0; c < ncat; c++) {
+ double len_left = site_rate->getRate(c) * left->length;
+ double len_right = site_rate->getRate(c) * right->length;
+ for (i = 0; i < nstates; i++) {
+ expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
+ expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
+ }
+ // compute real partial likelihood vector
+ for (x = 0; x < nstates; x++) {
+ double vleft = 0.0, vright = 0.0;
+ double *this_evec = evec + x*nstates;
+ for (i = 0; i < nstates; i++) {
+ vleft += this_evec[i] * expleft[i];
+ vright += this_evec[i] * expright[i];
+ }
+ partial_lh_tmp[x] = vleft*vright;
+ }
+ // do not increase partial_lh_left for left tip
+ partial_lh_right += nstates;
+
+ // compute dot-product with inv_eigenvector
+ double *inv_evec_ptr = inv_evec;
+ for (i = 0; i < nstates; i++) {
+ double res = 0.0;
+ for (x = 0; x < nstates; x++) {
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
+ }
+ inv_evec_ptr += nstates;
+ partial_lh[c*nstates+i] = res;
+ lh_max = max(lh_max, fabs(res));
+ }
+ }
+
+ // check if one should scale partial likelihoods
+ if (lh_max < SCALING_THRESHOLD) {
+ if (lh_max == 0.0) {
+ // for very shitty data
+ for (c = 0; c < ncat; c++)
+ memcpy(&partial_lh[c*nstates], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
+ sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 4;
+ int nsite = aln->getNSite();
+ for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+ if (aln->getPatternID(i) == ptn) {
+ outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+ x++;
+ }
+ } else {
+ // now do the likelihood scaling
+ for (i = 0; i < block; i++) {
+ partial_lh[i] *= SCALING_THRESHOLD_INVER;
+ //partial_lh[i] /= lh_max;
+ }
+ // unobserved const pattern will never have underflow
+ sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 1;
+ }
+ }
+
+ }
+ dad_branch->lh_scale_factor += sum_scale;
+
+ } else {
+ /*--------------------- INTERNAL-INTERNAL NODE case ------------------*/
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double partial_lh_tmp[nstates];
+ double *partial_lh = dad_branch->partial_lh + ptn*block;
+ double *partial_lh_left = left->partial_lh + ptn*block;
+ double *partial_lh_right = right->partial_lh + ptn*block;
+ double lh_max = 0.0;
+
+ double expleft[nstates];
+ double expright[nstates];
+ double *eval = models->at(ptn)->getEigenvalues();
+ double *evec = models->at(ptn)->getEigenvectors();
+ double *inv_evec = models->at(ptn)->getInverseEigenvectors();
+
+ dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+
+ for (c = 0; c < ncat; c++) {
+ double len_left = site_rate->getRate(c) * left->length;
+ double len_right = site_rate->getRate(c) * right->length;
+ for (i = 0; i < nstates; i++) {
+ expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
+ expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
+ }
+ // compute real partial likelihood vector
+ for (x = 0; x < nstates; x++) {
+ double vleft = 0.0, vright = 0.0;
+ double *this_evec = evec + x*nstates;
+ for (i = 0; i < nstates; i++) {
+ vleft += this_evec[i] * expleft[i];
+ vright += this_evec[i] * expright[i];
+ }
+ partial_lh_tmp[x] = vleft*vright;
+ }
+ partial_lh_left += nstates;
+ partial_lh_right += nstates;
+
+ // compute dot-product with inv_eigenvector
+ double *inv_evec_ptr = inv_evec;
+ for (i = 0; i < nstates; i++) {
+ double res = 0.0;
+ for (x = 0; x < nstates; x++) {
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
+ }
+ inv_evec_ptr += nstates;
+ partial_lh[c*nstates+i] = res;
+ lh_max = max(lh_max, fabs(res));
+ }
+ }
+
+ // check if one should scale partial likelihoods
+ if (lh_max < SCALING_THRESHOLD) {
+ if (lh_max == 0.0) {
+ // for very shitty data
+ for (c = 0; c < ncat; c++)
+ memcpy(&partial_lh[c*nstates], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
+ sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 4;
+ int nsite = aln->getNSite();
+ for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+ if (aln->getPatternID(i) == ptn) {
+ outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+ x++;
+ }
+ } else {
+ // now do the likelihood scaling
+ for (i = 0; i < block; i++) {
+ partial_lh[i] *= SCALING_THRESHOLD_INVER;
+ //partial_lh[i] /= lh_max;
+ }
+ // unobserved const pattern will never have underflow
+ sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 1;
+ }
+ }
+
+ }
+ dad_branch->lh_scale_factor += sum_scale;
+
+ }
+
+}
+
+//template <const int nstates>
+void PhyloTree::computeSitemodelLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+ PhyloNode *node = (PhyloNode*) dad_branch->node;
+ PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+ if (!central_partial_lh)
+ initializeAllPartialLh();
+ if (node->isLeaf()) {
+ PhyloNode *tmp_node = dad;
+ dad = node;
+ node = tmp_node;
+ PhyloNeighbor *tmp_nei = dad_branch;
+ dad_branch = node_branch;
+ node_branch = tmp_nei;
+ }
+ if ((dad_branch->partial_lh_computed & 1) == 0)
+ computePartialLikelihood(dad_branch, dad);
+ if ((node_branch->partial_lh_computed & 1) == 0)
+ computePartialLikelihood(node_branch, node);
+
+ size_t nstates = aln->num_states;
+ size_t ncat = site_rate->getNRate();
+
+ size_t block = ncat * nstates;
+ size_t ptn; // for big data size > 4GB memory required
+ size_t c, i;
+ size_t nptn = aln->size();
+
+ assert(theta_all);
+ if (!theta_computed) {
+ // precompute theta for fast branch length optimization
+
+ if (dad->isLeaf()) {
+ // special treatment for TIP-INTERNAL NODE case
+
+ double *tip_partial_lh_node = tip_partial_lh + (dad->id * get_safe_upper_limit(nptn)*nstates);
+
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i, c) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+ double *theta = theta_all + ptn*block;
+ double *lh_tip = tip_partial_lh_node + ptn*nstates;
+ for (c = 0; c < ncat; c++) {
+ for (i = 0; i < nstates; i++) {
+ theta[i] = lh_tip[i] * partial_lh_dad[i];
+ }
+ partial_lh_dad += nstates;
+ theta += nstates;
+ }
+
+ }
+ } else
+ {
+ // both dad and node are internal nodes
+
+// size_t all_entries = nptn*block;
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double *theta = theta_all + ptn*block;
+ double *partial_lh_node = node_branch->partial_lh + ptn*block;
+ double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+ for (i = 0; i < block; i++) {
+ theta[i] = partial_lh_node[i] * partial_lh_dad[i];
+ }
+ }
+ }
+ theta_computed = true;
+ }
+
+ ModelSet *models = (ModelSet*)model;
+ double my_df = 0.0, my_ddf = 0.0;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: my_df, my_ddf) private(ptn, i, c) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double lh_ptn = ptn_invar[ptn], df_ptn = 0.0, ddf_ptn = 0.0;
+ double *theta = theta_all + ptn*block;
+
+ double *eval = models->at(ptn)->getEigenvalues();
+
+ for (c = 0; c < ncat; c++) {
+ double lh_cat = 0.0, df_cat = 0.0, ddf_cat = 0.0;
+ for (i = 0; i < nstates; i++) {
+ double cof = eval[i]*site_rate->getRate(c);
+ double val = exp(cof*dad_branch->length) * theta[i];
+ double val1 = cof*val;
+ lh_cat += val;
+ df_cat += val1;
+ ddf_cat += cof*val1;
+ }
+ double prop = site_rate->getProp(c);
+ lh_ptn += prop * lh_cat;
+ df_ptn += prop * df_cat;
+ ddf_ptn += prop * ddf_cat;
+ theta += nstates;
+ }
+
+ lh_ptn = 1.0/fabs(lh_ptn);
+
+ double df_frac = df_ptn * lh_ptn;
+ double ddf_frac = ddf_ptn * lh_ptn;
+ double freq = ptn_freq[ptn];
+ double tmp1 = df_frac * freq;
+ double tmp2 = ddf_frac * freq;
+ my_df += tmp1;
+ my_ddf += tmp2 - tmp1 * df_frac;
+ }
+ df = my_df;
+ ddf = my_ddf;
+ if (isnan(df) || isinf(df)) {
+ df = 0.0;
+ ddf = 0.0;
+// outWarning("Numerical instability (some site-likelihood = 0)");
+ }
+
+}
+
+//template <const int nstates>
+double PhyloTree::computeSitemodelLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+ PhyloNode *node = (PhyloNode*) dad_branch->node;
+ PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+ if (!central_partial_lh)
+ initializeAllPartialLh();
+ if (node->isLeaf()) {
+ PhyloNode *tmp_node = dad;
+ dad = node;
+ node = tmp_node;
+ PhyloNeighbor *tmp_nei = dad_branch;
+ dad_branch = node_branch;
+ node_branch = tmp_nei;
+ }
+ if ((dad_branch->partial_lh_computed & 1) == 0)
+// computePartialLikelihoodEigen(dad_branch, dad);
+ computePartialLikelihood(dad_branch, dad);
+ if ((node_branch->partial_lh_computed & 1) == 0)
+// computePartialLikelihoodEigen(node_branch, node);
+ computePartialLikelihood(node_branch, node);
+ double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+ size_t nstates = aln->num_states;
+ size_t ncat = site_rate->getNRate();
+
+ size_t block = ncat * nstates;
+ size_t ptn; // for big data size > 4GB memory required
+ size_t c, i;
+ size_t nptn = aln->size();
+
+
+ memset(_pattern_lh_cat, 0, nptn*ncat*sizeof(double));
+ ModelSet *models = (ModelSet*)model;
+
+ if (dad->isLeaf()) {
+ // special treatment for TIP-INTERNAL NODE case
+ double *tip_partial_lh_node = tip_partial_lh + (dad->id * get_safe_upper_limit(nptn)*nstates);
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh) private(ptn, i, c) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double lh_ptn = ptn_invar[ptn];
+ double *lh_cat = _pattern_lh_cat + ptn*ncat;
+ double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+ double *partial_lh_node = tip_partial_lh_node + ptn*nstates;
+ double *eval = models->at(ptn)->getEigenvalues();
+
+ for (c = 0; c < ncat; c++) {
+ double len = site_rate->getRate(c)*dad_branch->length;
+ double prop = site_rate->getProp(c);
+ for (i = 0; i < nstates; i++) {
+ *lh_cat += exp(eval[i]*len) * partial_lh_node[i] * partial_lh_dad[i];
+ }
+ *lh_cat *= prop;
+ lh_ptn += *lh_cat;
+ // don't increase partial_lh_node pointer
+ partial_lh_dad += nstates;
+ lh_cat++;
+ }
+
+ lh_ptn = log(fabs(lh_ptn));
+ _pattern_lh[ptn] = lh_ptn;
+ tree_lh += lh_ptn * ptn_freq[ptn];
+ }
+ } else
+ {
+ // both dad and node are internal nodes
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh) private(ptn, i, c) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double lh_ptn = ptn_invar[ptn];
+ double *lh_cat = _pattern_lh_cat + ptn*ncat;
+ double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+ double *partial_lh_node = node_branch->partial_lh + ptn*block;
+ double *eval = models->at(ptn)->getEigenvalues();
+
+ for (c = 0; c < ncat; c++) {
+ double len = site_rate->getRate(c)*dad_branch->length;
+ double prop = site_rate->getProp(c);
+ for (i = 0; i < nstates; i++) {
+ *lh_cat += exp(eval[i]*len) * partial_lh_node[i] * partial_lh_dad[i];
+ }
+ *lh_cat *= prop;
+ lh_ptn += *lh_cat;
+ partial_lh_node += nstates;
+ partial_lh_dad += nstates;
+ lh_cat++;
+ }
+
+ lh_ptn = log(fabs(lh_ptn));
+ _pattern_lh[ptn] = lh_ptn;
+ tree_lh += lh_ptn * ptn_freq[ptn];
+ }
+ }
+
+ if (isnan(tree_lh) || isinf(tree_lh)) {
+ cout << "WARNING: Numerical underflow caused by alignment sites";
+ i = aln->getNSite();
+ int j;
+ for (j = 0, c = 0; j < i; j++) {
+ ptn = aln->getPatternID(j);
+ if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+ cout << " " << j+1;
+ c++;
+ if (c >= 10) {
+ cout << " ...";
+ break;
+ }
+ }
+ }
+ cout << endl;
+ tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+ for (ptn = 0; ptn < nptn; ptn++) {
+ if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+ _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
+ }
+ tree_lh += _pattern_lh[ptn] * ptn_freq[ptn];
+ }
+ }
+
+ assert(!isnan(tree_lh) && !isinf(tree_lh));
+
+ return tree_lh;
+}
+
+
+double PhyloTree::computeSitemodelLikelihoodFromBufferEigen() {
+ assert(theta_all && theta_computed);
+
+ size_t nstates = aln->num_states;
+ size_t ncat = site_rate->getNRate();
+
+ size_t block = ncat * nstates;
+ size_t ptn; // for big data size > 4GB memory required
+ size_t c, i;
+ size_t nptn = aln->size();
+
+ ModelSet *models = (ModelSet*)model;
+
+ double tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh) private(ptn, i, c) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double lh_ptn = ptn_invar[ptn];
+ double *theta = theta_all + ptn*block;
+
+ double *eval = models->at(ptn)->getEigenvalues();
+
+ for (c = 0; c < ncat; c++) {
+ double lh_cat = 0.0;
+ double len = site_rate->getRate(c)*current_it->length;
+ for (i = 0; i < nstates; i++) {
+ lh_cat += exp(eval[i]*len) * theta[i];
+ }
+ lh_ptn += lh_cat * site_rate->getProp(c);
+ theta += nstates;
+ }
+
+ lh_ptn = log(fabs(lh_ptn));
+ _pattern_lh[ptn] = lh_ptn;
+ tree_lh += lh_ptn * ptn_freq[ptn];
+ }
+ return tree_lh;
+}
+
diff --git a/phylokernelsitemodel.h b/phylokernelsitemodel.h
new file mode 100644
index 0000000..2bf4460
--- /dev/null
+++ b/phylokernelsitemodel.h
@@ -0,0 +1,801 @@
+/*
+ * phylokernelsitemodel.h
+ * optimize SIMD likelihood function for site-specific model
+ * Created on: Jan 12, 2016
+ * Author: minh
+ */
+
+#ifndef PHYLOKERNELSITEMODEL_H_
+#define PHYLOKERNELSITEMODEL_H_
+
+#include "phylotree.h"
+#include "model/modelset.h"
+
+inline double horizontal_add(double x) {
+ return x;
+}
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+void PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+
+ // don't recompute the likelihood
+ assert(dad);
+ if (dad_branch->partial_lh_computed & 1)
+ return;
+ dad_branch->partial_lh_computed |= 1;
+ PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+ size_t nptn = aln->size(), tip_block_size = get_safe_upper_limit(nptn)*nstates;
+ size_t ptn, c;
+ size_t ncat = site_rate->getNRate();
+ size_t i, x, j;
+ size_t block = nstates * ncat;
+ ModelSet *models = (ModelSet*) model;
+ assert(models->size() == nptn);
+
+
+ if (node->isLeaf()) {
+ dad_branch->lh_scale_factor = 0.0;
+ // scale number must be ZERO
+// memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+
+ if (!tip_partial_lh_computed)
+ computeTipPartialLikelihood();
+
+ return;
+ }
+
+ dad_branch->lh_scale_factor = 0.0;
+
+ // internal node
+ PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ PhyloNeighbor *nei = (PhyloNeighbor*)*it;
+ if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+ if ((nei->partial_lh_computed & 1) == 0)
+ computeSitemodelPartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(nei, node);
+ dad_branch->lh_scale_factor += nei->lh_scale_factor;
+ }
+
+ if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
+ // re-orient partial_lh
+ bool done = false;
+ FOR_NEIGHBOR_IT(node, dad, it2) {
+ PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
+ if (backnei->partial_lh) {
+ dad_branch->partial_lh = backnei->partial_lh;
+ dad_branch->scale_num = backnei->scale_num;
+ backnei->partial_lh = NULL;
+ backnei->scale_num = NULL;
+ backnei->partial_lh_computed &= ~1; // clear bit
+ done = true;
+ break;
+ }
+ }
+ assert(done && "partial_lh is not re-oriented");
+ }
+
+
+ double sum_scale = 0.0;
+
+ if (!left->node->isLeaf() && right->node->isLeaf()) {
+ PhyloNeighbor *tmp = left;
+ left = right;
+ right = tmp;
+ }
+
+ assert(node->degree() == 3); // does not work with multifurcating tree yet
+
+ if (left->node->isLeaf() && right->node->isLeaf()) {
+
+ /*--------------------- TIP-TIP (cherry) case ------------------*/
+
+ double *tip_partial_lh_left = tip_partial_lh + (left->node->id * tip_block_size);
+ double *tip_partial_lh_right = tip_partial_lh + (right->node->id * tip_block_size);
+
+ // scale number must be ZERO
+ memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, c, x, i, j) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ VectorClass partial_lh_tmp[nstates/VCSIZE];
+ VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+ VectorClass *partial_lh_left = (VectorClass*)(tip_partial_lh_left + ptn*nstates);
+ VectorClass *partial_lh_right = (VectorClass*)(tip_partial_lh_right + ptn*nstates);
+
+ VectorClass expleft[nstates/VCSIZE];
+ VectorClass expright[nstates/VCSIZE];
+ VectorClass *eval = (VectorClass*)(models->at(ptn)->getEigenvalues());
+ VectorClass *evec = (VectorClass*)(models->at(ptn)->getEigenvectors());
+ VectorClass *inv_evec = (VectorClass*)(models->at(ptn)->getInverseEigenvectors());
+ VectorClass vleft[VCSIZE];
+ VectorClass vright[VCSIZE];
+ VectorClass res[VCSIZE];
+ VectorClass len_left, len_right;
+ VectorClass *this_evec;
+
+ for (c = 0; c < ncat; c++) {
+ len_left = site_rate->getRate(c) * left->length;
+ len_right = site_rate->getRate(c) * right->length;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
+ expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
+ }
+ // compute real partial likelihood vector
+ this_evec = evec;
+ for (x = 0; x < nstates/VCSIZE; x++) {
+ for (j = 0; j < VCSIZE; j++) {
+ vleft[j] = 0.0;
+ vright[j] = 0.0;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ vleft[j] = mul_add(this_evec[i], expleft[i], vleft[j]);
+ vright[j] = mul_add(this_evec[i], expright[i], vright[j]);
+ }
+ this_evec += nstates/VCSIZE;
+ }
+ partial_lh_tmp[x] = horizontal_add(vleft)*horizontal_add(vright);
+ }
+
+ // compute dot-product with inv_eigenvector
+ this_evec = inv_evec;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ for (j = 0; j < VCSIZE; j++) {
+ res[j] = 0.0;
+ for (x = 0; x < nstates/VCSIZE; x++) {
+ res[j] = mul_add(partial_lh_tmp[x], this_evec[x], res[j]);
+ }
+ this_evec += nstates/VCSIZE;
+ }
+ partial_lh[i] = horizontal_add(res);
+ }
+
+// partial_lh_left += nstates/VCSIZE;
+// partial_lh_right += nstates/VCSIZE;
+ partial_lh += nstates/VCSIZE;
+ }
+ }
+
+ } else if (left->node->isLeaf() && !right->node->isLeaf()) {
+
+ /*--------------------- TIP-INTERNAL NODE case ------------------*/
+
+ double *tip_partial_lh_left = tip_partial_lh + (left->node->id * tip_block_size);
+
+ // only take scale_num from the right subtree
+ memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, j) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ VectorClass partial_lh_tmp[nstates/VCSIZE];
+ VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+ VectorClass *partial_lh_left = (VectorClass*)(tip_partial_lh_left + ptn*nstates);
+ VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
+ VectorClass lh_max = 0.0;
+
+ VectorClass expleft[nstates/VCSIZE];
+ VectorClass expright[nstates/VCSIZE];
+ VectorClass *eval = (VectorClass*)(models->at(ptn)->getEigenvalues());
+ VectorClass *evec = (VectorClass*)(models->at(ptn)->getEigenvectors());
+ VectorClass *inv_evec = (VectorClass*)(models->at(ptn)->getInverseEigenvectors());
+ VectorClass vleft[VCSIZE];
+ VectorClass vright[VCSIZE];
+ VectorClass res[VCSIZE];
+ VectorClass len_left, len_right;
+ VectorClass *this_evec;
+
+ for (c = 0; c < ncat; c++) {
+ len_left = site_rate->getRate(c) * left->length;
+ len_right = site_rate->getRate(c) * right->length;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
+ expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
+ }
+ // compute real partial likelihood vector
+ this_evec = evec;
+ for (x = 0; x < nstates/VCSIZE; x++) {
+ for (j = 0; j < VCSIZE; j++) {
+ vleft[j] = 0.0;
+ vright[j] = 0.0;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ vleft[j] = mul_add(this_evec[i], expleft[i], vleft[j]);
+ vright[j] = mul_add(this_evec[i], expright[i], vright[j]);
+ }
+ this_evec += nstates/VCSIZE;
+ }
+ partial_lh_tmp[x] = horizontal_add(vleft)*horizontal_add(vright);
+ }
+
+ // compute dot-product with inv_eigenvector
+ this_evec = inv_evec;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ for (j = 0; j < VCSIZE; j++) {
+ res[j] = 0.0;
+ for (x = 0; x < nstates/VCSIZE; x++) {
+ res[j] = mul_add(partial_lh_tmp[x], this_evec[x], res[j]);
+ }
+ this_evec += nstates/VCSIZE;
+ }
+ lh_max = max(lh_max, abs(partial_lh[i] = horizontal_add(res)));
+ }
+
+// partial_lh_left += nstates/VCSIZE;
+ partial_lh_right += nstates/VCSIZE;
+ partial_lh += nstates/VCSIZE;
+ }
+
+ // check if one should scale partial likelihoods
+ double dmax = horizontal_max(lh_max);
+ if (dmax < SCALING_THRESHOLD) {
+ partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+ if (dmax == 0.0) {
+ // for very shitty data
+ for (c = 0; c < ncat; c++)
+ memcpy(&partial_lh[c*nstates/VCSIZE], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
+ sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 4;
+ int nsite = aln->getNSite();
+ for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+ if (aln->getPatternID(i) == ptn) {
+ outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+ x++;
+ }
+ } else {
+ // now do the likelihood scaling
+ for (i = 0; i < block/VCSIZE; i++) {
+ partial_lh[i] *= SCALING_THRESHOLD_INVER;
+ //partial_lh[i] /= lh_max;
+ }
+ // unobserved const pattern will never have underflow
+ sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 1;
+ }
+ }
+
+ }
+ dad_branch->lh_scale_factor += sum_scale;
+
+
+ } else {
+ /*--------------------- INTERNAL-INTERNAL NODE case ------------------*/
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, j) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ VectorClass partial_lh_tmp[nstates/VCSIZE];
+ VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+ VectorClass *partial_lh_left = (VectorClass*)(left->partial_lh + ptn*block);
+ VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
+ VectorClass lh_max = 0.0;
+
+ VectorClass expleft[nstates/VCSIZE];
+ VectorClass expright[nstates/VCSIZE];
+ VectorClass *eval = (VectorClass*)(models->at(ptn)->getEigenvalues());
+ VectorClass *evec = (VectorClass*)(models->at(ptn)->getEigenvectors());
+ VectorClass *inv_evec = (VectorClass*)(models->at(ptn)->getInverseEigenvectors());
+ VectorClass vleft[VCSIZE];
+ VectorClass vright[VCSIZE];
+ VectorClass res[VCSIZE];
+ VectorClass len_left, len_right;
+ VectorClass *this_evec;
+
+ dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+
+ for (c = 0; c < ncat; c++) {
+ len_left = site_rate->getRate(c) * left->length;
+ len_right = site_rate->getRate(c) * right->length;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
+ expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
+ }
+ // compute real partial likelihood vector
+ this_evec = evec;
+ for (x = 0; x < nstates/VCSIZE; x++) {
+ for (j = 0; j < VCSIZE; j++) {
+ vleft[j] = 0.0;
+ vright[j] = 0.0;
+// this_evec = evec + (x*VCSIZE+j)*nstates/VCSIZE;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ vleft[j] = mul_add(this_evec[i], expleft[i], vleft[j]);
+ vright[j] = mul_add(this_evec[i], expright[i], vright[j]);
+ }
+ this_evec += nstates/VCSIZE;
+ }
+ partial_lh_tmp[x] = horizontal_add(vleft)*horizontal_add(vright);
+ }
+
+ // compute dot-product with inv_eigenvector
+ this_evec = inv_evec;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ for (j = 0; j < VCSIZE; j++) {
+ res[j] = 0.0;
+ for (x = 0; x < nstates/VCSIZE; x++) {
+ res[j] = mul_add(partial_lh_tmp[x], this_evec[x], res[j]);
+ }
+ this_evec += nstates/VCSIZE;
+ }
+ lh_max = max(lh_max, abs(partial_lh[i] = horizontal_add(res)));
+ }
+
+ partial_lh_left += nstates/VCSIZE;
+ partial_lh_right += nstates/VCSIZE;
+ partial_lh += nstates/VCSIZE;
+ }
+
+ // check if one should scale partial likelihoods
+ double dmax = horizontal_max(lh_max);
+ if (dmax < SCALING_THRESHOLD) {
+ partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+ if (dmax == 0.0) {
+ // for very shitty data
+ for (c = 0; c < ncat; c++)
+ memcpy(&partial_lh[c*nstates/VCSIZE], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
+ sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 4;
+ int nsite = aln->getNSite();
+ for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+ if (aln->getPatternID(i) == ptn) {
+ outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+ x++;
+ }
+ } else {
+ // now do the likelihood scaling
+ for (i = 0; i < block/VCSIZE; i++) {
+ partial_lh[i] *= SCALING_THRESHOLD_INVER;
+ //partial_lh[i] /= lh_max;
+ }
+ // unobserved const pattern will never have underflow
+ sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 1;
+ }
+ }
+
+ }
+ dad_branch->lh_scale_factor += sum_scale;
+
+ }
+
+}
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+void PhyloTree::computeSitemodelLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+ PhyloNode *node = (PhyloNode*) dad_branch->node;
+ PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+ if (!central_partial_lh)
+ initializeAllPartialLh();
+ if (node->isLeaf()) {
+ PhyloNode *tmp_node = dad;
+ dad = node;
+ node = tmp_node;
+ PhyloNeighbor *tmp_nei = dad_branch;
+ dad_branch = node_branch;
+ node_branch = tmp_nei;
+ }
+ if ((dad_branch->partial_lh_computed & 1) == 0)
+ computeSitemodelPartialLikelihoodEigenSIMD<VectorClass,VCSIZE,nstates>(dad_branch, dad);
+ if ((node_branch->partial_lh_computed & 1) == 0)
+ computeSitemodelPartialLikelihoodEigenSIMD<VectorClass,VCSIZE,nstates>(node_branch, node);
+
+// size_t nstates = aln->num_states;
+ size_t ncat = site_rate->getNRate();
+
+ size_t block = ncat * nstates;
+ size_t ptn; // for big data size > 4GB memory required
+ size_t c, i, j;
+ size_t nptn = aln->size();
+ size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+
+ assert(theta_all);
+ if (!theta_computed) {
+ // precompute theta for fast branch length optimization
+
+ if (dad->isLeaf()) {
+ // special treatment for TIP-INTERNAL NODE case
+
+ double *tip_partial_lh_node = tip_partial_lh + (dad->id * get_safe_upper_limit(nptn)*nstates);
+
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i, c) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+ VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
+ VectorClass *lh_tip = (VectorClass*)(tip_partial_lh_node + ptn*nstates);
+ for (c = 0; c < ncat; c++) {
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ theta[i] = lh_tip[i] * partial_lh_dad[i];
+ }
+ partial_lh_dad += nstates/VCSIZE;
+ theta += nstates/VCSIZE;
+ }
+
+ }
+ } else
+ {
+ // both dad and node are internal nodes
+
+ size_t block_VCSIZE = block/VCSIZE;
+
+// size_t all_entries = nptn*block;
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+ VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
+ VectorClass *partial_lh_node = (VectorClass*)(node_branch->partial_lh + ptn*block);
+ for (i = 0; i < block_VCSIZE; i++) {
+ theta[i] = partial_lh_node[i] * partial_lh_dad[i];
+ }
+ }
+ }
+ if (nptn < maxptn) {
+ // copy dummy values
+ for (ptn = nptn; ptn < maxptn; ptn++)
+ memcpy(&theta_all[ptn*block], theta_all, block*sizeof(double));
+ }
+ theta_computed = true;
+ }
+
+ ModelSet *models = (ModelSet*)model;
+ VectorClass my_df = 0.0, my_ddf = 0.0;
+ VectorClass dad_length = dad_branch->length;
+ VectorClass unit = 1.0;
+
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, c, j)
+{
+ VectorClass my_df_thread = 0.0;
+ VectorClass my_ddf_thread = 0.0;
+#pragma omp for nowait schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
+ VectorClass lh_ptn[VCSIZE];
+ VectorClass df_ptn[VCSIZE];
+ VectorClass ddf_ptn[VCSIZE];
+ VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
+ VectorClass* eval;
+
+ for (j = 0; j < VCSIZE; j++) {
+ lh_ptn[j] = 0.0;
+ df_ptn[j] = 0.0;
+ ddf_ptn[j] = 0.0;
+ if (ptn+j < nptn) {
+ eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
+ } else {
+ eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
+ }
+ for (c = 0; c < ncat; c++) {
+ VectorClass cat_rate = site_rate->getRate(c);
+ VectorClass lh_cat = 0.0, df_cat = 0.0, ddf_cat = 0.0;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ VectorClass cof = eval[i]*cat_rate;
+ VectorClass val = exp(cof*dad_length) * theta[i];
+ VectorClass val1 = cof*val;
+ lh_cat += val;
+ df_cat += val1;
+ ddf_cat = mul_add(cof, val1, ddf_cat);
+ }
+ VectorClass prop = site_rate->getProp(c);
+ lh_ptn[j] = mul_add(prop, lh_cat, lh_ptn[j]);
+ df_ptn[j] = mul_add(prop, df_cat, df_ptn[j]);
+ ddf_ptn[j] = mul_add(prop, ddf_cat, ddf_ptn[j]);
+ theta += nstates/VCSIZE;
+ }
+ }
+
+ VectorClass inv_lh_ptn = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+ inv_lh_ptn = unit / abs(inv_lh_ptn);
+ VectorClass freq;
+ freq.load_a(&ptn_freq[ptn]);
+
+ VectorClass df_ptn_sum = horizontal_add(df_ptn) * inv_lh_ptn;
+ VectorClass ddf_ptn_sum = horizontal_add(ddf_ptn) * inv_lh_ptn;
+ ddf_ptn_sum = nmul_add(df_ptn_sum, df_ptn_sum, ddf_ptn_sum);
+
+#ifdef _OPENMP
+ my_df_thread = mul_add(df_ptn_sum, freq, my_df_thread);
+ my_ddf_thread = mul_add(ddf_ptn_sum, freq, my_ddf_thread);
+#else
+ my_df = mul_add(df_ptn_sum, freq, my_df);
+ my_ddf = mul_add(ddf_ptn_sum, freq, my_ddf);
+#endif
+ } // for loop
+
+#ifdef _OPENMP
+#pragma omp critical
+ {
+ my_df += my_df_thread;
+ my_ddf += my_ddf_thread;
+ }
+}
+#endif
+
+ df = horizontal_add(my_df);
+ ddf = horizontal_add(my_ddf);
+ if (isnan(df) || isinf(df)) {
+ df = 0.0;
+ ddf = 0.0;
+ outWarning("Numerical instability (some site-likelihood = 0)");
+ }
+
+}
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+double PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+ PhyloNode *node = (PhyloNode*) dad_branch->node;
+ PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+ if (!central_partial_lh)
+ initializeAllPartialLh();
+ if (node->isLeaf()) {
+ PhyloNode *tmp_node = dad;
+ dad = node;
+ node = tmp_node;
+ PhyloNeighbor *tmp_nei = dad_branch;
+ dad_branch = node_branch;
+ node_branch = tmp_nei;
+ }
+ if ((dad_branch->partial_lh_computed & 1) == 0)
+ computeSitemodelPartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(dad_branch, dad);
+ if ((node_branch->partial_lh_computed & 1) == 0)
+ computeSitemodelPartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
+ size_t ncat = site_rate->getNRate();
+
+ size_t block = ncat * nstates;
+ size_t ptn; // for big data size > 4GB memory required
+ size_t c, i, j;
+ size_t nptn = aln->size();
+ size_t maxptn = get_safe_upper_limit(nptn);
+
+ ModelSet *models = (ModelSet*)model;
+ VectorClass tree_lh = 0.0;
+ VectorClass *cat_length = aligned_alloc<VectorClass>(ncat);
+ VectorClass *cat_prop = aligned_alloc<VectorClass>(ncat);
+ for (c = 0; c < ncat; c++) {
+ cat_length[c] = site_rate->getRate(c) * dad_branch->length;
+ cat_prop[c] = site_rate->getProp(c);
+ }
+
+ if (dad->isLeaf()) {
+ // copy dummy values because VectorClass will access beyond nptn
+ for (ptn = nptn; ptn < maxptn; ptn++)
+ memcpy(&dad_branch->partial_lh[ptn*block], &dad_branch->partial_lh[(ptn-1)*block], block*sizeof(double));
+
+ // special treatment for TIP-INTERNAL NODE case
+ double *tip_partial_lh_node = tip_partial_lh + (dad->id * get_safe_upper_limit(nptn)*nstates);
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, c, j)
+{
+ VectorClass tree_lh_thread = 0.0;
+#pragma omp for nowait schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
+ VectorClass lh_ptn[VCSIZE];
+ VectorClass* eval;
+ VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+ VectorClass *partial_lh_node = (VectorClass*)(tip_partial_lh_node + ptn*nstates);
+
+ for (j = 0; j < VCSIZE; j++) {
+ lh_ptn[j] = 0.0;
+ if (ptn+j < nptn) {
+ eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
+ } else {
+ eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
+ }
+ for (c = 0; c < ncat; c++) {
+ VectorClass lh_cat = 0.0;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ lh_cat=mul_add(exp(eval[i]*cat_length[c]), partial_lh_dad[i] * partial_lh_node[i], lh_cat);
+ }
+ lh_ptn[j] = mul_add(cat_prop[c], lh_cat, lh_ptn[j]);
+ partial_lh_dad += nstates/VCSIZE;
+// partial_lh_node += nstates/VCSIZE;
+ }
+ partial_lh_node += nstates/VCSIZE;
+ }
+
+ VectorClass freq;
+ freq.load_a(&ptn_freq[ptn]);
+ VectorClass lh_ptn_sum = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+ lh_ptn_sum = log(abs(lh_ptn_sum));
+ lh_ptn_sum.store_a(&_pattern_lh[ptn]);
+#ifdef _OPENMP
+ tree_lh_thread = mul_add(lh_ptn_sum, freq, tree_lh_thread);
+#else
+ tree_lh = mul_add(lh_ptn_sum, freq, tree_lh);
+#endif
+ } // for loop
+
+#ifdef _OPENMP
+#pragma omp critical
+ {
+ tree_lh += tree_lh_thread;
+ }
+}
+#endif
+
+ } else
+ {
+ // both dad and node are internal nodes
+ // copy dummy values because VectorClass will access beyond nptn
+ for (ptn = nptn; ptn < maxptn; ptn++) {
+ memcpy(&dad_branch->partial_lh[ptn*block], &dad_branch->partial_lh[(ptn-1)*block], block*sizeof(double));
+ memcpy(&node_branch->partial_lh[ptn*block], &node_branch->partial_lh[(ptn-1)*block], block*sizeof(double));
+ }
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, c, j)
+{
+ VectorClass tree_lh_thread = 0.0;
+#pragma omp for nowait schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
+ VectorClass lh_ptn[VCSIZE];
+ VectorClass* eval;
+ VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+ VectorClass *partial_lh_node = (VectorClass*)(node_branch->partial_lh + ptn*block);
+
+ for (j = 0; j < VCSIZE; j++) {
+ lh_ptn[j] = 0.0;
+ if (ptn+j < nptn) {
+ eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
+ } else {
+ eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
+ }
+ for (c = 0; c < ncat; c++) {
+ VectorClass lh_cat = 0.0;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ lh_cat = mul_add(exp(eval[i]*cat_length[c]), partial_lh_dad[i] * partial_lh_node[i], lh_cat);
+ }
+ lh_ptn[j] = mul_add(cat_prop[c], lh_cat, lh_ptn[j]);
+ partial_lh_dad += nstates/VCSIZE;
+ partial_lh_node += nstates/VCSIZE;
+ }
+ }
+
+ VectorClass freq;
+ freq.load_a(&ptn_freq[ptn]);
+ VectorClass lh_ptn_sum = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+ lh_ptn_sum = log(abs(lh_ptn_sum));
+ lh_ptn_sum.store_a(&_pattern_lh[ptn]);
+#ifdef _OPENMP
+ tree_lh_thread = mul_add(lh_ptn_sum, freq, tree_lh_thread);
+#else
+ tree_lh = mul_add(lh_ptn_sum, freq, tree_lh);
+#endif
+ } // for loop
+
+#ifdef _OPENMP
+#pragma omp critical
+ {
+ tree_lh += tree_lh_thread;
+ }
+}
+#endif
+
+
+ }
+
+ double tree_lh_final = horizontal_add(tree_lh) + node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+
+ if (isnan(tree_lh_final) || isinf(tree_lh_final)) {
+ cout << "WARNING: Numerical underflow caused by alignment sites";
+ i = aln->getNSite();
+ int j;
+ for (j = 0, c = 0; j < i; j++) {
+ ptn = aln->getPatternID(j);
+ if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+ cout << " " << j+1;
+ c++;
+ if (c >= 10) {
+ cout << " ...";
+ break;
+ }
+ }
+ }
+ cout << endl;
+ tree_lh_final = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+ for (ptn = 0; ptn < nptn; ptn++) {
+ if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+ _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
+ }
+ tree_lh_final += _pattern_lh[ptn] * ptn_freq[ptn];
+ }
+ }
+
+ assert(!isnan(tree_lh_final) && !isinf(tree_lh_final));
+
+ aligned_free(cat_prop);
+ aligned_free(cat_length);
+
+ return tree_lh_final;
+}
+
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+double PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD() {
+ assert(theta_all && theta_computed);
+
+// size_t nstates = aln->num_states;
+ size_t ncat = site_rate->getNRate();
+
+ size_t block = ncat * nstates;
+ size_t ptn; // for big data size > 4GB memory required
+ size_t c, i, j;
+ size_t nptn = aln->size();
+
+ ModelSet *models = (ModelSet*)model;
+
+ VectorClass tree_lh = 0.0;
+ VectorClass *cat_length = aligned_alloc<VectorClass>(ncat);
+ VectorClass *cat_prop = aligned_alloc<VectorClass>(ncat);
+ for (c = 0; c < ncat; c++) {
+ cat_length[c] = site_rate->getRate(c) * current_it->length;
+ cat_prop[c] = site_rate->getProp(c);
+ }
+
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, c, j)
+{
+ VectorClass tree_lh_thread = 0.0;
+#pragma omp for nowait schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
+ VectorClass lh_ptn[VCSIZE];
+ VectorClass* eval;
+ VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
+
+ for (j = 0; j < VCSIZE; j++) {
+ lh_ptn[j] = 0.0;
+ if (ptn+j < nptn) {
+ eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
+ } else {
+ eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
+ }
+ for (c = 0; c < ncat; c++) {
+ VectorClass lh_cat = 0.0;
+ for (i = 0; i < nstates/VCSIZE; i++) {
+ lh_cat = mul_add(exp(eval[i]*cat_length[c]), theta[i], lh_cat);
+ }
+ lh_ptn[j] = mul_add(cat_prop[c], lh_cat, lh_ptn[j]);
+ theta += nstates/VCSIZE;
+ }
+ }
+
+ VectorClass freq;
+ freq.load_a(&ptn_freq[ptn]);
+ VectorClass lh_ptn_sum = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+ lh_ptn_sum = log(abs(lh_ptn_sum));
+ lh_ptn_sum.store_a(&_pattern_lh[ptn]);
+#ifdef _OPENMP
+ tree_lh_thread = mul_add(lh_ptn_sum, freq, tree_lh_thread);
+#else
+ tree_lh = mul_add(lh_ptn_sum, freq, tree_lh);
+#endif
+ } // for loop
+
+#ifdef _OPENMP
+#pragma omp critical
+ {
+ tree_lh += tree_lh_thread;
+ }
+}
+#endif
+
+ double tree_lh_final = horizontal_add(tree_lh) + current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+
+ aligned_free(cat_prop);
+ aligned_free(cat_length);
+
+ return tree_lh_final;
+}
+
+
+#endif /* PHYLOKERNELSITEMODEL_H_ */
diff --git a/phylosupertree.cpp b/phylosupertree.cpp
index 5df6ba6..6ed385b 100644
--- a/phylosupertree.cpp
+++ b/phylosupertree.cpp
@@ -49,6 +49,51 @@ PhyloSuperTree::PhyloSuperTree(SuperAlignment *alignment, PhyloSuperTree *super_
aln = alignment;
}
+void PhyloSuperTree::setCheckpoint(Checkpoint *checkpoint) {
+ IQTree::setCheckpoint(checkpoint);
+ for (iterator it = begin(); it != end(); it++)
+ (*it)->setCheckpoint(checkpoint);
+}
+
+void PhyloSuperTree::saveCheckpoint() {
+// checkpoint->startStruct("PhyloSuperTree");
+// int part = 0;
+// for (iterator it = begin(); it != end(); it++, part++) {
+// string key = part_info[part].name + ".tree";
+// checkpoint->put(key, (*it)->getTreeString());
+// }
+// checkpoint->endStruct();
+ IQTree::saveCheckpoint();
+}
+
+void PhyloSuperTree::restoreCheckpoint() {
+ IQTree::restoreCheckpoint();
+
+ // first get the newick string of super tree
+// checkpoint->startStruct("PhyloTree");
+// string newick;
+// CKP_RESTORE(newick);
+// checkpoint->endStruct();
+//
+// if (newick.empty()) return;
+//
+// // now get partition tree strings
+// checkpoint->startStruct("PhyloSuperTree");
+// int part = 0;
+// for (iterator it = begin(); it != end(); it++, part++) {
+// string key = part_info[part].name + ".tree";
+// string part_tree;
+// if (!checkpoint->get(key, part_tree))
+// outError("No tree for partition " + part_info[part].name + " found from checkpoint");
+// newick += part_tree;
+// }
+//
+// checkpoint->endStruct();
+//
+// readTreeString(newick);
+
+}
+
void PhyloSuperTree::readPartition(Params ¶ms) {
try {
ifstream in;
@@ -555,9 +600,9 @@ void PhyloSuperTree::changeLikelihoodKernel(LikelihoodKernel lk) {
string PhyloSuperTree::getTreeString() {
stringstream tree_stream;
- printTree(tree_stream, WT_BR_LEN+WT_NEWLINE);
+ printTree(tree_stream, WT_TAXON_ID + WT_BR_LEN + WT_SORT_TAXA);
for (iterator it = begin(); it != end(); it++)
- (*it)->printTree(tree_stream, WT_BR_LEN+WT_NEWLINE);
+ (*it)->printTree(tree_stream, WT_TAXON_ID + WT_BR_LEN + WT_SORT_TAXA);
return tree_stream.str();
}
@@ -567,11 +612,13 @@ void PhyloSuperTree::readTreeString(const string &tree_string) {
str.seekg(0, ios::beg);
freeNode();
readTree(str, rooted);
- setAlignment(aln);
+ assignLeafNames();
+// setAlignment(aln);
setRootNode(params->root);
for (iterator it = begin(); it != end(); it++) {
(*it)->freeNode();
(*it)->readTree(str, rooted);
+ (*it)->assignLeafNames();
// (*it)->setAlignment((*it)->aln);
}
linkTrees();
diff --git a/phylosupertree.h b/phylosupertree.h
index d4b7769..6dc299a 100644
--- a/phylosupertree.h
+++ b/phylosupertree.h
@@ -51,6 +51,22 @@ public:
~PhyloSuperTree();
+ /**
+ set checkpoint object
+ @param checkpoint
+ */
+ virtual void setCheckpoint(Checkpoint *checkpoint);
+
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/** read partition model file */
void readPartition(Params ¶ms);
diff --git a/phylosupertreeplen.cpp b/phylosupertreeplen.cpp
index 75356db..6782e9d 100644
--- a/phylosupertreeplen.cpp
+++ b/phylosupertreeplen.cpp
@@ -71,6 +71,39 @@ PartitionModelPlen::~PartitionModelPlen()
{
}
+void PartitionModelPlen::saveCheckpoint() {
+ checkpoint->startStruct("PartitionModelPlen");
+ PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+ if (!tree->fixed_rates) {
+ int nrates = tree->part_info.size();
+ double *part_rates = new double[nrates];
+ for (int i = 0; i < nrates; i++)
+ part_rates[i] = tree->part_info[i].part_rate;
+ CKP_ARRAY_SAVE(nrates, part_rates);
+ delete [] part_rates;
+ }
+ checkpoint->endStruct();
+ PartitionModel::saveCheckpoint();
+}
+
+void PartitionModelPlen::restoreCheckpoint() {
+ checkpoint->startStruct("PartitionModelPlen");
+ PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+ if (!tree->fixed_rates) {
+ int nrates = tree->part_info.size();
+ double *part_rates = new double[nrates];
+ if (CKP_ARRAY_RESTORE(nrates, part_rates)) {
+ for (int i = 0; i < nrates; i++)
+ tree->part_info[i].part_rate = part_rates[i];
+ tree->mapTrees();
+ }
+ delete [] part_rates;
+ }
+ checkpoint->endStruct();
+ PartitionModel::restoreCheckpoint();
+}
+
+
double PartitionModelPlen::optimizeParameters(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
double tree_lh = 0.0, cur_lh = 0.0;
@@ -306,6 +339,16 @@ PhyloSuperTreePlen::~PhyloSuperTreePlen()
}
}
+void PhyloSuperTreePlen::saveCheckpoint() {
+ // bypass PhyloSuperTree
+ IQTree::saveCheckpoint();
+}
+
+void PhyloSuperTreePlen::restoreCheckpoint() {
+ // bypass PhyloSuperTree
+ IQTree::restoreCheckpoint();
+}
+
// -------------------------------------------------------------------------------------------------------------
double PhyloSuperTreePlen::computeDist(int seq1, int seq2, double initial_dist, double &var) {
diff --git a/phylosupertreeplen.h b/phylosupertreeplen.h
index 39a7f67..d54c46f 100644
--- a/phylosupertreeplen.h
+++ b/phylosupertreeplen.h
@@ -101,6 +101,16 @@ public:
~PartitionModelPlen();
/**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
+ /**
* @return #parameters of the model + # branches
*/
virtual int getNParameters();
@@ -156,6 +166,16 @@ public:
~PhyloSuperTreePlen();
/**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
+ /**
Read the tree saved with Taxon Names and branch lengths.
@param tree_string tree string to read from
*/
diff --git a/phylotesting.cpp b/phylotesting.cpp
index 1ac2c0a..272b9e9 100644
--- a/phylotesting.cpp
+++ b/phylotesting.cpp
@@ -33,6 +33,8 @@
#include "timeutil.h"
#include "phyloanalysis.h"
+#include "gsl/mygsl.h"
+#include "vectorclass/vectorclass.h"
/******* Binary model set ******/
@@ -269,28 +271,28 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl)
out.open(filename);
out << "Note : P(D|M) is the probability of site D given the model M (i.e., the site likelihood)" << endl;
if (wsl == WSL_RATECAT) {
- out << "P(D|M,rr[x]) is the probability of site D given the model M and the relative rate" << endl;
- out << "of evolution rr[x], where x is the class of rate to be considered." << endl;
- out << "We have P(D|M) = \\sum_x P(x) x P(D|M,rr[x])." << endl << endl;
+ out << "P(D|M,rr[i]) is the probability of site D given the model M and the relative rate" << endl;
+ out << "of evolution rr[i], where i is the class of rate to be considered." << endl;
+ out << "We have P(D|M) = \\sum_i P(i) x P(D|M,rr[i])." << endl << endl;
out << "Site logP(D|M) ";
for (i = 0; i < ncat; i++)
- out << "logP(D|M,rr[" << i+1 << "]=" << tree->getRate()->getRate(i)<< ") ";
+ out << "log{P(" << i+1 << ")xP(D|M,rr[" << i+1 << "]=" << tree->getRate()->getRate(i)<< ")} ";
} else if (wsl == WSL_MIXTURE) {
- out << "P(D|M[x]) is the probability of site D given the model M[x]," << endl;
- out << "where x is the mixture class to be considered." << endl;
- out << "We have P(D|M) = \\sum_x P(x) x P(D|M[x])." << endl << endl;
+ out << "P(D|M[i]) is the probability of site D given the model M[i]," << endl;
+ out << "where i is the mixture class to be considered." << endl;
+ out << "We have P(D|M) = \\sum_i P(i) x P(D|M[i])." << endl << endl;
out << "Site logP(D|M) ";
for (i = 0; i < ncat; i++)
- out << "logP(D|M[" << i+1 << "]) ";
+ out << "log{P(" << i+1 << ")xP(D|M[" << i+1 << "])} ";
} else {
// WSL_MIXTURE_RATECAT
- out << "P(D|M[x],rr[y]) is the probability of site D given the model M[x] and the relative rate" << endl;
- out << "of evolution rr[y], where x and y are the mixture class and rate class, respectively." << endl;
- out << "We have P(D|M) = \\sum_x \\sum_y P(x) x P(y) x P(D|M[x],rr[y])." << endl << endl;
+ out << "P(D|M[i],rr[j]) is the probability of site D given the model M[i] and the relative rate" << endl;
+ out << "of evolution rr[j], where i and j are the mixture class and rate class, respectively." << endl;
+ out << "We have P(D|M) = \\sum_i \\sum_j P(i) x P(j) x P(D|M[i],rr[j])." << endl << endl;
out << "Site logP(D|M) ";
for (i = 0; i < tree->getModel()->getNMixtures(); i++)
for (int j = 0; j < tree->getRate()->getNRate(); j++) {
- out << "logP(D|M[" << i+1 << "],rr[" << j+1 << "]=" << tree->getRate()->getRate(j) << ") ";
+ out << "log{P(" << i+1 << ")xP(" << j+1 << ")xP(D|M[" << i+1 << "],rr[" << j+1 << "]=" << tree->getRate()->getRate(j) << ")} ";
}
}
out << endl;
@@ -331,6 +333,37 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl)
}
+void printSiteStateFreq(const char*filename, PhyloTree *tree) {
+
+ int i, j, nsites = tree->getAlnNSite(), nstates = tree->aln->num_states;
+ double *ptn_state_freq = new double[tree->getAlnNPattern() * nstates];
+
+ tree->computePatternStateFreq(ptn_state_freq);
+
+ try {
+ ofstream out;
+ out.exceptions(ios::failbit | ios::badbit);
+ out.open(filename);
+ IntVector pattern_index;
+ tree->aln->getSitePatternIndex(pattern_index);
+ for (i = 0; i < nsites; i++) {
+ out.width(6);
+ out << left << i+1 << " ";
+ double *state_freq = &ptn_state_freq[pattern_index[i]*nstates];
+ for (j = 0; j < nstates; j++) {
+ out.width(15);
+ out << state_freq[j] << " ";
+ }
+ out << endl;
+ }
+ out.close();
+ cout << "Site state frequency vectors printed to " << filename << endl;
+ } catch (ios::failure) {
+ outError(ERR_WRITE_OUTPUT, filename);
+ }
+ delete [] ptn_state_freq;
+}
+
bool checkModelFile(ifstream &in, bool is_partitioned, vector<ModelInfo> &infos) {
if (!in.is_open()) return false;
in.exceptions(ios::badbit);
@@ -543,7 +576,7 @@ int getModelList(Params ¶ms, Alignment *aln, StrVector &models, bool separat
// for (i = 0; i < noptions; i++)
// test_options[i] = test_options_codon[i];
// } else
- if (seq_type == SEQ_MORPH || (aln->frac_const_sites == 0.0)) {
+ if (aln->frac_const_sites == 0.0) {
// morphological or SNP data: activate +ASC
if (with_new) {
if (with_asc)
@@ -1149,6 +1182,8 @@ string testModel(Params ¶ms, PhyloTree* in_tree, vector<ModelInfo> &model_in
// select model for each partition
PhyloSuperTree *stree = (PhyloSuperTree*)in_tree;
testPartitionModel(params, stree, model_info, fmodel, models_block);
+// stree->linkTrees();
+ stree->mapTrees();
string res_models = "";
for (vector<PartitionInfo>::iterator it = stree->part_info.begin(); it != stree->part_info.end(); it++) {
if (it != stree->part_info.begin()) res_models += ",";
@@ -1729,6 +1764,252 @@ int countDistinctTrees(const char *filename, bool rooted, IQTree *tree, IntVecto
//const double TOL_RELL_SCORE = 0.01;
+/*
+ Problem: solve the following linear system equation:
+ a_1*x + b_1*y = c_1
+ a_2*x + b_2*y = c_2
+ ....
+ a_n*x + b_n*y = c_n
+
+becomes minimizing weighted least square:
+
+ sum_k { w_k*[ c_k - (a_k*x + b_k*y) ]^2 }
+
+
+the solution is:
+
+ x = [(sum_k w_k*b_k*c_k)*(sum_k w_k*a_k*b_k) - (sum_k w_k*a_k*c_k)(sum_k w_k*b_k^2)] /
+ [ (sum_k w_k*a_k*b_k)^2 - (sum_k w_k*a_k^2)*(sum_k w_k*b_k^2) ]
+
+ y = [(sum_k w_k*a_k*c_k)*(sum_k w_k*a_k*b_k) - (sum_k w_k*b_k*c_k)(sum_k w_k*a_k^2)] /
+ [ (sum_k w_k*a_k*b_k)^2 - (sum_k w_k*a_k^2)*(sum_k w*k*b_k^2) ]
+
+ @param n number of data points
+ @param w weight vector of length n
+ @param a a value vector of length n
+ @param b b value vector of length n
+ @param c c value vector of length n
+ @param[out] x x-value
+ @param[out] y y-value
+ @return least square value
+*/
+void doWeightedLeastSquare(int n, double *w, double *a, double *b, double *c, double &x, double &y) {
+ int k;
+ double BC = 0.0, AB = 0.0, AC = 0.0, A2 = 0.0, B2 = 0.0;
+ double denom;
+ for (k = 0; k < n; k++) {
+ double wa = w[k]*a[k];
+ double wb = w[k]*b[k];
+ AB += wa*b[k];
+ BC += wb*c[k];
+ AC += wa*c[k];
+ A2 += wa*a[k];
+ B2 += wb*b[k];
+ }
+ denom = 1.0/(AB*AB - A2*B2);
+ x = (BC*AB - AC*B2) * denom;
+ y = (AC*AB - BC*A2) * denom;
+}
+
+/**
+ MLE estimates for AU test
+*/
+class OptimizationAUTest : public Optimization {
+
+public:
+
+ OptimizationAUTest(double d, double c, int nscales, double *bp, double *rr, double *rr_inv) {
+ this->d = d;
+ this->c = c;
+ this->bp = bp;
+ this->rr = rr;
+ this->rr_inv = rr_inv;
+ this->nscales = nscales;
+
+ }
+
+ /**
+ return the number of dimensions
+ */
+ virtual int getNDim() { return 2; }
+
+
+ /**
+ the target function which needs to be optimized
+ @param x the input vector x
+ @return the function value at x
+ */
+ virtual double targetFunk(double x[]) {
+ d = x[1];
+ c = x[2];
+ double res = 0.0;
+ for (int k = 0; k < nscales; k++) {
+ double cdf = gsl_cdf_ugaussian_P(d*rr[k] + c*rr_inv[k]);
+ res += bp[k] * log(1.0 - cdf) + (1.0-bp[k])*log(cdf);
+ }
+ return res;
+ }
+
+ void optimizeDC() {
+ double x[3], lower[3], upper[3];
+ bool bound_check[3];
+ x[1] = d;
+ x[2] = c;
+ lower[1] = lower[2] = 1e-4;
+ upper[1] = upper[2] = 100.0;
+ bound_check[1] = bound_check[2] = false;
+ minimizeMultiDimen(x, 2, lower, upper, bound_check, 1e-4);
+ d = x[1];
+ c = x[2];
+ }
+
+ double d, c;
+ int nscales;
+ double *bp;
+ double *rr;
+ double *rr_inv;
+};
+
+/**
+ @param tree_lhs RELL score matrix of size #trees x #replicates
+*/
+void performAUTest(Params ¶ms, PhyloTree *tree, double *pattern_lhs, vector<TreeInfo> &info) {
+
+ /* STEP 1: specify scale factors */
+ int nscales = 10;
+ double r[] = {0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4};
+ double rr[] = {sqrt(0.5), sqrt(0.6), sqrt(0.7), sqrt(0.8), sqrt(0.9), 1.0,
+ sqrt(1.1), sqrt(1.2), sqrt(1.3), sqrt(1.4)};
+ double rr_inv[] = {sqrt(1/0.5), sqrt(1/0.6), sqrt(1/0.7), sqrt(1/0.8), sqrt(1/0.9), 1.0,
+ sqrt(1/1.1), sqrt(1/1.2), sqrt(1/1.3), sqrt(1/1.4)};
+
+ /* STEP 2: compute bootstrap proportion */
+ int ntrees = info.size();
+ size_t nboot = params.topotest_replicates;
+ double nboot_inv = 1.0 / nboot;
+
+ int nptn = tree->getAlnNPattern();
+ int maxnptn = get_safe_upper_limit(nptn);
+
+ double *bp = new double[ntrees*nscales];
+ memset(bp, 0, sizeof(double)*ntrees*nscales);
+
+ int k, tid, ptn;
+#ifdef _OPENMP
+ #pragma omp parallel private(k, tid, ptn)
+ {
+ int *rstream;
+ init_random(params.ran_seed + omp_get_thread_num(), false, &rstream);
+#else
+ int *rstream = randstream;
+#endif
+ size_t boot;
+ int *boot_sample = aligned_alloc<int>(maxnptn);
+ memset(boot_sample, 0, maxnptn*sizeof(int));
+
+ double *boot_sample_dbl = aligned_alloc<double>(maxnptn);
+
+#ifdef _OPENMP
+ #pragma omp for schedule(static)
+#endif
+ for (k = 0; k < nscales; k++) {
+ string str = "SCALE=" + convertDoubleToString(r[k]);
+ for (boot = 0; boot < nboot; boot++) {
+ tree->aln->createBootstrapAlignment(boot_sample, str.c_str(), rstream);
+ for (ptn = 0; ptn < maxnptn; ptn++)
+ boot_sample_dbl[ptn] = boot_sample[ptn];
+ double max_lh = -1e20;
+ int max_tid = -1;
+ for (tid = 0; tid < ntrees; tid++) {
+ double *pattern_lh = pattern_lhs + (tid*maxnptn);
+ double tree_lh;
+#ifdef BINARY32
+ tree_lh = tree->dotProductSIMD<double, Vec2d, 2>(pattern_lh, boot_sample_dbl, nptn);
+#else
+ if (instruction_set >= 7)
+ tree_lh = tree->dotProductSIMD<double, Vec4d, 4>(pattern_lh, boot_sample_dbl, nptn);
+ else
+ tree_lh = tree->dotProductSIMD<double, Vec2d, 2>(pattern_lh, boot_sample_dbl, nptn);
+#endif
+ if (tree_lh > max_lh) {
+ max_lh = tree_lh;
+ max_tid = tid;
+ }
+ }
+ bp[k*ntrees+max_tid] += nboot_inv;
+ }
+ }
+
+ aligned_free(boot_sample_dbl);
+ aligned_free(boot_sample);
+
+#ifdef _OPENMP
+ finish_random(rstream);
+ }
+#endif
+
+ if (verbose_mode >= VB_MED) {
+ cout << "scale";
+ for (k = 0; k < nscales; k++)
+ cout << "\t" << r[k];
+ cout << endl;
+ for (tid = 0; tid < ntrees; tid++) {
+ cout << tid;
+ for (k = 0; k < nscales; k++) {
+ cout << "\t" << bp[tid+k*ntrees];
+ }
+ cout << endl;
+ }
+ }
+
+ /* STEP 3: weighted least square fit */
+
+ double *cc = new double[nscales];
+ double *w = new double[nscales];
+ double *this_bp = new double[nscales];
+ cout << "TreeID\tAU\tRSS\td_WLS\tc_WLS\td_MLE\tc_MLE" << endl;
+ for (tid = 0; tid < ntrees; tid++) {
+ for (k = 0; k < nscales; k++) {
+ this_bp[k] = bp[tid + k*ntrees];
+ double bp_val = min(max(bp[tid + k*ntrees], nboot_inv),1.0-nboot_inv);
+ double bp_cdf = gsl_cdf_ugaussian_Pinv(bp_val);
+ double bp_pdf = gsl_ran_ugaussian_pdf(bp_cdf);
+ cc[k] = gsl_cdf_ugaussian_Pinv(1.0 - bp_val);
+ w[k] = bp_pdf*bp_pdf*nboot / (bp_val*(1.0-bp_val));
+ }
+ double c, d; // c, d in original paper
+ // first obtain d and c by weighted least square
+ doWeightedLeastSquare(nscales, w, rr, rr_inv, cc, d, c);
+
+ // second, perform MLE estimate of d and c
+ OptimizationAUTest mle(d, c, nscales, this_bp, rr, rr_inv);
+ mle.optimizeDC();
+
+ // compute sum of squared difference
+ double rss = 0.0;
+ for (k = 0; k < nscales; k++) {
+ double diff = cc[k] - (rr[k]*d + rr_inv[k]*c);
+ rss += w[k] * diff * diff;
+ }
+
+ double pchi2 = computePValueChiSquare(rss, nscales-2);
+ /* STEP 4: compute p-value according to Eq. 11 */
+ info[tid].au_pvalue = 1.0 - gsl_cdf_ugaussian_P(mle.d-mle.c);
+ cout << tid+1 << "\t" << info[tid].au_pvalue << "\t" << rss << "\t" << d << "\t" << c << "\t" << mle.d << "\t" << mle.c;
+
+ // warning if p-value of chi-square < 0.01 (rss too high)
+ if (pchi2 < 0.01)
+ cout << " !!!";
+ cout << endl;
+ }
+
+ delete [] this_bp;
+ delete [] w;
+ delete [] cc;
+ delete [] bp;
+}
+
+
void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVector &distinct_ids)
{
if (!params.treeset_file)
@@ -1766,18 +2047,20 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
site_lh_out.close();
}
- double time_start = getCPUTime();
+ double time_start = getRealTime();
int *boot_samples = NULL;
int boot;
//double *saved_tree_lhs = NULL;
- double *tree_lhs = NULL;
+ double *tree_lhs = NULL; // RELL score matrix of size #trees x #replicates
double *pattern_lh = NULL;
double *pattern_lhs = NULL;
- double *orig_tree_lh = NULL;
+ double *orig_tree_lh = NULL; // Original tree log-likelihoods
double *max_lh = NULL;
double *lhdiff_weights = NULL;
int nptn = tree->getAlnNPattern();
+ int maxnptn = get_safe_upper_limit(nptn);
+
if (params.topotest_replicates && ntrees > 1) {
size_t mem_size = (size_t)params.topotest_replicates*nptn*sizeof(int) +
ntrees*params.topotest_replicates*sizeof(double) +
@@ -1796,14 +2079,16 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
// outError(ERR_NO_MEMORY);
if (!(tree_lhs = new double [ntrees * params.topotest_replicates]))
outError(ERR_NO_MEMORY);
- if (params.do_weighted_test) {
+ if (params.do_weighted_test || params.do_au_test) {
if (!(lhdiff_weights = new double [ntrees * ntrees]))
outError(ERR_NO_MEMORY);
- if (!(pattern_lhs = new double[ntrees* nptn]))
- outError(ERR_NO_MEMORY);
+ pattern_lhs = aligned_alloc<double>(ntrees*maxnptn);
+// if (!(pattern_lhs = new double[ntrees* nptn]))
+// outError(ERR_NO_MEMORY);
}
- if (!(pattern_lh = new double[nptn]))
- outError(ERR_NO_MEMORY);
+ pattern_lh = aligned_alloc<double>(maxnptn);
+// if (!(pattern_lh = new double[nptn]))
+// outError(ERR_NO_MEMORY);
if (!(orig_tree_lh = new double[ntrees]))
outError(ERR_NO_MEMORY);
if (!(max_lh = new double[params.topotest_replicates]))
@@ -1830,13 +2115,13 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
tree->setRootNode(params.root);
if (tree->isSuperTree())
((PhyloSuperTree*) tree)->mapTrees();
- if ((tree->sse == LK_EIGEN || tree->sse == LK_EIGEN_SSE) && !tree->isBifurcating()) {
- cout << "NOTE: Changing to old kernel as user tree is multifurcating" << endl;
- if (tree->sse == LK_EIGEN)
- tree->changeLikelihoodKernel(LK_NORMAL);
- else
- tree->changeLikelihoodKernel(LK_SSE);
- }
+// if ((tree->sse == LK_EIGEN || tree->sse == LK_EIGEN_SSE) && !tree->isBifurcating()) {
+// cout << "NOTE: Changing to old kernel as user tree is multifurcating" << endl;
+// if (tree->sse == LK_EIGEN)
+// tree->changeLikelihoodKernel(LK_NORMAL);
+// else
+// tree->changeLikelihoodKernel(LK_SSE);
+// }
tree->initializeAllPartialLh();
tree->fixNegativeBranch(false);
@@ -1855,9 +2140,10 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
if (pattern_lh) {
double curScore = tree->getCurScore();
+ memset(pattern_lh, 0, maxnptn*sizeof(double));
tree->computePatternLikelihood(pattern_lh, &curScore);
- if (params.do_weighted_test)
- memcpy(pattern_lhs + tid*nptn, pattern_lh, nptn*sizeof(double));
+ if (params.do_weighted_test || params.do_au_test)
+ memcpy(pattern_lhs + tid*maxnptn, pattern_lh, maxnptn*sizeof(double));
}
if (params.print_site_lh) {
string tree_name = "Tree" + convertIntToString(tree_index+1);
@@ -1890,7 +2176,7 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
int *tree_ranks = new int[ntrees];
/* perform RELL BP method */
- cout << "Performing RELL test..." << endl;
+ cout << "Performing RELL-BP test..." << endl;
int *maxtid = new int[params.topotest_replicates];
double *maxL = new double[params.topotest_replicates];
int *maxcount = new int[params.topotest_replicates];
@@ -1999,10 +2285,10 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
cout << "Computing pairwise logl difference variance ..." << endl;
/* computing lhdiff_weights as 1/sqrt(lhdiff_variance) */
for (tid = 0; tid < ntrees; tid++) {
- double *pattern_lh1 = pattern_lhs + (tid * nptn);
+ double *pattern_lh1 = pattern_lhs + (tid * maxnptn);
lhdiff_weights[tid*ntrees+tid] = 0.0;
for (tid2 = tid+1; tid2 < ntrees; tid2++) {
- double lhdiff_variance = tree->computeLogLDiffVariance(pattern_lh1, pattern_lhs + (tid2*nptn));
+ double lhdiff_variance = tree->computeLogLDiffVariance(pattern_lh1, pattern_lhs + (tid2*maxnptn));
lhdiff_weights[tid*ntrees+tid2] = 1.0/sqrt(lhdiff_variance);
lhdiff_weights[tid2*ntrees+tid] = lhdiff_weights[tid*ntrees+tid2];
}
@@ -2042,6 +2328,9 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
info[tid].wkh_pvalue /= params.topotest_replicates;
}
}
+
+ delete [] avg_lh;
+
/* now to ELW - Expected Likelihood Weight method */
cout << "Performing ELW test..." << endl;
@@ -2088,6 +2377,11 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
outError("Internal error: Wrong ", __func__);
delete [] sumL;
+ if (params.do_au_test) {
+ cout << "Performing approximately unbiased (AU) test..." << endl;
+ performAUTest(params, tree, pattern_lhs, info);
+ }
+
delete [] tree_ranks;
delete [] tree_probs;
@@ -2097,9 +2391,9 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
if (orig_tree_lh)
delete [] orig_tree_lh;
if (pattern_lh)
- delete [] pattern_lh;
+ aligned_free(pattern_lh);
if (pattern_lhs)
- delete [] pattern_lhs;
+ aligned_free(pattern_lhs);
if (lhdiff_weights)
delete [] lhdiff_weights;
if (tree_lhs)
@@ -2116,7 +2410,7 @@ void evaluateTrees(Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVect
treeout.close();
in.close();
- cout << "Time for evaluating all trees: " << getCPUTime() - time_start << " sec." << endl;
+ cout << "Time for evaluating all trees: " << getRealTime() - time_start << " sec." << endl;
}
@@ -2126,3 +2420,6 @@ void evaluateTrees(Params ¶ms, IQTree *tree) {
IntVector distinct_ids;
evaluateTrees(params, tree, info, distinct_ids);
}
+
+
+
diff --git a/phylotesting.h b/phylotesting.h
index 06747c9..413c8da 100644
--- a/phylotesting.h
+++ b/phylotesting.h
@@ -38,6 +38,7 @@ struct TreeInfo {
double wkh_pvalue; // p-value by weighted Kishino-Hasegawa test
double elw_value; // ELW - expected likelihood weights test
bool elw_confident; // to represent confidence set of ELW test
+ double au_pvalue; // p-value by approximately unbiased (AU) test
};
@@ -89,6 +90,13 @@ void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh = NULL,
void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl);
/**
+ * print site state frequency vectors (for Huaichun)
+ * @param filename output file name
+ * @param tree phylogenetic tree
+*/
+void printSiteStateFreq(const char*filename, PhyloTree *tree);
+
+/**
* Evaluate user-trees with possibility of tree topology tests
* @param params program parameters
* @param tree current tree
diff --git a/phylotree.cpp b/phylotree.cpp
index 82c19b0..bdc9da4 100644
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -21,6 +21,7 @@
#include "phylosupertree.h"
#include "phylosupertreeplen.h"
#include "upperbounds.h"
+#include "model/modelmixture.h"
//const static int BINARY_SCALE = floor(log2(1/SCALING_THRESHOLD));
//const static double LOG_BINARY_SCALE = -(log(2) * BINARY_SCALE);
@@ -51,7 +52,7 @@ void SPRMoves::add(PhyloNode *prune_node, PhyloNode *prune_dad, PhyloNode *regra
PhyloTree class
****************************************************************************/
-PhyloTree::PhyloTree() : MTree() {
+PhyloTree::PhyloTree() : MTree(), CheckpointFactory() {
init();
}
@@ -111,11 +112,45 @@ void PhyloTree::init() {
num_partial_lh_computations = 0;
}
-PhyloTree::PhyloTree(Alignment *aln) : MTree() {
+PhyloTree::PhyloTree(Alignment *aln) : MTree(), CheckpointFactory() {
init();
this->aln = aln;
}
+void PhyloTree::saveCheckpoint() {
+ checkpoint->startStruct("PhyloTree");
+ StrVector leafNames;
+ getTaxaName(leafNames);
+ CKP_VECTOR_SAVE(leafNames);
+// string newick = PhyloTree::getTreeString();
+// CKP_SAVE(newick);
+// CKP_SAVE(curScore);
+ checkpoint->endStruct();
+ CheckpointFactory::saveCheckpoint();
+}
+
+void PhyloTree::restoreCheckpoint() {
+ CheckpointFactory::restoreCheckpoint();
+ checkpoint->startStruct("PhyloTree");
+ StrVector leafNames;
+ if (CKP_VECTOR_RESTORE(leafNames)) {
+ if (leafNames.size() != leafNum)
+ outError("Alignment mismatched from checkpoint!");
+
+ StrVector taxname;
+ getTaxaName(taxname);
+ for (int i = 0; i < taxname.size(); i++)
+ if (taxname[i] != leafNames[i])
+ outError("Sequence name " + taxname[i] + " mismatched from checkpoint");
+ }
+// string newick;
+// CKP_RESTORE(curScore);
+// CKP_RESTORE(newick);
+// if (!newick.empty())
+// PhyloTree::readTreeString(newick);
+ checkpoint->endStruct();
+}
+
void PhyloTree::discardSaturatedSite(bool val) {
discard_saturated_site = val;
}
@@ -249,7 +284,7 @@ void PhyloTree::assignLeafNames(Node *node, Node *dad) {
if (node->isLeaf()) {
node->id = atoi(node->name.c_str());
assert(node->id >= 0 && node->id < leafNum);
- node->name = aln->getSeqName(node->id).c_str();
+ node->name = aln->getSeqName(node->id);
}
FOR_NEIGHBOR_IT(node, dad, it)assignLeafNames((*it)->node, node);
}
@@ -321,11 +356,32 @@ void PhyloTree::setParams(Params* params) {
}
void PhyloTree::readTreeString(const string &tree_string) {
- stringstream str;
- str << tree_string;
- str.seekg(0, ios::beg);
+ stringstream str(tree_string);
+// str(tree_string);
+// str.seekg(0, ios::beg);
freeNode();
readTree(str, rooted);
+ assignLeafNames();
+// setAlignment(aln);
+ setRootNode(params->root);
+
+ if (isSuperTree()) {
+ ((PhyloSuperTree*) this)->mapTrees();
+ }
+ if (params->pll) {
+ pllReadNewick(getTreeString());
+ }
+ resetCurScore();
+// lhComputed = false;
+}
+
+void PhyloTree::readTreeStringSeqName(const string &tree_string) {
+ stringstream str(tree_string);
+// str(tree_string);
+// str.seekg(0, ios::beg);
+ freeNode();
+ readTree(str, rooted);
+// assignLeafNames();
setAlignment(aln);
setRootNode(params->root);
@@ -376,7 +432,8 @@ void PhyloTree::readTreeFile(const string &file_name) {
string PhyloTree::getTreeString() {
stringstream tree_stream;
- printTree(tree_stream);
+ setRootNode(params->root);
+ printTree(tree_stream, WT_TAXON_ID + WT_BR_LEN + WT_SORT_TAXA);
return tree_stream.str();
}
@@ -405,7 +462,7 @@ void PhyloTree::setModel(ModelSubst *amodel) {
void PhyloTree::setModelFactory(ModelFactory *model_fac) {
model_factory = model_fac;
- if (model_factory && model_factory->model->isMixture())
+ if (model_factory && (model_factory->model->isMixture() || model_factory->model->isSiteSpecificModel()))
setLikelihoodKernel(sse);
}
@@ -1188,10 +1245,11 @@ void PhyloTree::initializeAllPartialLh() {
}
assert(index == (nodeNum - 1) * 2);
if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
- if (params->lh_mem_save == LM_PER_NODE)
+ if (params->lh_mem_save == LM_PER_NODE) {
assert(indexlh == nodeNum-leafNum);
- else
+ } else {
assert(indexlh == (nodeNum-1)*2-leafNum);
+ }
} else
assert(indexlh == (nodeNum-1)*2);
clearAllPartialLH();
@@ -1320,14 +1378,15 @@ void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node
size_t block_size;
if (instruction_set >= 7)
// block size must be divisible by 4
- block_size = ((nptn+3)/4)*4;
+ nptn = ((nptn+3)/4)*4;
else
// block size must be divisible by 2
- block_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+ nptn = ((nptn % 2) == 0) ? nptn : (nptn + 1);
size_t scale_block_size = nptn;
+// size_t tip_block_size = nptn * model->num_states;
- block_size = block_size * model->num_states * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
+ block_size = nptn * model->num_states * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
if (!node) {
node = (PhyloNode*) root;
// allocate the big central partial likelihoods memory
@@ -1344,12 +1403,15 @@ void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node
if (!central_partial_lh) {
uint64_t tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures();
+ if (model->isSiteSpecificModel() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE))
+ tip_partial_lh_size = get_safe_upper_limit(aln->size()) * model->num_states * leafNum;
uint64_t mem_size = ((uint64_t)leafNum * 4 - 6) * (uint64_t) block_size + 2 + tip_partial_lh_size;
if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
- if (params->lh_mem_save == LM_PER_NODE)
+ if (params->lh_mem_save == LM_PER_NODE) {
mem_size -= ((uint64_t)leafNum * 3 - 4) * (uint64_t)block_size;
- else
+ } else {
mem_size -= (uint64_t)leafNum * (uint64_t)block_size;
+ }
}
if (verbose_mode >= VB_MED)
cout << "Allocating " << mem_size * sizeof(double) << " bytes for partial likelihood vectors" << endl;
@@ -1364,20 +1426,22 @@ void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node
// now always assign tip_partial_lh
if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
- if (params->lh_mem_save == LM_PER_NODE)
+ if (params->lh_mem_save == LM_PER_NODE) {
tip_partial_lh = central_partial_lh + ((nodeNum - leafNum)*block_size);
- else
+ } else {
tip_partial_lh = central_partial_lh + (((nodeNum - 1)*2-leafNum)*block_size);
+ }
} else
tip_partial_lh = central_partial_lh + (((nodeNum - 1)*2)*block_size);
if (!central_scale_num) {
uint64_t mem_size = (leafNum - 1) * 4 * scale_block_size;
if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
- if (params->lh_mem_save == LM_PER_NODE)
+ if (params->lh_mem_save == LM_PER_NODE) {
mem_size -= ((uint64_t)leafNum*3 - 2) * (uint64_t) scale_block_size;
- else
+ } else {
mem_size -= (uint64_t)leafNum * (uint64_t) scale_block_size;
+ }
}
if (verbose_mode >= VB_MED)
cout << "Allocating " << mem_size * sizeof(UBYTE) << " bytes for scale num vectors" << endl;
@@ -1449,6 +1513,16 @@ void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node
indexlh++;
}
}
+
+// if (model->isSiteSpecificModel() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+// // allocate tip memory for this model
+// if (node->isLeaf()) {
+// nei2->partial_lh = tip_partial_lh + (node->id * tip_block_size);
+// }
+// if (dad->isLeaf()) {
+// nei->partial_lh = tip_partial_lh + (dad->id * tip_block_size);
+// }
+// }
}
FOR_NEIGHBOR_IT(node, dad, it) initializeAllPartialLh(index, indexlh, (PhyloNode*) (*it)->node, node);
}
@@ -1625,6 +1699,43 @@ double PhyloTree::computePatternLhCat(SiteLoglType wsl) {
}
}
+void PhyloTree::computePatternStateFreq(double *ptn_state_freq) {
+ assert(getModel()->isMixture());
+ computePatternLhCat(WSL_MIXTURE);
+ double *lh_cat = _pattern_lh_cat;
+ size_t ptn, nptn = getAlnNPattern(), m, nmixture = getModel()->getNMixtures();
+ double *ptn_freq = ptn_state_freq;
+ size_t state, nstates = aln->num_states;
+ ModelMixture *models = (ModelMixture*)model;
+
+ // loop over all site-patterns
+ for (ptn = 0; ptn < nptn; ptn++) {
+
+ // first compute posterior for each mixture component
+ double sum_lh = 0.0;
+ for (m = 0; m < nmixture; m++) {
+ sum_lh += lh_cat[m];
+ }
+ sum_lh = 1.0/sum_lh;
+ for (m = 0; m < nmixture; m++) {
+ lh_cat[m] *= sum_lh;
+ }
+
+ // now compute state frequencies
+ for (state = 0; state < nstates; state++) {
+ double freq = 0;
+ for (m = 0; m < nmixture; m++)
+ freq += models->at(m)->state_freq[state] * lh_cat[m];
+ ptn_freq[state] = freq;
+ }
+
+ // increase the pointers
+ lh_cat += nmixture;
+ ptn_freq += nstates;
+ }
+}
+
+
void PhyloTree::computePatternLikelihood(double *ptn_lh, double *cur_logl, double *ptn_lh_cat, SiteLoglType wsl) {
/* if (!dad_branch) {
@@ -1847,8 +1958,10 @@ double PhyloTree::computeLogLDiffVariance(double *pattern_lh_other, double *ptn_
double PhyloTree::computeLogLDiffVariance(PhyloTree *other_tree, double *pattern_lh) {
double *pattern_lh_other = new double[getAlnNPattern()];
other_tree->computePatternLikelihood(pattern_lh_other);
- delete[] pattern_lh_other;
+ // BUG FIX found by Xcode analyze (use of memory after it is freed)
+// delete[] pattern_lh_other;
double res = computeLogLDiffVariance(pattern_lh_other, pattern_lh);
+ delete[] pattern_lh_other;
return res;
}
@@ -3403,7 +3516,7 @@ double PhyloTree::computeDist(int seq1, int seq2, double initial_dist, double &d
// if no model or site rate is specified, return JC distance
if (initial_dist == 0.0) {
if (params->compute_obs_dist)
- initial_dist = aln->computeObsDist(seq1, seq2);
+ return (initial_dist = aln->computeObsDist(seq1, seq2));
else
initial_dist = aln->computeDist(seq1, seq2);
}
@@ -5045,14 +5158,20 @@ int PhyloTree::testAllBranches(int threshold, double best_score, double *pattern
double lbp_support, aLRT_support, aBayes_support;
double SH_aLRT_support = (testOneBranch(best_score, pattern_lh, reps, lbp_reps,
node, dad, lbp_support, aLRT_support, aBayes_support) * 100);
+ ostringstream ss;
+ ss.precision(3);
+ ss << node->name;
+ if (!node->name.empty())
+ ss << "/";
if (reps)
- node->name = convertDoubleToString(SH_aLRT_support);
+ ss << SH_aLRT_support;
if (lbp_reps)
- node->name += "/" + convertDoubleToString(lbp_support * 100);
+ ss << "/" << lbp_support * 100;
if (aLRT_test)
- node->name += "/" + convertDoubleToString(aLRT_support);
+ ss << "/" << aLRT_support;
if (aBayes_test)
- node->name += "/" + convertDoubleToString(aBayes_support);
+ ss << "/" << aBayes_support;
+ node->name = ss.str();
if (SH_aLRT_support < threshold)
num_low_support = 1;
if (((PhyloNeighbor*) node->findNeighbor(dad))->partial_pars) {
@@ -5295,12 +5414,10 @@ void PhyloTree::removeIdenticalSeqs(Params ¶ms) {
else
new_aln = aln->removeIdenticalSeq("", params.gbo_replicates > 0, removed_seqs, twin_seqs);
if (removed_seqs.size() > 0) {
- cout << "NOTE: " << removed_seqs.size() << " identical sequences will be ignored during tree search" << endl;
- if (verbose_mode >= VB_MED) {
- for (int i = 0; i < removed_seqs.size(); i++) {
- cout << removed_seqs[i] << " is identical to " << twin_seqs[i] << endl;
- }
- }
+ cout << "NOTE: " << removed_seqs.size() << " identical sequences (see below) will be ignored for subsequent analysis" << endl;
+ for (int i = 0; i < removed_seqs.size(); i++) {
+ cout << "NOTE: " << removed_seqs[i] << " (identical to " << twin_seqs[i] << ") is ignored but added at the end" << endl;
+ }
delete aln;
aln = new_aln;
}
@@ -5392,5 +5509,5 @@ void PhyloTree::generateRandomTree(TreeGenType tree_type) {
(*it)->name = aln->getSeqName((*it)->id);
stringstream str;
ext_tree.printTree(str);
- PhyloTree::readTreeString(str.str());
+ PhyloTree::readTreeStringSeqName(str.str());
}
diff --git a/phylotree.h b/phylotree.h
index a2c2480..2ed8388 100644
--- a/phylotree.h
+++ b/phylotree.h
@@ -31,6 +31,7 @@
#include "optimization.h"
#include "model/rateheterogeneity.h"
#include "pll/pll.h"
+#include "checkpoint.h"
#define BOOT_VAL_FLOAT
#define BootValType float
@@ -241,12 +242,69 @@ struct LeafFreq {
}
};
+
+// **********************************************
+// BEGIN definitions for likelihood mapping (HAS)
+// **********************************************
+
+/* maximum exp difference, such that 1.0+exp(-TP_MAX_EXP_DIFF) == 1.0 */
+const double TP_MAX_EXP_DIFF = 40.0;
+
+/* Index definition for counter array needed in likelihood mapping analysis (HAS) */
+#define LM_REG1 0
+#define LM_REG2 1
+#define LM_REG3 2
+#define LM_REG4 3
+#define LM_REG5 4
+#define LM_REG6 5
+#define LM_REG7 6
+#define LM_AR1 7
+#define LM_AR2 8
+#define LM_AR3 9
+#define LM_MAX 10
+
+struct QuartetGroups{
+ int numGroups; // number of clusters:
+ // 0: not initialized, default -> 1
+ // 1: no clusters - any (a,b)|(c,d)
+ // 2: 2 clusters - (a,a')|(b,b')
+ // 3: 3 clusters - (a,a')|(b,c) [rare]
+ // 4: 4 clusters - (a,b)|(c,d)
+ int numSeqs; // number of seqs in alignment (should be #A+#B+#C+#D+#X)
+ int numQuartSeqs; // number of seqs in analysis (should be #A+#B+#C+#D)
+ int numGrpSeqs[5]; // number of seqs in cluster A, B, C, D, and X (exclude)
+ int uniqueQuarts; // number of existing unique quartets for this grouping
+ string Name[5]; // seqIDs of cluster A
+ vector<int> GroupA; // seqIDs of cluster A
+ vector<int> GroupB; // seqIDs of cluster B
+ vector<int> GroupC; // seqIDs of cluster C
+ vector<int> GroupD; // seqIDs of cluster D
+ vector<int> GroupX; // seqIDs of cluster X
+};
+
+struct QuartetInfo {
+ int seqID[4];
+ double logl[3]; // log-lh for {0,1}|{2,3} {0,2}|{1,3} {0,3}|{1,4}
+ double qweight[3]; // weight for {0,1}|{2,3} {0,2}|{1,3} {0,3}|{1,4}
+ int corner; // for the 3 corners of the simplex triangle (0:top, 1:right, 2:left)
+ int area; // for the 7 areas of the simplex triangle
+ // corners (0:top, 1:right, 2:left), rectangles (3:right, 4:left, 5:bottom), 6:center
+};
+
+struct SeqQuartetInfo {
+ unsigned long countarr[LM_MAX]; // the 7 areas of the simplex triangle [0-6; corners (0:top, 1:right, 2:left), rectangles (3:right, 4:left, 5:bottom), 6:center] and the 3 corners [7-9; 7:top, 8:right, 9:left]
+};
+
+// ********************************************
+// END definitions for likelihood mapping (HAS)
+// ********************************************
+
/**
Phylogenetic Tree class
@author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
*/
-class PhyloTree : public MTree, public Optimization {
+class PhyloTree : public MTree, public Optimization, public CheckpointFactory {
friend class PhyloSuperTree;
friend class PhyloSuperTreePlen;
@@ -277,6 +335,17 @@ public:
*/
virtual ~PhyloTree();
+
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
read the tree from the input file in newick format
@param infile the input file file.
@@ -420,6 +489,9 @@ public:
typedef BootValType (PhyloTree::*DotProductType)(BootValType *x, BootValType *y, int size);
DotProductType dotProduct;
+ typedef double (PhyloTree::*DotProductDoubleType)(double *x, double *y, int size);
+ DotProductDoubleType dotProductDouble;
+
#if defined(BINARY32) || defined(__NOAVX__)
void setDotProductAVX() {}
#else
@@ -616,13 +688,15 @@ public:
//template <const int nstates>
void computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
-
+
//template <const int nstates>
void computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
//template <const int nstates>
void computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+ void computeSitemodelPartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
template <class VectorClass, const int VCSIZE, const int nstates>
void computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
@@ -632,6 +706,9 @@ public:
template <class VectorClass, const int VCSIZE, const int nstates>
void computeMixturePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+ template <class VectorClass, const int VCSIZE, const int nstates>
+ void computeSitemodelPartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
/****************************************************************************
computing likelihood on a branch
****************************************************************************/
@@ -669,6 +746,8 @@ public:
//template <const int nstates>
double computeMixrateLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+ double computeSitemodelLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
template <class VectorClass, const int VCSIZE, const int nstates>
double computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
@@ -678,6 +757,9 @@ public:
template <class VectorClass, const int VCSIZE, const int nstates>
double computeMixtureLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
+ template <class VectorClass, const int VCSIZE, const int nstates>
+ double computeSitemodelLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
double computeLikelihoodBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad);
/****************************************************************************
@@ -704,6 +786,11 @@ public:
template <class VectorClass, const int VCSIZE, const int nstates>
double computeMixtureLikelihoodFromBufferEigenSIMD();
+ template <class VectorClass, const int VCSIZE, const int nstates>
+ double computeSitemodelLikelihoodFromBufferEigenSIMD();
+
+ double computeSitemodelLikelihoodFromBufferEigen();
+
/**
compute tree likelihood when a branch length collapses to zero
@param dad_branch the branch leading to the subtree
@@ -740,6 +827,13 @@ public:
virtual double computePatternLhCat(SiteLoglType wsl);
/**
+ compute state frequency for each pattern (for Huaichun)
+ @param[out] ptn_state_freq state frequency vector per pattern,
+ should be pre-allocated with size of num_patterns * num_states
+ */
+ void computePatternStateFreq(double *ptn_state_freq);
+
+ /**
compute pattern likelihoods only if the accumulated scaling factor is non-zero.
Otherwise, copy the pattern_lh attribute
@param pattern_lh (OUT) pattern log-likelihoods,
@@ -830,20 +924,29 @@ public:
void rollBack(istream &best_tree_string);
/**
- Read the tree saved with Taxon Names and branch lengths.
+ refactored 2015-12-22: Taxon IDs instead of Taxon names to save space!
+ Read the tree saved with Taxon IDs and branch lengths.
@param tree_string tree string to read from
@param updatePLL if true, tree is read into PLL
*/
virtual void readTreeString(const string &tree_string);
/**
+ Read the tree saved with Taxon names and branch lengths.
+ @param tree_string tree string to read from
+ @param updatePLL if true, tree is read into PLL
+ */
+ virtual void readTreeStringSeqName(const string &tree_string);
+
+ /**
Read the tree saved with Taxon Names and branch lengths.
@param tree_string tree string to read from
*/
void readTreeFile(const string &file_name);
/**
- * Return the tree string contining taxon names and branch lengths
+ refactored 2015-12-22: Taxon IDs instead of Taxon names to save space!
+ * Return the tree string contining taxon IDs and branch lengths
* @return
*/
virtual string getTreeString();
@@ -892,6 +995,8 @@ public:
//template <const int nstates>
void computeMixrateLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+ void computeSitemodelLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
template <class VectorClass, const int VCSIZE, const int nstates>
void computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
@@ -901,6 +1006,9 @@ public:
template <class VectorClass, const int VCSIZE, const int nstates>
void computeMixtureLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+ template <class VectorClass, const int VCSIZE, const int nstates>
+ void computeSitemodelLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
/**
compute tree likelihood and derivatives on a branch. used to optimize branch length
@param dad_branch the branch leading to the subtree
@@ -1034,7 +1142,7 @@ public:
/**
inherited from Optimization class, to return to likelihood of the tree
- when the current branch length is set to value
+ when the current branceh length is set to value
@param value current branch length
@return negative of likelihood (for minimization)
*/
@@ -1362,6 +1470,40 @@ public:
PhyloNode *node = NULL, PhyloNode *dad = NULL);
/****************************************************************************
+ Quartet functions
+ ****************************************************************************/
+
+ QuartetGroups LMGroups;
+ /**
+ * for doLikelihoodMapping reportLikelihoodMapping: likelihood mapping information by region
+ */
+ vector<QuartetInfo> lmap_quartet_info;
+ int areacount[8];
+ int cornercount[4];
+ // int areacount[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ // int cornercount[4] = {0, 0, 0, 0};
+
+ /**
+ * for doLikelihoodMapping, reportLikelihoodMapping: likelihood mapping information by sequence
+ */
+ vector<SeqQuartetInfo> lmap_seq_quartet_info;
+
+ /** generate a bunch of quartets and compute likelihood for 3 quartet trees for each replicate
+ @param lmap_num_quartets number of quartets
+ @param lmap_quartet_info (OUT) vector of quartet information
+ */
+ void computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info, QuartetGroups &LMGroups);
+
+ /** main function that performs likelihood mapping analysis (Strimmer & von Haeseler 1997) */
+ void doLikelihoodMapping();
+
+ /** output results of likelihood mapping analysis */
+ void reportLikelihoodMapping(ofstream &out);
+
+ /** read clusters for likelihood mapping analysis */
+ void readLikelihoodMappingGroups(char *filename, QuartetGroups &LMGroups);
+
+ /****************************************************************************
Collapse stable (highly supported) clades by one representative
****************************************************************************/
diff --git a/phylotreeavx.cpp b/phylotreeavx.cpp
index 697e370..b4d0245 100644
--- a/phylotreeavx.cpp
+++ b/phylotreeavx.cpp
@@ -9,6 +9,7 @@
#include "phylokernel.h"
#include "phylokernelmixture.h"
#include "phylokernelmixrate.h"
+#include "phylokernelsitemodel.h"
#include "vectorclass/vectorclass.h"
#ifndef __AVX__
@@ -27,10 +28,35 @@ void PhyloTree::setDotProductAVX() {
dotProduct = &PhyloTree::dotProductSIMD<double, Vec4d, 4>;
#endif
+ dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec4d, 4>;
}
void PhyloTree::setLikelihoodKernelAVX() {
setParsimonyKernelAVX();
+ if (model_factory && model_factory->model->isSiteSpecificModel()) {
+ switch (aln->num_states) {
+ case 4:
+ computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD<Vec4d, 4, 4>;
+ computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigenSIMD<Vec4d, 4, 4>;
+ computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD<Vec4d, 4, 4>;
+ computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD<Vec4d, 4, 4>;
+ break;
+ case 20:
+ computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD<Vec4d, 4, 20>;
+ computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigenSIMD<Vec4d, 4, 20>;
+ computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD<Vec4d, 4, 20>;
+ computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD<Vec4d, 4, 20>;
+ break;
+ default:
+ computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigen;
+ computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigen;
+ computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigen;
+ computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigen;
+ break;
+ }
+ return;
+ }
+
switch(aln->num_states) {
case 4:
if (model_factory && model_factory->model->isMixture()) {
diff --git a/phylotreesse.cpp b/phylotreesse.cpp
index 885878d..3bc4054 100644
--- a/phylotreesse.cpp
+++ b/phylotreesse.cpp
@@ -21,7 +21,9 @@
#include "phylokernel.h"
#include "phylokernelmixture.h"
#include "phylokernelmixrate.h"
+#include "phylokernelsitemodel.h"
#include "model/modelgtr.h"
+#include "model/modelset.h"
/* BQM: to ignore all-gapp subtree at an alignment site */
@@ -66,6 +68,7 @@ void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
#else
dotProduct = &PhyloTree::dotProductSIMD<double, Vec2d, 2>;
#endif
+ dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec2d, 2>;
}
sse = lk;
if (!aln || lk == LK_NORMAL) {
@@ -77,6 +80,43 @@ void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
return;
}
+ if (model_factory && model_factory->model->isSiteSpecificModel()) {
+ if (sse == LK_EIGEN) {
+ computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigen;
+ computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigen;
+ computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigen;
+ computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigen;
+ return;
+ }
+ // LK_EIGEN_SSE
+ if (instruction_set >= 7) {
+ // CPU supports AVX
+ setLikelihoodKernelAVX();
+ return;
+ }
+ switch (aln->num_states) {
+ case 4:
+ computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD<Vec2d, 2, 4>;
+ computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigenSIMD<Vec2d, 2, 4>;
+ computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD<Vec2d, 2, 4>;
+ computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD<Vec2d, 2, 4>;
+ break;
+ case 20:
+ computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD<Vec2d, 2, 20>;
+ computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigenSIMD<Vec2d, 2, 20>;
+ computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD<Vec2d, 2, 20>;
+ computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD<Vec2d, 2, 20>;
+ break;
+ default:
+ computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigen;
+ computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigen;
+ computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigen;
+ computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigen;
+ break;
+ }
+ return;
+ }
+
if (sse == LK_EIGEN) {
if (model_factory && model_factory->model->isMixture()) {
if (model_factory->fused_mix_rate) {
@@ -320,6 +360,102 @@ void PhyloTree::computeTipPartialLikelihood() {
if (tip_partial_lh_computed)
return;
tip_partial_lh_computed = true;
+
+
+ //-------------------------------------------------------
+ // initialize ptn_freq and ptn_invar
+ //-------------------------------------------------------
+
+ computePtnFreq();
+ // for +I model
+ computePtnInvar();
+
+ if (getModel()->isSiteSpecificModel()) {
+ ModelSet *models = (ModelSet*)model;
+ size_t nptn = aln->getNPattern(), max_nptn = get_safe_upper_limit(nptn), tip_block_size = max_nptn * aln->num_states;
+ int nstates = aln->num_states;
+ int nseq = aln->getNSeq();
+#ifdef _OPENMP
+ #pragma omp parallel for schedule(static)
+#endif
+ for (int nodeid = 0; nodeid < nseq; nodeid++) {
+ int i, x;
+ double *partial_lh = tip_partial_lh + tip_block_size*nodeid;
+ size_t ptn;
+ for (ptn = 0; ptn < nptn; ptn++, partial_lh += nstates) {
+ int state = aln->at(ptn)[nodeid];
+// double *partial_lh = node_partial_lh + ptn*nstates;
+ double *inv_evec = models->at(ptn)->getInverseEigenvectors();
+
+ if (state < nstates) {
+ for (i = 0; i < nstates; i++)
+ partial_lh[i] = inv_evec[i*nstates+state];
+ } else if (state == aln->STATE_UNKNOWN) {
+ // special treatment for unknown char
+ for (i = 0; i < nstates; i++) {
+ double lh_unknown = 0.0;
+ double *this_inv_evec = inv_evec + i*nstates;
+ for (x = 0; x < nstates; x++)
+ lh_unknown += this_inv_evec[x];
+ partial_lh[i] = lh_unknown;
+ }
+ } else {
+ double lh_ambiguous;
+ // ambiguous characters
+ int ambi_aa[] = {
+ 4+8, // B = N or D
+ 32+64, // Z = Q or E
+ 512+1024 // U = I or L
+ };
+ switch (aln->seq_type) {
+ case SEQ_DNA:
+ {
+ int cstate = state-nstates+1;
+ for (i = 0; i < nstates; i++) {
+ lh_ambiguous = 0.0;
+ for (x = 0; x < nstates; x++)
+ if ((cstate) & (1 << x))
+ lh_ambiguous += inv_evec[i*nstates+x];
+ partial_lh[i] = lh_ambiguous;
+ }
+ }
+ break;
+ case SEQ_PROTEIN:
+ //map[(unsigned char)'B'] = 4+8+19; // N or D
+ //map[(unsigned char)'Z'] = 32+64+19; // Q or E
+ {
+ int cstate = state-nstates;
+ for (i = 0; i < nstates; i++) {
+ lh_ambiguous = 0.0;
+ for (x = 0; x < 11; x++)
+ if (ambi_aa[cstate] & (1 << x))
+ lh_ambiguous += inv_evec[i*nstates+x];
+ partial_lh[i] = lh_ambiguous;
+ }
+ }
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ }
+ // sanity check
+// bool all_zero = true;
+// for (i = 0; i < nstates; i++)
+// if (partial_lh[i] != 0) {
+// all_zero = false;
+// break;
+// }
+// assert(!all_zero && "some tip_partial_lh are all zeros");
+
+ }
+ // dummy values
+ for (ptn = nptn; ptn < max_nptn; ptn++, partial_lh += nstates)
+ memcpy(partial_lh, partial_lh-nstates, nstates*sizeof(double));
+ }
+ return;
+ }
+
int m, i, x, state, nstates = aln->num_states, nmixtures = model->getNMixtures();
double *all_inv_evec = model->getInverseEigenvectors();
assert(all_inv_evec);
@@ -390,14 +526,6 @@ void PhyloTree::computeTipPartialLikelihood() {
break;
}
-
- //-------------------------------------------------------
- // initialize ptn_freq and ptn_invar
- //-------------------------------------------------------
-
- computePtnFreq();
- // for +I model
- computePtnInvar();
}
void PhyloTree::computePtnFreq() {
@@ -451,14 +579,17 @@ void PhyloTree::computePtnInvar() {
//template <const int nstates>
void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+
// don't recompute the likelihood
assert(dad);
if (dad_branch->partial_lh_computed & 1)
return;
dad_branch->partial_lh_computed |= 1;
+ PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+
size_t nstates = aln->num_states;
size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
- PhyloNode *node = (PhyloNode*)(dad_branch->node);
if (node->isLeaf()) {
dad_branch->lh_scale_factor = 0.0;
@@ -483,22 +614,15 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
dad_branch->lh_scale_factor = 0.0;
// internal node
- assert(node->degree() == 3); // it works only for strictly bifurcating tree
PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
FOR_NEIGHBOR_IT(node, dad, it) {
+ PhyloNeighbor *nei = (PhyloNeighbor*)*it;
if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+ if ((nei->partial_lh_computed & 1) == 0)
+ computePartialLikelihood(nei, node);
+ dad_branch->lh_scale_factor += nei->lh_scale_factor;
}
- if (!left->node->isLeaf() && right->node->isLeaf()) {
- PhyloNeighbor *tmp = left;
- left = right;
- right = tmp;
- }
- if ((left->partial_lh_computed & 1) == 0)
- computePartialLikelihoodEigen(left, node);
- if ((right->partial_lh_computed & 1) == 0)
- computePartialLikelihoodEigen(right, node);
-
if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
// re-orient partial_lh
bool done = false;
@@ -517,71 +641,179 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
assert(done && "partial_lh is not re-oriented");
}
-
-
- dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
- double *eleft = new double[block*nstates], *eright = new double[block*nstates];
+ // precompute buffer to save times
+ double *echildren = new double[block*nstates*(node->degree()-1)];
+ double *partial_lh_leaves = new double[(aln->STATE_UNKNOWN+1)*block*(node->degree()-1)];
+ double *echild = echildren;
+ double *partial_lh_leaf = partial_lh_leaves;
+
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ double expchild[nstates];
+ PhyloNeighbor *child = (PhyloNeighbor*)*it;
+ // precompute information buffer
+ for (c = 0; c < ncat; c++) {
+ double len_child = site_rate->getRate(c) * child->length;
+ for (i = 0; i < nstates; i++) {
+ expchild[i] = exp(eval[i]*len_child);
+ }
+ for (x = 0; x < nstates; x++)
+ for (i = 0; i < nstates; i++) {
+ echild[c*nstatesqr+x*nstates+i] = evec[x*nstates+i] * expchild[i];
+ }
+ }
- // precompute information buffer
- for (c = 0; c < ncat; c++) {
- double *expleft = new double[nstates];
- double *expright = new double[nstates];
- double len_left = site_rate->getRate(c) * left->length;
- double len_right = site_rate->getRate(c) * right->length;
- for (i = 0; i < nstates; i++) {
- expleft[i] = exp(eval[i]*len_left);
- expright[i] = exp(eval[i]*len_right);
- }
- for (x = 0; x < nstates; x++)
- for (i = 0; i < nstates; i++) {
- eleft[c*nstatesqr+x*nstates+i] = evec[x*nstates+i] * expleft[i];
- eright[c*nstatesqr+x*nstates+i] = evec[x*nstates+i] * expright[i];
- }
- delete [] expright;
- delete [] expleft;
+ // pre compute information for tip
+ if (child->node->isLeaf()) {
+ vector<int>::iterator it;
+ for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
+ int state = (*it);
+ for (x = 0; x < block; x++) {
+ double vchild = 0.0;
+ for (i = 0; i < nstates; i++) {
+ vchild += echild[x*nstates+i] * tip_partial_lh[state*nstates+i];
+ }
+ partial_lh_leaf[state*block+x] = vchild;
+ }
+ }
+ for (x = 0; x < block; x++) {
+ size_t addr = aln->STATE_UNKNOWN * block;
+ partial_lh_leaf[addr+x] = 1.0;
+ }
+ partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+ }
+ echild += block*nstates;
+ }
+
+
+ double sum_scale = 0.0;
+
+
+ double *eleft = echildren, *eright = echildren + block*nstates;
+
+ if (!left->node->isLeaf() && right->node->isLeaf()) {
+ PhyloNeighbor *tmp = left;
+ left = right;
+ right = tmp;
+ double *etmp = eleft;
+ eleft = eright;
+ eright = etmp;
}
+
+ if (node->degree() > 3) {
- if (left->node->isLeaf() && right->node->isLeaf()) {
- // special treatment for TIP-TIP (cherry) case
-
- // pre compute information for both tips
- double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
- double *partial_lh_right = new double[(aln->STATE_UNKNOWN+1)*block];
-
- vector<int>::iterator it;
- for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
- int state = (*it);
- for (x = 0; x < block; x++) {
- double vleft = 0.0;
- for (i = 0; i < nstates; i++) {
- vleft += eleft[x*nstates+i] * tip_partial_lh[state*nstates+i];
- }
- partial_lh_left[state*block+x] = vleft;
- }
- }
+ /*--------------------- multifurcating node ------------------*/
+
+ // now for-loop computing partial_lh over all site-patterns
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#endif
+ for (ptn = 0; ptn < nptn; ptn++) {
+ double partial_lh_all[block];
+ for (i = 0; i < block; i++)
+ partial_lh_all[i] = 1.0;
+ dad_branch->scale_num[ptn] = 0;
+
+ double *partial_lh_leaf = partial_lh_leaves;
+ double *echild = echildren;
+
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ PhyloNeighbor *child = (PhyloNeighbor*)*it;
+ if (child->node->isLeaf()) {
+ // external node
+ int state_child = (ptn < orig_ntn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+ double *child_lh = partial_lh_leaf + state_child*block;
+ for (c = 0; c < block; c++) {
+ // compute real partial likelihood vector
+ partial_lh_all[c] *= child_lh[c];
+ }
+ partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+ } else {
+ // internal node
+ double *partial_lh = partial_lh_all;
+ double *partial_lh_child = child->partial_lh + ptn*block;
+ dad_branch->scale_num[ptn] += child->scale_num[ptn];
+
+ double *echild_ptr = echild;
+ for (c = 0; c < ncat; c++) {
+ // compute real partial likelihood vector
+ for (x = 0; x < nstates; x++) {
+ double vchild = 0.0;
+// double *echild_ptr = echild + (c*nstatesqr+x*nstates);
+ for (i = 0; i < nstates; i++) {
+ vchild += echild_ptr[i] * partial_lh_child[i];
+ }
+ echild_ptr += nstates;
+ partial_lh[x] *= vchild;
+ }
+ partial_lh += nstates;
+ partial_lh_child += nstates;
+ }
+ } // if
+ echild += block*nstates;
+ } // FOR_NEIGHBOR
+
+
+ // compute dot-product with inv_eigenvector
+ double lh_max = 0.0;
+ double *partial_lh_tmp = partial_lh_all;
+ double *partial_lh = dad_branch->partial_lh + ptn*block;
+ for (c = 0; c < ncat; c++) {
+ double *inv_evec_ptr = inv_evec;
+ for (i = 0; i < nstates; i++) {
+ double res = 0.0;
+ for (x = 0; x < nstates; x++) {
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
+ }
+ inv_evec_ptr += nstates;
+ partial_lh[i] = res;
+ lh_max = max(lh_max, fabs(res));
+ }
+ partial_lh += nstates;
+ partial_lh_tmp += nstates;
+ }
+ // check if one should scale partial likelihoods
+ if (lh_max < SCALING_THRESHOLD) {
+ partial_lh = dad_branch->partial_lh + ptn*block;
+ if (lh_max == 0.0) {
+ // for very shitty data
+ for (c = 0; c < ncat; c++)
+ memcpy(&partial_lh[c*nstates], &tip_partial_lh[aln->STATE_UNKNOWN*nstates], nstates*sizeof(double));
+ sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 4;
+ int nsite = aln->getNSite();
+ for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+ if (aln->getPatternID(i) == ptn) {
+ outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+ x++;
+ }
+ } else {
+ // now do the likelihood scaling
+ for (i = 0; i < block; i++) {
+ partial_lh[i] *= SCALING_THRESHOLD_INVER;
+ //partial_lh[i] /= lh_max;
+ }
+ // unobserved const pattern will never have underflow
+ sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+ //sum_scale += log(lh_max) * ptn_freq[ptn];
+ dad_branch->scale_num[ptn] += 1;
+ }
+ }
- for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
- int state = (*it);
- for (x = 0; x < block; x++) {
- double vright = 0.0;
- for (i = 0; i < nstates; i++) {
- vright += eright[x*nstates+i] * tip_partial_lh[state*nstates+i];
- }
- partial_lh_right[state*block+x] = vright;
- }
- }
+ } // for ptn
+ dad_branch->lh_scale_factor += sum_scale;
+
+ // end multifurcating treatment
+ } else if (left->node->isLeaf() && right->node->isLeaf()) {
- for (x = 0; x < block; x++) {
- size_t addr = aln->STATE_UNKNOWN * block;
- partial_lh_left[addr+x] = 1.0;
- partial_lh_right[addr+x] = 1.0;
- }
+ /*--------------------- TIP-TIP (cherry) case ------------------*/
+ double *partial_lh_left = partial_lh_leaves;
+ double *partial_lh_right = partial_lh_leaves + (aln->STATE_UNKNOWN+1)*block;
// scale number must be ZERO
memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
#ifdef _OPENMP
-//#pragma omp parallel for private(ptn, c, x, i, partial_lh_tmp)
#pragma omp parallel for private(ptn, c, x, i) schedule(static)
#endif
for (ptn = 0; ptn < nptn; ptn++) {
@@ -598,45 +830,28 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
}
// compute dot-product with inv_eigenvector
+ double *inv_evec_ptr = inv_evec;
for (i = 0; i < nstates; i++) {
double res = 0.0;
for (x = 0; x < nstates; x++) {
- res += partial_lh_tmp[x]*inv_evec[i*nstates+x];
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
}
+ inv_evec_ptr += nstates;
partial_lh[c*nstates+i] = res;
}
}
}
- delete [] partial_lh_right;
- delete [] partial_lh_left;
} else if (left->node->isLeaf() && !right->node->isLeaf()) {
- // special treatment to TIP-INTERNAL NODE case
+
+ /*--------------------- TIP-INTERNAL NODE case ------------------*/
+
// only take scale_num from the right subtree
memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
- // pre compute information for left tip
- double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
-
- vector<int>::iterator it;
- for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
- int state = (*it);
- for (x = 0; x < block; x++) {
- double vleft = 0.0;
- for (i = 0; i < nstates; i++) {
- vleft += eleft[x*nstates+i] * tip_partial_lh[state*nstates+i];
- }
- partial_lh_left[state*block+x] = vleft;
- }
- }
- for (x = 0; x < block; x++) {
- size_t addr = aln->STATE_UNKNOWN * block;
- partial_lh_left[addr+x] = 1.0;
- }
+ double *partial_lh_left = partial_lh_leaves;
- double sum_scale = 0.0;
#ifdef _OPENMP
-//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, partial_lh_tmp)
#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
#endif
for (ptn = 0; ptn < nptn; ptn++) {
@@ -644,25 +859,33 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
double *partial_lh = dad_branch->partial_lh + ptn*block;
double *partial_lh_right = right->partial_lh + ptn*block;
int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+ double *vleft = partial_lh_left + state_left*block;
double lh_max = 0.0;
+ double *eright_ptr = eright;
for (c = 0; c < ncat; c++) {
// compute real partial likelihood vector
for (x = 0; x < nstates; x++) {
- double vleft = 0.0, vright = 0.0;
- size_t addr = c*nstatesqr+x*nstates;
- vleft = partial_lh_left[state_left*block+c*nstates+x];
+ double vright = 0.0;
+// size_t addr = c*nstatesqr+x*nstates;
+// vleft = partial_lh_left[state_left*block+c*nstates+x];
for (i = 0; i < nstates; i++) {
- vright += eright[addr+i] * partial_lh_right[c*nstates+i];
+ vright += eright_ptr[i] * partial_lh_right[i];
}
- partial_lh_tmp[x] = vleft * (vright);
+ eright_ptr += nstates;
+ partial_lh_tmp[x] = vleft[x] * (vright);
}
+ vleft += nstates;
+ partial_lh_right += nstates;
+
// compute dot-product with inv_eigenvector
+ double *inv_evec_ptr = inv_evec;
for (i = 0; i < nstates; i++) {
double res = 0.0;
for (x = 0; x < nstates; x++) {
- res += partial_lh_tmp[x]*inv_evec[i*nstates+x];
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
}
+ inv_evec_ptr += nstates;
partial_lh[c*nstates+i] = res;
lh_max = max(fabs(res), lh_max);
}
@@ -698,14 +921,13 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
}
dad_branch->lh_scale_factor += sum_scale;
- delete [] partial_lh_left;
+// delete [] partial_lh_left;
} else {
- // both left and right are internal node
- double sum_scale = 0.0;
+ /*--------------------- INTERNAL-INTERNAL NODE case ------------------*/
+
#ifdef _OPENMP
-//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, partial_lh_tmp)
#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
#endif
for (ptn = 0; ptn < nptn; ptn++) {
@@ -716,24 +938,34 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
double lh_max = 0.0;
dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+ double *eleft_ptr = eleft;
+ double *eright_ptr = eright;
+
for (c = 0; c < ncat; c++) {
// compute real partial likelihood vector
for (x = 0; x < nstates; x++) {
double vleft = 0.0, vright = 0.0;
- size_t addr = c*nstatesqr+x*nstates;
+// size_t addr = c*nstatesqr+x*nstates;
for (i = 0; i < nstates; i++) {
- vleft += eleft[addr+i] * partial_lh_left[c*nstates+i];
- vright += eright[addr+i] * partial_lh_right[c*nstates+i];
+ vleft += eleft_ptr[i] * partial_lh_left[i];
+ vright += eright_ptr[i] * partial_lh_right[i];
}
+ eleft_ptr += nstates;
+ eright_ptr += nstates;
partial_lh_tmp[x] = vleft*vright;
// assert(partial_lh_tmp[x] != 0.0);
}
+ partial_lh_left += nstates;
+ partial_lh_right += nstates;
+
// compute dot-product with inv_eigenvector
+ double *inv_evec_ptr = inv_evec;
for (i = 0; i < nstates; i++) {
double res = 0.0;
for (x = 0; x < nstates; x++) {
- res += partial_lh_tmp[x]*inv_evec[i*nstates+x];
+ res += partial_lh_tmp[x]*inv_evec_ptr[x];
}
+ inv_evec_ptr += nstates;
partial_lh[c*nstates+i] = res;
lh_max = max(lh_max, fabs(res));
}
@@ -772,8 +1004,8 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
}
- delete [] eright;
- delete [] eleft;
+ delete [] partial_lh_leaves;
+ delete [] echildren;
}
//template <const int nstates>
@@ -813,7 +1045,7 @@ void PhyloTree::computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode
if (dad->isLeaf()) {
// special treatment for TIP-INTERNAL NODE case
#ifdef _OPENMP
-#pragma omp parallel for private(ptn, i) schedule(static)
+#pragma omp parallel for private(ptn, i, c) schedule(static)
#endif
for (ptn = 0; ptn < nptn; ptn++) {
double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
diff --git a/quartet.cpp b/quartet.cpp
new file mode 100644
index 0000000..2fe668d
--- /dev/null
+++ b/quartet.cpp
@@ -0,0 +1,1756 @@
+//
+// quartet.cpp
+// iqtree
+//
+// Created by Minh Bui on 24/07/15.
+//
+//
+
+#include <stdio.h>
+#include <string.h>
+
+#include "phylotree.h"
+#include "phylosupertree.h"
+#include "model/partitionmodel.h"
+#include "alignment.h"
+#if 0 // (HAS-bla)
+#include "tools.h"
+#endif
+#include "ncl/ncl.h"
+#include "msetsblock.h"
+#include "myreader.h"
+// #include "lmap.c"
+
+#if 0 /*** moved to phylotree.h ***/
+/* Index definition for counter array needed in likelihood mapping analysis (HAS) */
+#define LM_REG1 0 /* top corner */
+#define LM_REG2 1 /* bottom-right corner */
+#define LM_REG3 2 /* bottom-left corner */
+#define LM_REG4 3 /* right rectangle */
+#define LM_REG5 4 /* bottom rectangle */
+#define LM_REG6 5 /* left rectangle */
+#define LM_REG7 6 /* center */
+#define LM_AR1 7 /* top third */
+#define LM_AR2 8 /* bottom-right third */
+#define LM_AR3 9 /* bottom-left third */
+#define LM_MAX 10
+#endif
+
+//*** likelihood mapping stuff (imported from TREE-PUZZLE's lmap.c) (HAS)
+
+// #include <time.h>
+
+/**********************************************************/
+/* Likelihood mapping routines (TODO: move to lmap.c/h) */
+/**********************************************************/
+
+/*
+ (a,b)-(c,d) => numclust == 4
+ (a,b)-(c,c) => numclust == 3
+ (a,a)-(b,b) => numclust == 2
+
+ 1l/\1r
+ / \
+ / 1 \
+ 6u / \ / \ 4u
+ / \/ \
+ 6d / /\ \ 4d
+ / 6 / \ 4 \
+ /\ / 7 \ /\
+ 3u/ \ /______\ / \2u
+ / 3 | 5 | 2 \
+ /_____|________|_____\
+ 3d 5l 5r 2d
+ (a,d)-(b,c) (a,c)-(b,d) => numclust == 4
+ (a,c)-(b,c) (a,c)-(b,c) => numclust == 3
+ (a,b)-(a,b) (a,b)-(a,b) => numclust == 2
+
+*/
+
+/***********************************
+* Likelihood mapping to SVG file *
+***********************************/
+
+/* first lines of SVG likelihood mapping file */
+void initsvg(FILE *ofp, QuartetGroups &LMGroups)
+{
+ /* SVG preamble */
+ fprintf(ofp,"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n");
+ fprintf(ofp,"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n");
+ fprintf(ofp,"<svg\n");
+ fprintf(ofp," xmlns:svg=\"http://www.w3.org/2000/svg\"\n");
+ fprintf(ofp," xmlns=\"http://www.w3.org/2000/svg\"\n");
+ fprintf(ofp," xmlns:xlink=\"http://www.w3.org/1999/xlink\"\n");
+ fprintf(ofp," version=\"1.1\"\n");
+ fprintf(ofp," baseProfile=\"full\"\n");
+ fprintf(ofp," id=\"body\"\n");
+ fprintf(ofp," width=\"800px\"\n");
+ fprintf(ofp," height=\"800px\"\n");
+ fprintf(ofp," viewBox=\"0 0 1000 1000\"\n");
+ fprintf(ofp," preserveAspectRatio=\"none\">\n");
+ fprintf(ofp," <defs>\n");
+ fprintf(ofp," <style type=\"text/css\"><![CDATA[\n");
+ fprintf(ofp," circle{ stroke: none; }\n");
+ fprintf(ofp," polygon{ stroke: black; stroke-width: 2px; fill: none; }\n");
+ fprintf(ofp," line{ stroke: black; stroke-width: 2px; }\n");
+ fprintf(ofp," text{ font-size:50px; }\n");
+ fprintf(ofp," ]]></style>\n");
+ fprintf(ofp," </defs>\n");
+ fprintf(ofp," <title\n");
+ fprintf(ofp," id=\"title1\">SVG drawing</title>\n");
+ /* end SVG preamble */
+
+ /* triangle 1 (top) */
+ fprintf(ofp,"<g transform=\"scale(0.45)\"><g transform=\"translate(600,1050)\">\n");
+ fprintf(ofp," <g id=\"fig1\">\n");
+ fprintf(ofp," <polygon points=\"0.0,-0.0 1000.0,-0.0 500,-866.0254038\" />\n");
+
+
+#if LMAP_CLUSTER
+#endif /* LMAP_CLUSTER */
+ if (LMGroups.numGroups == 2) { /* two cluster analysis */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"500.0\"\n");
+ fprintf(ofp," y=\"-896.0254038\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ // fprintf(ofp," id=\"label_top_1\">(a,a)-(b,b)</text> <!-- CHANGE HERE IF NECESSARY -->\n");
+ fprintf(ofp," id=\"label_top_1\">(%s,%s)-(%s,%s)</text> <!-- (a,a|b,b) - CHANGE HERE IF NECESSARY -->\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[0]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[1]).c_str());
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"-30.0\"\n");
+ fprintf(ofp," y=\"60.0\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ // fprintf(ofp," id=\"label_left_1\">(a,b)-(a,b)</text> <!-- CHANGE HERE IF NECESSARY -->\n");
+ fprintf(ofp," id=\"label_left_1\">(%s,%s)-(%s,%s)</text> <!-- (a,b|a,b) - CHANGE HERE IF NECESSARY -->\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str(),
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str());
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"1030.0\"\n");
+ fprintf(ofp," y=\"60.0\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ // fprintf(ofp," id=\"label_right_1\">(a,b)-(a,b)</text> <!-- CHANGE HERE IF NECESSARY -->\n");
+ fprintf(ofp," id=\"label_right_1\">(%s,%s)-(%s,%s)</text> <!-- (a,b|a,b) - CHANGE HERE IF NECESSARY -->\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str(),
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str());
+ }
+ if (LMGroups.numGroups == 3) { /* three cluster analysis */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"500.0\"\n");
+ fprintf(ofp," y=\"-896.0254038\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ // fprintf(ofp," id=\"label_top_1\">(a,b)-(c,c)</text> <!-- CHANGE HERE IF NECESSARY -->\n");
+ fprintf(ofp," id=\"label_top_1\">(%s,%s)-(%s,%s)</text> <!-- (a,b|c,c) - CHANGE HERE IF NECESSARY -->\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str(),
+ (LMGroups.Name[2]).c_str(),(LMGroups.Name[2]).c_str());
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"-30.0\"\n");
+ fprintf(ofp," y=\"60.0\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ // fprintf(ofp," id=\"label_left_1\">(a,c)-(b,c)</text> <!-- CHANGE HERE IF NECESSARY -->\n");
+ fprintf(ofp," id=\"label_left_1\">(%s,%s)-(%s,%s)</text> <!-- (a,c|b,c) - CHANGE HERE IF NECESSARY -->\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[2]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[2]).c_str());
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"1030.0\"\n");
+ fprintf(ofp," y=\"60.0\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ // fprintf(ofp," id=\"label_right_1\">(a,c)-(b,c)</text> <!-- CHANGE HERE IF NECESSARY -->\n");
+ fprintf(ofp," id=\"label_right_1\">(%s,%s)-(%s,%s)</text> <!-- (a,c|b,c) - CHANGE HERE IF NECESSARY -->\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[2]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[2]).c_str());
+ }
+ if (LMGroups.numGroups == 4) { /* four cluster analysis */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"500.0\"\n");
+ fprintf(ofp," y=\"-896.0254038\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ // fprintf(ofp," id=\"label_top_1\">(a,b)-(c,d)</text> <!-- CHANGE HERE IF NECESSARY -->\n");
+ fprintf(ofp," id=\"label_top_1\">(%s,%s)-(%s,%s)</text> <!-- (a,b|c,d) - CHANGE HERE IF NECESSARY -->\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str(),
+ (LMGroups.Name[2]).c_str(),(LMGroups.Name[3]).c_str());
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"-30.0\"\n");
+ fprintf(ofp," y=\"60.0\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ // fprintf(ofp," id=\"label_left_1\">(a,d)-(b,c)</text> <!-- CHANGE HERE IF NECESSARY -->\n");
+ fprintf(ofp," id=\"label_left_1\">(%s,%s)-(%s,%s)</text> <!-- (a,d|b,c) - CHANGE HERE IF NECESSARY -->\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[3]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[2]).c_str());
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"1030.0\"\n");
+ fprintf(ofp," y=\"60.0\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ // fprintf(ofp," id=\"label_right_1\">(a,c)-(b,d)</text> <!-- CHANGE HERE IF NECESSARY -->\n");
+ fprintf(ofp," id=\"label_right_1\">(%s,%s)-(%s,%s)</text> <!-- (a,c|b,d) - CHANGE HERE IF NECESSARY -->\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[2]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[3]).c_str());
+ }
+
+} /* initsvg */
+
+
+
+
+void plotlmpointsvg(FILE *ofp, double w1, double w2)
+{
+ /* plot dots into triangle 1 (top) */
+ fprintf(ofp," <circle cx=\"%.10f\" cy=\"%.10f\" r=\"2\" />\n", (0.5*w1 + w2)*1000, -(w1*866.0254038));
+} /* plotlmpointsvg */
+
+
+
+// void finishsvg(FILE *ofp, unsigned long **countarr)
+void finishsvg(FILE *ofp, vector<SeqQuartetInfo> lmap_seq_quartet_info, int leafNum, unsigned long Numquartets)
+{
+ fprintf(ofp," </g>\n");
+ /* end triangle 1 (top) */
+
+ /* triangle 2 (bottom left) */
+ fprintf(ofp," <g id=\"fig2\" transform=\"translate(-550.0,1000)\">\n");
+ fprintf(ofp," <polygon points=\"0.0,-0.0 1000.0,-0.0 500.0,-866.0254038\" />\n");
+ fprintf(ofp," <line\n");
+ fprintf(ofp," id=\"line2-1\"\n");
+ fprintf(ofp," y2=\"-0.0\"\n");
+ fprintf(ofp," x2=\"500\"\n");
+ fprintf(ofp," y1=\"-288.6751346\"\n");
+ fprintf(ofp," x1=\"500\" />\n");
+ fprintf(ofp," <line\n");
+ fprintf(ofp," id=\"line2-2\"\n");
+ fprintf(ofp," y2=\"-433.0127019\"\n");
+ fprintf(ofp," x2=\"250\"\n");
+ fprintf(ofp," y1=\"-288.6751346\"\n");
+ fprintf(ofp," x1=\"500\" />\n");
+ fprintf(ofp," <line\n");
+ fprintf(ofp," id=\"line2-3\"\n");
+ fprintf(ofp," y2=\"-433.0127019\"\n");
+ fprintf(ofp," x2=\"750\"\n");
+ fprintf(ofp," y1=\"-288.6751346\"\n");
+ fprintf(ofp," x1=\"500\" />\n");
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"440\"\n");
+ fprintf(ofp," y=\"-500\"\n");
+ fprintf(ofp," id=\"up_2\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_AR1]*100.0/Numquartets);
+ // fprintf(ofp," id=\"up_2\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_AR1]*100.0/Numquartets);
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"250\"\n");
+ fprintf(ofp," y=\"-150\"\n");
+ fprintf(ofp," id=\"down_left_2\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_AR3]*100.0/Numquartets);
+ // fprintf(ofp," id=\"down_left_2\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_AR3]*100.0/Numquartets);
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"630\"\n");
+ fprintf(ofp," y=\"-150\"\n");
+ fprintf(ofp," id=\"down_right_2\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_AR2]*100.0/Numquartets);
+ // fprintf(ofp," id=\"down_right_2\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_AR2]*100.0/Numquartets);
+ fprintf(ofp," </g>\n");
+ /* end triangle 2 (bottom left) */
+
+ /* triangle 3 (bottom right) */
+ fprintf(ofp," <g id=\"fig3\" transform=\"translate(550,1000)\">\n");
+ fprintf(ofp," <polygon points=\"0.0,-0.0 1000.0,-0.0 500.0,-866.0254038\" />\n");
+ fprintf(ofp," <polygon id=\"triangle3b\" points=\"250,-144.3375673 750,-144.3375673 500,-577.3502692\" />\n");
+ fprintf(ofp," <line\n");
+ fprintf(ofp," id=\"line3-1\"\n");
+ fprintf(ofp," x1=\"125\"\n");
+ fprintf(ofp," y1=\"-216.5063509\"\n");
+ fprintf(ofp," x2=\"250\"\n");
+ fprintf(ofp," y2=\"-144.3375673\" />\n");
+ fprintf(ofp," <line\n");
+ fprintf(ofp," id=\"line3-2\"\n");
+ fprintf(ofp," x1=\"375\"\n");
+ fprintf(ofp," y1=\"-649.5190528\"\n");
+ fprintf(ofp," x2=\"500\"\n");
+ fprintf(ofp," y2=\"-577.3502692\" />\n");
+ fprintf(ofp," <line\n");
+ fprintf(ofp," id=\"line3-3\"\n");
+ fprintf(ofp," x1=\"625\"\n");
+ fprintf(ofp," y1=\"-649.5190528\"\n");
+ fprintf(ofp," x2=\"500\"\n");
+ fprintf(ofp," y2=\"-577.3502692\" />\n");
+ fprintf(ofp," <line\n");
+ fprintf(ofp," id=\"line3-4\"\n");
+ fprintf(ofp," x1=\"875\"\n");
+ fprintf(ofp," y1=\"-216.5063509\"\n");
+ fprintf(ofp," x2=\"750\"\n");
+ fprintf(ofp," y2=\"-144.3375673\" />\n");
+ fprintf(ofp," <line\n");
+ fprintf(ofp," id=\"line3-5\"\n");
+ fprintf(ofp," x1=\"750\"\n");
+ fprintf(ofp," y1=\"-0.0\"\n");
+ fprintf(ofp," x2=\"750\"\n");
+ fprintf(ofp," y2=\"-144.3375673\" />\n");
+ fprintf(ofp," <line\n");
+ fprintf(ofp," id=\"line3-6\"\n");
+ fprintf(ofp," x1=\"250\"\n");
+ fprintf(ofp," y1=\"-0.0\"\n");
+ fprintf(ofp," x2=\"250\"\n");
+ fprintf(ofp," y2=\"-144.3375673\" />\n");
+
+ /* number of resolved quartets, top */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"500\"\n");
+ fprintf(ofp," y=\"-660\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ fprintf(ofp," id=\"up_3\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_REG1]*100.0/Numquartets);
+ // fprintf(ofp," id=\"up_3\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_REG1]*100.0/Numquartets);
+
+ /* number of resolved quartets, bottom left */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," y=\"-50\"\n");
+ fprintf(ofp," x=\"70\"\n");
+ fprintf(ofp," id=\"down_left_3\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_REG3]*100.0/Numquartets);
+ // fprintf(ofp," id=\"down_left_3\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_REG3]*100.0/Numquartets);
+
+ /* number of resolved quartets, bottom right */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," y=\"-50\"\n");
+ fprintf(ofp," x=\"770\"\n");
+ fprintf(ofp," id=\"down_right_3\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_REG2]*100.0/Numquartets);
+ // fprintf(ofp," id=\"down_right_3\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_REG2]*100.0/Numquartets);
+
+ /* number of partly resolved quartets, bottom */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"500\"\n");
+ fprintf(ofp," y=\"-50\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ fprintf(ofp," id=\"down_side_3\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_REG5]*100.0/Numquartets);
+ // fprintf(ofp," id=\"down_side_3\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_REG5]*100.0/Numquartets);
+
+ /* number of unresolved quartets, center */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"500\"\n");
+ fprintf(ofp," y=\"-280\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ fprintf(ofp," id=\"center_3\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_REG7]*100.0/Numquartets);
+ // fprintf(ofp," id=\"center_3\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_REG7]*100.0/Numquartets);
+
+ /* number of partly resolved quartets, top right */
+ /* fprintf(ofp,"<circle cx=\"685.0\" cy=\"-390.8439\" r=\"20\" />\n"); */ /* ro */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"685.0\"\n");
+ fprintf(ofp," y=\"-390.8439\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ fprintf(ofp," transform=\"rotate(60,665.0,-380.8439)\"\n");
+ fprintf(ofp," id=\"right_side_3\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_REG4]*100.0/Numquartets);
+ // fprintf(ofp," id=\"right_side_3\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_REG4]*100.0/Numquartets);
+
+ /* number of partly resolved quartets, top left */
+ /* fprintf(ofp,"<circle cx=\"315.0\" cy=\"-390.8439\" r=\"20\" />\n"); */ /* lo */
+ fprintf(ofp," <text\n");
+ fprintf(ofp," x=\"315.0\"\n");
+ fprintf(ofp," y=\"-390.8439\"\n");
+ fprintf(ofp," text-anchor=\"middle\"\n");
+ fprintf(ofp," transform=\"rotate(-60,335.0,-380.8439)\"\n");
+ fprintf(ofp," id=\"left_side_3\">%.1f%%</text>\n", (double)lmap_seq_quartet_info[leafNum].countarr[LM_REG6]*100.0/Numquartets);
+ // fprintf(ofp," id=\"left_side_3\">%.1f%%</text>\n", (double)countarr[Maxspc][LM_REG6]*100.0/Numquartets);
+
+ fprintf(ofp," </g>\n");
+ /* end triangle 3 (bottom right) */
+
+ fprintf(ofp,"</g></g>\n");
+ fprintf(ofp,"</svg>\n");
+} /* finishsvg */
+
+/* end - Likelihood mapping to SVG file */
+
+
+/***********************************
+* Likelihood mapping to EPS file *
+***********************************/
+
+/* first lines of EPSF likelihood mapping file */
+void initeps(FILE *ofp, QuartetGroups &LMGroups)
+{
+ time_t Starttime;
+ time(&Starttime);
+
+ fprintf(ofp, "%%!PS-Adobe-3.0 EPSF-3.0\n");
+ fprintf(ofp, "%%%%BoundingBox: 60 210 550 650\n");
+ fprintf(ofp, "%%%%Pages: 1\n");
+ fprintf(ofp, "%%%%Creator: IQ-TREE/TREE-PUZZLE\n");
+#if 0
+# ifndef ALPHA
+ fprintf(ofp, "%%%%Creator: %s (version %s)\n", PACKAGE, VERSION);
+# else
+ fprintf(ofp, "%%%%Creator: %s (version %s%s)\n", PACKAGE, VERSION, ALPHA);
+# endif
+#endif
+ fprintf(ofp, "%%%%Title: Likelihood Mapping Analysis\n");
+ fprintf(ofp, "%%%%CreationDate: %s", asctime(localtime(&Starttime)) );
+ fprintf(ofp, "%%%%DocumentFonts: Helvetica\n");
+ fprintf(ofp, "%%%%DocumentNeededFonts: Helvetica\n");
+ fprintf(ofp, "%%%%EndComments\n");
+ fprintf(ofp, "%% use inch as unit\n");
+ fprintf(ofp, "/inch {72 mul} def\n");
+ fprintf(ofp, "%% triangle side length (3 inch)\n");
+ fprintf(ofp, "/tl {3 inch mul} def\n");
+ fprintf(ofp, "%% plot one dot (x-y coordinates on stack)\n");
+ fprintf(ofp, "/dot {\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, "0.002 tl 0 360 arc %% radius is 0.002 of the triangle length\n");
+ fprintf(ofp, "closepath\n");
+ fprintf(ofp, "fill\n");
+ fprintf(ofp, "} def\n");
+
+ /* PS definition of a flush right print */
+ fprintf(ofp, "\n%% flush right show\n");
+ fprintf(ofp, "/centershow {\n");
+ fprintf(ofp, " dup stringwidth pop %% get length of string\n");
+ fprintf(ofp, " neg 0 rmoveto %% move width to left\n");
+ fprintf(ofp, " show\n");
+ fprintf(ofp, "} def\n");
+ fprintf(ofp, "\n%% centered show\n");
+
+ /* PS definition of a centered print */
+ fprintf(ofp, "/centershow {\n");
+ fprintf(ofp, " dup stringwidth pop %% get length of string\n");
+ fprintf(ofp, " -2 div %% devide length by -2\n");
+ fprintf(ofp, " 0 rmoveto %% move half width to left\n");
+ fprintf(ofp, " show\n");
+ fprintf(ofp, "} def\n");
+
+
+ fprintf(ofp, "%% preamble\n");
+ fprintf(ofp, "/Helvetica findfont\n");
+ fprintf(ofp, "12 scalefont\n");
+ fprintf(ofp, "setfont\n");
+ fprintf(ofp, "%% 0/0 for triangle of triangles\n");
+ fprintf(ofp, "0.9 inch 3 inch translate\n");
+ fprintf(ofp, "%% first triangle (the one with dots)\n");
+ fprintf(ofp, "0.6 tl 1.2 tl 0.8660254038 mul translate\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.0 tl 0.0 tl moveto\n");
+ fprintf(ofp, " 1.0 tl 0.0 tl lineto\n");
+ fprintf(ofp, " 0.5 tl 0.8660254038 tl lineto\n");
+ fprintf(ofp, "closepath\n");
+ fprintf(ofp, "stroke\n");
+
+#if LMAP_CLUSTER
+#endif /* LMAP_CLUSTER */
+ if (LMGroups.numGroups == 2) { /* two cluster analysis */
+ fprintf(ofp, "%% label corners\n");
+ fprintf(ofp, "0.5 tl 0.9 tl moveto\n"); /* old: 0.375 0.9 */
+ // fprintf(ofp, "((a,a)-(b,b)) centershow %% CHANGE HERE IF NECESSARY\n");
+ fprintf(ofp, "((%s,%s)-(%s,%s)) centershow %% (a,a|b,b) - CHANGE HERE IF NECESSARY\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[0]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[1]).c_str());
+ fprintf(ofp, "-0.045 tl -0.08 tl moveto\n"); /* old: -0.16 -0.08 */
+ // fprintf(ofp, "((a,b)-(a,b)) centershow %% CHANGE HERE IF NECESSARY\n");
+ fprintf(ofp, "((%s,%s)-(%s,%s)) centershow %% (a,b|a,b) - CHANGE HERE IF NECESSARY\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str(),
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str());
+ fprintf(ofp, "1.045 tl -0.08 tl moveto\n"); /* old: -0.92 -0.08 */
+ // fprintf(ofp, "((a,b)-(a,b)) centershow %% CHANGE HERE IF NECESSARY\n");
+ fprintf(ofp, "((%s,%s)-(%s,%s)) centershow %% (a,b|a,b) - CHANGE HERE IF NECESSARY\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str(),
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str());
+ }
+ if (LMGroups.numGroups == 3) { /* three cluster analysis */
+ fprintf(ofp, "%% label corners\n");
+ fprintf(ofp, "0.5 tl 0.9 tl moveto\n"); /* old: 0.375 0.9 */
+ // fprintf(ofp, "((a,b)-(c,c)) centershow %% CHANGE HERE IF NECESSARY\n");
+ fprintf(ofp, "((%s,%s)-(%s,%s)) centershow %% (a,b|c,c) - CHANGE HERE IF NECESSARY\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str(),
+ (LMGroups.Name[2]).c_str(),(LMGroups.Name[2]).c_str());
+ fprintf(ofp, "-0.045 tl -0.08 tl moveto\n"); /* old: -0.16 -0.08 */
+ // fprintf(ofp, "((a,c)-(b,c)) centershow %% CHANGE HERE IF NECESSARY\n");
+ fprintf(ofp, "((%s,%s)-(%s,%s)) centershow %% (a,c|b,c) - CHANGE HERE IF NECESSARY\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[2]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[2]).c_str());
+ fprintf(ofp, "1.045 tl -0.08 tl moveto\n"); /* old: -0.92 -0.08 */
+ // fprintf(ofp, "((a,c)-(b,c)) centershow %% CHANGE HERE IF NECESSARY\n");
+ fprintf(ofp, "((%s,%s)-(%s,%s)) centershow %% (a,c|b,c) - CHANGE HERE IF NECESSARY\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[2]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[2]).c_str());
+ }
+ if (LMGroups.numGroups == 4) { /* four cluster analysis */
+ fprintf(ofp, "%% label corners\n");
+ fprintf(ofp, "0.5 tl 0.9 tl moveto\n"); /* old: 0.375 0.9 */
+ // fprintf(ofp, "((a,b)-(c,d)) centershow %% CHANGE HERE IF NECESSARY\n");
+ fprintf(ofp, "((%s,%s)-(%s,%s)) centershow %% (a,b|c,d) - CHANGE HERE IF NECESSARY\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[1]).c_str(),
+ (LMGroups.Name[2]).c_str(),(LMGroups.Name[3]).c_str());
+ fprintf(ofp, "-0.045 tl -0.08 tl moveto\n"); /* old: -0.16 -0.08 */
+ // fprintf(ofp, "((a,d)-(b,c)) centershow %% CHANGE HERE IF NECESSARY\n");
+ fprintf(ofp, "((%s,%s)-(%s,%s)) centershow %% (a,d|b,c) - CHANGE HERE IF NECESSARY\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[3]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[2]).c_str());
+ fprintf(ofp, "1.045 tl -0.08 tl moveto\n"); /* old: -0.92 -0.08 */
+ // fprintf(ofp, "((a,c)-(b,d)) centershow %% CHANGE HERE IF NECESSARY\n");
+ fprintf(ofp, "((%s,%s)-(%s,%s)) centershow %% (a,c|b,d) - CHANGE HERE IF NECESSARY\n",
+ (LMGroups.Name[0]).c_str(),(LMGroups.Name[2]).c_str(),
+ (LMGroups.Name[1]).c_str(),(LMGroups.Name[3]).c_str());
+ }
+
+} /* initeps */
+
+/* plot one point of likelihood mapping analysis (EPS) */
+void plotlmpointeps(FILE *epsofp, double w1, double w2)
+{
+ fprintf(epsofp,"%.10f tl %.10f tl dot\n", 0.5*w1 + w2, w1*0.8660254038);
+} /* plotlmpointeps */
+
+
+
+#if 0
+/* plot one point of likelihood mapping analysis */
+void plotlmpoint(FILE *epsofp, FILE *svgofp, double w1, double w2)
+{
+ if (lmapeps_optn) {
+ fprintf(epsofp,"%.10f tl %.10f tl dot\n",
+ 0.5*w1 + w2, w1*0.8660254038);
+ }
+ if (lmapsvg_optn) {
+ //fprintf(svgofp," <use x=\"%.10f\" y=\"%.10f\" xlink:href=\"#result\" />\n", (0.5*w1 + w2), -(w1*0.8660254038));
+ fprintf(svgofp," <circle cx=\"%.10f\" cy=\"%.10f\" r=\"2\" />\n",
+ (0.5*w1 + w2)*1000, -(w1*866.0254038));
+ }
+} /* plotlmpoint */
+#endif
+
+
+
+#if 0
+/* plot one point of likelihood mapping analysis */
+void plotlmpointcolor(FILE *epsofp, FILE *svgofp, double w1, double w2, int red, int green, int blue)
+{
+ if (lmapeps_optn) {
+ fprintf(epsofp,"currentrgbcolor %d %d %d setrgbcolor\n", red, green, blue);
+ fprintf(epsofp,"%.10f tl %.10f tl dot\n",
+ 0.5*w1 + w2, w1*0.8660254038);
+ fprintf(epsofp,"setrgbcolor\n");
+ }
+ if (lmapsvg_optn) {
+/*
+ stijn imbrechts:
+ Adding colour to elements is pretty easy, if you are familiar with
+ CSS, it works almost exactly the same.
+ The dots are represented by a <circle> element, if you want all of
+ them to be, for example, red, add this to the <style> area:
+ circle{ fill: red; stroke: red }
+
+ If you just want a certain group of dots coloured, you can group them
+ by adding a "class"-attribute like this:
+ <circle cx="500" cy="100" r="2" class="reddot" />
+ And add the following rule to the <style> area:
+ circle.reddot{ fill: red; stroke: red; }
+ Only the circles who belong to the "reddot" class will turn red
+
+ you can use rgb values as well: fill: rgb(255,0,0);
+*/
+ fprintf(svgofp," <circle cx=\"%.10f\" cy=\"%.10f\" r=\"2\" ",
+ (0.5*w1 + w2)*1000, -(w1*866.0254038));
+ fprintf(svgofp,"fill=\"rgb(%d%%, %d%%, %d%%)\" />\n", (int)(100*red), (int)(100*green), (int)(100*blue));
+ }
+} /* plotlmpointcolor */
+#endif
+
+
+
+
+/* last lines of EPSF likelihood mapping file */
+//void finisheps(FILE *ofp, unsigned long **countarr)
+void finisheps(FILE *ofp, vector<SeqQuartetInfo> lmap_seq_quartet_info, int leafNum, unsigned long Numquartets)
+{
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "%% second triangle (the one with 3 basins)\n");
+ fprintf(ofp, "/secondtriangle {\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.0 tl 0.0 tl moveto\n");
+ fprintf(ofp, " 1.0 tl 0.0 tl lineto\n");
+ fprintf(ofp, " 0.5 tl 0.8660254038 tl lineto\n");
+ fprintf(ofp, "closepath\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.50 tl 0.2886751346 tl moveto\n");
+ fprintf(ofp, " 0.50 tl 0.0000000000 tl lineto\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.50 tl 0.2886751346 tl moveto\n");
+ fprintf(ofp, " 0.25 tl 0.4330127019 tl lineto\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.50 tl 0.2886751346 tl moveto\n");
+ fprintf(ofp, " 0.75 tl 0.4330127019 tl lineto\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "0.44 tl 0.5 tl moveto %% up\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_AR1]*100.0/Numquartets);
+ fprintf(ofp, "0.25 tl 0.15 tl moveto %% down left\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_AR3]*100.0/Numquartets);
+ fprintf(ofp, "0.63 tl 0.15 tl moveto %% down right\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_AR2]*100.0/Numquartets);
+ fprintf(ofp, "} def\n");
+ fprintf(ofp, "%% third triangle (the one with 7 basins)\n");
+ fprintf(ofp, "/thirdtriangle {\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.0 tl 0.0 tl moveto\n");
+ fprintf(ofp, " 1.0 tl 0.0 tl lineto\n");
+ fprintf(ofp, " 0.5 tl 0.8660254038 tl lineto\n");
+ fprintf(ofp, "closepath\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.25 tl 0.1443375673 tl moveto\n");
+ fprintf(ofp, " 0.75 tl 0.1443375673 tl lineto\n");
+ fprintf(ofp, " 0.50 tl 0.5773502692 tl lineto\n");
+ fprintf(ofp, "closepath\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.125 tl 0.2165063509 tl moveto\n");
+ fprintf(ofp, " 0.250 tl 0.1443375673 tl lineto\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.375 tl 0.6495190528 tl moveto\n");
+ fprintf(ofp, " 0.500 tl 0.5773502692 tl lineto\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.625 tl 0.6495190528 tl moveto\n");
+ fprintf(ofp, " 0.500 tl 0.5773502692 tl lineto\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.875 tl 0.2165063509 tl moveto\n");
+ fprintf(ofp, " 0.750 tl 0.1443375673 tl lineto\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.750 tl 0.00 tl moveto\n");
+ fprintf(ofp, " 0.750 tl 0.1443375673 tl lineto\n");
+ fprintf(ofp, "stroke\n");
+ fprintf(ofp, "newpath\n");
+ fprintf(ofp, " 0.250 tl 0.00 tl moveto\n");
+ fprintf(ofp, " 0.250 tl 0.1443375673 tl lineto\n");
+ fprintf(ofp, "stroke\n");
+ /* resolved quartets, top */
+ fprintf(ofp, "0.42 tl 0.66 tl moveto %% up\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_REG1]*100.0/Numquartets);
+ /* resolved quartets, bottom left */
+ fprintf(ofp, "0.07 tl 0.05 tl moveto %% down left\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_REG3]*100.0/Numquartets);
+ /* resolved quartets, bottom right */
+ fprintf(ofp, "0.77 tl 0.05 tl moveto %% down right\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_REG2]*100.0/Numquartets);
+ /* partly resolved quartets, bottom */
+ fprintf(ofp, "0.43 tl 0.05 tl moveto %% down side\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_REG5]*100.0/Numquartets);
+ /* unresolved quartets */
+ fprintf(ofp, "0.43 tl 0.28 tl moveto %% center\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_REG7]*100.0/Numquartets);
+ /* partly resolved quartets, top right */
+ fprintf(ofp, "gsave\n");
+ fprintf(ofp, "-60 rotate\n");
+ fprintf(ofp, "-0.07 tl 0.77 tl moveto %% right side\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_REG4]*100.0/Numquartets);
+ fprintf(ofp, "grestore\n");
+ /* partly resolved quartets, top left */
+ fprintf(ofp, "gsave\n");
+ fprintf(ofp, "60 rotate\n");
+ fprintf(ofp, "0.4 tl -0.09 tl moveto %% left side\n");
+ fprintf(ofp, "(%.1f%%) show\n", (double) lmap_seq_quartet_info[leafNum].countarr[LM_REG6]*100.0/Numquartets);
+ fprintf(ofp, "grestore\n");
+ fprintf(ofp, "} def\n");
+ fprintf(ofp, "%% print the other two triangles\n");
+ fprintf(ofp, "-0.6 tl -1.2 tl 0.8660254038 mul translate\n");
+ fprintf(ofp, "secondtriangle\n");
+ fprintf(ofp, "1.2 tl 0 translate\n");
+ fprintf(ofp, "thirdtriangle\n");
+ fprintf(ofp, "showpage\n");
+ fprintf(ofp, "%%%%EOF\n");
+} /* finisheps */
+
+/* Likelihood mapping to EPS file */
+
+
+/****************************************/
+/* end of Likelihood mapping routines */
+/****************************************/
+
+
+/***************************************************************/
+
+
+//*** end of likelihood mapping stuff (imported from TREE-PUZZLE's lmap.c) (HAS)
+
+
+void PhyloTree::computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info, QuartetGroups &LMGroups) {
+
+ if (leafNum < 4)
+ outError("Tree must have 4 or more taxa with unique sequences!");
+
+ lmap_quartet_info.resize(params->lmap_num_quartets);
+
+ int qc[] = {0, 1, 2, 3, 0, 2, 1, 3, 0, 3, 1, 2};
+
+ double onethird = 1.0/3.0;
+ unsigned char treebits[] = {1, 2, 4};
+
+ int sizeA, sizeB, sizeC, sizeD, numGroups;
+ int size3, size2, size1, size0;
+
+ // LMGroups.numGroups = 0;
+ if(LMGroups.numGroups == 0) { /* no grouping */
+ LMGroups.numGroups = 1;
+ LMGroups.GroupA.resize(leafNum);
+ for (int s = 0; s<leafNum; s++) LMGroups.GroupA[s] = s;
+ LMGroups.numGrpSeqs[0] = leafNum; /* cluster A */
+ LMGroups.numGrpSeqs[1] = 0; /* cluster B */
+ LMGroups.numGrpSeqs[2] = 0; /* cluster C */
+ LMGroups.numGrpSeqs[3] = 0; /* cluster D */
+ LMGroups.numGrpSeqs[4] = 0; /* excluded */
+ LMGroups.numQuartSeqs = leafNum; /* all sequences in analysis */
+ LMGroups.numSeqs = leafNum; /* all sequences in alignment */
+ LMGroups.numGroups = 1;
+ }
+
+ numGroups = LMGroups.numGroups;
+ sizeA = LMGroups.numGrpSeqs[0]; /* cluster A */
+ sizeB = LMGroups.numGrpSeqs[1]; /* cluster B */
+ sizeC = LMGroups.numGrpSeqs[2]; /* cluster C */
+ sizeD = LMGroups.numGrpSeqs[3]; /* cluster D */
+
+ switch(LMGroups.numGroups){
+ case 1:
+ if(sizeA < 4)
+ outError("Likelihood Mapping requires 4 or more taxa with unique sequences!");
+ break;
+ case 2:
+ if((sizeA < 2)||(sizeB < 2))
+ outError("2-cluster Likelihood Mapping requires clusters A and B to have >=2 taxa with unique sequences!");
+ break;
+ case 3:
+ if((sizeA < 1)||(sizeB < 1)||(sizeC < 2))
+ outError("3-cluster Likelihood Mapping requires clusters B and C to have >=1 and cluster C >=2 taxa with unique sequences!");
+ break;
+ case 4:
+ if((sizeA < 1)||(sizeB < 1)||(sizeA < 1)||(sizeD < 1))
+ outError("4-cluster Likelihood Mapping requires all 4 clusters to have >0 taxa with unique sequences!");
+ break;
+ default:
+ outError("Unknown Likelihood Mapping mode! PLEASE report this to the developers!");
+ break;
+ }
+
+ switch(LMGroups.numGroups){
+ case 1:
+ size3 = sizeA-4;
+ size2 = sizeA-3;
+ size1 = sizeA-2;
+ size0 = sizeA-1;
+ LMGroups.uniqueQuarts = 1 + size3 +
+ size2 * (size2-1) / 2 +
+ size1 * (size1-1) * (size1-2) / 6 +
+ size0 * (size0-1) * (size0-2) * (size0-3) / 24;
+ break;
+ case 2:
+ LMGroups.uniqueQuarts = (sizeA * (sizeA - 1)) / 2 * (sizeB * (sizeB - 1)) / 2; break;
+ case 3:
+ LMGroups.uniqueQuarts = sizeA * sizeB * (sizeC * (sizeC - 1)) / 2; break;
+ case 4:
+ LMGroups.uniqueQuarts = sizeA * sizeB * sizeC * sizeD; break;
+ default:
+ outError("Unknown Likelihood Mapping mode! PLEASE report this to the developers!");
+ break;
+ }
+ // fprintf(stderr,"XXX - #quarts: %d; #groups: %d, A: %d, B:%d, C:%d, D:%d\n", LMGroups.uniqueQuarts, LMGroups.numGroups, sizeA, sizeB, sizeC, sizeD);
+
+
+#ifdef _OPENMP
+ #pragma omp parallel
+ {
+ int *rstream;
+ init_random(params->ran_seed + omp_get_thread_num(), false, &rstream);
+#else
+ int *rstream = randstream;
+#endif
+
+#ifdef _OPENMP
+ #pragma omp for schedule(guided)
+#endif
+ for (int qid = 0; qid < params->lmap_num_quartets; qid++) { /*** draw lmap_num_quartets quartets randomly ***/
+ // fprintf(stderr, "%d\n", qid);
+
+ // uniformly draw 4 taxa
+ // (a) sample taxon 1
+ // was: lmap_quartet_info[qid].seqID[0] = random_int(leafNum);
+ lmap_quartet_info[qid].seqID[0] = LMGroups.GroupA[random_int(sizeA, rstream)];
+
+ do {
+ // (b) sample taxon 2 according to the number of clusters
+ // was: lmap_quartet_info[qid].seqID[1] = random_int(leafNum);
+ switch(numGroups){
+ case 1: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,A2|a3,a4
+ case 2: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,A2|b1,b2
+ case 3: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a ,B |c1,c2
+ case 4: lmap_quartet_info[qid].seqID[1] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a ,B |c, d
+ default: outError("Unknown Likelihood Mapping sampling mode! PLEASE report this to the developers!"); break;
+ }
+ } while (lmap_quartet_info[qid].seqID[1] == lmap_quartet_info[qid].seqID[0]);
+ do {
+ // (c) sample taxon 3 according to the number of clusters
+ // was: lmap_quartet_info[qid].seqID[2] = random_int(leafNum);
+ switch(numGroups){
+ case 1: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,a2|A3,a4
+ case 2: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a1,a2|B1,b2
+ case 3: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupC[random_int(sizeC, rstream)]; break; // a ,b |C1,c2
+ case 4: lmap_quartet_info[qid].seqID[2] = LMGroups.GroupC[random_int(sizeC, rstream)]; break; // a ,b |C, d
+ default: outError("Unknown Likelihood Mapping sampling mode! PLEASE report this to the developers!"); break;
+ }
+ } while (lmap_quartet_info[qid].seqID[2] == lmap_quartet_info[qid].seqID[0] || lmap_quartet_info[qid].seqID[2] == lmap_quartet_info[qid].seqID[1]);
+ do {
+ // (d) sample taxon 4 according to the number of clusters
+ // was: lmap_quartet_info[qid].seqID[3] = random_int(leafNum);
+ switch(numGroups){
+ case 1: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupA[random_int(sizeA, rstream)]; break; // a1,a2|a3,A4
+ case 2: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupB[random_int(sizeB, rstream)]; break; // a1,a2|b1,B2
+ case 3: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupC[random_int(sizeC, rstream)]; break; // a ,b |c1,C2
+ case 4: lmap_quartet_info[qid].seqID[3] = LMGroups.GroupD[random_int(sizeD, rstream)]; break; // a ,b |c, D
+ default: outError("Unknown Likelihood Mapping sampling mode! PLEASE report this to the developers!"); break;
+ }
+ } while (lmap_quartet_info[qid].seqID[3] == lmap_quartet_info[qid].seqID[0] || lmap_quartet_info[qid].seqID[3] == lmap_quartet_info[qid].seqID[1]
+ || lmap_quartet_info[qid].seqID[3] == lmap_quartet_info[qid].seqID[2]);
+
+// fprintf(stderr, "qqq%d: %d, %d, %d, %d\n", qid, lmap_quartet_info[qid].seqID[0], lmap_quartet_info[qid].seqID[1], lmap_quartet_info[qid].seqID[2], lmap_quartet_info[qid].seqID[3]);
+
+ // *** taxa should not be sorted, because that changes the corners a dot is assigned to - removed HAS ;^)
+ // obsolete: sort(lmap_quartet_info[qid].seqID, lmap_quartet_info[qid].seqID+4); // why sort them?!? HAS ;^)
+
+ // initialize sub-alignment and sub-tree
+ Alignment *quartet_aln;
+ PhyloTree *quartet_tree;
+ if (aln->isSuperAlignment()) {
+ quartet_aln = new SuperAlignment;
+ } else {
+ quartet_aln = new Alignment;
+ }
+ IntVector seq_id;
+ seq_id.insert(seq_id.begin(), lmap_quartet_info[qid].seqID, lmap_quartet_info[qid].seqID+4);
+ quartet_aln->extractSubAlignment(aln, seq_id, 0);
+ if (isSuperTree()) {
+ quartet_tree = new PhyloSuperTree((SuperAlignment*)quartet_aln, (PhyloSuperTree*)this);
+ } else {
+ quartet_tree = new PhyloTree(quartet_aln);
+ }
+
+ // set up parameters
+ quartet_tree->setParams(params);
+ quartet_tree->optimize_by_newton = params->optimize_by_newton;
+ quartet_tree->setLikelihoodKernel(params->SSE);
+
+ // set up partition model
+ if (isSuperTree()) {
+ PhyloSuperTree *quartet_super_tree = (PhyloSuperTree*)quartet_tree;
+ PhyloSuperTree *super_tree = (PhyloSuperTree*)this;
+ for (int i = 0; i < super_tree->size(); i++) {
+ quartet_super_tree->at(i)->setModelFactory(super_tree->at(i)->getModelFactory());
+ quartet_super_tree->at(i)->setModel(super_tree->at(i)->getModel());
+ quartet_super_tree->at(i)->setRate(super_tree->at(i)->getRate());
+ }
+ }
+
+ // set model and rate
+ quartet_tree->setModelFactory(model_factory);
+ quartet_tree->setModel(getModel());
+ quartet_tree->setRate(getRate());
+ // NOTE: we don't need to set phylo_tree in model and rate because parameters are not reoptimized
+
+
+
+ // loop over 3 quartets to compute likelihood
+ for (int k = 0; k < 3; k++) {
+ string quartet_tree_str;
+ quartet_tree_str = "(" + quartet_aln->getSeqName(qc[k*4]) + "," + quartet_aln->getSeqName(qc[k*4+1]) + ",(" +
+ quartet_aln->getSeqName(qc[k*4+2]) + "," + quartet_aln->getSeqName(qc[k*4+3]) + "));";
+ quartet_tree->readTreeStringSeqName(quartet_tree_str);
+ quartet_tree->initializeAllPartialLh();
+ quartet_tree->wrapperFixNegativeBranch(true);
+ // optimize branch lengths with logl_epsilon=0.1 accuracy
+ lmap_quartet_info[qid].logl[k] = quartet_tree->optimizeAllBranches(10, 0.1);
+ }
+ // reset model & rate so that they are not deleted
+ quartet_tree->setModel(NULL);
+ quartet_tree->setModelFactory(NULL);
+ quartet_tree->setRate(NULL);
+
+ if (isSuperTree()) {
+ PhyloSuperTree *quartet_super_tree = (PhyloSuperTree*)quartet_tree;
+ for (int i = 0; i < quartet_super_tree->size(); i++) {
+ quartet_super_tree->at(i)->setModelFactory(NULL);
+ quartet_super_tree->at(i)->setModel(NULL);
+ quartet_super_tree->at(i)->setRate(NULL);
+ }
+ }
+
+ delete quartet_tree;
+ delete quartet_aln;
+
+ // determine likelihood order
+ int qworder[3]; // local (thread-safe) vector for sorting
+
+ if (lmap_quartet_info[qid].logl[0] > lmap_quartet_info[qid].logl[1]) {
+ if(lmap_quartet_info[qid].logl[2] > lmap_quartet_info[qid].logl[0]) {
+ qworder[0] = 2;
+ qworder[1] = 0;
+ qworder[2] = 1;
+ } else if (lmap_quartet_info[qid].logl[2] < lmap_quartet_info[qid].logl[1]) {
+ qworder[0] = 0;
+ qworder[1] = 1;
+ qworder[2] = 2;
+ } else {
+ qworder[0] = 0;
+ qworder[1] = 2;
+ qworder[2] = 1;
+ }
+ } else {
+ if(lmap_quartet_info[qid].logl[2] > lmap_quartet_info[qid].logl[1]) {
+ qworder[0] = 2;
+ qworder[1] = 1;
+ qworder[2] = 0;
+ } else if (lmap_quartet_info[qid].logl[2] < lmap_quartet_info[qid].logl[0]) {
+ qworder[0] = 1;
+ qworder[1] = 0;
+ qworder[2] = 2;
+ } else {
+ qworder[0] = 1;
+ qworder[1] = 2;
+ qworder[2] = 0;
+ }
+ }
+
+ // compute Bayesian weights
+ double temp;
+
+ lmap_quartet_info[qid].qweight[0] = lmap_quartet_info[qid].logl[0];
+ lmap_quartet_info[qid].qweight[1] = lmap_quartet_info[qid].logl[1];
+ lmap_quartet_info[qid].qweight[2] = lmap_quartet_info[qid].logl[2];
+
+ temp = lmap_quartet_info[qid].qweight[qworder[1]]-lmap_quartet_info[qid].qweight[qworder[0]];
+ if(temp < -TP_MAX_EXP_DIFF) /* possible, since 1.0+exp(>36) == 1.0 */
+ lmap_quartet_info[qid].qweight[qworder[1]] = 0.0;
+ else
+ lmap_quartet_info[qid].qweight[qworder[1]] = exp(temp);
+
+ temp = lmap_quartet_info[qid].qweight[qworder[2]]-lmap_quartet_info[qid].qweight[qworder[0]];
+ if(temp < -TP_MAX_EXP_DIFF) /* possible, since 1.0+exp(>36) == 1.0 */
+ lmap_quartet_info[qid].qweight[qworder[2]] = 0.0;
+ else
+ lmap_quartet_info[qid].qweight[qworder[2]] = exp(temp);
+
+ lmap_quartet_info[qid].qweight[qworder[0]] = 1.0;
+
+ temp = lmap_quartet_info[qid].qweight[0] + lmap_quartet_info[qid].qweight[1] + lmap_quartet_info[qid].qweight[2];
+ lmap_quartet_info[qid].qweight[0] = lmap_quartet_info[qid].qweight[0]/temp;
+ lmap_quartet_info[qid].qweight[1] = lmap_quartet_info[qid].qweight[1]/temp;
+ lmap_quartet_info[qid].qweight[2] = lmap_quartet_info[qid].qweight[2]/temp;
+
+ // determine which of the three corners (only meaningful if seqIDs NOT sorted)
+ if (treebits[qworder[0]] == 1) {
+ lmap_quartet_info[qid].corner=0;
+ } else {
+ if (treebits[qworder[0]] == 2) {
+ lmap_quartet_info[qid].corner=1;
+ } else {
+ lmap_quartet_info[qid].corner=2;
+ }
+ }
+
+ // determine which of the 7 regions (only meaningful if seqIDs NOT sorted)
+ double temp1, temp2, temp3;
+ unsigned char discreteweight[3];
+ double sqdiff[3];
+
+ /* 100 distribution */
+ temp1 = 1.0 - lmap_quartet_info[qid].qweight[qworder[0]];
+ sqdiff[0] = temp1*temp1 +
+ lmap_quartet_info[qid].qweight[qworder[1]]*lmap_quartet_info[qid].qweight[qworder[1]] +
+ lmap_quartet_info[qid].qweight[qworder[2]]*lmap_quartet_info[qid].qweight[qworder[2]];
+ discreteweight[0] = treebits[qworder[0]];
+
+ /* 110 distribution */
+ temp1 = 0.5 - lmap_quartet_info[qid].qweight[qworder[0]];
+ temp2 = 0.5 - lmap_quartet_info[qid].qweight[qworder[1]];
+ sqdiff[1] = temp1*temp1 + temp2*temp2 +
+ lmap_quartet_info[qid].qweight[qworder[2]]*lmap_quartet_info[qid].qweight[qworder[2]];
+ discreteweight[1] = treebits[qworder[0]] + treebits[qworder[1]];
+
+ /* 111 distribution */
+ temp1 = onethird - lmap_quartet_info[qid].qweight[qworder[0]];
+ temp2 = onethird - lmap_quartet_info[qid].qweight[qworder[1]];
+ temp3 = onethird - lmap_quartet_info[qid].qweight[qworder[2]];
+ sqdiff[2] = temp1 * temp1 + temp2 * temp2 + temp3 * temp3;
+ discreteweight[2] = (unsigned char) 7;
+
+ /* sort in descending order */
+ int sqorder[3]; // local (thread-safe) vector for sorting
+ if (sqdiff[0] > sqdiff[1]) {
+ if(sqdiff[2] > sqdiff[0]) {
+ sqorder[0] = 2;
+ sqorder[1] = 0;
+ sqorder[2] = 1;
+ } else if (sqdiff[2] < sqdiff[1]) {
+ sqorder[0] = 0;
+ sqorder[1] = 1;
+ sqorder[2] = 2;
+ } else {
+ sqorder[0] = 0;
+ sqorder[1] = 2;
+ sqorder[2] = 1;
+ }
+ } else {
+ if(sqdiff[2] > sqdiff[1]) {
+ sqorder[0] = 2;
+ sqorder[1] = 1;
+ sqorder[2] = 0;
+ } else if (sqdiff[2] < sqdiff[0]) {
+ sqorder[0] = 1;
+ sqorder[1] = 0;
+ sqorder[2] = 2;
+ } else {
+ sqorder[0] = 1;
+ sqorder[1] = 2;
+ sqorder[2] = 0;
+ }
+ }
+
+
+ // determine which of the 7 regions (only meaningful if seqIDs NOT sorted)
+ unsigned char qpbranching = (unsigned char) discreteweight[sqorder[2]];
+
+ if (qpbranching == 1) {
+ lmap_quartet_info[qid].area=0; // LM_REG1 - top
+ }
+ if (qpbranching == 2) {
+ lmap_quartet_info[qid].area=1; // LM_REG2 - right
+ }
+ if (qpbranching == 4) {
+ lmap_quartet_info[qid].area=2; // LM_REG3 - left
+ }
+
+ if (qpbranching == 3) {
+ lmap_quartet_info[qid].area=3; // LM_REG4
+ }
+ if (qpbranching == 6) {
+ lmap_quartet_info[qid].area=4; // LM_REG5
+ }
+ if (qpbranching == 5) {
+ lmap_quartet_info[qid].area=5; // LM_REG6
+ }
+
+ if (qpbranching == 7) {
+ lmap_quartet_info[qid].area=6; // LM_REG7 - center
+ }
+
+ } /*** end draw lmap_num_quartets quartets randomly ***/
+
+#ifdef _OPENMP
+ finish_random(rstream);
+ }
+#endif
+
+} // end PhyloTree::computeQuartetLikelihoods
+
+
+//**************************************
+
+
+void PhyloTree::readLikelihoodMappingGroups(char *filename, QuartetGroups &LMGroups) {
+
+ int numsets, numtax, taxid;
+ char clustchar;
+ MSetsBlock *lmclusters;
+ lmclusters = new MSetsBlock();
+ cout << endl << "Reading likelihood mapping cluster file " << filename << "..." << endl;
+ cout << "(The leading numbers represent the order from the master alignment.)" << endl << endl;
+
+ MyReader nexus(filename);
+
+ nexus.Add(lmclusters);
+
+ MyToken token(nexus.inf);
+ nexus.Execute(token);
+
+ // lmclusters->Report(cout);
+
+ TaxaSetNameVector *allsets = lmclusters->getSets();
+ numsets = allsets->size();
+
+ if(numsets > 5) outError("Only up to 4 Likelihood Mapping clusters allowed, plus one 'ignored' cluster!");
+
+ int n = 0;
+ for (TaxaSetNameVector::iterator i = allsets->begin(); i != allsets->end(); i++) {
+ if ((*i)->name.compare("ignored")==0 || (*i)->name.compare("IGNORED")==0) {
+ LMGroups.Name[4] = (*i)->name;
+ numtax = (*i)->taxlist.size();
+ LMGroups.numGrpSeqs[4] = numtax;
+ LMGroups.GroupX.resize(numtax);
+ cout << "Cluster \"" << LMGroups.Name[4] << "\" lists " << (*i)->taxlist.size() << " sequences to be ignored:" << endl;
+ int t = 0;
+ for (vector<string>::iterator it = (*i)->taxlist.begin(); it != (*i)->taxlist.end(); it++) {
+ taxid = aln->getSeqID(*it);
+ if (taxid < 0) {
+ cout << "Warning: unknown sequence name \"" << (*it) << "\"! Will be ignored." << endl;
+ } else {
+ LMGroups.GroupX[t] = taxid;
+ // cout << " " << (*it) << " (" << taxid << "," << LMGroups.GroupX[t] << ")" << endl;
+ cout << " " << LMGroups.GroupX[t]+1 << ". " << (*it) << endl;
+ t++;
+ }
+ }
+ if (numtax != t) {
+ cout << "Warning: ignored cluster did contain unknown sequence names!" << endl;
+ LMGroups.numGrpSeqs[4] = t;
+ }
+ } else {
+ switch(n){
+ case 0: clustchar='A'; break;
+ case 1: clustchar='B'; break;
+ case 2: clustchar='C'; break;
+ case 3: clustchar='D'; break;
+ default: outError("Only up to 4 Likelihood Mapping clusters allowed, plus one 'ignored' cluster!"); break;
+ }
+ LMGroups.Name[n] = (*i)->name;
+ numtax = (*i)->taxlist.size();
+ LMGroups.numGrpSeqs[n] = numtax;
+ switch(n){
+ case 0: LMGroups.GroupA.resize(numtax); break;
+ case 1: LMGroups.GroupB.resize(numtax); break;
+ case 2: LMGroups.GroupC.resize(numtax); break;
+ case 3: LMGroups.GroupD.resize(numtax); break;
+ default: outError("Only up to 4 Likelihood Mapping clusters allowed, plus one 'ignored' cluster!"); break;
+ }
+
+ cout << "Cluster " << n+1 << " \"" << LMGroups.Name[n] << "\" lists " << (*i)->taxlist.size() << " sequences: " << endl;
+
+ int t = 0;
+ for (vector<string>::iterator it = (*i)->taxlist.begin(); it != (*i)->taxlist.end(); it++) {
+ taxid = aln->getSeqID(*it);
+ if (taxid < 0) {
+ cout << "Warning: sequence name \"" << (*it) << "\"! Will be ignored." << endl;
+ } else {
+ switch(n){
+ case 0: LMGroups.GroupA[t] = taxid;
+ cout << " " << LMGroups.GroupA[t]+1 << ". " << (*it) << endl;
+ break;
+ case 1: LMGroups.GroupB[t] = taxid;
+ cout << " " << LMGroups.GroupB[t]+1 << ". " << (*it) << endl;
+ break;
+ case 2: LMGroups.GroupC[t] = taxid;
+ cout << " " << LMGroups.GroupC[t]+1 << ". " << (*it) << endl;
+ break;
+ case 3: LMGroups.GroupD[t] = taxid;
+ cout << " " << LMGroups.GroupD[t]+1 << ". " << (*it) << endl;
+ break;
+ default: outError("Only up to 4 Likelihood Mapping clusters allowed, plus one 'ignored' cluster!"); break;
+ }
+ t++;
+ }
+ }
+ LMGroups.numGrpSeqs[n] = t;
+ n++;
+ }
+ cout << endl;
+ }
+ LMGroups.numGroups = n;
+
+
+} // end PhyloTree::readLikelihoodMappingGroups
+
+//**************************************
+
+void PhyloTree::doLikelihoodMapping() {
+ // TODO For Heiko: Please add code here
+ // vector<QuartetInfo> lmap_quartet_info;
+ // vector<SeqQuartetInfo> lmap_seq_quartet_info;
+ // int areacount[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ // int cornercount[4] = {0, 0, 0, 0};
+ int resolved, partly, unresolved;
+ int qid;
+ ofstream out;
+ string filename;
+
+ if(params->lmap_cluster_file != NULL) {
+ // cout << "YYY: test reading" << params->lmap_cluster_file << endl;
+ readLikelihoodMappingGroups(params->lmap_cluster_file, LMGroups);
+ } else {
+ LMGroups.numGroups = 0; /* no clusterfile -> un-initialized */
+ }
+
+ areacount[0] = 0;
+ areacount[1] = 0;
+ areacount[2] = 0;
+ areacount[3] = 0;
+ areacount[4] = 0;
+ areacount[5] = 0;
+ areacount[6] = 0;
+ areacount[7] = 0;
+ cornercount[0] = 0;
+ cornercount[1] = 0;
+ cornercount[2] = 0;
+ cornercount[3] = 0;
+
+ lmap_seq_quartet_info.resize(leafNum+1);
+ for (qid = 0; qid < leafNum; qid++) {
+ lmap_seq_quartet_info[qid].countarr[0] = 0;
+ lmap_seq_quartet_info[qid].countarr[1] = 0;
+ lmap_seq_quartet_info[qid].countarr[2] = 0;
+ lmap_seq_quartet_info[qid].countarr[3] = 0;
+ lmap_seq_quartet_info[qid].countarr[4] = 0;
+ lmap_seq_quartet_info[qid].countarr[5] = 0;
+ lmap_seq_quartet_info[qid].countarr[6] = 0;
+ lmap_seq_quartet_info[qid].countarr[7] = 0;
+ lmap_seq_quartet_info[qid].countarr[8] = 0;
+ lmap_seq_quartet_info[qid].countarr[9] = 0;
+ }
+
+ cout << "Computing quartet likelihoods..." << endl << endl;
+
+ computeQuartetLikelihoods(lmap_quartet_info, LMGroups);
+
+ for (qid = 0; qid < params->lmap_num_quartets; qid++) {
+ int tempreg;
+
+ tempreg = lmap_quartet_info[qid].area;
+ areacount[tempreg]++;
+ lmap_seq_quartet_info[leafNum].countarr[tempreg]++; /* which of the 7 regions */
+ lmap_seq_quartet_info[lmap_quartet_info[qid].seqID[0]].countarr[tempreg]++;
+ lmap_seq_quartet_info[lmap_quartet_info[qid].seqID[1]].countarr[tempreg]++;
+ lmap_seq_quartet_info[lmap_quartet_info[qid].seqID[2]].countarr[tempreg]++;
+ lmap_seq_quartet_info[lmap_quartet_info[qid].seqID[3]].countarr[tempreg]++;
+
+ tempreg = LM_AR1+lmap_quartet_info[qid].corner; /* which of the 3 corners */
+ cornercount[lmap_quartet_info[qid].corner]++;
+ lmap_seq_quartet_info[leafNum].countarr[tempreg]++;
+ lmap_seq_quartet_info[lmap_quartet_info[qid].seqID[0]].countarr[tempreg]++;
+ lmap_seq_quartet_info[lmap_quartet_info[qid].seqID[1]].countarr[tempreg]++;
+ lmap_seq_quartet_info[lmap_quartet_info[qid].seqID[2]].countarr[tempreg]++;
+ lmap_seq_quartet_info[lmap_quartet_info[qid].seqID[3]].countarr[tempreg]++;
+ }
+
+ if (params->print_lmap_quartet_lh) {
+ // print quartet file
+ filename = (string)params->out_prefix + ".lmap.quartetlh";
+ out.open(filename.c_str());
+ }
+
+ string lmap_svgfilename = (string)params->out_prefix + ".lmap.svg";
+ FILE *svgout;
+ svgout = fopen(lmap_svgfilename.c_str(), "w");
+ initsvg(svgout, LMGroups);
+
+ string lmap_epsfilename = (string)params->out_prefix + ".lmap.eps";
+ FILE *epsout;
+ epsout = fopen(lmap_epsfilename.c_str(), "w");
+ initeps(epsout, LMGroups);
+
+ for (qid = 0; qid < params->lmap_num_quartets; qid++) {
+
+ plotlmpointeps(epsout,
+ // double w1, double w2)
+ lmap_quartet_info[qid].qweight[0],
+ lmap_quartet_info[qid].qweight[1]);
+
+ plotlmpointsvg(svgout,
+ // double w1, double w2)
+ lmap_quartet_info[qid].qweight[0],
+ lmap_quartet_info[qid].qweight[1]);
+
+ }
+
+ if (params->print_lmap_quartet_lh) {
+ // print quartet file
+ for (qid = 0; qid < params->lmap_num_quartets; qid++) {
+ out << "(" << lmap_quartet_info[qid].seqID[0] << ","
+ << lmap_quartet_info[qid].seqID[1] << ","
+ << lmap_quartet_info[qid].seqID[2] << ","
+ << lmap_quartet_info[qid].seqID[3] << ")"
+ << "\t" << lmap_quartet_info[qid].logl[0]
+ << "\t" << lmap_quartet_info[qid].logl[1]
+ << "\t" << lmap_quartet_info[qid].logl[2]
+ << "\t" << lmap_quartet_info[qid].qweight[0]
+ << "\t" << lmap_quartet_info[qid].qweight[1]
+ << "\t" << lmap_quartet_info[qid].qweight[2] << endl;
+ }
+
+ PhyloTree::reportLikelihoodMapping(out);
+
+ /**** begin of report output ****/
+ /**** moved to PhyloTree::reportLikelihoodMapping ****/
+#if 0
+ // LM_REG1 0 /* top corner */
+ // LM_REG2 1 /* bottom-right corner */
+ // LM_REG3 2 /* bottom-left corner */
+ // LM_REG4 3 /* right rectangle */
+ // LM_REG5 4 /* bottom rectangle */
+ // LM_REG6 5 /* left rectangle */
+ // LM_REG7 6 /* center */
+ // LM_AR1 7 /* top third */
+ // LM_AR2 8 /* bottom-right third */
+ // LM_AR3 9 /* bottom-left third */
+
+ out << "LIKELIHOOD MAPPING ANALYSIS" << endl << endl;
+ out << "Number of quartets: " << params->lmap_num_quartets << "(random choice)" << endl << endl;
+ out << "Quartet trees are based on the selected model of substitution." << endl << endl;
+ out << "Sequences are not grouped in clusters." << endl;
+
+ out << endl << endl;
+ out << "LIKELIHOOD MAPPING STATISTICS" << endl << endl;
+
+ out << " (a,b)-(c,d) (a,b)-(c,d) " << endl;
+ out << " /\\ /\\ " << endl;
+ out << " / \\ / \\ " << endl;
+ out << " / \\ / 1 \\ " << endl;
+ out << " / a1 \\ / \\ / \\ " << endl;
+ out << " /\\ /\\ / \\/ \\ " << endl;
+ out << " / \\ / \\ / /\\ \\ " << endl;
+ out << " / \\ / \\ / 6 / \\ 4 \\ " << endl;
+ out << " / \\/ \\ /\\ / 7 \\ /\\ " << endl;
+ out << " / | \\ / \\ /______\\ / \\ " << endl;
+ out << " / a3 | a2 \\ / 3 | 5 | 2 \\ " << endl;
+ out << " /__________|_________\\ /_____|________|_____\\ " << endl;
+ out << "(a,d)-(b,c) (a,c)-(b,d) (a,b)-(c,d) (a,c)-(b,d) " << endl << endl;
+ out << "For more information about likelihood-mapping refer to" << endl;
+ out << " Strimmer and von Haeseler (1997) PNAS 94:6815-6819" << endl;
+ out << " http://www.ncbi.nlm.nih.gov/pubmed/9192648" << endl;
+ out << "and/or" << endl;
+ out << " Schmidt and von Haeseler (2003) Current Protocols in Bioinformatics" << endl;
+ out << " (by Baxevanis et al., Eds.), Unit 6, Wiley&Sons, New York." << endl;
+ out << " http://dx.doi.org/10.1002/0471250953.bi0606s17" << endl;
+
+
+ out << endl << endl;
+ out << "Quartet support of regions a1, a2, a3:" << endl << endl;
+ out << " #quartets a1 (% a1) a2 (% a2) a3 (% a3) name" << endl;
+ for (qid = 0; qid < leafNum; qid++) {
+ //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
+ unsigned long sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
+
+ out.setf(ios::fixed, ios::floatfield); // set fixed floating format
+ out.precision(2);
+ out << setw(4) << qid+1
+ << setw(9) << sumq
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[7]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[7]/sumq << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[8]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[8]/sumq << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[9]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[9]/sumq << ") "
+ << PhyloTree::aln->getSeqName(qid) << endl;
+ }
+
+ out << endl << endl << "Quartet support of areas 1-7:" << endl << endl;
+ out << " resolved partly unresolved name" << endl;
+ out << " #quartets 1 (% 1) 2 (% 2) 3 (% 3) 4 (% 4) 5 (% 5) 6 (% 6) 7 (% 7)" << endl;
+ for (qid = 0; qid < leafNum; qid++) {
+ //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
+ unsigned long sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
+
+ out.setf(ios::fixed, ios::floatfield); // set fixed floating format
+ out.precision(2);
+ out << setw(4) << qid+1
+ << setw(9) << sumq
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[0]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[0]/sumq << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[1]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[1]/sumq << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[2]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[2]/sumq << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[3]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[3]/sumq << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[4]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[4]/sumq << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[5]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[5]/sumq << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[6]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[6]/sumq << ") "
+ << PhyloTree::aln->getSeqName(qid) << endl;
+ }
+
+ out << endl << endl << "Quartet resolution per sequence:" << endl << endl;
+ out << " #quartets resolved partly unresolved name" << endl;
+ for (qid = 0; qid < leafNum; qid++) {
+ //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
+ unsigned long resolved = lmap_seq_quartet_info[qid].countarr[LM_REG1] + lmap_seq_quartet_info[qid].countarr[LM_REG2] + lmap_seq_quartet_info[qid].countarr[LM_REG3];
+ unsigned long partly = lmap_seq_quartet_info[qid].countarr[LM_REG4] + lmap_seq_quartet_info[qid].countarr[LM_REG5] + lmap_seq_quartet_info[qid].countarr[LM_REG6];
+ unsigned long unres = lmap_seq_quartet_info[qid].countarr[LM_REG7];
+ unsigned long sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
+
+ out.setf(ios::fixed, ios::floatfield); // set fixed floating format
+ out.precision(2);
+ out << setw(4) << qid+1
+ << setw(9) << sumq
+ << setw(7) << resolved
+ << " (" << setw(6) << (double) 100.0*resolved/sumq << ") "
+ << setw(7) << partly
+ << " (" << setw(6) << (double) 100.0*partly/sumq << ") "
+ << setw(7) << unres
+ << " (" << setw(6) << (double) 100.0*unres/sumq << ") "
+ << PhyloTree::aln->getSeqName(qid) << endl;
+ }
+#endif
+ }
+
+ resolved = areacount[0] + areacount[1] + areacount[2];
+ partly = areacount[3] + areacount[4] + areacount[5];
+ unresolved = areacount[6];
+
+ out << endl << "LIKELIHOOD MAPPING SUMMARY" << endl << endl;
+ out << "Number of quartets: " << (resolved+partly+unresolved)
+ << " (randomly drawn with replacement)" << endl << endl;
+ out << "Overall quartet resolution:" << endl;
+ out << "Number of fully resolved quartets: " << resolved
+ << " (" << 100.0 * resolved/(resolved+partly+unresolved) << "%)" << endl;
+ out << "Number of partly resolved quartets: " << partly
+ << " (" << 100.0 * partly/(resolved+partly+unresolved) << "%)" << endl;
+ out << "Number of unresolved quartets: " << unresolved
+ << " (" << 100.0 * unresolved/(resolved+partly+unresolved) << "%)" << endl << endl;
+
+#if 0
+#endif
+
+ /**** end of report output ****/
+ /**** moved to PhyloTree::reportLikelihoodMapping ****/
+
+
+ if (params->print_lmap_quartet_lh) {
+ // print quartet file
+ out.close();
+ cout << "likelihood mapping results written to " << filename << endl;
+ }
+
+ finishsvg(svgout, lmap_seq_quartet_info, leafNum, params->lmap_num_quartets);
+ fclose(svgout);
+ cout << "likelihood mapping plot (SVG) written to " << lmap_svgfilename << endl;
+
+ finisheps(epsout, lmap_seq_quartet_info, leafNum, params->lmap_num_quartets);
+ fclose(epsout);
+ cout << "likelihood mapping plot (EPS) written to " << lmap_epsfilename << endl;
+
+
+// cout << "\nOverall quartet resolution: (from " << (resolved+partly+unresolved) << " randomly drawn quartets)" << endl;
+// cout << "Fully resolved quartets: " << resolved << " (= "
+// << (double) resolved * 100.0 / (resolved+partly+unresolved) << "%)" << endl;
+// cout << "Partly resolved quartets: " << partly << " (= "
+// << (double) partly * 100.0 / (resolved+partly+unresolved) << "%)" << endl;
+// cout << "Unresolved quartets: " << unresolved << " (= "
+// << (double) unresolved * 100.0 / (resolved+partly+unresolved) << "%)" << endl << endl;
+
+} // end PhyloTree::doLikelihoodMapping
+
+
+
+
+void PhyloTree::reportLikelihoodMapping(ofstream &out) {
+ // int areacount[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ // int cornercount[4] = {0, 0, 0, 0};
+ int resolved, partly, unresolved;
+ int qid;
+ int leafNum;
+ leafNum = PhyloTree::aln->getNSeq();
+ // vector<QuartetInfo> lmap_quartet_info;
+ // vector<SeqQuartetInfo> lmap_seq_quartet_info;
+
+
+ // LM_REG1 0 /* top corner */
+ // LM_REG2 1 /* bottom-right corner */
+ // LM_REG3 2 /* bottom-left corner */
+ // LM_REG4 3 /* right rectangle */
+ // LM_REG5 4 /* bottom rectangle */
+ // LM_REG6 5 /* left rectangle */
+ // LM_REG7 6 /* center */
+ // LM_AR1 7 /* top third */
+ // LM_AR2 8 /* bottom-right third */
+ // LM_AR3 9 /* bottom-left third */
+#if 0
+#define LM_REG1 0 /* top corner */
+#define LM_REG2 1 /* bottom-right corner */
+#define LM_REG3 2 /* bottom-left corner */
+#define LM_REG4 3 /* right rectangle */
+#define LM_REG5 4 /* bottom rectangle */
+#define LM_REG6 5 /* left rectangle */
+#define LM_REG7 6 /* center */
+#define LM_AR1 7 /* top third */
+#define LM_AR2 8 /* bottom-right third */
+#define LM_AR3 9 /* bottom-left third */
+#endif
+
+ out << "LIKELIHOOD MAPPING ANALYSIS" << endl;
+ out << "---------------------------" << endl << endl;
+ out << "Number of quartets: " << params->lmap_num_quartets << " (randomly chosen from "
+ << LMGroups.uniqueQuarts << " existing unique quartets)" << endl << endl;
+ out << "Quartet trees are based on the selected model of substitution." << endl << endl;
+
+ if(LMGroups.numGroups == 1) {
+ int n=0, t;
+ out << "Sequences are not grouped in clusters. Using sequences:" << endl;
+ for (t=0; t<LMGroups.numGrpSeqs[n]; t++){
+ out << " " << LMGroups.GroupA[t]+1 << ". "
+ << PhyloTree::aln->getSeqName(LMGroups.GroupA[t]) << endl;
+ }
+ out << endl << "Ordered as in user-given cluster file, numbers according to alignment order." << endl;
+ out << "All other sequences have been ignored." << endl << endl;
+ }
+ if((LMGroups.numGroups > 1)&&(LMGroups.numGroups <= 4)) {
+ int n, t;
+ out << "Sequences are grouped into " << LMGroups.numGroups << " clusters." << endl << endl;
+ for (n=0; n<LMGroups.numGroups; n++){
+ out << "Cluster " << n+1 << " \"" << LMGroups.Name[n] << "\" lists " << LMGroups.numGrpSeqs[n] << " sequences: " << endl;
+ for (t=0; t<LMGroups.numGrpSeqs[n]; t++){
+ switch(n){
+ case 0:
+ out << " " << LMGroups.GroupA[t]+1 << ". "
+ << PhyloTree::aln->getSeqName(LMGroups.GroupA[t]) << endl;
+ break;
+ case 1:
+ out << " " << LMGroups.GroupB[t]+1 << ". "
+ << PhyloTree::aln->getSeqName(LMGroups.GroupB[t]) << endl;
+ break;
+ case 2:
+ out << " " << LMGroups.GroupC[t]+1 << ". "
+ << PhyloTree::aln->getSeqName(LMGroups.GroupC[t]) << endl;
+ break;
+ case 3:
+ out << " " << LMGroups.GroupD[t]+1 << ". "
+ << PhyloTree::aln->getSeqName(LMGroups.GroupD[t]) << endl;
+ break;
+ default: outError("Number of Likelihood Mapping groups too high! PLEASE report this to the developers!"); break;
+ }
+ }
+ out << endl;
+ }
+ out << "Ordered as in user-given cluster file, numbers according to alignment order." << endl;
+ out << "All other sequences have been ignored." << endl << endl;
+ }
+
+ out << endl << endl;
+ out << "LIKELIHOOD MAPPING STATISTICS" << endl;
+ out << "-----------------------------" << endl << endl;
+
+ switch(LMGroups.numGroups){
+ case 1:
+ out << " (a,b)-(c,d) (a,b)-(c,d) " << endl; break;
+ case 2:
+ out << " (a,a)-(b,b) (a,a)-(b,b) " << endl; break;
+ case 3:
+ out << " (a,b)-(c,c) (a,b)-(c,c) " << endl; break;
+ case 4:
+ out << " (a,b)-(c,d) (a,b)-(c,d) " << endl; break;
+ default: outError("Number of Likelihood Mapping groups too high! PLEASE report this to the developers!"); break;
+ }
+ out << " /\\ /\\ " << endl;
+ out << " / \\ / \\ " << endl;
+ out << " / \\ / 1 \\ " << endl;
+ out << " / a1 \\ / \\ / \\ " << endl;
+ out << " /\\ /\\ / \\/ \\ " << endl;
+ out << " / \\ / \\ / /\\ \\ " << endl;
+ out << " / \\ / \\ / 6 / \\ 4 \\ " << endl;
+ out << " / \\/ \\ /\\ / 7 \\ /\\ " << endl;
+ out << " / | \\ / \\ /______\\ / \\ " << endl;
+ out << " / a3 | a2 \\ / 3 | 5 | 2 \\ " << endl;
+ out << " /__________|_________\\ /_____|________|_____\\ " << endl;
+ switch(LMGroups.numGroups){
+ case 1:
+ out << "(a,d)-(b,c) (a,c)-(b,d) (a,d)-(b,c) (a,c)-(b,d) "
+ << endl << endl; break;
+ case 2:
+ out << "(a,b)-(a,b) (a,b)-(a,b) (a,b)-(a,b) (a,b)-(a,b) "
+ << endl << endl; break;
+ case 3:
+ out << "(a,c)-(b,c) (a,c)-(b,c) (a,c)-(b,c) (a,c)-(b,c) "
+ << endl << endl; break;
+ case 4:
+ out << "(a,d)-(b,c) (a,c)-(b,d) (a,d)-(b,c) (a,c)-(b,d) "
+ << endl << endl; break;
+ default: outError("Number of Likelihood Mapping groups too high! PLEASE report this to the developers!"); break;
+ }
+ // |<--- 80 chars --->|
+ out << "Division of the likelihood mapping plots into 3 or 7 areas." << endl;
+ out << "On the left the areas show support for one of the different groupings" << endl;
+ out << "like (a,b|c,d)." << endl;
+
+ out << "On the right the right quartets falling into the areas 1, 2 and 3 are" << endl;
+ out << "informative. Those in the rectangles 4, 5 and 6 are partly informative" << endl;
+ out << "and those in the center (7) are not informative." << endl << endl;
+
+ switch(LMGroups.numGroups){
+ case 1:
+ out << "Sequences a,b,c,d are drawn from all included sequences." << endl << endl; break;
+ case 2:
+ out << "Sequences a(2x) and b(2x) are drawn from clusters 1 and 2, respectively, with" << endl;
+ out << "Cluster 1: " << LMGroups.Name[0] << endl;
+ out << "Cluster 2: " << LMGroups.Name[1] << endl << endl;
+ break;
+ case 3:
+ out << "Sequences a, b and c(2x) are drawn from clusters 1, 2 and 3, respectively, with" << endl;
+ out << "Cluster 1: " << LMGroups.Name[0] << endl;
+ out << "Cluster 2: " << LMGroups.Name[1] << endl;
+ out << "Cluster 3: " << LMGroups.Name[2] << endl << endl;
+ break;
+ case 4:
+ out << "Sequences a,b,c,d are drawn from clusters 1, 2, 3 and 4, respectively, with" << endl;
+ out << "Cluster 1: " << LMGroups.Name[0] << endl;
+ out << "Cluster 2: " << LMGroups.Name[1] << endl;
+ out << "Cluster 3: " << LMGroups.Name[2] << endl;
+ out << "Cluster 4: " << LMGroups.Name[3] << endl << endl;
+ break;
+ default: outError("Number of Likelihood Mapping groups too high! PLEASE report this to the developers!"); break;
+ }
+
+ out << "Note, that the corners only make a difference if the sequences are" << endl;
+ out << "clustered in groups. Furthermore, while sequences should occur about" << endl;
+ out << "equally often in unclustered mappings, in clustered mappings their" << endl;
+ out << "occurrence rates depend on the group sizes the quartets are drawn from." << endl << endl;
+
+ out << "For more information about likelihood-mapping refer to" << endl;
+ out << " - Schmidt and von Haeseler (2009) The Phylogenetic Handbook, 2nd Ed." << endl;
+ out << " (by Lemey et al., Eds.), 181-209, Cambridge Univ. Press, UK." << endl;
+ out << " http://www.thephylogenetichandbook.org" << endl;
+ out << " - Schmidt and von Haeseler (2003) Current Protocols in Bioinformatics" << endl;
+ out << " (by Baxevanis et al., Eds.), Unit 6, Wiley&Sons, New York." << endl;
+ out << " http://dx.doi.org/10.1002/0471250953.bi0606s17" << endl;
+ out << "and/or" << endl;
+ out << " - Strimmer and von Haeseler (1997) PNAS 94:6815-6819" << endl;
+ out << " http://www.ncbi.nlm.nih.gov/pubmed/9192648" << endl;
+
+
+ out << endl << endl;
+ out << "Quartet support of regions a1, a2, a3 (mainly for clustered analysis):" << endl << endl;
+ out << " #quartets a1 (% a1) a2 (% a2) a3 (% a3) name" << endl;
+ out << "-----------------------------------------------------------------------------" << endl;
+ for (qid = 0; qid <= leafNum; qid++) {
+ //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
+ unsigned long sumq0, sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
+ if (sumq>0) sumq0=sumq;
+ else sumq0=1;
+
+ if (qid < leafNum) {
+ out.setf(ios::fixed, ios::floatfield); // set fixed floating format
+ out.precision(2);
+ out << setw(4) << qid+1
+ << setw(9) << sumq
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[7]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[7]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[8]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[8]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[9]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[9]/sumq0 << ") "
+ << PhyloTree::aln->getSeqName(qid) << endl;
+ } else {
+ out << "-----------------------------------------------------------------------------" << endl;
+ out.setf(ios::fixed, ios::floatfield); // set fixed floating format
+ out.precision(2);
+ out << " "
+ << setw(9) << sumq
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[7]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[7]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[8]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[8]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[9]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[9]/sumq0 << ") " << endl;
+ }
+ }
+
+ out << endl << endl << "Quartet support of areas 1-7 (mainly for clustered analysis):" << endl << endl;
+ out << " resolved partly unresolved name" << endl;
+ out << " #quartets 1 (% 1) 2 (% 2) 3 (% 3) 4 (% 4) 5 (% 5) 6 (% 6) 7 (% 7)" << endl;
+ out << "------------------------------------------------------------------------------------------------------------------------------------------------" << endl;
+ for (qid = 0; qid <= leafNum; qid++) {
+ //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
+ unsigned long sumq0, sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
+ if (sumq>0) sumq0=sumq;
+ else sumq0=1;
+
+ if (qid < leafNum) {
+ out.setf(ios::fixed, ios::floatfield); // set fixed floating format
+ out.precision(2);
+ out << setw(4) << qid+1
+ << setw(9) << sumq
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[0]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[0]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[1]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[1]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[2]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[2]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[3]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[3]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[4]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[4]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[5]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[5]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[6]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[6]/sumq0 << ") "
+ << PhyloTree::aln->getSeqName(qid) << endl;
+ } else {
+ out << "------------------------------------------------------------------------------------------------------------------------------------------------" << endl;
+ out.setf(ios::fixed, ios::floatfield); // set fixed floating format
+ out.precision(2);
+ out << " "
+ << setw(9) << sumq
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[0]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[0]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[1]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[1]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[2]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[2]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[3]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[3]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[4]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[4]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[5]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[5]/sumq0 << ") "
+ << setw(7) << lmap_seq_quartet_info[qid].countarr[6]
+ << " (" << setw(6) << (double) 100.0*lmap_seq_quartet_info[qid].countarr[6]/sumq0 << ") "
+ << endl << endl;
+ }
+ }
+
+ out << endl << endl << "Quartet resolution per sequence (phylogenetic information):" << endl << endl;
+ out << " #quartets resolved partly unresolved name" << endl;
+ out << "-----------------------------------------------------------------------------" << endl;
+ for (qid = 0; qid <= leafNum; qid++) {
+ //unsigned long sumq = lmap_seq_quartet_info[qid].countarr[0] + lmap_seq_quartet_info[qid].countarr[1] + lmap_seq_quartet_info[qid].countarr[2] + lmap_seq_quartet_info[qid].countarr[3] + lmap_seq_quartet_info[qid].countarr[4] + lmap_seq_quartet_info[qid].countarr[5] + lmap_seq_quartet_info[qid].countarr[6];
+ unsigned long resolved = lmap_seq_quartet_info[qid].countarr[LM_REG1] + lmap_seq_quartet_info[qid].countarr[LM_REG2] + lmap_seq_quartet_info[qid].countarr[LM_REG3];
+ unsigned long partly = lmap_seq_quartet_info[qid].countarr[LM_REG4] + lmap_seq_quartet_info[qid].countarr[LM_REG5] + lmap_seq_quartet_info[qid].countarr[LM_REG6];
+ unsigned long unres = lmap_seq_quartet_info[qid].countarr[LM_REG7];
+ unsigned long sumq0, sumq = lmap_seq_quartet_info[qid].countarr[7] + lmap_seq_quartet_info[qid].countarr[8] + lmap_seq_quartet_info[qid].countarr[9];
+ if (sumq>0) sumq0=sumq;
+ else sumq0=1;
+
+ if (qid < leafNum) {
+ out.setf(ios::fixed, ios::floatfield); // set fixed floating format
+ out.precision(2);
+ out << setw(4) << qid+1
+ << setw(9) << sumq
+ << setw(7) << resolved
+ << " (" << setw(6) << (double) 100.0*resolved/sumq0 << ") "
+ << setw(7) << partly
+ << " (" << setw(6) << (double) 100.0*partly/sumq0 << ") "
+ << setw(7) << unres
+ << " (" << setw(6) << (double) 100.0*unres/sumq0 << ") "
+ << PhyloTree::aln->getSeqName(qid) << endl;
+ } else {
+ out << "-----------------------------------------------------------------------------" << endl;
+ out.setf(ios::fixed, ios::floatfield); // set fixed floating format
+ out.precision(2);
+ out << " "
+ << setw(9) << sumq
+ << setw(7) << resolved
+ << " (" << setw(6) << (double) 100.0*resolved/sumq0 << ") "
+ << setw(7) << partly
+ << " (" << setw(6) << (double) 100.0*partly/sumq0 << ") "
+ << setw(7) << unres
+ << " (" << setw(6) << (double) 100.0*unres/sumq0 << ") " << endl;
+ }
+ }
+
+ resolved = areacount[0] + areacount[1] + areacount[2];
+ partly = areacount[3] + areacount[4] + areacount[5];
+ unresolved = areacount[6];
+
+ // out << endl << "LIKELIHOOD MAPPING ANALYSIS" << endl << endl;
+ // out << "Number of quartets: " << (resolved+partly+unresolved)
+ // << " (randomly drawn with replacement)" << endl << endl;
+ out << "Overall quartet resolution:" << endl << endl;
+ out << "Number of fully resolved quartets (regions 1+2+3): " << resolved
+ << " (=" << 100.0 * resolved/(resolved+partly+unresolved) << "%)" << endl;
+ out << "Number of partly resolved quartets (regions 4+5+6): " << partly
+ << " (=" << 100.0 * partly/(resolved+partly+unresolved) << "%)" << endl;
+ out << "Number of unresolved quartets (region 7) : " << unresolved
+ << " (=" << 100.0 * unresolved/(resolved+partly+unresolved) << "%)" << endl << endl;
+
+} // end PhyloTree::reportLikelihoodMapping
diff --git a/splitgraph.cpp b/splitgraph.cpp
index 915b08d..3479c14 100644
--- a/splitgraph.cpp
+++ b/splitgraph.cpp
@@ -158,6 +158,58 @@ void SplitGraph::init(Params ¶ms)
}
+
+void SplitGraph::saveCheckpoint() {
+ if (empty()) return;
+ int ntax = getNTaxa();
+// checkpoint->startStruct("S");
+ CKP_SAVE(ntax);
+ int nsplits = size();
+ CKP_SAVE(nsplits);
+ checkpoint->startList(size());
+ for (iterator it = begin(); it != end(); it++) {
+ checkpoint->addListElement();
+ stringstream ss;
+ ss << (*it)->getWeight();
+ for (int i = 0; i < ntax; i++)
+ if ((*it)->containTaxon(i))
+ ss << " " << i;
+ checkpoint->put("", ss.str());
+ }
+ checkpoint->endList();
+// checkpoint->endStruct();
+ CheckpointFactory::saveCheckpoint();
+}
+
+void SplitGraph::restoreCheckpoint() {
+ int ntax, nsplits;
+ CheckpointFactory::restoreCheckpoint();
+// checkpoint->startStruct("S");
+
+ if (!CKP_RESTORE(ntax)) return;
+ CKP_RESTORE(nsplits);
+ checkpoint->startList(nsplits);
+ for (int split = 0; split < nsplits; split++) {
+ checkpoint->addListElement();
+ string str;
+ assert(checkpoint->getString("", str));
+ stringstream ss(str);
+ double weight;
+ ss >> weight;
+ Split *sp = new Split(ntax, weight);
+ for (int i = 0; i < ntax; i++) {
+ int tax;
+ if (ss >> tax) {
+ sp->addTaxon(tax);
+ } else
+ break;
+ }
+ push_back(sp);
+ }
+ checkpoint->endList();
+// checkpoint->endStruct();
+}
+
int SplitGraph::getNTrivialSplits() {
int count = 0;
for (iterator it = begin(); it != end(); it++)
@@ -701,3 +753,16 @@ int SplitGraph::findLeafName(string &name) {
return i;
return -1;
}
+
+int SplitGraph::removeTrivialSplits() {
+ int removed = 0;
+ for (iterator itg = begin(); itg != end(); ) {
+ if ((*itg)->trivial() >= 0) {
+ removed++;
+ delete (*itg);
+ (*itg) = back();
+ pop_back();
+ } else itg++;
+ }
+ return removed;
+}
diff --git a/splitgraph.h b/splitgraph.h
index 6002a67..f7e5dd8 100644
--- a/splitgraph.h
+++ b/splitgraph.h
@@ -31,7 +31,7 @@
#include "node.h"
#include "splitset.h"
#include "mtree.h"
-
+#include "checkpoint.h"
class MTreeSet;
@@ -42,7 +42,7 @@ SplitGraph class
@author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
*/
-class SplitGraph : public vector<Split*>
+class SplitGraph : public vector<Split*>, public CheckpointFactory
{
public:
@@ -70,6 +70,16 @@ public:
@param params program parameters
*/
void init(Params ¶ms);
+
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
/**
if no taxa block found, but the sets block is present, then
@@ -368,6 +378,11 @@ public:
*/
void calcDistance(matrix(double) &dist, vector<int> &taxa_order);
+ /**
+ * remove all trivial splits
+ * @return number of trivial splits removed
+ */
+ int removeTrivialSplits();
protected:
diff --git a/stoprule.cpp b/stoprule.cpp
index f0afc7e..a5018bf 100644
--- a/stoprule.cpp
+++ b/stoprule.cpp
@@ -20,7 +20,7 @@
#include "stoprule.h"
#include "timeutil.h"
-StopRule::StopRule()
+StopRule::StopRule() : CheckpointFactory()
{
// nTime_ = 0;
predicted_iteration = 0;
@@ -52,6 +52,26 @@ void StopRule::initialize(Params ¶ms) {
StopRule::~StopRule()
{
}
+
+void StopRule::saveCheckpoint() {
+ checkpoint->startStruct("StopRule");
+ CKP_SAVE(curIteration);
+ CKP_SAVE(start_real_time);
+ CKP_VECTOR_SAVE(time_vec);
+ checkpoint->endStruct();
+ CheckpointFactory::saveCheckpoint();
+}
+
+void StopRule::restoreCheckpoint() {
+ CheckpointFactory::restoreCheckpoint();
+ checkpoint->startStruct("StopRule");
+ CKP_RESTORE(curIteration);
+ CKP_RESTORE(start_real_time);
+ CKP_VECTOR_RESTORE(time_vec);
+ checkpoint->endStruct();
+}
+
+
//
//int StopRule::getNumIterations() {
// if (stop_condition == SC_FIXED_ITERATION || predicted_iteration == 0)
diff --git a/stoprule.h b/stoprule.h
index 1619e8f..972a714 100644
--- a/stoprule.h
+++ b/stoprule.h
@@ -21,13 +21,13 @@
#define STOPRULE_H
#include "tools.h"
-
+#include "checkpoint.h"
/**
Stopping rule
@author BUI Quang Minh <minh.bui at univie.ac.at>
*/
-class StopRule
+class StopRule : public CheckpointFactory
{
public:
@@ -42,6 +42,16 @@ public:
*/
~StopRule();
+ /**
+ save object into the checkpoint
+ */
+ virtual void saveCheckpoint();
+
+ /**
+ restore object from the checkpoint
+ */
+ virtual void restoreCheckpoint();
+
/**
read improved iteration number from a file
@param fileName file name
diff --git a/superalignment.cpp b/superalignment.cpp
index 542f0b0..6d71ca0 100644
--- a/superalignment.cpp
+++ b/superalignment.cpp
@@ -319,9 +319,9 @@ void SuperAlignment::createBootstrapAlignment(IntVector &pattern_freq, const cha
}
-void SuperAlignment::createBootstrapAlignment(int *pattern_freq, const char *spec) {
+void SuperAlignment::createBootstrapAlignment(int *pattern_freq, const char *spec, int *rstream) {
if (!isSuperAlignment()) outError("Internal error: ", __func__);
- if (spec && strncmp(spec, "GENE", 4) != 0) outError("Unsupported yet. ", __func__);
+// if (spec && strncmp(spec, "GENE", 4) != 0) outError("Unsupported yet. ", __func__);
if (spec && strncmp(spec, "GENE", 4) == 0) {
// resampling whole genes
@@ -333,12 +333,12 @@ void SuperAlignment::createBootstrapAlignment(int *pattern_freq, const char *spe
}
memset(pattern_freq, 0, nptn * sizeof(int));
for (int i = 0; i < partitions.size(); i++) {
- int part = random_int(partitions.size());
+ int part = random_int(partitions.size(), rstream);
Alignment *aln = partitions[part];
if (strncmp(spec,"GENESITE",8) == 0) {
// then resampling sites in resampled gene
for (int j = 0; j < aln->getNSite(); j++) {
- int ptn_id = aln->getPatternID(random_int(aln->getNPattern()));
+ int ptn_id = aln->getPatternID(random_int(aln->getNPattern(), rstream));
pattern_freq[ptn_id + part_pos[part]]++;
}
@@ -351,7 +351,10 @@ void SuperAlignment::createBootstrapAlignment(int *pattern_freq, const char *spe
// resampling sites within genes
int offset = 0;
for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
- (*it)->createBootstrapAlignment(pattern_freq + offset);
+ if (spec && strncmp(spec, "SCALE=", 6) == 0)
+ (*it)->createBootstrapAlignment(pattern_freq + offset, spec, rstream);
+ else
+ (*it)->createBootstrapAlignment(pattern_freq + offset, NULL, rstream);
offset += (*it)->getNPattern();
}
}
@@ -429,20 +432,26 @@ SuperAlignment::~SuperAlignment()
partitions.clear();
}
-void SuperAlignment::printCombinedAlignment(ostream &out, bool append) {
+void SuperAlignment::printCombinedAlignment(ostream &out, bool print_taxid) {
vector<Alignment*>::iterator pit;
int final_length = 0;
for (pit = partitions.begin(); pit != partitions.end(); pit++)
- final_length += (*pit)->getNSite();
+ if ((*pit)->seq_type == SEQ_CODON)
+ final_length += 3*(*pit)->getNSite();
+ else
+ final_length += (*pit)->getNSite();
out << getNSeq() << " " << final_length << endl;
- StrVector::iterator it;
int max_len = getMaxSeqNameLength();
+ if (print_taxid) max_len = 10;
if (max_len < 10) max_len = 10;
- int seq_id = 0;
- for (it = seq_names.begin(); it != seq_names.end(); it++, seq_id++) {
+ int seq_id;
+ for (seq_id = 0; seq_id < seq_names.size(); seq_id++) {
out.width(max_len);
- out << left << (*it) << " ";
+ if (print_taxid)
+ out << left << seq_id << " ";
+ else
+ out << left << seq_names[seq_id] << " ";
int part = 0;
for (pit = partitions.begin(); pit != partitions.end(); pit++, part++) {
int part_seq_id = taxa_index[seq_id][part];
@@ -460,13 +469,6 @@ void SuperAlignment::printCombinedAlignment(ostream &out, bool append) {
}
void SuperAlignment::printCombinedAlignment(const char *file_name, bool append) {
- vector<Alignment*>::iterator pit;
- int final_length = 0;
- for (pit = partitions.begin(); pit != partitions.end(); pit++)
- if ((*pit)->seq_type == SEQ_CODON)
- final_length += 3*(*pit)->getNSite();
- else
- final_length += (*pit)->getNSite();
try {
ofstream out;
out.exceptions(ios::failbit | ios::badbit);
@@ -475,28 +477,7 @@ void SuperAlignment::printCombinedAlignment(const char *file_name, bool append)
out.open(file_name, ios_base::out | ios_base::app);
else
out.open(file_name);
- out << getNSeq() << " " << final_length << endl;
- StrVector::iterator it;
- int max_len = getMaxSeqNameLength();
- if (max_len < 10) max_len = 10;
- int seq_id = 0;
- for (it = seq_names.begin(); it != seq_names.end(); it++, seq_id++) {
- out.width(max_len);
- out << left << (*it) << " ";
- int part = 0;
- for (pit = partitions.begin(); pit != partitions.end(); pit++, part++) {
- int part_seq_id = taxa_index[seq_id][part];
- int nsite = (*pit)->getNSite();
- if (part_seq_id >= 0) {
- for (int i = 0; i < nsite; i++)
- out << (*pit)->convertStateBackStr((*pit)->getPattern(i) [part_seq_id]);
- } else {
- string str(nsite, '?');
- out << str;
- }
- }
- out << endl;
- }
+ printCombinedAlignment(out);
out.close();
cout << "Concatenated alignment was printed to " << file_name << endl;
} catch (ios::failure) {
diff --git a/superalignment.h b/superalignment.h
index 7ae6a11..c3552ad 100644
--- a/superalignment.h
+++ b/superalignment.h
@@ -149,8 +149,9 @@ public:
resampling pattern frequency by a non-parametric bootstrap
@param pattern_freq (OUT) resampled pattern frequencies
@param spec bootstrap specification, see above
+ @param rstream random generator stream, NULL to use the global randstream
*/
- virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL);
+ virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL, int *rstream = NULL);
/**
* shuffle alignment by randomizing the order of sites over all sub-alignments
@@ -181,7 +182,13 @@ public:
*/
void printCombinedAlignment(const char *filename, bool append = false);
- void printCombinedAlignment(ostream &out, bool append = false);
+ /**
+ * print the super-alignment to a stream
+ * @param out output stream
+ * @param print_taxid true to print taxa IDs instead of names, default: false
+ */
+
+ void printCombinedAlignment(ostream &out, bool print_taxid = false);
/**
* print all sub alignments into files with prefix, suffix is the charset name
diff --git a/tools.cpp b/tools.cpp
index 3281686..fc01036 100644
--- a/tools.cpp
+++ b/tools.cpp
@@ -770,6 +770,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.SSE = LK_EIGEN_SSE;
params.lk_no_avx = false;
params.print_site_lh = WSL_NONE;
+ params.print_site_state_freq = 0;
params.print_site_rate = false;
params.print_site_posterior = 0;
params.print_tree_lh = false;
@@ -881,7 +882,13 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.freq_const_patterns = NULL;
params.no_rescale_gamma_invar = false;
params.compute_seq_identity_along_tree = false;
+ params.lmap_num_quartets = 0;
+ params.lmap_cluster_file = NULL;
+ params.print_lmap_quartet_lh = false;
params.link_alpha = false;
+ params.ignore_checkpoint = false;
+ params.checkpoint_dump_interval = 20;
+ params.force_unfinished = false;
if (params.nni5) {
@@ -895,7 +902,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
// initialize random seed based on current time
gettimeofday(&tv, &tz);
//params.ran_seed = (unsigned) (tv.tv_sec+tv.tv_usec);
- params.ran_seed = (unsigned) (tv.tv_usec);
+ params.ran_seed = (tv.tv_usec);
for (cnt = 1; cnt < argc; cnt++) {
try {
@@ -1153,7 +1160,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.tree_gen = BALANCED;
continue;
}
- if (strcmp(argv[cnt], "-keep_ident") == 0) {
+ if (strcmp(argv[cnt], "-keep_ident") == 0 || strcmp(argv[cnt], "-keep-ident") == 0) {
params.ignore_identical_seqs = false;
continue;
}
@@ -1239,7 +1246,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
cnt++;
if (cnt >= argc)
throw "Use -seed <random_seed>";
- params.ran_seed = (unsigned) convert_int(argv[cnt]);
+ params.ran_seed = abs(convert_int(argv[cnt]));
continue;
}
if (strcmp(argv[cnt], "-pdgain") == 0) {
@@ -1490,7 +1497,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.do_weighted_test = true;
continue;
}
- if (strcmp(argv[cnt], "-zau") == 0) {
+ if (strcmp(argv[cnt], "-au") == 0) {
params.do_au_test = true;
continue;
}
@@ -1853,7 +1860,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
if (cnt >= argc)
throw "Use -fs <site_freq_file>";
params.site_freq_file = argv[cnt];
- params.SSE = LK_NORMAL;
+// params.SSE = LK_EIGEN;
continue;
}
@@ -2164,6 +2171,10 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
params.print_site_posterior = 1;
continue;
}
+ if (strcmp(argv[cnt], "-wsf") == 0) {
+ params.print_site_state_freq = 1;
+ continue;
+ }
if (strcmp(argv[cnt], "-wba") == 0) {
params.print_bootaln = true;
continue;
@@ -2560,6 +2571,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
continue;
}
if (strcmp(argv[cnt], "-pll") == 0) {
+ outError("-pll option is discontinued.");
params.pll = true;
continue;
}
@@ -2814,11 +2826,55 @@ void parseArg(int argc, char *argv[], Params ¶ms) {
continue;
}
+ if (strcmp(argv[cnt], "-lmap") == 0) {
+ cnt++;
+ if (cnt >= argc)
+ throw "Use -lmap <likelihood_mapping_num_quartets>";
+ params.lmap_num_quartets = convert_int(argv[cnt]);
+ if (params.lmap_num_quartets < 1)
+ throw "Number of quartets must be >= 1";
+ continue;
+ }
+
+ if (strcmp(argv[cnt], "-lmclust") == 0) {
+ cnt++;
+ if (cnt >= argc)
+ throw "Use -lmclust <likelihood_mapping_cluster_file>";
+ params.lmap_cluster_file = argv[cnt];
+ // '-keep_ident' is currently required to allow a 1-to-1 mapping of the
+ // user-given groups (HAS) - possibly obsolete in the future versions
+ params.ignore_identical_seqs = false;
+ continue;
+ }
+
+ if (strcmp(argv[cnt], "-wql") == 0) {
+ params.print_lmap_quartet_lh = true;
+ continue;
+ }
+
if (strcmp(argv[cnt], "--link-alpha") == 0) {
params.link_alpha = true;
continue;
}
+ if (strcmp(argv[cnt], "-redo") == 0) {
+ params.ignore_checkpoint = true;
+ continue;
+ }
+
+ if (strcmp(argv[cnt], "--force-unfinish") == 0) {
+ params.force_unfinished = true;
+ continue;
+ }
+
+ if (strcmp(argv[cnt], "-cptime") == 0) {
+ cnt++;
+ if (cnt >= argc)
+ throw "Use -cptime <checkpoint_time_interval>";
+ params.checkpoint_dump_interval = convert_int(argv[cnt]);
+ continue;
+ }
+
if (argv[cnt][0] == '-') {
string err = "Invalid \"";
err += argv[cnt];
@@ -2955,8 +3011,8 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " -q <partition_file> Edge-linked partition model (file in NEXUS/RAxML format)" << endl
<< " -spp <partition_file> Like -q option but allowing partition-specific rates" << endl
<< " -sp <partition_file> Edge-unlinked partition model (like -M option of RAxML)" << endl
- << " -t <start_tree_file> | BIONJ | RANDOM" << endl
- << " Starting tree (default: 100 parsimony trees and BIONJ)" << endl
+ << " -t <start_tree_file> or -t BIONJ or -t RANDOM" << endl
+ << " Starting tree (default: 99 parsimony tree and BIONJ)" << endl
<< " -te <user_tree_file> Like -t but fixing user tree (no tree search performed)" << endl
<< " -o <outgroup_taxon> Outgroup taxon name for writing .treefile" << endl
<< " -pre <PREFIX> Using <PREFIX> for output files (default: aln/partition)" << endl
@@ -2965,8 +3021,16 @@ void usage_iqtree(char* argv[], bool full_command) {
#endif
<< " -seed <number> Random seed number, normally used for debugging purpose" << endl
<< " -v, -vv, -vvv Verbose mode, printing more messages to screen" << endl
+ << " -keep-ident Keep identical sequences (default: remove & finally add)" << endl
+ << endl << "CHECKPOINTING TO RESUME STOPPED RUN:" << endl
+ << " -redo Redo analysis even for successful runs (default: resume)" << endl
+ << " -cptime <seconds> Minimum checkpoint time interval (default: 20)" << endl
+ << endl << "LIKELIHOOD MAPPING ANALYSIS:" << endl
+ << " -lmap <#quartets> Number of quartets for likelihood mapping analysis" << endl
+ << " -lmclust <clustfile> NEXUS file containing clusters for likelihood mapping" << endl
+ << " -wql Print quartet log-likelihoods to .quartetlh file" << endl
<< endl << "NEW STOCHASTIC TREE SEARCH ALGORITHM:" << endl
- << " -pll Use phylogenetic likelihood library (PLL) (default: off)" << endl
+// << " -pll Use phylogenetic likelihood library (PLL) (default: off)" << endl
<< " -numpars <number> Number of initial parsimony trees (default: 100)" << endl
<< " -toppars <number> Number of best parsimony trees (default: 20)" << endl
<< " -sprrad <number> Radius for parsimony SPR search (default: 6)" << endl
@@ -3056,7 +3120,7 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " Invar, Gamma, Invar+Gamma, or FreeRate model where 'n' is" << endl
<< " number of categories (default: n=4)" << endl
<< " -a <Gamma_shape> Gamma shape parameter for site rates (default: estimate)" << endl
- << " -gmedian Computing mean for Gamma rate category (default: mean)" << endl
+ << " -gmedian Median approximation for +G site rates (default: mean)" << endl
<< " --test-alpha More thorough estimation for +I+G model parameters" << endl
<< " -i <p_invar> Proportion of invariable sites (default: estimate)" << endl
<< " -mh Computing site-specific rates to .mhrate file using" << endl
@@ -3099,6 +3163,7 @@ void usage_iqtree(char* argv[], bool full_command) {
<< " -z <trees_file> Evaluating a set of user trees" << endl
<< " -zb <#replicates> Performing BP,KH,SH,ELW tests for trees passed via -z" << endl
<< " -zw Also performing weighted-KH and weighted-SH tests" << endl
+ << " -au Also performing approximately unbiased (AU) test" << endl
<< endl;
cout << "GENERATING RANDOM TREES:" << endl;
@@ -3394,32 +3459,44 @@ int finish_random() {
int *randstream;
-int init_random(int seed) {
+int init_random(int seed, bool write_info, int** rstream) {
// srand((unsigned) time(NULL));
if (seed < 0)
seed = make_sprng_seed();
#ifndef PARALLEL
- cout << "(Using SPRNG - Scalable Parallel Random Number Generator)" << endl;
- randstream = init_sprng(0, 1, seed, SPRNG_DEFAULT); /*init stream*/
- if (verbose_mode >= VB_MED) {
- print_sprng(randstream);
+ if (write_info)
+ cout << "(Using SPRNG - Scalable Parallel Random Number Generator)" << endl;
+ if (rstream) {
+ *rstream = init_sprng(0, 1, seed, SPRNG_DEFAULT); /*init stream*/
+ } else {
+ randstream = init_sprng(0, 1, seed, SPRNG_DEFAULT); /*init stream*/
+ if (verbose_mode >= VB_MED) {
+ print_sprng(randstream);
+ }
}
#else /* PARALLEL */
- if (PP_IamMaster) {
+ if (PP_IamMaster && write_info) {
cout << "(Using SPRNG - Scalable Parallel Random Number Generator)" << endl;
}
/* MPI_Bcast(&seed, 1, MPI_UNSIGNED, PP_MyMaster, MPI_COMM_WORLD); */
- randstream = init_sprng(PP_Myid, PP_NumProcs, seed, SPRNG_DEFAULT); /*initialize stream*/
- if (verbose_mode >= VB_MED) {
- cout << "(" << PP_Myid << ") !!! random seed set to " << seed << " !!!" << endl;
- print_sprng(randstream);
+ if (rstream) {
+ *rstream = init_sprng(PP_Myid, PP_NumProcs, seed, SPRNG_DEFAULT); /*initialize stream*/
+ } else {
+ randstream = init_sprng(PP_Myid, PP_NumProcs, seed, SPRNG_DEFAULT); /*initialize stream*/
+ if (verbose_mode >= VB_MED) {
+ cout << "(" << PP_Myid << ") !!! random seed set to " << seed << " !!!" << endl;
+ print_sprng(randstream);
+ }
}
#endif /* PARALLEL */
return (seed);
} /* initrandom */
-int finish_random() {
- return free_sprng(randstream);
+int finish_random(int *rstream) {
+ if (rstream)
+ return free_sprng(rstream);
+ else
+ return free_sprng(randstream);
}
#endif /* USE_SPRNG */
@@ -3427,8 +3504,8 @@ int finish_random() {
/******************/
/* returns a random integer in the range [0; n - 1] */
-int random_int(int n) {
- return (int) floor(random_double() * n);
+int random_int(int n, int *rstream) {
+ return (int) floor(random_double(rstream) * n);
} /* randominteger */
//int randint(int a, int b) {
@@ -3436,19 +3513,25 @@ int random_int(int n) {
//}
//
-double random_double() {
+double random_double(int *rstream) {
#ifndef FIXEDINTRAND
#ifndef PARALLEL
#if RAN_TYPE == RAN_STANDARD
return ((double) rand()) / ((double) RAND_MAX + 1);
#elif RAN_TYPE == RAN_SPRNG
- return sprng(randstream);
+ if (rstream)
+ return sprng(rstream);
+ else
+ return sprng(randstream);
#else /* NO_SPRNG */
return randomunitintervall();
#endif /* NO_SPRNG */
#else /* NOT PARALLEL */
#if RAN_TYPE == RAN_SPRNG
- return sprng(randstream);
+ if (rstream)
+ return sprng(rstream);
+ else
+ return sprng(randstream);
#else /* NO_SPRNG */
int m;
for (m = 1; m < PP_NumProcs; m++)
diff --git a/tools.h b/tools.h
index b7808f3..ef24f80 100644
--- a/tools.h
+++ b/tools.h
@@ -950,7 +950,7 @@ public:
/**
random number seed
*/
- unsigned int ran_seed;
+ int ran_seed;
/**
run time of the algorithm
@@ -1335,6 +1335,12 @@ public:
*/
SiteLoglType print_site_lh;
+ /**
+ 0: print nothing
+ 1: print site state frequency vectors
+ */
+ int print_site_state_freq;
+
/** TRUE to print site-specific rates, default: FALSE */
bool print_site_rate;
@@ -1675,6 +1681,25 @@ public:
/** true to compute sequence identity along tree */
bool compute_seq_identity_along_tree;
+
+ /** true to ignore checkpoint file */
+ bool ignore_checkpoint;
+ /** number of quartets for likelihood mapping */
+ int lmap_num_quartets;
+
+ /**
+ file containing the cluster information for clustered likelihood mapping
+ */
+ char *lmap_cluster_file;
+
+ /** time (in seconds) between checkpoint dump */
+ int checkpoint_dump_interval;
+ /** TRUE to print quartet log-likelihoods to .quartetlh file */
+ bool print_lmap_quartet_lh;
+
+ /** true if ignoring the "finished" flag in checkpoint file */
+ bool force_unfinished;
+
};
/**
@@ -2069,22 +2094,25 @@ double computePValueChiSquare(double x, int df);
/* random number generator */
/*--------------------------------------------------------------*/
+extern int *randstream;
+
/**
* initialize the random number generator
* @param seed seed for generator
+ * @param write_info true to write information, false otherwise (default)
*/
-int init_random(int seed);
+int init_random(int seed, bool write_info = false, int** rstream = NULL);
/**
* finalize random number generator (e.g. free memory
*/
-int finish_random();
+int finish_random(int *rstream = NULL);
/**
* returns a random integer in the range [0; n - 1]
* @param n upper-bound of random number
*/
-int random_int(int n);
+int random_int(int n, int *rstream = NULL);
/**
* return a random integer in the range [a,b]
@@ -2095,12 +2123,12 @@ int random_int(int n);
* returns a random integer in the range [0; RAND_MAX - 1]
* = random_int(RAND_MAX)
*/
-int random_int();
+//int random_int(int *rstream = NULL);
/**
* returns a random floating-point nuber in the range [0; 1)
*/
-double random_double();
+double random_double(int *rstream = NULL);
template <class T>
void my_random_shuffle (T first, T last)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/iqtree.git
More information about the debian-med-commit
mailing list