[med-svn] [dindel] 01/02: Imported Upstream version 1.01+dfsg
Steffen Möller
moeller at moszumanska.debian.org
Fri Mar 18 15:37:00 UTC 2016
This is an automated email from the git hooks/post-receive script.
moeller pushed a commit to branch master
in repository dindel.
commit 6328b6425902329c9fc70bc71092e9b83a07ab51
Author: Steffen Möller <moeller at debian.org>
Date: Fri Mar 18 16:32:43 2016 +0100
Imported Upstream version 1.01+dfsg
---
DInDel.cpp | 4309 +++++++++++++++++++++++++++++++++++++++++++++
DInDel.hpp | 397 +++++
Fasta.hpp | 72 +
Faster.cpp | 785 +++++++++
Faster.hpp | 101 ++
GetCandidates.cpp | 498 ++++++
GetCandidates.hpp | 107 ++
HapBlock.cpp | 204 +++
HapBlock.hpp | 57 +
Haplotype.hpp | 389 ++++
HaplotypeDistribution.cpp | 486 +++++
HaplotypeDistribution.hpp | 498 ++++++
Library.hpp | 258 +++
MLAlignment.hpp | 78 +
Makefile | 15 +
MyBam.hpp | 98 ++
ObservationModel.hpp | 103 ++
ObservationModelFB.cpp | 1829 +++++++++++++++++++
ObservationModelFB.hpp | 169 ++
ObservationModelSeqAn.hpp | 377 ++++
OutputData.hpp | 121 ++
README | 15 +
Read.hpp | 451 +++++
ReadIndelErrorModel.hpp | 57 +
StringHash.hpp | 40 +
Utils.hpp | 51 +
Variant.hpp | 179 ++
VariantFile.hpp | 304 ++++
digamma.hpp | 450 +++++
foreach.hpp | 812 +++++++++
30 files changed, 13310 insertions(+)
diff --git a/DInDel.cpp b/DInDel.cpp
new file mode 100644
index 0000000..a292606
--- /dev/null
+++ b/DInDel.cpp
@@ -0,0 +1,4309 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <stdlib.h>
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <set>
+#include <algorithm>
+#include <boost/program_options.hpp>
+#include <boost/math/special_functions/digamma.hpp>
+#include <seqan/align.h>
+#include <seqan/graph_align.h>
+#include "foreach.hpp"
+#include "bam.h"
+#include "DInDel.hpp"
+#include "Haplotype.hpp"
+#include "HaplotypeDistribution.hpp"
+#include "ObservationModelFB.hpp"
+#include "Utils.hpp"
+#include "faidx.h"
+#include "GetCandidates.hpp"
+#include "ObservationModelSeqAn.hpp"
+#include "VariantFile.hpp"
+#include "Faster.hpp"
+#include <ext/hash_map>
+#include <exception>
+
+const int USECALLWINDOW=0;
+//using namespace seqan;
+using namespace seqan;
+namespace po = boost::program_options;
+
+using namespace std;
+//using namespace fasta;
+
+
+DetInDel::DetInDel(const string & bfName, const Parameters & _params, int multipleFiles) : params(_params)
+
+{
+ fai=NULL;
+ if (params.alignAgainstReference) {
+ fai = fai_load(params.refFileName.c_str());
+ if (!fai) {
+ cerr << "Cannot open reference sequence file." << endl;
+ params.alignAgainstReference=false;
+ exit(1);
+ }
+ }
+
+ if (multipleFiles==0) {
+ myBams.push_back(new MyBam(bfName));
+ } else {
+
+ ifstream file(bfName.c_str());
+ if (!file.is_open()) {
+ cout << "Cannot open file with BAM files: " << bfName << endl;
+ throw string("File open error.");
+ }
+ while (!file.eof()) {
+ string line;
+ getline(file, line);
+ if (!line.empty()) {
+ istringstream is(line);
+ string fname;
+ is >> fname;
+ if (!fname.empty()) {
+ cout << "Reading BAM file " << fname << endl;
+ myBams.push_back(new MyBam(fname));
+ myBamsFileNames.push_back(fname);
+ }
+ }
+ }
+ file.close();
+ }
+}
+
+DetInDel::~DetInDel()
+{
+ if (params.alignAgainstReference && fai) {
+ fai_destroy(fai);
+ }
+ for (size_t b=0;b<myBams.size();b++) delete myBams[b];
+}
+
+void DetInDel::analyzeDifference(const pair<Haplotype, Haplotype> & hp1, const vector<Read> & reads, uint32_t leftPos, uint32_t rightPos)
+{
+ cout << "Inference results" << endl;
+
+ if (params.analyzeLowFreqDiffThreshold<-100.0) {
+ size_t offset=50;
+ cout << "Haplotype pair: " << endl;
+ cout << hp1.first << endl << hp1.second << endl;
+
+
+ cout << "h.1 alignment: " << endl;
+ cout << string(offset,' ') << hp1.first.seq << endl;
+ for (size_t r=0;r<reads.size();r++) {
+ ObservationModelFBMax om(hp1.first, reads[r], leftPos,params.obsParams);
+ MLAlignment ml=om.calcLikelihood();
+
+ double lm=ml.ll;
+ ObservationModelFBMax op(hp1.second, reads[r], leftPos, params.obsParams);
+ MLAlignment ml2=op.calcLikelihood();
+ double lp=ml2.ll;
+
+ if (lm>=lp) om.printAlignment(offset);
+ }
+
+ cout << "h.2 alignment: " << endl;
+ cout << string(offset,' ') << hp1.second.seq << endl;
+ for (size_t r=0;r<reads.size();r++) {
+ ObservationModelFBMax om(hp1.first, reads[r], leftPos,params.obsParams);
+ MLAlignment ml=om.calcLikelihood();
+
+ double lm=ml.ll;
+ ObservationModelFBMax op(hp1.second, reads[r], leftPos, params.obsParams);
+ MLAlignment ml2=op.calcLikelihood();
+ double lp=ml2.ll;
+
+ if (lp>=lm) op.printAlignment(offset);
+
+
+ }
+
+
+
+ } else {
+
+ double ll=0.0,l1=0.0, l2=0.0;
+ vector<size_t > show;
+ for (size_t r=0;r<reads.size();r++) {
+ ObservationModelFBMax om(hp1.first, reads[r], leftPos,params.obsParams);
+ MLAlignment ml=om.calcLikelihood();
+
+ double lm=ml.ll;
+ ObservationModelFBMax op(hp1.second, reads[r], leftPos, params.obsParams);
+ MLAlignment ml2=op.calcLikelihood();
+ double lp=ml2.ll;
+ double dll=log(exp(lp)+exp(lm))+log(.5);
+ l1+=(addLogs(lm,lm)+log(.5));
+ l2+=(addLogs(lp,lp)+log(.5));
+ ll+=dll;
+
+ if (lp-lm>params.analyzeLowFreqDiffThreshold) {
+ show.push_back(r);
+ }
+ // cout << "read[" << r <<"]: 1-mq: " << 1.0-reads[r].mapQual << " first hap lik: " << lm << " second hap lik: " << lp << " combined: " << dll << " lp+lm: " << ll << " lm+lm: " << l1 << " lp+lp: " << l2 << endl;
+
+ }
+
+ size_t offset=50;
+ if (show.size()) {
+ cout << "Haplotype pair: " << endl;
+ cout << hp1.first << endl << hp1.second << endl;
+
+
+ cout << "h.1 alignment: " << endl;
+ cout << string(offset,' ') << hp1.first.seq << endl;
+
+ for (size_t i=0;i<show.size();i++) {
+ Read rr=reads[show[i]];
+ ObservationModelFBMax om2(hp1.first, rr, leftPos, params.obsParams);
+ om2.calcLikelihood();
+
+ // om2.computeMarginals();
+ om2.printAlignment(offset);
+ }
+
+ cout << endl << endl;
+ cout << "h.2 alignment: " << endl;
+ cout << string(offset,' ') << hp1.second.seq << endl;
+
+ for (size_t i=0;i<show.size();i++) {
+ Read rr=reads[show[i]];
+ ObservationModelFBMax om2(hp1.second, rr, leftPos, params.obsParams);
+ om2.calcLikelihood();
+ // om2.computeMarginals();
+ om2.printAlignment(offset);
+ }
+ }
+ else { cout << "No differences in log-likelihoods over threshold." << endl; };
+ }
+
+}
+
+void DetInDel::showAlignments(const pair<Haplotype, Haplotype> & hp1, const vector<Read> & reads, uint32_t leftPos, uint32_t rightPos)
+{
+ cout << "Inference results" << endl;
+
+ double ll=0.0;
+ int offset=50;
+ vector <double> lf(reads.size(),0.0), ls(reads.size(),0);
+ cout << "h.1 alignment: " << endl;
+ cout << string(offset,' ') << hp1.first.seq << endl;
+ for (size_t r=0;r<reads.size();r++) {
+ ObservationModelFBMax om(hp1.first, reads[r], leftPos,params.obsParams);
+ double lm=om.getLogLikelihood();
+ lf[r]=lm;
+ if (lm<params.analyzeLowFreqDiffThreshold) {
+ om.printAlignment(offset);
+ }
+ }
+
+ cout << "h.2 alignment: " << endl;
+ cout << string(offset,' ') << hp1.second.seq << endl;
+ for (size_t r=0;r<reads.size();r++) {
+ ObservationModelFBMax om(hp1.second, reads[r], leftPos,params.obsParams);
+ double lm=om.getLogLikelihood();
+ ls[r]=lm;
+ if (lm<params.analyzeLowFreqDiffThreshold) {
+ om.printAlignment(offset);
+ }
+ ll+=addLogs(lf[r],ls[r])+log(.5);
+ }
+ cout << "Total loglikelihood: " << ll << endl;
+
+
+}
+
+void DetInDel::showAlignmentsPerHaplotype(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, uint32_t candPos, uint32_t leftPos)
+{
+ cout << "ALIGNMENTS" << endl;
+
+ vector<std::set<size_t> > maxHap(haps.size());
+ for (size_t r=0;r<reads.size();r++) {
+ size_t idx=0;
+ double ml=-HUGE_VAL;
+ for (size_t h=0;h<haps.size();h++) {
+ if (liks[h][r].ll>ml) {
+ ml=liks[h][r].ll;
+ idx=h;
+ }
+ }
+ maxHap[idx].insert(r);
+ }
+
+ int offset=50;
+ for (size_t h=0;h<haps.size();h++) {
+ cout << "*******************************************" << endl;
+ cout << endl << "HAPLOTYPE " << h << endl << endl;
+ cout << string(offset,' ') << haps[h].seq << endl;
+ BOOST_FOREACH(size_t r, maxHap[h]) {
+ ObservationModelFBMax om(haps[h], reads[r], leftPos,params.obsParams);
+ om.calcLikelihood();
+ om.printAlignment(offset);
+ }
+ }
+
+
+}
+
+
+
+
+string DetInDel::getRefSeq(uint32_t lpos, uint32_t rpos)
+{
+ if (!fai) throw string("FAI error.");
+
+ char *str;
+ char *ref;
+
+ str = (char*)calloc(strlen(params.tid.c_str()) + 30, 1);
+ sprintf(str, "%s:%d-%d", params.tid.c_str(), lpos, rpos);
+ int len;
+ ref = fai_fetch(fai, str, &len);
+ if (len==0) throw string("faidx error: len==0");
+ free(str);
+ string res(ref);
+ free(ref);
+
+ transform(res.begin(), res.end(), res.begin(), ::toupper);
+ return res;
+}
+
+
+double DetInDel::getMaxHap(Haplotype & h1, Haplotype &h2, HapPairLik & hpl, const vector<Haplotype> & haps, vector<HapPairLik> & likPairs)
+{
+
+ size_t idx=0, midx;
+ double maxll=-HUGE_VAL;
+ for (idx=0;idx<likPairs.size();idx++) {
+ double ll=likPairs[idx].ll;
+ if (ll>maxll) {
+ maxll=ll;
+ midx=idx;
+ }
+ }
+ h1=haps[likPairs[midx].h1];
+ h2=haps[likPairs[midx].h2];
+
+ /*
+ cout << "getMaxHap: " << midx << " h1: " << likPairs[midx].h1 << " h2: " << likPairs[midx].h2 << endl;
+ cout << "indelcoverage h1: ";
+ for (map<int, VariantCoverage>::const_iterator it=likPairs[midx].hapIndelCoverage1.begin();it!=likPairs[midx].hapIndelCoverage1.end();it++) {
+ cout << "[" << it->second.nf << "," << it->second.nr << "]";
+ }
+ cout << endl;
+ cout << "indelcoverage h2: ";
+ for (map<int, VariantCoverage>::const_iterator it=likPairs[midx].hapIndelCoverage2.begin();it!=likPairs[midx].hapIndelCoverage2.end();it++) {
+ cout << "[" << it->second.nf << "," << it->second.nr << "]";
+ }
+ cout << endl;
+ */
+
+ hpl=likPairs[midx];
+ return maxll;
+}
+
+void DetInDel::outputMaxHap(ostream *output, const string & prefix, const vector<Haplotype> & haps, vector<HapPairLik> & likPairs)
+{
+
+ Haplotype h1, h2;
+ HapPairLik hpl;
+ getMaxHap(h1,h2, hpl, haps, likPairs);
+ *output << prefix << " " << hpl.ll << " " << hpl.numFirst << " " << hpl.numSecond << " " << hpl.numIndFirst << " " << hpl.numIndSecond << " " << hpl.numOffBoth << " " << h1.seq << " " << h2.seq << " ";
+ for (map<int, AlignedVariant>::const_iterator it=h1.indels.begin();it!=h1.indels.end();it++) if (it->second.getString()!="*REF") *output << "[" << it->first << "," << it->second.getStartRead() << "," << it->second.getString() << "]";
+ *output << "!";
+ for (map<int, AlignedVariant>::const_iterator it=h2.indels.begin();it!=h2.indels.end();it++) if (it->second.getString()!="*REF") *output << "[" << it->first << "," << it->second.getStartRead() << "," << it->second.getString() << "]";
+ *output << "!";
+ for (map<int, AlignedVariant>::const_iterator it=h1.snps.begin();it!=h1.snps.end();it++) if (it->second.getString()!="*REF") *output << "[" << it->first << "," << it->second.getStartRead() << "," << it->second.getString() << "]";
+ *output << "!";
+ for (map<int, AlignedVariant>::const_iterator it=h2.snps.begin();it!=h2.snps.end();it++) if (it->second.getString()!="*REF") *output << "[" << it->first << "," << it->second.getStartRead() << "," << it->second.getString() << "]";
+ *output << endl;
+
+
+}
+
+void DetInDel::outputTopHaps(ostream *output, const string & prefix, const vector<Haplotype> & haps, vector<HapPairLik> & likPairs, int n)
+{
+ // output n most likely haplotype pairs
+
+ for (int ns=0;ns<n && ns<int(likPairs.size());ns++) {
+ const Haplotype & h1 = haps[likPairs[ns].h1];
+ const Haplotype & h2 = haps[likPairs[ns].h2];
+ const HapPairLik & hpl = likPairs[ns];
+ *output << prefix << " " << ns+1 << " " << hpl.ll << " " << hpl.numFirst << " " << hpl.numSecond << " " << hpl.numIndFirst << " " << hpl.numIndSecond << " " << hpl.numOffBoth << " " << h1.seq << " " << h2.seq << " ";
+ for (map<int, AlignedVariant>::const_iterator it=h1.indels.begin();it!=h1.indels.end();it++) if (it->second.getString()!="*REF") *output << "[" << it->first << "," << it->second.getStartRead() << "," << it->second.getString() << "]";
+ *output << "!";
+ for (map<int, AlignedVariant>::const_iterator it=h2.indels.begin();it!=h2.indels.end();it++) if (it->second.getString()!="*REF") *output << "[" << it->first << "," << it->second.getStartRead() << "," << it->second.getString() << "]";
+ *output << "!";
+ for (map<int, AlignedVariant>::const_iterator it=h1.snps.begin();it!=h1.snps.end();it++) if (it->second.getString()!="*REF") *output << "[" << it->first << "," << it->second.getStartRead() << "," << it->second.getString() << "]";
+ *output << "!";
+ for (map<int, AlignedVariant>::const_iterator it=h2.snps.begin();it!=h2.snps.end();it++) if (it->second.getString()!="*REF") *output << "[" << it->first << "," << it->second.getStartRead() << "," << it->second.getString() << "]";
+ *output << endl;
+ }
+
+}
+
+void DetInDel::outputHapsAndFreqs(ostream *output, const string & prefix, const vector<Haplotype> & haps, const vector<double> & freqs, uint32_t leftPos)
+{
+ // output n most likely haplotype pairs
+
+ for (size_t h=0;h<haps.size();h++) {
+ const Haplotype & h1 = haps[h];
+ *output << prefix << " " << h+1 << " " << freqs[h] << " ";
+ for (map<int, AlignedVariant>::const_iterator it=h1.indels.begin();it!=h1.indels.end();it++) if (it->second.getString()!="*REF") *output << leftPos+it->first << "," << it->second.getString() << "|";
+ //*output << "!";
+ //for (map<int, AlignedVariant>::const_iterator it=h1.snps.begin();it!=h1.snps.end();it++) if (it->second.getString()!="*REF") *output << leftPos+it->first << "," << it->second.getString() << "|";
+ *output << endl;
+ }
+
+}
+
+
+
+void DetInDel::empiricalDistributionMethod(int index, const vector<Read> & reads, uint32_t pos, uint32_t leftPos, uint32_t rightPos, const AlignedCandidates & candidateVariants, OutputData & oData, OutputData & glfData)
+{
+ vector<Haplotype> haps;
+
+ vector<vector<MLAlignment> > liks;
+ vector<HapPairLik> likPairs;
+
+
+
+ // get the haplotypes
+ // changes leftPos and rightPos to haplotype blocks in HDIterator
+
+ // NOTE leftPos will be the left position of the reference sequence each haplotype will be aligned to
+ bool skip=getHaplotypes(haps, reads, pos, leftPos, rightPos, candidateVariants);
+
+ if (int(reads.size()*haps.size())>params.maxHapReadProd) {
+ stringstream os;
+ os << "skipped_numhap_times_numread>" << params.maxHapReadProd;
+ throw os.str();
+ }
+
+ int refSeqPos=leftPos;
+
+ if (skip) {
+ cerr << "tid: " << params.tid << " pos: " << pos << " SKIPPING!" << endl;
+ }
+ else {
+
+ if (!params.quiet) cout << "[empiricalDistributionMethod] Number of haplotypes: " << haps.size() << endl;
+ // compute likelihood of every read given every haplotype
+
+ if (params.estimateHapFreqs) {
+ vector<double> hapFreqs;
+ map <int, vector<tuple<AlignedVariant, double, double> > > posteriors;
+ vector<HapEstResult> her;
+
+ OutputData::Line prefilledLine(oData);
+ prefilledLine.set("index", index);
+ prefilledLine.set("tid", params.tid);
+ prefilledLine.set("center_position",pos);
+ prefilledLine.set("num_reads", reads.size());
+ prefilledLine.set("msg","ok");
+ // string rseq = getRefSeq(leftPos+1, rightPos+1);
+ prefilledLine.set("lpos",leftPos);
+ prefilledLine.set("rpos",rightPos);
+ // prefilledLine.set("refseq", rseq);
+
+
+ vector<int> onHap(reads.size(),1); // which reads were mapped inside the haplotype window given an artificially high mapping quality
+
+ if (params.slower) {
+ computeLikelihoods(haps, reads, liks, leftPos, rightPos, onHap);
+ } else {
+ computeLikelihoodsFaster(haps, reads, liks, leftPos, rightPos, onHap);
+ // int nrOffAll=0; // number of reads mapped outside all haplotypes
+ // for (size_t x=0;x<onHap.size();x++) if (!onHap[x]) nrOffAll++;
+ }
+
+
+ int numReadOffAllHaps=0;
+ int numHQReads=0;
+ for (size_t r=0;r<reads.size();r++) if (reads[r].mapQual > (1.0-1e-6) ) {
+ numHQReads++;
+ int offall=1;
+ for (size_t h=0;h<haps.size();h++) if (!liks[h][r].offHap) offall=0;
+ if (offall) numReadOffAllHaps++;
+ }
+ prefilledLine.set("num_off_hap", numReadOffAllHaps);
+ prefilledLine.set("num_hqreads", numHQReads);
+
+
+ //estimateHaplotypeFrequenciesPosterior(haps, reads,liks, hapFreqs, posteriors, pos, leftPos, glfOutput);
+ /*
+ estimateHaplotypeFrequenciesBayesEM(haps, reads,liks, hapFreqs, her, pos, leftPos, glfOutput, "all");
+ BOOST_FOREACH(HapEstResult hr, her) {
+ //cout << "EMA " << params.tid << " " << pos << " " << leftPos+hr.pos << " " <<reads.size() << " " << hr.av.getString() << " " << hr.prob << " " << hr.freq << " " << hr.freq*double(reads.size()) << " " << hr.nrf << " " << hr.nrr << endl;
+ OutputData::Line EMALine = prefilledLine;
+ EMALine.set("analysis_type", "EMA");
+ EMALine.set("realigned_position", leftPos+hr.pos);
+ EMALine.set("first_called_all", hr.av.getString());
+ EMALine.set("post_prob_variant", hr.prob);
+ EMALine.set("est_freq", hr.freq);
+ EMALine.set("first_var_cover_forward", hr.nrf);
+ EMALine.set("first_var_cover_reverse", hr.nrr);
+ oData.output(EMALine);
+ }
+ */
+
+ hapFreqs.clear();
+ her.clear();
+ estimateHaplotypeFrequenciesBayesEM(haps, reads,liks, hapFreqs, her, pos, leftPos, rightPos, glfData, index, candidateVariants, params.bayesType);
+
+ for (size_t x=0;x<her.size();x++) {
+ HapEstResult & hr = her[x];
+ //cout << "EMSV " << params.tid << " " << pos << " " << leftPos+hr.pos << " " <<reads.size() << " " << hr.av.getString() << " " << hr.prob << " " << hr.freq << " " << hr.freq*double(reads.size()) << " " << hr.nrf << " " << hr.nrr << endl;
+
+ int var_in_window=0;
+ const AlignedVariant & avar = hr.av;
+ const AlignedVariant *av = candidateVariants.findVariant(hr.pos+leftPos, avar.getType(), avar.getString());
+ if (av!=NULL) {
+ var_in_window=1;
+ }
+
+ OutputData::Line EMSVLine = prefilledLine;
+ EMSVLine.set("analysis_type", params.bayesType);
+ EMSVLine.set("realigned_position", leftPos+hr.pos);
+ EMSVLine.set("first_called_all", hr.av.getString());
+ EMSVLine.set("post_prob_variant", hr.prob);
+ EMSVLine.set("est_freq", hr.freq);
+ EMSVLine.set("first_var_cover_forward", hr.nrf);
+ EMSVLine.set("first_var_cover_reverse", hr.nrr);
+ EMSVLine.set("was_candidate_in_window",var_in_window);
+ // oData.output(EMSVLine);
+ }
+
+
+
+
+ if (params.outputRealignedBAM && params.slower) {
+ stringstream os;
+ os << index << "_" << params.tid << "_" << leftPos+params.minReadOverlap << "_" << rightPos-params.minReadOverlap << ".bam";
+
+ vector < CIGAR > cigars(reads.size());
+ for (size_t r=0;r<reads.size();r++) {
+ if (onHap[r]) {
+ double llmax=-HUGE_VAL;
+ int hidx=0;
+ for (size_t h=0;h<haps.size();h++) if (liks[h][r].ll>llmax) {
+ llmax=liks[h][r].ll;
+ hidx=h;
+ }
+ cigars[r]=getCIGAR(haps[hidx], reads[r], liks[hidx][r], refSeqPos);
+ }
+ }
+ int leftOk = leftPos + params.minReadOverlap;
+ int rightOk = rightPos - params.minReadOverlap;
+
+ string newBAMFileName=params.fileName;
+ newBAMFileName.append(".ra.").append(os.str());
+ writeRealignedBAMFile(newBAMFileName, cigars, reads, onHap, myBams[0]->bh);
+
+ if (params.processRealignedBAM != "no") {
+ stringstream cmd;
+ cmd << params.processRealignedBAM << " " << newBAMFileName << " " << params.fileName << "_realigned" << " " << params.tid << " " << leftOk << " " << rightOk;
+ cout << "Executing: " << cmd.str() << endl;
+ system(cmd.str().c_str());
+ }
+
+
+ /*
+ newBAMFileName=params.fileName;
+ newBAMFileName.append(".ua.").append(os.str());
+ writeUnalignedBAMFile(newBAMFileName, reads, onHap, myBams[0].bh);
+ */
+ }
+
+ }
+ if (params.showHapAlignments) {
+ showAlignmentsPerHaplotype(haps, reads, liks, pos, leftPos);
+ }
+
+ if (params.doDiploid) {
+ // cout << "A" << endl;
+ vector<double> hapFreqs;
+ map <int, vector<tuple<AlignedVariant, double, double> > > posteriors;
+ vector<HapEstResult> her;
+
+ vector<int> onHap(reads.size(),1); // which reads were mapped inside the haplotype window given an artificially high mapping quality
+
+ if (params.slower) {
+ computeLikelihoods(haps, reads, liks, leftPos, rightPos, onHap);
+ } else {
+ computeLikelihoodsFaster(haps, reads, liks, leftPos, rightPos, onHap);
+ }
+
+ diploidGLF(haps, reads, liks, hapFreqs, her, pos, leftPos, rightPos, glfData, index, candidateVariants,"dip");
+
+
+ //statisticsHaplotypePair(haps,reads,liks, hpl, prefilledLine);
+
+ /*
+ string prefix("FMAP");
+
+
+ callIndel(haps, reads, liks, likPairs, pos, leftPos, rightPos, prefix, prefilledLine, oData);
+ callSNP(haps, reads, liks, likPairs, pos, leftPos, rightPos, prefix, prefilledLine, oData);
+ */
+ /*
+ if (!(nci==0 && ncs==0 && params.printCallsOnly)) {
+ stringstream os; os << " " << params.tid << " " << pos << " " << leftPos; prefix.append(os.str());
+ outputTopHaps(&output, prefix, haps, likPairs, params.numOutputTopHap);
+ }
+ */
+
+ /*
+ if (params.analyzeLowFreq) {
+ pair<Haplotype, Haplotype> oh;
+ oh.first=h2;
+ oh.second=h1;
+ analyzeDifference(oh, reads, leftPos, rightPos);
+
+ oh.first=h1;
+ oh.second=h2;
+ analyzeDifference(oh, reads, leftPos, rightPos);
+ //oh.first.printHaps();
+ //oh.second.printHaps();
+ }
+ */
+
+ if (params.outputRealignedBAM && params.slower) {
+ // computes pair likelihoods using priors
+ computePairLikelihoods(haps, reads, liks, likPairs, true,candidateVariants, leftPos);
+ Haplotype h1, h2; HapPairLik hpl;
+ getMaxHap(h1, h2, hpl, haps, likPairs);
+
+ vector < CIGAR > cigars(reads.size());
+ for (size_t r=0;r<reads.size();r++) {
+ int hmax = hpl.h1;
+ if (fabs(liks[hpl.h1][r].ll-liks[hpl.h2][r].ll)<1e-8) {
+ if (haps[hpl.h1].countIndels()<haps[hpl.h2].countIndels()) hmax = hpl.h1; else hmax = hpl.h2;
+ } else {
+ if (liks[hpl.h1][r].ll>liks[hpl.h2][r].ll) {
+ hmax = hpl.h1;
+ } else {
+ hmax = hpl.h2;
+ }
+ }
+ const Haplotype & hx = haps[hmax];
+ const Read & rd = reads[r];
+ const MLAlignment & ml = liks[hmax][r];
+ cigars[r]=getCIGAR(haps[hmax], reads[r], liks[hmax][r], refSeqPos);
+ }
+
+ stringstream os;
+ int leftOk = leftPos + params.minReadOverlap;
+ int rightOk = rightPos - params.minReadOverlap;
+ os << index << "_" << params.tid << "_" << leftPos+params.minReadOverlap << "_" << rightPos-params.minReadOverlap << ".bam";
+ string newBAMFileName=params.fileName;
+ newBAMFileName.append(".ra.").append(os.str());
+ writeRealignedBAMFile(newBAMFileName, cigars, reads, onHap, myBams[0]->bh);
+ /*
+ newBAMFileName=params.fileName;
+ newBAMFileName.append(".ua.").append(os.str());
+ writeUnalignedBAMFile(newBAMFileName, reads, onHap, myBams[0]->bh);
+ */
+ if (params.processRealignedBAM != "no") {
+ stringstream cmd;
+ cmd << params.processRealignedBAM << " " << newBAMFileName << " " << params.fileName << "_realigned" << " " << params.tid << " " << leftOk << " " << rightOk;
+ cout << "Executing: " << cmd.str() << endl;
+ system(cmd.str().c_str());
+ }
+
+
+ }
+
+
+ }
+// glf.writeToFile(string(""), output);
+ }
+
+}
+
+
+
+void DetInDel::writeUnalignedBAMFile(const string & fileName, const vector<Read> & reads, const vector<int> & onHap, const bam_header_t *bh=NULL)
+{
+
+ if (onHap.size()!=reads.size()) return;
+ bool hasUnaligned=false;
+ for (size_t x=0;x<onHap.size();x++) if (!onHap[x]) {
+ hasUnaligned=true;
+ break;
+ }
+ if (!hasUnaligned) return;
+
+ bamFile bf = bam_open(fileName.c_str(),"wb");
+ if (bf==NULL) throw string("Cannot open bamfile ").append(fileName).append(" for writing!");
+
+ if (bh!=NULL) {
+ bam_header_write(bf, bh);
+ }
+
+ for (size_t r=0;r<reads.size();r++) if (!onHap[r]) {
+ bam1_t *b=reads[r].bam;
+ if (bam_write1(bf,b)<=0) throw string("Error writing to unalignable read to bamfile.");
+ }
+
+ bam_close(bf);
+}
+
+void DetInDel::writeRealignedBAMFile(const string & fileName, const vector<CIGAR> & cigars, const vector<Read> & reads, const vector<int> & onHap, const bam_header_t *bh=NULL)
+{
+ if (cigars.size()!=reads.size()) throw string("Problem with the cigars.");
+
+ bamFile bf = bam_open(fileName.c_str(),"wb");
+ if (bf==NULL) throw string("Cannot open bamfile ").append(fileName).append(" for writing!");
+
+ if (bh!=NULL) {
+ bam_header_write(bf, bh);
+ }
+
+ for (size_t r=0;r<reads.size();r++) {
+
+ bam1_t *b=reads[r].bam;
+
+ if (onHap[r]) {
+ bam1_t *nb=bam_init1();
+
+ uint32_t old_ncig=b->core.n_cigar;
+ uint32_t new_ncig=cigars[r].size();
+ int old_data_len=b->data_len;
+ int new_data_len=old_data_len - old_ncig*4 + new_ncig*4;
+
+ *nb=*b;
+ nb->data = (uint8_t*)calloc(new_data_len, 1);
+ nb->data_len=new_data_len;
+ nb->m_data=nb->data_len;
+ // copy cigar
+ for (uint32_t n=0;n<new_ncig;n++) {
+ bam1_cigar(nb)[n]=( (((uint32_t) cigars[r][n].second) << BAM_CIGAR_SHIFT) | ( ( (uint32_t) cigars[r][n].first ) ) );
+ }
+ nb->core.n_cigar=(unsigned int) new_ncig;
+
+ for (size_t n=0;n<b->core.l_qname;n++) {
+ nb->data[n]=b->data[n];
+ }
+
+ int y=b->core.l_qname+4*new_ncig;
+ for (int n=b->core.l_qname+4*old_ncig;n<b->data_len;n++,y++) {
+ nb->data[y]=b->data[n];
+ }
+
+ // update position of read
+
+ nb->core.pos=cigars[r].refPos;
+ // update insert size if mapped
+ nb->core.isize=cigars[r].refPos-nb->core.mpos;
+ if (bam_write1(bf,nb)<=0) throw string("Error writing alignment to realigned BAM file.");
+ bam_destroy1(nb);
+ } else {
+ if (bam_write1(bf,b)<=0) throw string("Error writing alignment to realigned BAM file.");
+ }
+ }
+
+ bam_close(bf);
+}
+
+
+DetInDel::CIGAR DetInDel::getCIGAR(const Haplotype & hap, const Read & read, const MLAlignment & ml, int refSeqStart)
+{
+ if (hap.ml.hpos.size()!=hap.size()) throw string("Haplotype has not been aligned!");
+ if (ml.hpos.size()!=read.size()) throw string("Read is not properly aligned!");
+ const MLAlignment & hml=hap.ml; // alignment of haplotype to reference
+
+ //string qname = bam1_qname(read.getBam());
+ const int debug = 0;
+ /*
+ if (qname == "IL8_4337:8:102:11530:1494") {
+ cout << "YES" << endl;
+ cout << qname << endl;
+ debug = 1;
+ }
+ */
+
+ vector<int> npos(read.size()); // npos records position of read base on the reference sequence
+ for (int b=0;b<int(read.size());b++) {
+ if (ml.hpos[b]>=0) npos[b]=hml.hpos[ml.hpos[b]]; else npos[b]=ml.hpos[b];
+ }
+
+ if (debug) {
+ for (size_t h=0;h<npos.size();h++) {
+ cout << "[" << h << "," << npos[h] << "]";
+ }
+ cout << endl;
+ cout << endl;
+
+ for (size_t h=0;h<ml.hpos.size();h++) {
+ cout << "[" << h << "," << ml.hpos[h] << "]";
+ }
+ cout << endl;
+ cout << endl;
+ for (size_t h=0;h<hml.hpos.size();h++) {
+ cout << "[" << h << "," << hml.hpos[h] << "]";
+ }
+ cout << endl;
+ }
+ CIGAR cig;
+
+ int b=0, prevponr=0; // position on reference previous base aligned on the reference (ie no deletion/insertion/LO/RO)
+
+ // determine last base in read aligned to the haplotype
+ b=read.size()-1;
+ while (npos[b]<0) b--;
+ int lastbonh=b;
+
+ if (lastbonh<0) {
+ // clip the whole read, read is de facto off haplotype
+ cig.push_back(CIGAR::CIGOp(BAM_CSOFT_CLIP, read.size()));
+ return cig;
+ }
+
+
+ if (debug) {
+ cout << "lastbonh: " << lastbonh << endl;
+ }
+ // find first base in read that is aligned to haplotype and to reference
+ // all sequence before that is considered 'soft clipped', ie not aligned. This may include sequence that matches perfectly to the reference
+ b=0;
+ while (npos[b]<0) b++;
+ if (b>0) cig.push_back(CIGAR::CIGOp(BAM_CSOFT_CLIP, b));
+ prevponr=npos[b];
+ cig.refPos=refSeqStart+prevponr;
+
+ int curr_cop=BAM_CMATCH;
+ int len_curr_cop=1;
+
+
+
+ while (b<lastbonh) {
+
+ int chp=npos[b]; // position on reference of current base in read
+ int nhp=npos[b+1];
+
+ if (nhp==MLAlignment::INS) {
+ if (chp==MLAlignment::INS) {
+ // stay on inserted sequence
+ if (curr_cop!=BAM_CINS) throw string("Error(1)!");
+ len_curr_cop++;
+ } else {
+ if (chp>=0) {
+ // going from on reference to insertions
+ if (curr_cop!=BAM_CMATCH) throw string("Error(2)!");
+ // write CIGAR
+ cig.push_back(CIGAR::CIGOp(BAM_CMATCH,len_curr_cop));
+
+ // update current CIGAR operation
+ len_curr_cop=1;
+ curr_cop=BAM_CINS;
+
+ prevponr=chp;
+ } else throw string("How is this possible? (1)");
+ }
+
+ } else if (chp>=0 && nhp>=0 && nhp-chp==1) {
+ // MATCH to MATCH
+ if (curr_cop!=BAM_CMATCH) {
+ cout << "b: " << b << " chp: " << chp << " nhp: " << nhp << endl;
+ throw string("Error(3)!");
+ }
+ len_curr_cop++;
+ prevponr=nhp;
+ } else if (chp>=0 && nhp>=0 && nhp-chp>1) {
+ // deletion
+ if (curr_cop!=BAM_CMATCH) throw string("Error(4)!");
+ // write CIGAR
+ cig.push_back(CIGAR::CIGOp(BAM_CMATCH,len_curr_cop));
+
+ // write deletion CIGAR
+ cig.push_back(CIGAR::CIGOp(BAM_CDEL,nhp-chp-1));
+
+ curr_cop=BAM_CMATCH;
+ len_curr_cop=1;
+
+ prevponr=nhp;
+ } else if (chp==MLAlignment::INS && nhp-prevponr==1) {
+ // from inserted bases to bases matched to the reference
+ cig.push_back(CIGAR::CIGOp(BAM_CINS,len_curr_cop));
+
+ //
+ curr_cop=BAM_CMATCH;
+ len_curr_cop=1;
+
+ prevponr=nhp;
+ } else if (chp==MLAlignment::INS && nhp-prevponr>1) {
+ // next base is again on reference but more than 1 reference base from the last read base aligned to the haplotype
+ cig.push_back(CIGAR::CIGOp(BAM_CINS,len_curr_cop));
+ cig.push_back(CIGAR::CIGOp(BAM_CDEL,nhp-prevponr-1));
+
+ curr_cop=BAM_CMATCH;
+ len_curr_cop=1;
+
+ prevponr=nhp;
+ }
+ b++;
+ }
+
+ // write last cigar
+ cig.push_back(CIGAR::CIGOp(curr_cop,len_curr_cop));
+
+ // write soft_clip at the end
+ if (read.size()-1 - lastbonh>0) {
+ cig.push_back(CIGAR::CIGOp(BAM_CSOFT_CLIP,read.size()-1 - lastbonh));
+ }
+
+ /*
+ cout << "cig: ";
+ BOOST_FOREACH(CIGAR::CIGOp cop, cig) {
+ cout << "(" << cop.first << "," << cop.second << ")" ;
+ }
+ cout << endl;
+ */
+ return cig;
+}
+
+
+void DetInDel::getReads(uint32_t leftPos, uint32_t rightPos, vector<Read> & reads, uint32_t & oldLeftPos, uint32_t & oldRightFetchReadPos, vector<Read *> & readBuffer, bool reset)
+{
+ // filter using map quality
+ class SortFunc {
+ public:
+ static bool sortFunc(const Read & r1, const Read & r2)
+ {
+ // sort in decreasing order
+ if (r1.mapQual>r2.mapQual) return true; else return false;
+ }
+ };
+
+ if (leftPos<oldLeftPos) {
+ cerr << "Windows are not sorted!" << endl;
+ exit(3);
+ }
+
+ reads.clear();
+
+ if (int(rightPos-leftPos)<3*params.minReadOverlap) throw string("Choose a larger width or a smaller minReadOverlap.");
+
+
+ int maxDev = int ( libraries.getMaxInsertSize());
+
+ //maxDev = 100;
+ //cerr << "CHANGE THIS CHANGE THIS" << endl;
+
+ string_hash< list<int> > mapped_name_to_idx, unmapped_name_to_idx; // query name to read idx
+ string_hash< list<int> >::const_iterator hash_it;
+
+ int numUnknownLib = 0;
+ string_hash <int> unknownLib;
+ const int LEFTPAD = 200;
+ // note the idea is to get only consider reads starting at position leftMostReadPos (and not ones merely overlapping)
+ // LEFTPAD should take care of overlap effects (note that leftMostReadPos is already generous, based on library insert size)
+
+
+ uint32_t rightFetchReadPos = rightPos+maxDev;
+ uint32_t rightMostReadPos = rightPos+maxDev;
+
+ uint32_t leftFetchReadPos = leftPos-maxDev-LEFTPAD;
+ uint32_t leftMostReadPos = leftPos-maxDev-LEFTPAD; // left most position of reads we want to seriously consider
+
+ // reset indicates whether we want to remake the readBuffer
+
+ vector<Read*> newReadBuffer;
+ bool leftOverlapsPrevious = false;
+ if (reset) {
+ for (size_t r=0;r<readBuffer.size();r++) {
+ if (readBuffer[r]!=NULL) delete readBuffer[r];
+ }
+ readBuffer.clear();
+ oldRightFetchReadPos = rightFetchReadPos;
+ } else {
+ // clear reads that do not overlap with new window [leftMostReadPos, rightMostReadPos]
+
+ for (size_t r=0;r<readBuffer.size();r++) {
+ uint32_t rend = readBuffer[r]->getEndPos();
+ uint32_t rbeg = readBuffer[r]->getBam()->core.pos;
+ if (rbeg<leftMostReadPos) {
+ delete readBuffer[r];
+ readBuffer[r]=NULL;
+ } else {
+ newReadBuffer.push_back(readBuffer[r]);
+ }
+ }
+
+ // note that it is required that the new leftPos of the window >= the old leftPos
+ // therefore if leftFetchReadPos<=oldRightFetchReadPos the new w
+ if (leftMostReadPos<oldRightFetchReadPos) {
+ leftFetchReadPos = oldRightFetchReadPos;
+ leftOverlapsPrevious = true;
+ }
+ }
+
+
+
+
+ // cout << "leftFetchReadPos: " << leftFetchReadPos << " rightFetchReadPos: " << rightFetchReadPos << " oldRightFetchReadPos: " << oldRightFetchReadPos << endl;
+ // cout << "leftMostReadPos: " << leftMostReadPos << " rightMostReadPos: " << rightMostReadPos << " leftOverlapsPrevious: " << int(leftOverlapsPrevious) << endl;
+ // store updated readBuffer
+ readBuffer.swap(newReadBuffer);
+
+// cout << "leftPos : " << leftPos << " rightPos: " << rightPos << " maxDev: " << maxDev << endl;
+
+
+ // first clean readbuffer
+
+
+ int numReads = readBuffer.size();
+
+ vector<Read> newReads;
+ if (leftFetchReadPos<=rightFetchReadPos) {
+ cout << "Fetching reads...." << endl;
+ for (size_t b=0;b<myBams.size();b++) {
+ //bam_fetch(myBams[b].bf, myBams[b].idx, myBams[b].getTID(params.tid), leftPos+params.minReadOverlap, rightPos-params.minReadOverlap, &reads, &Read::fetchFuncVector);
+ Read::FetchReadData data(&newReads, int(b), &(this->libraries), &myBams, numReads, params.maxReads*100);
+ bam_fetch(myBams[b]->bf, myBams[b]->idx, myBams[b]->getTID(params.tid), leftFetchReadPos , rightFetchReadPos,&data , &Read::fetchFuncVectorPooled);
+ numUnknownLib += data.numUnknownLib;
+ numReads = data.numReads;
+ }
+ oldRightFetchReadPos = rightFetchReadPos;
+ }
+
+ // add new reads to readBuffer
+
+ for (size_t r=0;r<newReads.size();r++) {
+ if (newReads[r].getBam()->core.pos>=leftFetchReadPos) {
+ // only store reads that do not overlap with the boundary;
+ // reads overlapping with boundary will have been picked up before.
+ readBuffer.push_back(new Read(newReads[r]));
+ }
+ }
+
+ if (0) {
+ // check with regular fetch
+ vector<Read> tmpReads;
+ for (size_t b=0;b<myBams.size();b++) {
+ //bam_fetch(myBams[b].bf, myBams[b].idx, myBams[b].getTID(params.tid), leftPos+params.minReadOverlap, rightPos-params.minReadOverlap, &reads, &Read::fetchFuncVector);
+ Read::FetchReadData data(&tmpReads, int(b), &(this->libraries), &myBams, numReads, params.maxReads*100);
+ bam_fetch(myBams[b]->bf, myBams[b]->idx, myBams[b]->getTID(params.tid), leftMostReadPos , rightMostReadPos,&data , &Read::fetchFuncVectorPooled);
+ }
+ for (size_t r=0;r<tmpReads.size();r++) {
+ if (tmpReads[r].getBam()->core.pos>=leftMostReadPos) {
+ string qname = string(bam1_qname(tmpReads[r].getBam()));
+ cout << "glp: " << leftPos << " qname: " << qname << " pos: " << tmpReads[r].pos << " end: " << tmpReads[r].getEndPos() << endl;
+ }
+ }
+ }
+
+
+
+ // check readbuffer for duplicates (debugging)
+ if (1) {
+ string_hash <int> qnameCount;
+ for (size_t r=0;r<readBuffer.size();r++) {
+ string qname = string(bam1_qname(readBuffer[r]->getBam()));
+ //cout << "lp: " << leftPos << " qname: " << qname << " pos: " << readBuffer[r]->pos << " end: " << readBuffer[r]->getEndPos() << endl;
+ string_hash<int>::iterator it = qnameCount.find(qname);
+ if (it == qnameCount.end()) {
+ qnameCount[qname]=1;
+ } else {
+ qnameCount[qname]++;
+ if (qnameCount[qname]>2) {
+ cerr << "Duplicate reads: readbuffer problem!" << endl;
+ throw string("duplicate reads!");
+ }
+ }
+ }
+ }
+
+
+
+ newReads.clear();
+
+ size_t oldNumReads=readBuffer.size();
+
+
+ // copy readBuffer to reads
+
+ for (size_t r=0;r<readBuffer.size();r++) {
+ reads.push_back(Read(*readBuffer[r]));
+ }
+
+
+ // get query names
+
+ vector<int> unmapped;
+ for (size_t r=0; r<reads.size();r++) {
+ if (reads[r].isUnmapped()) {
+ unmapped.push_back(r);
+ unmapped_name_to_idx[ string(bam1_qname(reads[r].getBam())) ].push_back(r);
+ // cout << " __reads[" << r << "]: " << bam1_qname(reads[r].getBam()) << " UNMAPPED" << endl;
+ } else {
+ mapped_name_to_idx[ string(bam1_qname(reads[r].getBam())) ].push_back(r);
+ // cout << " __reads[" << r << "]: " << bam1_qname(reads[r].getBam()) << " pos: " << reads[r].pos << " " << reads[r].getBam()->core.pos << " mpos: " << reads[r].getBam()->core.mpos << " mu: " << reads[r].mateIsUnmapped() << endl;
+ }
+ }
+
+
+ // filter reads based on haplotype window, the minimum read overlap for mapped reads, minimum mapping quality and maximum read length
+
+ /*
+ cout << "name_to_idx.size: " << mapped_name_to_idx.size() << endl;
+ for (hash_it = mapped_name_to_idx.begin();hash_it!=mapped_name_to_idx.end();hash_it++) {
+ cout << "hit: " << hash_it->first;
+ BOOST_FOREACH(int x, hash_it->second) {
+ cout << " " << x;
+ }
+ cout << endl;
+ }
+ */
+ int numTIDmismatch = 0, numOrphan =0, numOrphanUnmapped = 0, numInRegion = 0;
+
+ // reads are filtered by setting mapping quality to -1
+ vector<Read> filteredReads;
+ double minMapQual = params.mapQualThreshold;
+ if (minMapQual<0.0) minMapQual=0.0;
+ for (int r=0;r<int(reads.size());r++) {
+ //cout << "***" << endl;
+ //cout << "reads.mapQual " << reads[r].mapQual << " bq: " << reads[r].getBam()->core.qual << endl;
+ bool filter = false;
+ int tf = 0;
+ if (reads[r].size()>params.maxReadLength) filter=true;
+
+ if (reads[r].getEndPos()<leftMostReadPos || reads[r].pos>rightMostReadPos) filter=true;
+
+ if (!reads[r].isUnmapped()) {
+ // cout << "mapped" << endl;
+ if (reads[r].pos+int(reads[r].size())<int(leftPos)+params.minReadOverlap || reads[r].pos>int(rightPos)-params.minReadOverlap) {
+ // cout << " { " << reads[r].pos+reads[r].size() << " " << leftPos+params.minReadOverlap << " " << reads[r].pos << " " << rightPos-params.minReadOverlap << " } " << endl;
+ filter=true;
+ tf = 1;
+ } else if (reads[r].mateIsUnmapped() == false ){
+ if (reads[r].getBam()->core.mtid != reads[r].getBam()->core.tid) {
+ // filter = true;
+ // cout << "TIDERR: reads[" << r << "]: " << bam1_qname(reads[r].getBam()) << " matePos: " << reads[r].matePos << " mateLen: " << reads[r].mateLen << endl;
+ numTIDmismatch++;
+ } else {
+ // find mate of mapped read
+ // filter if we cannot find it (mapped to another chromosome, those are a bit suspicious)
+
+ // lookup mapped read
+ hash_it = mapped_name_to_idx.find(string(bam1_qname(reads[r].getBam())));
+ if (hash_it == mapped_name_to_idx.end()) { numOrphan++; filter=true; } else {
+ if (hash_it->second.size()>2) cerr << "HUH? DUPLICATE READ LABELS???" << endl;
+ if (reads[r].mateIsUnmapped() == false) {
+ filter = true;
+ }
+
+ BOOST_FOREACH(int idx, hash_it->second) {
+ if (idx != r) {
+ reads[r].mateLen = reads[idx].size();
+ reads[r].matePos = reads[idx].pos;
+ filter = false;
+ if (reads[r].matePos != reads[r].getBAMMatePos()) {
+ cerr << "matepos inconsistency!" << endl;
+ cerr << reads[r].matePos << " " << reads[r].getBAMMatePos() << endl;
+ exit(1);
+ }
+ }
+ }
+
+ if (filter == true) {
+ numOrphan++;
+ tf = 2;
+ }
+ //cout << "mapped read: " << r << " " << qname << " pos: " << reads[r].pos << " " << reads[r].getBam()->core.mtid << " " << reads[r].getBam()->core.mpos << " mateunmap: " << reads[r].mateIsUnmapped() << endl;
+ }
+ }
+ } else if (reads[r].mateIsUnmapped() == true) {
+ reads[r].matePos=reads[r].pos;
+ hash_it = unmapped_name_to_idx.find(string(bam1_qname(reads[r].getBam())));
+ if (hash_it == unmapped_name_to_idx.end()) { filter=true; } else {
+ filter = true;
+ if (hash_it->second.size()>2) cerr << "HUH? DUPLICATE READ LABELS???" << endl;
+ BOOST_FOREACH(int idx, hash_it->second) {
+ if (idx != r) {
+ reads[r].mateLen = reads[idx].size();
+ filter = false;
+ }
+ }
+
+ }
+ if (filter==true) {
+ numOrphan++;
+ tf = 3;
+ }
+ }
+ if (filter == false) numInRegion++;
+
+ } else {
+ // cout << " unmapped" << endl;
+ // read is unmapped
+ if (params.mapUnmappedReads) {
+ // cout << " unmapped " << qname << " ; ";
+
+ // lookup mapped read
+ hash_it = mapped_name_to_idx.find(string(bam1_qname(reads[r].getBam())));
+ int idx;
+ if (hash_it == mapped_name_to_idx.end()) { numOrphanUnmapped++; filter=true; tf = 4;} else {
+ if (hash_it->second.size()!=1) {
+ cerr << "UNMAPPED READ HAS MORE THAN ONE MATE!" << endl;
+ exit(1);
+ }
+ idx = *(hash_it->second.begin());
+ //cout << " FOUND " << idx << endl ;
+ // check if mate will overlap with haplotype
+ uint32_t range_l, range_r; // range of mate
+
+ int maxInsert = (int) reads[idx].getLibrary().getMaxInsertSize();
+ int minInsert = 0;
+ uint32_t rpos = reads[idx].pos;
+
+ // cout << "idx: " << idx << " unmapped: " << reads[idx].isUnmapped() << " rpos: " << rpos << " isreverse: " << reads[idx].isReverse() << endl;
+ if (reads[idx].isReverse()) {
+ range_l = rpos-maxInsert;
+ range_r = rpos-minInsert;
+ } else {
+ range_l = rpos+minInsert;
+ range_r = rpos+maxInsert;
+ }
+
+ // cout << "insert: " << insert << " std: " << std << " range_l : " << range_l << " range_r: " << range_r << " leftPos: " << leftPos << " rightPos: " << rightPos << endl;
+
+ if (range_r>leftPos && range_l<rightPos) {
+ numInRegion++;
+ filter=false;
+ reads[r].mapQual = reads[idx].mapQual;
+ reads[r].matePos = reads[idx].pos;
+ reads[r].mateLen = reads[idx].size();
+ if (reads[r].isReverse() == reads[idx].isReverse()) {
+ reads[r].reverse();
+ reads[r].complement();
+ }
+ } else {
+ filter=true;
+ tf = 5;
+ }
+ }
+ } else {
+ filter = true;
+ }
+
+ }
+ if (filter == true) reads[r].mapQual = -1.0;
+
+// cout << "reads[" << r << "]: " << bam1_qname(reads[r].getBam()) << " matePos: " << reads[r].matePos << " mateLen: " << reads[r].mateLen << " Filter: " << tf << " filter: " << filter << " mq: " << reads[r].mapQual << endl;
+ }
+
+
+ int nUnmapped = 0;
+ int nMateposError = 0;
+ sort(reads.begin(), reads.end(), SortFunc::sortFunc);
+ size_t max; for (max=0;max<params.maxReads && max<reads.size();max++) if (!(reads[max].mapQual<minMapQual)) {
+ if (reads[max].matePos==-1 && reads[max].isPaired() && !reads[max].mateIsUnmapped() ) {
+ nMateposError++;
+ reads[max].matePos = reads[max].pos;
+ };
+ filteredReads.push_back(Read(reads[max]));
+ if (reads[max].isUnmapped()) nUnmapped++;
+ } else break;
+
+
+ filteredReads.swap(reads);
+ filteredReads.clear();
+ //Read::filterReads(reads, params.maxReads, params.mapQualThreshold, params.maxReadLength, params.minReadOverlap, leftPos, rightPos);
+
+ if (params.filterReadAux!="") {
+ if (params.filterReadAux.size()>1) {
+ size_t before=reads.size();
+ int exclude=1;
+ if (params.filterReadAux[0]=='+') exclude=0;
+ string match=params.filterReadAux.substr(1,params.filterReadAux.size());
+ Read::filterReads(reads, exclude, match);
+ size_t after=reads.size();
+ if (!params.quiet) cout << "filterAux: " << before-after << " reads were filtered based on match string " << params.filterReadAux << endl;
+ }
+ }
+
+ if (!params.quiet) cout << "Number of reads: " << reads.size() << " out of " << oldNumReads << " # unmapped reads: " << nUnmapped << " numReadsUnknownLib: " << numUnknownLib << " numChrMismatch: " << numTIDmismatch << " numMappedWithoutMate: " << numOrphan << " numUnmappedWithoutMate: " << numOrphanUnmapped << endl;
+ if (nMateposError) {
+ cerr << "The mate position of " << nMateposError << " reads was recorded as -1 in the BAM file" << endl;
+ }
+
+ if (params.showReads) {
+ for (size_t r=0;r<reads.size();r++) {
+ cout << "read[" << r << "]: " << reads[r] << endl;
+ }
+ }
+
+ if (reads.size()<2) {
+ throw string("too_few_reads");
+ } else if (reads.size()>=params.maxReads) {
+ throw string("above_read_count_threshold");
+ }
+
+}
+
+
+void DetInDel::detectIndels(const string & variantsFileName)
+{
+
+ ofstream output;
+ ofstream glfOutput;
+
+ string callsFile=params.fileName; callsFile.append(".calls.txt");
+ string glfFile=params.fileName; glfFile.append(".glf.txt");
+
+ OutputData oData=params.makeOutputData(output);
+
+ /*
+ output.open(callsFile.c_str());
+ if (!output.is_open()) {
+ throw(string("Cannot open file ").append(callsFile).append(" for writing."));
+ }
+
+ oData.outputLine(oData.headerString());
+ */
+
+
+ glfOutput.open(glfFile.c_str());
+ if (!glfOutput.is_open()) {
+ throw(string("Cannot open file ").append(glfFile).append(" for writing."));
+ }
+
+ OutputData glfData=params.makeGLFOutputData(glfOutput);
+ glfData.outputLine(glfData.headerString());
+
+ VariantFile vf(variantsFileName);
+
+ int index=0;
+ //for (map<uint32_t,InDel>::const_iterator it=indels.begin();it!=indels.end();it++, cnt++) {
+
+ vector<Read *> readBuffer;
+ uint32_t oldLeftPos=0, oldRightFetchReadPos=0;
+
+
+ string oldTid("-1");
+
+ // NOTE ReadBuffer should be reset on first usage or on chromosome change!
+ bool resetReadBuffer = true;
+
+
+
+ while (!vf.eof()) {
+ AlignedCandidates candidateVariants;
+ candidateVariants=vf.getLineVector(params.varFileIsOneBased);
+ if (candidateVariants.variants.size()==0) continue;
+
+ vector<Read> reads;
+
+ uint32_t pos, leftPos, rightPos;
+ // get lowest and highest position
+
+ //leftPos=(candidateVariants.leftPos>int(params.width))?(candidateVariants.leftPos-params.width):0;
+ //rightPos=(candidateVariants.rightPos+params.width-1);
+ leftPos = candidateVariants.leftPos;
+ rightPos = candidateVariants.rightPos;
+
+
+ pos = candidateVariants.centerPos;
+ params.tid=candidateVariants.tid;
+
+ if (params.tid!=oldTid) {
+ // reinit
+ resetReadBuffer = true;
+ oldTid = params.tid;
+ oldLeftPos = 0;
+ }
+
+ if (leftPos < oldLeftPos) {
+ cerr << "leftPos: " << leftPos << " oldLeftPos: " << oldLeftPos << endl;
+ cerr << "Candidate variant files must be sorted on left position of window!" << endl;
+ exit(1);
+ }
+
+
+// TODO either add tid to AlignedVariant or infer it from the vector of aligned variants
+// change alige
+ index++;
+ bool skipped = false;
+
+ if (!params.quiet) cout << "****" << endl << " tid: " << params.tid << " pos: " << pos << " leftPos: " << leftPos << " " << " rightPos: " << rightPos << endl;
+
+ string message="ok";
+ /*
+ if (!(indels[pos].count[0]>=params.minCount || indels[pos].count[1]>=params.minCount)) {
+ message="below_indel_count_threshold";
+ skipped=true;
+ goto _end;
+ }
+ */
+
+
+
+ try {
+ getReads(leftPos, rightPos, reads, oldLeftPos, oldRightFetchReadPos, readBuffer, resetReadBuffer);
+
+
+ if (params.inferenceMethod=="empirical") empiricalDistributionMethod(index, reads, pos, leftPos, rightPos, candidateVariants, oData, glfData);
+ else throw string("Unknown inference method");
+
+ }
+ catch (string s) {
+ for (size_t x=0;x<s.size();x++) if (s[x]==' ') s[x]='_';
+ message=string("error_").append(s);
+ skipped=true;
+ goto _end;
+ }
+ catch (std::bad_alloc) {
+ message = string("error_bad_alloc");
+ skipped = true;
+ goto _end;
+ }
+ catch (std::exception& e) {
+ message = string("error_exception_").append(e.what());
+ skipped = true;
+ goto _end;
+ }
+
+
+ _end:
+
+ if (skipped) {
+ cerr << "skipped " << params.tid << " " << pos << " reason: " << message << endl;
+ //OutputData::Line line(oData);
+ //line.set("msg", message);
+ //line.set("index", index);
+ //oData.output(line);
+
+ OutputData::Line gline(glfData);
+ gline.set("msg", message);
+ gline.set("index", index);
+ gline.set("tid", params.tid);
+ gline.set("lpos", leftPos);
+ gline.set("rpos", rightPos);
+ glfData.output(gline);
+
+ // reset read buffer: all reads will be fetched again
+ resetReadBuffer = true;
+ } else {
+ resetReadBuffer = false;
+ }
+
+ oldLeftPos = leftPos;
+ }
+
+ //output.close();
+ glfOutput.close();
+
+ // clean up read buffer
+ for (size_t r=0;r<readBuffer.size();r++) {
+ if (readBuffer[r]!=NULL) delete readBuffer[r];
+ }
+
+
+}
+
+
+
+
+bool DetInDel::alignHaplotypes(vector<Haplotype> & haps, uint32_t pos, uint32_t & leftPos, uint32_t & rightPos, map<int, std::set<AlignedVariant> > & variants)
+{
+ uint32_t start=leftPos;
+ uint32_t end=rightPos+1;
+
+ variants.clear();
+
+ int print=0;
+
+ seqan::Score<int> score(-1, -460, -100,-960);
+
+ Read rh1;
+ rh1.pos=0;
+ rh1.posStat.first=0;
+ rh1.mapQual=1.0-1e-32;
+ ObservationModelParameters alignParams("probabilistic");
+ alignParams.pError=0.0001;
+ alignParams.pMut=0.01;
+ alignParams.maxLengthDel=50;
+ alignParams.forceReadOnHaplotype=true;
+ alignParams.bMid=0;
+ //alignParams.maxLengthIndel=12;
+ //alignParams.numIndels=2;
+ //alignParams.indelDist="uniform";
+
+ vector<Haplotype> tmp_haps;
+ for (size_t h=0;h<haps.size();h++) {
+ rh1.seq.seq=haps[h].seq;
+ rh1.setAllQual(1.0-1e-16);
+
+ Haplotype hRef;
+ uint32_t start=leftPos;
+ uint32_t end=rightPos;
+
+ string refSeq=getRefSeq(start+1, end+1);
+
+
+
+
+ hRef.append(refSeq);
+ /*
+ char lc = (haps[h].seq[haps[h].seq.size()-1]);
+ char lcl;
+ if (lc == 'T') lcl = 'A'; else if (lc == 'A') lcl = 'T'; else if (lc=='G') lcl = 'C'; else if (lc=='C') lcl = 'G';
+
+ hRef.seq+= lcl;
+ */
+ /*
+ ObservationModelFBMax om(hRef, rh1, 0, alignParams);
+ */
+ ObservationModelSeqAn om(hRef, rh1, 0, alignParams, score);
+ haps[h].indels.clear();
+ haps[h].snps.clear();
+ //om.reportVariants(haps[h].indels, haps[h].snps, haps[h].align);
+ //om.calcLikelihood();
+ om.align();
+ const MLAlignment & ml=om.getMLAlignment();
+ haps[h].indels=ml.indels;
+ haps[h].snps=ml.snps;
+ haps[h].align=ml.align;
+ haps[h].ml=ml;
+ bool hasStartEndIndel = false;
+ if (ml.hpos[0] == MLAlignment::LO) hasStartEndIndel = true;
+ int hs = ml.hpos.size()-1;
+ if (hs>0 && ml.hpos[hs] == MLAlignment::RO) hasStartEndIndel = true;
+ //if (params.showCandHap) {
+ // cout << "hap " << h << endl;om.printAlignment(20);
+ // cout << string(20,' ') << haps[h].align << endl;
+ // }
+
+ for (map<int, AlignedVariant>::const_iterator it=haps[h].indels.begin(); it!=haps[h].indels.end();it++) variants[it->first].insert(it->second);
+ for (map<int, AlignedVariant>::const_iterator it=haps[h].snps.begin(); it!=haps[h].snps.end();it++) variants[it->first].insert(it->second);
+ if (!hasStartEndIndel) {
+ tmp_haps.push_back(haps[h]);
+ }
+
+ }
+
+ haps.swap(tmp_haps);
+
+ // add REF allele as variant to each haplotype in order to compute coverage statistics
+ for (map<int, std::set<AlignedVariant> >::const_iterator it=variants.begin();it!=variants.end();it++) {
+ for (size_t h=0;h<haps.size();h++) haps[h].addRefVariant(it->first);
+ }
+
+ if (!params.quiet) {
+ for (map<int, std::set<AlignedVariant> >::const_iterator it=variants.begin();it!=variants.end();it++) {
+ cout << "aligned_var at pos " << pos << " " << leftPos+it->first;
+ BOOST_FOREACH(AlignedVariant av, it->second) {
+ cout << " " << av.getString();
+ }
+ cout << endl;
+ }
+ }
+
+
+ return true;
+}
+
+bool DetInDel::getHaplotypes(vector<Haplotype> &haps, const vector<Read> & reads,uint32_t pos, uint32_t & leftPos, uint32_t & rightPos, const AlignedCandidates & candidateVariants)
+{
+
+
+ uint32_t rs=(int(leftPos)>params.minReadOverlap)?(leftPos-params.minReadOverlap):0;
+ uint32_t re=rightPos+params.minReadOverlap;
+ string refSeq=getRefSeq(rs+1, re+1);
+
+ HaplotypeDistribution hd(pos, refSeq, rs);
+
+ // infer empirical haplotype distribution from WS alignment
+ for (size_t r=0;r<reads.size();r++) {
+ hd.insertRead(reads[r].getBam());
+ }
+ hd.setFrequencies();
+
+ haps.clear();
+ vector<Variant> indelVariants;
+
+ /*
+ if (!(candidateVariants.size()>0 && params.checkAllCIGARs==0)) {
+ indelVariants=hd.getIndelVariantsAtMidPos();
+ }
+
+
+ // add any prespecified variants
+ //indelVariants.insert(indelVariants.begin(), variants.begin(), variants.end());
+ */
+ if (!params.quiet) {
+ cout << "candidate_var at pos: " << pos ;
+ BOOST_FOREACH(AlignedVariant v, candidateVariants.variants) {
+ cout << " " << v.getStartHap() << "," << v.getString();
+ }
+ cout << endl;
+ }
+
+
+ // get haplotypes from empirical distribution
+
+ try {
+ HDIterator2 hdi(hd, params.maxHap, pos, leftPos, rightPos, params.noIndelWindow);
+
+ double logNumHaps=hdi.getLogNumHaps();
+ if (logNumHaps>log(params.skipMaxHap)) {
+ cerr << "tid: " << params.tid << " pos: " << pos << " too many haplotypes [>exp(" << logNumHaps << ")]" << endl;
+ return true;
+ }
+
+ //hdi.generateHapsWithIndels(haps, indels);
+ vector<Haplotype> tmp_haps;
+ hdi.generateHapsWithAlignedVariants(haps, candidateVariants, 0, params.changeINStoN);
+
+
+
+
+ if (haps.size()>params.skipMaxHap || haps.size()*reads.size()>params.maxHapReadProd) {
+ cerr << "tid: " << params.tid << " pos: " << pos << " too many haplotypes [>" << haps.size() << "]" << " numreads: " << reads.size() << endl;
+ return true;
+ }
+
+ if (params.showHapDist) {
+ cout << endl << "Empirical distribution: " << endl;
+ cout << hdi << endl;
+ }
+
+ leftPos=hdi.start();
+ rightPos=hdi.end();
+
+
+
+
+ map<int, std::set<AlignedVariant> > variants;
+ alignHaplotypes(haps,pos, leftPos, rightPos, variants);
+
+ // remove duplicate reference-haplotypes of different length
+ bool foundRef = false;
+ for (size_t th=0;th<haps.size();th++) {
+ const Haplotype & hap=haps[th];
+ int num_indels = hap.countIndels();
+ int num_snps = hap.countSNPs();
+ if (num_indels == 0 && num_snps == 0) {
+
+
+ if (!foundRef) {
+ tmp_haps.push_back(Haplotype(haps[th]));
+ foundRef = true;
+ }
+ } else {
+ tmp_haps.push_back(Haplotype(haps[th]));
+ }
+ }
+ /*
+ if (params.showCandHap) {
+ for (size_t i=0;i<haps.size();i++) {
+ cout << "PRE FILTER hdi[" << i << "]:" << haps[i] << endl;
+ }
+ }
+ */
+
+ typedef map<int, AlignedVariant>::const_iterator It;
+ haps.swap(tmp_haps);
+
+ int nh=0;
+ if (params.showCandHap) {
+ for (size_t i=0;i<haps.size();i++) {
+ cout << "POSTFILTER hdi[" << nh++ << "]:" << haps[i] << endl;
+ }
+ }
+ }
+ catch (string s) {
+ if (s=="Blocks are not consecutive.") {
+ cerr << "tid: " << params.tid << "pos: " << pos << s << endl;
+ //return true;
+ throw string("hapblock");
+ } else {
+ throw string(s);
+ }
+ }
+ return false;
+}
+
+
+
+
+/*
+// HaplotypeDistribution method
+void DetInDel::computeLikelihoods(const vector<Haplotype> &haps, const vector<Read> & reads, vector<vector<MLAlignment> > & liks, uint32_t leftPos, uint32_t rightPos, vector<int> & onHap)
+{
+// cout << "Computing likelihoods for all reads and haplotypes.\n";
+ map<size_t, vector<size_t> > hapSizeToIdx;
+ onHap = vector<int>(reads.size(),0); // records whether a read was aligned onto at least one haplotype
+
+ typedef map<size_t, vector<size_t> >::const_iterator hapsCIt;
+
+ for (size_t x=0;x<haps.size();x++) {
+ hapSizeToIdx[haps[x].size()].push_back(x);
+ }
+
+ liks=vector<vector<MLAlignment> >(haps.size(),vector<MLAlignment>(reads.size()));
+
+ for (hapsCIt it=hapSizeToIdx.begin();it!=hapSizeToIdx.end();it++) {
+ const vector<size_t> hapIdx=it->second;
+ // setup observation models for all the reads
+ vector<ObservationModelFBMaxErr> oms; oms.reserve(reads.size());
+ for (size_t r=0;r<reads.size();r++) {
+ const Haplotype & hap=haps[hapIdx[0]];
+ oms.push_back(ObservationModelFBMaxErr(hap, reads[r], leftPos, params.obsParams));
+ }
+
+ for (size_t h=0;h<hapIdx.size();h++) {
+ size_t hidx=hapIdx[h];
+ const Haplotype & hap=haps[hidx];
+
+// cout << "Haplotype[" << hidx << "]: " << endl;
+
+
+ for (size_t r=0;r<reads.size();r++) {
+ oms[r].changeHaplotype(hap);
+ liks[hidx][r]=oms[r].calcLikelihood();
+ if (!liks[hidx][r].offHapHMQ) onHap[r]=1;
+// cout << "[" << r << ": " << liks[hidx][r].ll << "] ";
+ if (liks[hidx][r].ll>0.1) {
+ ObservationModelFBMaxErr om(hap, reads[r], leftPos, params.obsParams);
+ liks[hidx][r]=om.calcLikelihood();
+ cout << string(25,' ') << hap.seq << endl;
+ om.printAlignment(25);
+ cout << "h: " << h << " r: " << r << endl;
+ cout << bam1_qname(reads[r].getBam()) << endl;
+ cerr << "Likelihood>0" << endl;
+ exit(1);
+ }
+ if (isnan(liks[hidx][r]) || isinf(liks[hidx][r])) {
+ cout << "NAN/Inf error" << endl;
+ throw string("Nan detected");
+ }
+ }
+ // cout << endl;
+ }
+ }
+}
+*/
+void DetInDel::computeLikelihoods(const vector<Haplotype> &haps, const vector<Read> & reads, vector<vector<MLAlignment> > & liks, uint32_t leftPos, uint32_t rightPos, vector<int> & onHap)
+{
+// cout << "Computing likelihoods for all reads and haplotypes.\n";
+ onHap = vector<int>(reads.size(),0); // records whether a read was aligned onto at least one haplotype
+
+ typedef map<size_t, vector<size_t> >::const_iterator hapsCIt;
+
+ liks=vector<vector<MLAlignment> >(haps.size(),vector<MLAlignment>(reads.size()));
+ for (size_t hidx=0;hidx<haps.size();hidx++) {
+ const Haplotype & hap=haps[hidx];
+ for (size_t r=0;r<reads.size();r++) {
+ ObservationModelFBMaxErr oms(hap, reads[r], leftPos, params.obsParams);
+ liks[hidx][r]=oms.calcLikelihood();
+ if (!liks[hidx][r].offHapHMQ) onHap[r]=1;
+// cout << "[" << r << ": " << liks[hidx][r].ll << "] ";
+ if (liks[hidx][r].ll>0.1) {
+ ObservationModelFBMaxErr om(hap, reads[r], leftPos, params.obsParams);
+ liks[hidx][r]=om.calcLikelihood();
+ cout << string(25,' ') << hap.seq << endl;
+ om.printAlignment(25);
+ cout << "hidx: " << hidx << " r: " << r << endl;
+ cout << bam1_qname(reads[r].getBam()) << endl;
+ cerr << "Likelihood>0" << endl;
+ exit(1);
+ }
+ if (isnan(liks[hidx][r]) || isinf(liks[hidx][r])) {
+ cout << "NAN/Inf error" << endl;
+ throw string("Nan detected");
+ }
+ }
+// cout << endl;
+ }
+}
+
+
+void DetInDel::computeHapPosition(const Haplotype & hap, const Read & read, vector<int> & alPos, int leftPos)
+{
+ // get position on haplotype of read alignment to reference from the aligned position of first and last base in the read
+
+ const bam1_t *b=read.getBam();
+ const bam1_core_t *c=&b->core;
+ uint32_t* cigar=bam1_cigar(b);
+ int k, end, start;
+ end = c->pos;
+
+ int offs=0, l=0, lend; // offset due to SOFT_SKIP at beginning
+ // lend is base for which end is computed (there might be a SOFT_CLIP at the end of the read)
+
+ bool al=false;
+ for (k = 0; k < (int) c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+
+ if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CREF_SKIP) al=true;
+
+ if (!al && op == BAM_CSOFT_CLIP) offs += cigar[k] >> BAM_CIGAR_SHIFT;
+
+ if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP)
+ l += cigar[k] >> BAM_CIGAR_SHIFT;
+
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ end += cigar[k] >> BAM_CIGAR_SHIFT;
+ lend=l;
+ }
+ }
+
+ start=c->pos-leftPos; // make relative to alignment of haplotypes to reference
+ end-=leftPos;
+
+ // lookup start and end in alignment of haplotype to reference
+
+ for (int x=0;x<int(hap.ml.hpos.size());x++) if (hap.ml.hpos[x]==start) {
+ alPos.push_back(hap.ml.hpos[x]-offs);
+ break;
+ }
+
+ for (int x=int(hap.ml.hpos.size())-1;x>=0;x--) if (hap.ml.hpos[x]==end) {
+ alPos.push_back(hap.ml.hpos[x]-lend);
+ break;
+ }
+
+
+}
+
+void DetInDel::computeLikelihoodsFaster(const vector<Haplotype> &haps, const vector<Read> & reads, vector<vector<MLAlignment> > & liks, uint32_t leftPos, uint32_t rightPos, vector<int> & onHap)
+{
+ liks.clear();
+ liks=vector<vector<MLAlignment> >(haps.size(),vector<MLAlignment>(reads.size()));
+ onHap = vector<int>(reads.size(),0); // records whether a read was aligned onto at least one haplotype
+
+ const unsigned int kmer=4;
+
+ for (size_t h=0;h<haps.size();h++) {
+ const Haplotype & hap = haps[h];
+ //cout << "Haplotype[" << h << "]: " << endl;
+ HapHash hash(kmer, hap);
+ for (size_t r=0;r<reads.size();r++) {
+ // given BWA alignment of read to reference, estimate a number of likely alignments to the haplotype
+ vector<int> alPos;
+ computeHapPosition(hap, reads[r], alPos, leftPos);
+ // cout << "[" << r << ": " << alPos.size() ;
+
+
+ ObservationModelS om(hap, reads[r], leftPos, params.obsParams);
+
+ // align using guessed alignments and the haplotype hash
+ liks[h][r]=om.align(hash);
+ // cout << "," << liks[h][r].ll << "] ";
+ if (!liks[h][r].offHapHMQ) onHap[r]=1; // if on-haplotype with artificial high mapping quality
+
+ /*
+ seqan::Score<int> score(-1, -460, -100,-960);
+ ObservationModelSeqAn om2(hap, reads[r], leftPos, params.obsParams, score);
+ om2.align();
+ */
+
+ }
+ //cout << "done" << endl;
+ }
+
+ // check HMQ off-haplotype reads
+
+ // realign a couple of high-mapping quality reads to obtain new candidate indels
+ // propose new set of candidate haplotypes by realigning all reads to the new set of candidate haplotypes
+
+
+ // --bamFiles /Users/caa/Documents/workspace/DInDelFastProb/bamfiles.txt --output test --region 12036338-12036340 --maxReadLength 60 --tid 17 --maxHap 8 --showEmpirical --minReadOverlap 20 --width 60 --maxLengthIndel 10 --ref /Users/caa/data/human_b36_female.Y.fa --pError 0.0005 --maxRead 2000 --computeMAP
+}
+
+double DetInDel::getPairPrior(const AlignedVariant & av1, const AlignedVariant & av2, int leftPos, const AlignedCandidates & candidateVariants)
+{
+ std::set<AlignedVariant> vars; vars.insert(av1); vars.insert(av2);
+ double ll = 0.0;
+ BOOST_FOREACH(AlignedVariant avar, vars) {
+ double lnf = 0.0;
+ int type = avar.getType();
+ const AlignedVariant *av = candidateVariants.findVariant(avar.getStartHap()+leftPos, avar.getType(), avar.getString());
+
+ if (type==Variant::SNP) lnf = log(params.priorSNP); else if (type==Variant::DEL || type==Variant::INS) lnf = log(params.priorIndel);
+ if (av==NULL) {
+ ll += lnf;
+ } else {
+ double prior = av->getFreq();
+ if (prior<0.0) ll += lnf; else ll+=log(prior);
+ }
+ }
+
+ return ll;
+
+}
+
+double DetInDel::getHaplotypePrior(const Haplotype & h1, const Haplotype & h2, int leftPos, const AlignedCandidates & candidateVariants)
+{
+ // returns log prior probability of the pair of haplotypes, based on settings in params
+ // one day maybe change prior for known SNPs
+ double ll=0.0;
+
+ // count indels
+ typedef map <int, AlignedVariant>::const_iterator AVIt;
+ //hash_map <int, int> indels, snps;
+ std::set <AlignedVariant> indels, snps;
+ for (AVIt it=h1.indels.begin();it!=h1.indels.end();it++) if (it->second.getString().find("*REF")==string::npos && it->second.getString().find("=>")==string::npos ) {
+ //indels[it->first]=1;
+ indels.insert(it->second);
+ }
+ for (AVIt it=h2.indels.begin();it!=h2.indels.end();it++) if (it->second.getString().find("*REF")==string::npos && it->second.getString().find("=>")==string::npos ) {
+ //indels[it->first]=1;
+ indels.insert(it->second);
+ }
+
+ for (AVIt it=h1.snps.begin();it!=h1.snps.end();it++) if (it->second.getString().find("*REF")==string::npos && it->second.getString().find("=>D")==string::npos) {
+ snps.insert(it->second);
+ //snps[it->first]=1;
+ }
+ for (AVIt it=h2.snps.begin();it!=h2.snps.end();it++) if (it->second.getString().find("*REF")==string::npos && it->second.getString().find("=>D")==string::npos) {
+ snps.insert(it->second);
+ //snps[it->first]=1;
+ }
+
+ BOOST_FOREACH(AlignedVariant indel, indels) {
+ // cout << "indel: " << indel.getString() << " " << indel.getStartHap();
+ const AlignedVariant *av = candidateVariants.findVariant(indel.getStartHap()+leftPos, indel.getType(), indel.getString());
+ if (av==NULL) {
+ // cout << " not found. " << endl;
+ ll += log(params.priorIndel);
+ } else {
+ double prior = av->getFreq();
+ // cout << " found: " << prior << " " << av->getStartHap() << " " << av->getFreq() << endl;
+ if (prior<0.0) ll += log(params.priorIndel); else ll+=log(prior);
+ }
+ }
+ BOOST_FOREACH(AlignedVariant indel, snps) {
+ // cout << "snp: " << indel.getString() << " " << indel.getStartHap();
+ const AlignedVariant *av = candidateVariants.findVariant(indel.getStartHap()+leftPos, indel.getType(), indel.getString());
+ if (av==NULL) {
+ // cout << " not found. " << endl;
+ ll += log(params.priorIndel);
+ } else {
+ double prior = av->getFreq();
+ // cout << " found: " << prior << " " << av->getStartHap() << " " << av->getFreq() << endl;
+ if (prior<0.0) ll += log(params.priorIndel); else ll+=log(prior);
+ }
+ }
+ /*
+ BOOST_FOREACH(AlignedVariant snp, snps) {
+ const AlignedVariant *av = candidateVariants.findVariant(snp.getStartHap()+leftPos, snp.getString());
+ if (av==NULL) ll += log(params.priorSNP); else {
+ double prior = av->getFreq();
+ if (prior<0.0) ll += log(params.priorSNP); else ll+=log(prior);
+ }
+ }
+ */
+ /*
+ int numIndels=int(indels.size());
+ int numSNPs=int(snps.size());
+
+ ll+=double(numIndels)*log(params.priorIndel);
+ ll+=double(numSNPs)*log(params.priorSNP);
+ */
+// cout << "ll: " << ll << endl;
+ return ll;
+}
+
+#define FILTERHAPS
+#ifdef FILTERHAPS
+
+void DetInDel::filterHaplotypes(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<int> & filtered, map<pair<int, AlignedVariant>, VariantCoverage> & varCoverage, bool doFilter)
+{
+
+ const int debugfh = 0;
+ int numFiltered = 0;
+ int numHaps = int(haps.size());
+ filtered = vector<int>(haps.size(),0);
+
+ varCoverage.clear();
+
+
+ typedef pair<int, AlignedVariant> PAV;
+ map<PAV, vector< std::set<int> > > hVarCoverage; // coverage of each variant per haplotype, so that eventually only coverage of reads that were not filtered out are reported
+
+ for (int h=0;h<int(haps.size());h++) {
+ // check all reads aligned to this haplotype and select the ones that are not off-haplotype and aligned without indels and at most two high-quality mismatches
+ std::set<int> selReads;
+ for (size_t r=0;r<reads.size();r++) {
+ int sel = 0;
+ if (!liks[h][r].offHapHMQ && liks[h][r].numIndels == 0) { // && liks[h][r].numMismatch<=3) {
+ selReads.insert(int(r));
+ sel = 1;
+ }
+ if (debugfh) cout << "sel: " << "h: " << h << " " << bam1_qname(reads[r].getBam()) << " mpos: " << reads[r].matePos << " selected: " << sel << endl;
+
+ }
+
+ // check each variant in the haplotype
+
+ bool allCovered = true; // all variants in haplotype should be covered by at least one read.
+ for (map<int, AlignedVariant>::const_iterator it = haps[h].indels.begin();it!= haps[h].indels.end();it++) {
+ const AlignedVariant & av = it->second;
+
+ PAV pav(it->first, av);
+
+ map<PAV,vector<std::set<int> > >::iterator pit = hVarCoverage.find(pav);
+ if (pit == hVarCoverage.end()) {
+ hVarCoverage[pav] = vector< std::set<int> >(haps.size()*2);
+ }
+
+ if (av.getType() == Variant::INS || av.getType() == Variant::DEL) {
+ int left = av.getLeftFlankRead() - params.obsParams.padCover; // readFlankLeft is the first left unique base flanking the indel
+ int right = av.getRightFlankRead() + params.obsParams.padCover;
+ int leftV = av.getLeftFlankRead();
+ int rightV = av.getRightFlankRead();
+
+
+ int len = right-left+1;
+ bool covered = false;
+ int numdelcovered = 0;
+ //cout << "left: " << left << " right: " << right << " len: " << len << endl;
+ if (av.getType() == Variant::DEL) {
+ // see if there is at least one read spanning the interval with at most one mismatch
+ BOOST_FOREACH(int r, selReads) {
+ std::set <int> c;
+ int strand = 0;
+ if (reads[r].isUnmapped()) {
+ if (!reads[r].mateIsReverse()) strand = 1;
+ } else {
+ if (reads[r].isReverse()) strand = 1;
+ }
+
+ int nmm = 0;
+ for (int b=0;b<=int(liks[h][r].hpos.size());b++) {
+ int hb = liks[h][r].hpos[b];
+ if (hb>=left && hb<=right) {
+ c.insert(hb);
+ if ( haps[h].seq[hb]!='N' && haps[h].seq[hb]!=reads[r].seq.seq[b]) nmm++;
+ }
+
+ }
+ int cov = 0;
+ if (int(c.size())>= len && nmm<=params.obsParams.maxMismatch) {
+ cov = 1;
+ numdelcovered++;
+ hVarCoverage[pav][h+strand*numHaps].insert(r);
+ }
+ //cout << "RC" << bam1_qname(reads[r].getBam()) << " cov: " << cov << endl;
+ }
+
+ if (numdelcovered>=1) {
+ covered = true;
+ }
+ } else if (av.getType() == Variant::INS) {
+ // see if all bases in the haplotype from left to right are covered by at least one read that matches the haplotype without indels and at most 2 mismatches
+ vector<int> hapBaseCovered(len, 0);
+ vector<int> thisReadCovered(len, 0);
+ int lenins = av.getSeq().size();
+
+ BOOST_FOREACH(int r, selReads) {
+ for (int x=0;x<len;x++) thisReadCovered[x]=0;
+ int nmm = 0;
+ std::set <int> c;
+
+ // determine read strand
+ int strand = 0;
+ if (reads[r].isUnmapped()) {
+ if (!reads[r].mateIsReverse()) strand = 1;
+ } else {
+ if (reads[r].isReverse()) strand = 1;
+ }
+
+ for (int b=0;b<=int(liks[h][r].hpos.size());b++) {
+ int hb = liks[h][r].hpos[b];
+ if (hb>=left && hb<=right) {
+ // covered even if there is a mismatch
+ thisReadCovered[hb-left]+=1;
+ c.insert(hb);
+ // count number of mismatches
+ if (haps[h].seq[hb]!=reads[r].seq.seq[b]) nmm++;
+ }
+ }
+
+ bool thisread_covered = false;
+ // for insertion <= 10 bp, whole insertion+padCover must be covered with at most one error by at least one read (ie not just covered by multiple reads together)
+ if ( (lenins>10 && nmm<=params.obsParams.maxMismatch) || (lenins<=10 && int(c.size())>=len && nmm<=params.obsParams.maxMismatch)) {
+ thisread_covered = true;
+ for (size_t x=0;x<thisReadCovered.size();x++) {
+ hapBaseCovered[x] += thisReadCovered[x];
+ if (thisReadCovered[x]==0) {
+ thisread_covered = false;
+ }
+ if (debugfh) cout << " " << hapBaseCovered[x];
+ }
+ if (thisread_covered) {
+ hVarCoverage[pav][h+strand*numHaps].insert(r);
+ }
+ }
+
+
+ if (0) cout << " hap " << h << " var: " << av.getString() << " len: " << len << " " << bam1_qname(reads[r].getBam()) << " nmm: " << nmm << " c.size(): " << c.size() << " mpos: " << reads[r].matePos << " covered: " << thisread_covered << endl;
+ if (thisread_covered) covered=true;
+
+
+ }
+ }
+
+ if (!covered) {
+ allCovered = false;
+ break;
+ }
+ if (debugfh) cout << "hap" << h << " var: " << av.getString() << " COVERED:" << covered << endl;
+
+ }
+
+ }
+ if (doFilter) {
+ if (!allCovered) {
+ numFiltered++;
+ filtered[h]=1;
+ }
+ }
+ if (debugfh) cout << "Haplotype[" << h << "]: filtered " << filtered[h] << endl;
+
+ }
+ cout << "Filtered " << numFiltered << " haplotypes." << endl;
+ // determine coverage of each variant
+ for (map<PAV, vector <std::set<int> > >::const_iterator it = hVarCoverage.begin();it != hVarCoverage.end(); it++) {
+ const PAV & pav = it->first;
+ std::set<int> rf, rr; // forward and reverse strand reads
+ for (int h=0;h<numHaps;h++) if (filtered[h]!=1) {
+ rf.insert(hVarCoverage[pav][h].begin(), hVarCoverage[pav][h].end());
+ rr.insert(hVarCoverage[pav][h+numHaps].begin(), hVarCoverage[pav][h+numHaps].end());
+ }
+ varCoverage[pav]=VariantCoverage(int(rf.size()), int(rr.size()));
+ }
+
+
+}
+#endif
+
+void DetInDel::estimateHaplotypeFrequenciesBayesEM(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<double> & hapFreqs, vector <HapEstResult > & posteriors, uint32_t candPos, uint32_t leftPos, uint32_t rightPos, OutputData & glfData, int index, const AlignedCandidates & candidateVariants, string program="all")
+
+{
+
+ // estimate haplotype frequencies using EM
+ hapFreqs.clear();
+
+ size_t nh=haps.size();
+ size_t nr=reads.size();
+
+ vector<double> rl(nh*nr,0.0); // read given haplotype likelihoods
+
+ vector<double> z(nh*nr,0.0); // expectations of read-haplotype indicator variables
+ vector<double> pi(nh); // haplotype frequencies
+ vector<double> lpi(nh); // expectation of log frequencies
+ vector<double> nk(nh,0.0), ak(nh,0.0); // counts for each haplotype
+
+ hapFreqs=nk;
+
+ int numUnmappedRealigned=0;
+ int idx=0;
+ int numReadOffAllHaps=0;
+ for (size_t r=0;r<nr;r++) {
+ int offallhap=1;
+ for (size_t h=0;h<nh;h++) {
+ // initialize read-haplotype likelihoods
+ rl[idx]=liks[h][r].ll;
+ if (!liks[h][r].offHap) offallhap=0;
+ idx++;
+ }
+ if (offallhap) {
+ numReadOffAllHaps++;
+
+ }else {
+ if (reads[r].isUnmapped()) numUnmappedRealigned++;
+ }
+ }
+
+
+ // filter reads
+
+ vector<int> filtered(nh, 0);
+ map<pair<int, AlignedVariant>, VariantCoverage> varCoverage;
+
+ //if (params.filterHaplotypes) {
+ cout << "ALWAYS CALLING ::filterHaplotypes" << endl;
+ filterHaplotypes(haps, reads,liks, filtered, varCoverage, params.filterHaplotypes);
+ //}
+
+
+ typedef pair<int, AlignedVariant> PAV;
+
+ std::set< PAV > allVariants;
+ map<int, std::set<PAV> > allVariantsByPos; //
+
+ typedef map<int, AlignedVariant>::const_iterator It;
+ typedef map<int, std::set<PAV> >::const_iterator PIt;
+
+ for (size_t th=0;th<nh;th++) {
+ const Haplotype & hap=haps[th];
+ for (It it=hap.indels.begin();it!=hap.indels.end();it++) {
+ if (!it->second.isRef() && !(it->second.isSNP() && it->second.getString()[3]=='D')) {
+ allVariants.insert(PAV(it->first,it->second));
+ allVariantsByPos[it->first].insert(PAV(it->first,it->second));
+ }
+ }
+ }
+
+ // set active variants, and divide into snps and indels
+ vector< std::set< PAV > > activeVariants, activeSNPs, activeIndels;
+
+ int nav=0;
+ int PRID=-1;
+ if (program=="all") {
+ std::set<PAV> snps, indels;
+ BOOST_FOREACH(PAV pav, allVariants) {
+ if (pav.second.isSNP()) {
+ snps.insert(pav);
+ } else if (pav.second.isIndel()) {
+ indels.insert(pav);
+ }
+ }
+
+ // both (double prior)
+ activeVariants.push_back(allVariants);
+ activeIndels.push_back(indels);
+ activeSNPs.push_back(snps);
+ nav++;
+ PRID=1;
+ } else if (program=="singlevariant") {
+ std::set < std::set<PAV> > ssPAV;
+ for (size_t h=0;h<haps.size();h++) if (filtered[h]==0) {
+ const Haplotype & hap=haps[h];
+
+
+ //cout << "hap[" << h << "].seq: " << hap.seq << endl;
+
+ //cout << "vars:";
+ std::set<PAV> act;
+ for (It it=hap.indels.begin();it!=hap.indels.end();it++) {
+ if (!it->second.isRef() && !(it->second.isSNP() && it->second.getString()[3]=='D')) {
+ act.insert(PAV(it->first, it->second));
+ // cout << "[" << it->first << "," << it->second.getString() << "]";
+ }
+ }
+ //cout << endl;
+ ssPAV.insert(act);
+ }
+
+ nav=0;
+ BOOST_FOREACH(std::set<PAV> s, ssPAV) {
+ std::set<PAV> snps, indels;
+ BOOST_FOREACH(PAV pav, s) {
+ if (pav.second.isSNP()) {
+ snps.insert(pav);
+ } else if (pav.second.isIndel()) {
+ indels.insert(pav);
+ }
+ }
+
+ // both (double prior)
+ activeVariants.push_back(s);
+ activeIndels.push_back(indels);
+ activeSNPs.push_back(snps);
+ nav++;
+ }
+
+ PRID=2;
+ } else if (program == "priorpersite") {
+ nav = 0;
+
+ // add reference haplotype
+ activeVariants.push_back(std::set<PAV>());
+ activeIndels.push_back(std::set<PAV>());
+ activeSNPs.push_back(std::set<PAV>());
+
+
+
+ for (map<int, std::set<PAV> >::const_iterator site_it = allVariantsByPos.begin();site_it!=allVariantsByPos.end();site_it++) {
+ std::set<PAV> snps, indels;
+ BOOST_FOREACH(PAV pav, site_it->second) {
+ if (pav.second.isSNP()) snps.insert(pav);
+ else if (pav.second.isIndel()) indels.insert(pav);
+ }
+
+ int maxStateSnp = (snps.size()==0)?1:2;
+ int maxStateIndel = (indels.size()==0)?1:2;
+
+ int prevNumActive = activeVariants.size();
+ int sSnp = 1, sIndel = 1;
+ //for (int sSnp = 0;sSnp<maxStateSnp;sSnp++) {
+ // for (int sIndel = 0; sIndel < maxStateIndel; sIndel++) {
+ //
+ // if (sSnp == 1 || sIndel == 1) {
+ // extend previous activeVariants
+
+ for (int pna = 0;pna<prevNumActive;pna++) {
+ std::set<PAV> av = activeVariants[pna];
+ std::set<PAV> aIndels = activeIndels[pna];
+ std::set<PAV> aSNPs = activeSNPs[pna];
+ if (sSnp == 1) {
+ av.insert(snps.begin(),snps.end());
+ aSNPs.insert(snps.begin(),snps.end());
+ }
+ if (sIndel == 1) {
+ av.insert(indels.begin(),indels.end());
+ aIndels.insert(indels.begin(),indels.end());
+ }
+
+ activeVariants.push_back(av);
+ activeIndels.push_back(aIndels);
+ activeSNPs.push_back(aSNPs);
+ }
+
+ // }
+ // }
+ // }
+
+ }
+ nav = activeVariants.size();
+
+ PRID = 3;
+
+ } else {
+ cerr << "Unknown EM option" << endl;
+ exit(1);
+ }
+
+ vector<int> compatible(nh,0);
+ vector<double> logliks(nav,0.0);
+ vector<double> logpriors(nav, 0.0);
+ vector<double> post(nav,0.0);
+ vector<double> freqs(nav*nh,0.0);
+
+ // create matrix of which variant is active in which set
+ idx=0;
+ int nv=int(allVariants.size());
+ vector<int> active(nav*nv,0), hapHasVar(nh*nv,0);
+
+ //cout << "active: " << endl;
+ BOOST_FOREACH(PAV pav, allVariants) {
+ // cout << pav.first << " " << pav.second.getString() << " ";
+ for (int a=0;a<nav;a++) {
+ if (activeVariants[a].find(pav)!=activeVariants[a].end()) active[a*nv+idx]=1;
+ // cout << " " << active[a*nv+idx];
+ }
+ // cout << endl;
+ for (size_t h=0;h<nh;h++) {
+ It it=haps[h].indels.find(pav.first);
+ if (it!=haps[h].indels.end() && it->second.getString()==pav.second.getString()) hapHasVar[h*nv+idx]=1;
+// cout << "[" << active[h*nv+idx] << " " << hapHasVar[h*nv+idx]<< " ]";
+
+ }
+// cout << endl;
+ idx++;
+ }
+
+ /*
+ cout << "allVariants: ";
+ BOOST_FOREACH(PAV pav, allVariants) {
+ cout << " [" << pav.first << " " << pav.second.getString() << "]";
+ }
+ cout << endl;
+ */
+
+
+
+
+
+ double logz=-HUGE_VAL;
+
+ double a0=params.bayesa0;
+
+ for (int th=0;th<nav;th++) {
+
+ // set active variants
+
+ double logprior=0.0;
+
+ map <int, int> sites;
+ BOOST_FOREACH(PAV pav, activeSNPs[th]) {
+ sites[pav.first]=1;
+
+ const AlignedVariant & avar = pav.second;
+ const AlignedVariant *av = candidateVariants.findVariant(avar.getStartHap()+leftPos, avar.getType(), avar.getString());
+ int type = pav.second.getType();
+
+ double lnf = 0.0;
+
+ if (type==Variant::SNP) lnf = log(params.priorSNP); else if (type==Variant::DEL || type==Variant::INS) lnf = log(params.priorIndel);
+ if (av==NULL) {
+ logprior += lnf;
+ } else {
+ double prior = av->getFreq();
+ if (prior<0.0) logprior += lnf; else logprior+=log(prior);
+ }
+
+ }
+ BOOST_FOREACH(PAV pav, activeIndels[th]) {
+
+ const AlignedVariant & avar = pav.second;
+ const AlignedVariant *av = candidateVariants.findVariant(avar.getStartHap()+leftPos, avar.getType(), avar.getString());
+ int type = pav.second.getType();
+
+ double lnf = 0.0;
+
+ if (type==Variant::SNP) lnf = log(params.priorSNP); else if (type==Variant::DEL || type==Variant::INS) lnf = log(params.priorIndel);
+ if (av==NULL) {
+ logprior += lnf;
+ } else {
+ double prior = av->getFreq();
+ if (prior<0.0) logprior += lnf; else logprior+=log(prior);
+ }
+
+ sites[pav.first]=2;
+ }
+
+
+ /*
+ for (map<int,int>::const_iterator it=sites.begin();it!=sites.end();it++) {
+ if (it->second==2) logprior+=log(params.priorIndel); else if (it->second==1) logprior+=log(params.priorSNP);
+ }
+ */
+
+ logpriors[th]=logprior;
+
+// cout << "Number of indels: " << ni << " number of SNPs: " << ns << endl;
+
+ // check haplotypes
+
+ int numah=0; // number of haplotypes for which frequencies will be estimated
+ for (size_t h=0;h<nh;h++) {
+ compatible[h]=1;
+ if (filtered[h]!=0) {
+ compatible[h]=0;
+ } else {
+ for (It it=haps[h].indels.begin();it!=haps[h].indels.end();it++) {
+ if (!it->second.isRef() && !(it->second.isSNP() && it->second.getString()[3]=='D') && activeVariants[th].find(PAV(it->first,it->second))==activeVariants[th].end()) {
+ // haplotype h has a non-ref variant that is not one of the active variants
+ compatible[h]=0;
+ break;
+ }
+ }
+ }
+ if (compatible[h]) numah++;
+ }
+
+
+ // run EM for this set of active variants
+ bool converged=false;
+ double tol=params.EMtol;
+
+ double eOld=-HUGE_VAL, eNew;
+
+ // initialize frequencies
+ for (size_t h=0;h<nh;h++) if (compatible[h]) lpi[h]=log(1.0/double(numah)); else lpi[h]=-100;
+
+ /*
+ for (size_t r=0;r<nr;r++) {
+ cout << "rl[" << r << "]:";
+ for (size_t h=0;h<nh;h++) {
+ cout << " " << rl[r*nh+h];
+ }
+ cout << endl;
+ }
+ */
+ double loglik, llNew, llOld=-HUGE_VAL;
+ int iter=0;
+ while (!converged) {
+ //cout << endl << "EM[" << iter << "]:" << endl;
+ // compute expectation of indicator variables
+ for (size_t h=0;h<nh;h++) nk[h]=0.0;
+
+ loglik=0.0;
+
+ int idx=0;
+ for (size_t r=0;r<nr;r++) {
+ double lognorm=-HUGE_VAL;
+ // compute responsibilities
+ for (size_t h=0;h<nh;h++) {
+ z[idx]=lpi[h]+(rl[idx]);
+ lognorm=addLogs(lognorm, z[idx]);
+ idx++;
+ }
+ idx-=nh;
+ // normalize and compute counts
+ for (size_t h=0;h<nh;h++) {
+ z[idx]-=lognorm;
+ z[idx]=exp(z[idx]);
+ nk[h]+=z[idx];
+ idx++;
+ }
+ loglik+=lognorm;
+
+ }
+ // compute frequencies
+ //cout << "pi: ";
+
+ double ahat=0.0;
+ for (size_t h=0;h<nh;h++) if (compatible[h]) {
+ ak[h]=nk[h]+a0; // a0 is Dirichlet prior parameter
+ ahat+=ak[h];
+ }
+ double dahat=boost::math::digamma(ahat);
+
+ for (size_t h=0;h<nh;h++) {
+
+ // do variational bayes
+ if (compatible[h]) {
+ lpi[h]=boost::math::digamma(ak[h])-dahat;
+ pi[h]=log((a0+nk[h])/(double(numah)*a0+double(nr)));
+ } else {
+ lpi[h]=-100;
+ pi[h]=-100;
+ }
+ // cout << " " << pi[h];
+ // zp+=exp(pi[h]);
+ }
+ //cout << " zp: " << zp << endl;
+
+
+ idx=0;
+ eNew=0.0;
+ for (size_t r=0;r<nr;r++) {
+ for (size_t h=0;h<nh;h++) {
+ // compute responsibilities
+ eNew+=z[idx]*( pi[h]+rl[idx]);
+ idx++;
+ }
+ }
+ //cout << "eOld: " << eOld << " " << " eNew: " << eNew; for (size_t h=0;h<nh;h++) { cout << " " << pi[h]; } cout << endl;
+ /*
+ for (size_t r=0;r<nr;r++) {
+ cout << "z[" << r << "]:";
+ for (size_t h=0;h<nh;h++) {
+ cout << " " << z[r*nh+h];
+ }
+ cout << endl;
+ }
+ */
+ llNew=loglik;
+ //cout << "loglik: " << loglik << endl;
+ if (0 && llOld>llNew+1e-5) {
+ cerr << "ERROR: nr: " << nr << " eOld: " << eOld << " eNew: " << eNew << " diff: " << eOld-eNew << endl;
+ cout << "ERROR: nr: " << nr << " eOld: " << eOld << " eNew: " << eNew << " diff: " << eOld-eNew << endl;
+ cerr << "ERROR: nr: " << nr << " llOld: " << llOld << " eNew: " << llNew << " diff: " << llOld-llNew << endl;
+ cout << "ERROR: nr: " << nr << " llOld: " << llOld << " eNew: " << llNew << " diff: " << llOld-llNew << endl;
+
+ //throw string("EM Error in estimateHapFreqs");
+ //iter=100;
+
+ }
+ converged=(fabs(eOld-eNew))<tol || iter>25;
+ //cout << "iter: " << iter << " eOld: " << eOld << " eNew: " << eNew << endl;
+
+ eOld=eNew;
+ llOld=llNew;
+ iter++;
+
+
+ }
+
+
+ // check sum
+
+ double z=0.0;
+ for (size_t y=0;y<nh;y++) {
+ z+=exp(pi[y]);
+ }
+
+ if (0) {
+ cout << "th: " << th << endl;
+ for (size_t y=0;y<nh;y++) {
+ cout << "[" << y << "," << exp(pi[y]) << "]";
+ }
+ cout << endl << endl;
+ }
+
+ logliks[th]=loglik;
+ logz=addLogs(logz, logliks[th]+logprior);
+ for (size_t h=0;h<nh;h++) { freqs[th*nh+h]=exp(pi[h])/z; }
+ //for (size_t h=0;h<nh;h++) { cout << " " << freqs[th*nh+h]; } cout << endl;
+
+ //cout << "loglik: " << loglik << " " << logliks[th] << " logprior: " << logprior << endl << endl;
+
+ }
+
+ for (int a=0;a<nav;a++) {
+ post[a]=exp(logliks[a]+logpriors[a]-logz);
+ //cout << "post[" << a << "]: " << post[a] << endl;
+ }
+
+ for (size_t h=0;h<nh;h++) {
+ hapFreqs[h]=0.0;
+ }
+
+ for (int th=0;th<nav;th++) for (size_t h=0;h<nh;h++) {
+
+ hapFreqs[h]+=exp(logliks[th]+logpriors[th]-logz)*freqs[th*nh+h];
+ }
+
+ /*
+ cout << "hapFreqs:\n ";
+ for (size_t th=0;th<nh;th++) {
+ cout << "hapFreq[" << th << "]: " << hapFreqs[th] << endl;
+ cout << "H" << th << ": " << logliks[th] << " " << logpriors[th] << " ";
+ for (size_t h=0;h<nh;h++) {
+ cout << " " << freqs[th*nh+h];
+ }
+ cout << endl;
+ }
+ */
+
+ cout << endl;
+
+ // compute marginal posteriors for the individual variants
+ vector< std::set<int> > readidx(myBams.size());
+ for (int r=0;r<int(nr);r++) readidx[reads[r].poolID].insert(r);
+
+ vector<double> prior(nh*nh,0.0);
+
+ idx=-1;
+ BOOST_FOREACH(PAV pav, allVariants) {
+ idx++;
+
+ double logp=-HUGE_VAL;
+ double freq=0.0;
+ for (int th=0;th<nav;th++) {
+ if (active[th*nv+idx]) {
+ logp=addLogs(logp, logliks[th]+logpriors[th]);
+ }
+ }
+
+ for (size_t h=0;h<nh;h++) {
+ if (hapHasVar[h*nv+idx]) {
+ freq+=hapFreqs[h];
+ }
+ }
+
+ logp-=logz;
+
+ // compute marginalized haplotype frequencies
+ vector<double> prior(nh*nh,0.0);
+
+ bool doGLF=false; //(candPos==leftPos+pav.first)?true:false;
+
+ const AlignedVariant & avar = pav.second;
+ const AlignedVariant *av = candidateVariants.findVariant(avar.getStartHap()+leftPos, avar.getType(), avar.getString());
+ doGLF = (av==NULL)?false:true;
+
+ //change this
+if (params.outputGLF && doGLF) {
+ map<int,int> otn; // old to new haplotype index
+ // vector for mapping old haplotype index to new haplotype index
+ vector<int> marsum(nv, 0);
+ int s=1;
+ for (int y=0;y<nv;y++) {
+ if (y!=idx) {
+ marsum[y]=s;
+ s*=2;
+ } else marsum[y]=0;
+ // cout << "marsum[" << y << "]: " << marsum[y] << endl;
+ }
+
+
+ // make a map of new marginalized states to the corresponding new index in the haplotype arrays.
+ map<int, int> mar_states;
+ for (int h=0;h<int(nh);h++) {
+ int nidx=0;
+ for (int v=0;v<nv;v++) nidx+=marsum[v]*hapHasVar[h*nv+v];
+ map<int,int>::iterator it=mar_states.find(nidx);
+ if (it!=mar_states.end()) otn[h]=it->second; else {
+ int ns=int(mar_states.size());
+ mar_states[nidx]=ns;
+ otn[h]=ns;
+ }
+ // cout << "oth["<<h<<"]: " << otn[h] << endl;
+ }
+
+ int nmarhap=mar_states.size();
+
+ vector<double> marFreqs(nmarhap,0.0); //marginal frequencies
+ for (size_t h=0;h<nh;h++) {
+ int newh=otn[h];
+ marFreqs[newh]+=hapFreqs[h];
+ }
+ // convert back to conditional frequencies/probabilities
+ for (int h=0;h<nmarhap;h++) {
+ if (marFreqs[h]<1e-16) marFreqs[h]=-50; else marFreqs[h]=log(marFreqs[h]);
+ }
+ // cout << "marFreq[]: "; for (size_t x=0;x<nmarhap;x++) cout << " " << marFreqs[x]; cout << endl;
+
+ // compute for every pair of haplotypes the likelihood of drawing it given priors on the variants and the estimated frequencies
+ for (size_t h1=0;h1<nh;h1++) {
+ for (size_t h2=h1;h2<nh;h2++) {
+ prior[h1*nh+h2]=marFreqs[otn[h1]]+marFreqs[otn[h2]];
+ // cout << "prior: " << h1 << " " << h2 << ": " << prior[h1*nh+h2] << endl;
+ }
+ }
+}
+ // check which how many reads to a haplotype with this variant
+
+ std::set<int> support;
+ int totnf=0, totnr=0;
+ double log5=log(0.5);
+ if (1) for (size_t b=0;b<myBams.size();b++) {
+ double msq=0.0;
+
+ int nf=0;
+ int nr=0;
+ vector<double> lik(3,0.0);
+
+ if (readidx[b].size()) {
+ // compute RMS of mapping qualities
+
+ int pos=pav.first;
+ msq=0.0;
+ int n=0;
+ if (params.outputGLF && doGLF) {
+ for (int i=0;i<3;i++) lik[i]=-HUGE_VAL;
+ for (size_t h1=0;h1<nh;h1++) for (size_t h2=h1;h2<nh;h2++) {
+ int genotype=hapHasVar[h1*nv+idx]+hapHasVar[h2*nv+idx];
+ double pr=prior[h1*nh+h2];
+ // cout << "prior: " << h1 << " " << h2 << ": " << prior[h1*nh+h2];
+ double ll=pr;
+ BOOST_FOREACH(int r, readidx[b]) {
+ ll+=log5+addLogs(rl[r*nh+h1],rl[r*nh+h2]);
+ }
+
+ lik[genotype]=addLogs(lik[genotype],ll);
+ // cout << " ll: " << ll << " " << lik[genotype] << endl;
+ }
+ }
+
+ BOOST_FOREACH(int r, readidx[b]) {
+ //cout << " " << 10*log10(1-reads[r].mapQual);
+
+ size_t mlidx; double ml=-HUGE_VAL;
+ std::set<size_t> mlis;
+ for (size_t hi=0;hi<nh;hi++) {
+ if (liks[hi][r].ll>=ml) {
+ mlidx=hi;
+ ml=liks[hi][r].ll;
+ }
+ }
+ for (size_t hi=0;hi<nh;hi++) {
+ if (liks[hi][r].ll>=ml-1e-7) {
+ mlis.insert(hi);
+ }
+ }
+ bool nrt=false, nft=false;
+
+ map<int,bool>::const_iterator it;
+ BOOST_FOREACH(size_t h, mlis) {
+ bool covered=false;
+ if (pav.second.isIndel()) {
+ it=liks[h][r].hapIndelCovered.find(pos);
+ if (it!=liks[h][r].hapIndelCovered.end() && it->second) covered=true;
+ } else if (pav.second.isSNP()) {
+ it=liks[h][r].hapSNPCovered.find(pos);
+ if (it!=liks[h][r].hapSNPCovered.end() && it->second) covered=true;
+ }
+ if (covered) { // hapHasVar[] is to check whether haplotype truely has variant or only the reference
+ if (hapHasVar[h*nv+idx]) {
+ //if (pav.first+leftPos==43017596) {
+ // cout << "h: " << h << " r: " << r << " read: " << reads[r] << endl;
+ //}
+
+
+ if (reads[r].onReverseStrand) nrt=true; else nft=true;
+ }
+
+ }
+ }
+ double mq=-10*log10(1.0-reads[r].mapQual);
+ msq+=mq*mq;
+ n++;
+ if (nft) nf++;
+ if (nrt) nr++;
+ } // foreach read r
+ // cout << endl;
+ if (n!=0) msq=sqrt(msq/double(n)); else msq=0.0;
+ if (nf+nr>0) support.insert(b);
+ totnf+=nf;
+ totnr+=nr;
+ } // readidx[b].size()
+
+ if (params.outputGLF && doGLF) {
+ OutputData::Line line(glfData);
+ line.set("msg", "ok");
+ line.set("index", index);
+ line.set("tid", params.tid);
+ line.set("analysis_type",program);
+ line.set("indidx",b);
+ line.set("was_candidate_in_window",1);
+ line.set("lpos",leftPos);
+ line.set("rpos",rightPos);
+ line.set("center_position",candPos);
+ line.set("realigned_position",pav.first+leftPos);
+ line.set("post_prob_variant", exp(logp));
+ line.set("est_freq", freq);
+ line.set("logZ", logz);
+ line.set("nref_all", pav.second.getString());
+ line.set("num_reads", readidx[b].size());
+ line.set("msq",msq);
+ line.set("num_cover_forward", nf);
+ line.set("num_cover_reverse", nr);
+ line.set("num_unmapped_realigned", numUnmappedRealigned);
+ line.set("var_coverage_forward", varCoverage[pav].nf);
+ line.set("var_coverage_reverse", varCoverage[pav].nr);
+
+ if (b==0) {
+ // output haplotypes and frequencies
+
+ ostringstream os;
+ bool ifh = true;
+ for (size_t h=0;h<haps.size();h++) {
+ if (hapFreqs[h]>1.0/double(2*reads.size())) {
+ bool isfirst = true;
+ if (!ifh) os << ";";
+ ifh = false;
+ int nvars = 0;
+ for (map<int, AlignedVariant>::const_iterator it=haps[h].indels.begin();it!=haps[h].indels.end();it++) if (it->second.getString()!="*REF") {
+ nvars++;
+ if (!isfirst) os << ",";
+ isfirst = false;
+ os << leftPos+it->first << "," << it->second.getString();
+ }
+ if (nvars==0) os << "REF";
+ os << ":" << hapFreqs[h];
+ }
+ }
+ line.set("hapfreqs", os.str());
+
+ }
+
+ string likstring;
+
+ for (int n=0;n<3;n++) {
+ ostringstream o;
+ o << lik[n];
+ if (n==0) likstring.append("0/0:"); else if (n==1) likstring.append("0/1:"); else likstring.append("1/1:");
+ likstring.append(o.str());
+ if (n<2) likstring.append(";");
+ }
+ line.set("glf",likstring);
+ glfData.output(line);
+ }
+ // glfOutput << PRID << " " << b << " " << params.tid << " " << candPos << " " << pav.first+leftPos << " " << pav.second.getString() << " " << reads.size() << " " << msq << " " << nf << " " << nr << " " << lik[0] << " " << lik[1] << " " << lik[2] << endl;
+
+
+ } // foreach b
+ posteriors.push_back(HapEstResult(pav.second, pav.first,exp(logp),freq, totnf, totnr));
+ }
+
+
+ cout << "candPos: " << candPos << " numReadOffAllHaps: " << numReadOffAllHaps << " logz: " << logz << endl;
+
+
+
+ if (params.outputPooledLikelihoods) {
+
+ // output which variants are active in which haplotype
+
+
+ stringstream os; os << params.fileName << "." << params.tid << "." << candPos;
+ string oprefix = os.str();
+
+ string fname = oprefix;
+ fname.append(".hapvars");
+ ofstream of(fname.c_str());
+ if (!of.is_open()) {
+ throw string("Cannot open file ").append(fname).append(" for writing .hapvars file");
+ }
+
+ idx=0;
+ BOOST_FOREACH(PAV pav, allVariants) {
+ stringstream o;
+ o << params.tid << " " << leftPos+pav.first << " " << pav.second.getString();
+ of << o.str() << string(50-o.str().length(),' ');
+ for (size_t h=0;h<nh;h++) {
+ of << " " << hapHasVar[h*nv+idx];
+ }
+
+ of << endl;
+ idx++;
+
+ }
+
+ of.close();
+
+
+ string prefix;
+ stringstream os5; os5 << "EM " << params.tid << " " << candPos << " " << reads.size(); prefix.append(os5.str());
+
+ fname = oprefix;
+ fname.append(".hapfreqs");
+ of.open(fname.c_str());
+ outputHapsAndFreqs(&of,prefix,haps,hapFreqs, leftPos);
+ of.close();
+
+ fname.clear();
+ fname=oprefix;
+ oprefix.append(".liks");
+
+ cout << "fname: " << fname << endl;
+ of.open(fname.c_str());
+ if (!of.is_open()) {
+ throw string("Cannot open file ").append(fname).append(" for writing .liks file");
+ }
+
+
+
+ // output all likelihoods
+
+ for (size_t r=0;r<nr;r++) {
+
+ of << r << " " << bam1_qname(reads[r].getBam()) << " " << log(1.0-reads[r].mapQual) << " " << reads[r].poolID;
+
+
+ for (size_t h=0;h<nh;h++) {
+ of << " " << liks[h][r].ll;
+ }
+
+
+ for (size_t h=0;h<nh;h++) {
+ of << " " << liks[h][r].offHap;
+ }
+ of << endl;
+ }
+ of.close();
+
+ // output alignments
+ fname = oprefix;
+ fname.append(".alignments");
+ cout << "fname: " << fname << endl;
+ of.open(fname.c_str());
+ if (!of.is_open()) {
+ throw string("Cannot open file ").append(fname).append(" for writing .liks file");
+ }
+
+
+
+ for (size_t r=0;r<nr;r++) {
+ cout << "###" << endl;
+ cout << "read: " << bam1_qname(reads[r].getBam()) << " mpos: " << reads[r].matePos << endl;
+ cout << "isUnmapped: " << reads[r].isUnmapped() << endl;
+ // compute maximum alignment likelihood
+ double lq = 0.0;
+ for (size_t b=0;b<reads[r].size();b++) lq += log(reads[r].qual[b]);
+
+ cout << "Max alignment loglik: " << lq << endl;
+
+ double maxll = -HUGE_VAL;
+ std::set <int> mlhaps;
+ for (int h=nh-1;h>=0;h--) if (liks[h][r]>maxll) { maxll = liks[h][r]; }
+ for (int h=nh-1;h>=0;h--) mlhaps.insert(h); //if (fabs(liks[h][r]-maxll)<0.01) mlhaps.insert(h);
+ BOOST_FOREACH(int hidx, mlhaps) {
+ cout << "r: " << r << " hidx: " << hidx << " maxll:" << maxll << endl;
+ ObservationModelFBMaxErr obs(haps[hidx], reads[r], leftPos, params.obsParams);
+ cout << string(50,' ') << haps[hidx].seq << endl;
+ obs.printAlignment(50);
+ }
+ }
+
+ of.close();
+ }
+}
+
+#ifndef DIPLOIDGLF
+void DetInDel::diploidGLF(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<double> & hapFreqs, vector <HapEstResult > & posteriors, uint32_t candPos, uint32_t leftPos, uint32_t rightPos, OutputData & glfData, int index, const AlignedCandidates & candidateVariants, string program="all")
+
+{
+ size_t nh=haps.size();
+ size_t nr=reads.size();
+
+ vector<int> filtered(nh, 0);
+ map<pair<int, AlignedVariant>, VariantCoverage> varCoverage;
+ filterHaplotypes(haps, reads, liks, filtered, varCoverage, params.filterHaplotypes);
+
+ vector<double> rl(nh*nr,0.0); // read given haplotype likelihoods
+
+ // get read-hap likelihoods and get number of reads that are off all-haplotypes.
+
+ int idx=0;
+ int numReadOffAllHaps=0;
+ for (size_t r=0;r<nr;r++) {
+ int no=1;
+ for (size_t h=0;h<nh;h++) {
+ // initialize read-haplotype likelihoods
+ rl[idx]=liks[h][r].ll;
+ if (!liks[h][r].offHap) no=0;
+ idx++;
+ }
+ if (no) {
+ numReadOffAllHaps++;
+ }
+ }
+
+ // get all variants
+
+ typedef pair<int, AlignedVariant> PAV;
+ const int VARSNP=1;
+ const int VARINDEL=2;
+
+
+ std::set< PAV > allVariants;
+ map<int, std::set<PAV> > allVariantsByPos; //
+
+
+ typedef map<int, AlignedVariant>::const_iterator It;
+ typedef map<int, std::set<PAV> >::const_iterator PIt;
+
+ vector<int> hap_num_indels(nh, 0);
+ vector<int> hap_num_candidate_indels(nh, 0);
+ vector<int> hap_num_snps(nh, 0);
+
+ int ref_hap_idx = -1;
+ for (size_t th=0;th<nh;th++) {
+ const Haplotype & hap=haps[th];
+ //cout << "hap[" << th << "]: ";
+ hap_num_indels[th] = hap.countIndels();
+ hap_num_snps[th] = hap.countSNPs();
+
+ if (hap_num_indels[th] == 0 && hap_num_snps[th] == 0) {
+ //if (ref_hap_idx != -1) cout << string("Already have ref-hap!") << " " << ref_hap_idx << endl;
+ //if (ref_hap_idx!=-1) cout << "RH: " << haps[ref_hap_idx].seq << endl;
+ //cout << "TH: " << haps[th].seq << endl;
+ ref_hap_idx = th;
+ //int h1 = th;
+ //cout << "RRRR IN: " << hap_num_indels[h1] << " SNP: " << hap_num_snps[h1]; cout << " H1:"; for (It it=haps[h1].indels.begin();it!=haps[h1].indels.end();it++) cout << it->first << "," << it->second.getString() << ";" << endl;
+
+ }
+ if (hap_num_indels[th] != 0) {
+ //check how many were candidates in the input file
+ int nc = 0;
+ for (It it=hap.indels.begin();it!=hap.indels.end();it++) {
+ const AlignedVariant & avar = it->second;
+ const AlignedVariant *av = candidateVariants.findVariant(avar.getStartHap()+leftPos, avar.getType(), avar.getString());
+ if (av!=NULL) nc+=1;
+ }
+ hap_num_candidate_indels[th] = nc;
+ }
+
+
+
+ for (It it=hap.indels.begin();it!=hap.indels.end();it++) {
+ if (!it->second.isRef() && !(it->second.isSNP() && it->second.getString()[3]=='D')) {
+ //cout << " " << it->first << "," << it->second.getString();
+ allVariants.insert(PAV(it->first,it->second));
+ allVariantsByPos[it->first].insert(PAV(it->first,it->second));
+ }
+ }
+ //cout << endl;
+ }
+
+ idx=0;
+ map<int,int> posToPosIdx;
+ vector<int> varPositions;
+ for (PIt pit=allVariantsByPos.begin();pit!=allVariantsByPos.end(); pit++) {
+ posToPosIdx[pit->first]=idx;
+ varPositions.push_back(pit->first);
+ idx++;
+ }
+
+ int numVarPos = allVariantsByPos.size();
+ int nv=int(allVariants.size());
+
+ vector<int> hapVar(nh*numVarPos,0);
+ vector<int> varType(nv+1);
+ vector<PAV> variants(nv+1);
+ idx=1;
+
+
+ BOOST_FOREACH(PAV pav, allVariants) {
+ int type=VARSNP;
+ if (pav.second.isIndel()) type=VARINDEL;
+ varType[idx]=type;
+ int posIdx=posToPosIdx[pav.first];
+ for (size_t h=0;h<nh;h++) {
+ It it=haps[h].indels.find(pav.first);
+ if (it!=haps[h].indels.end() && it->second.getString()==pav.second.getString()) hapVar[h*numVarPos+posIdx]=idx;
+ }
+ variants[idx]=pav;
+
+
+
+ idx++;
+ }
+
+
+
+ /*
+ cout << "allVariants: ";
+ BOOST_FOREACH(PAV pav, allVariants) {
+ cout << " [" << pav.first << " " << pav.second.getString() << "]";
+ }
+ cout << endl;
+ */
+
+ // check which how many reads to a haplotype with this variant
+
+ std::set<int> readidx;
+ for (size_t r=0;r<nr;r++) readidx.insert(r);
+
+
+
+ // compute marginal posteriors for the individual variants
+ vector<double> prior(nh*nh,0.0), pairs_posterior(nh*nh, 0);
+ // compute for every pair of haplotypes the likelihood of drawing it given priors on the variants and the estimated frequencies
+ for (size_t h1=0;h1<nh;h1++) {
+ for (size_t h2=h1;h2<nh;h2++) {
+ prior[h1*nh+h2]=getHaplotypePrior(haps[h1],haps[h2], leftPos, candidateVariants);
+ }
+ }
+
+
+ vector<int> max_indel_pair(2,-1);
+ vector<int> max_noindel_pair(2,-1);
+
+ double max_ll_indel = -HUGE_VAL;
+ double max_ll_noindel = -HUGE_VAL;
+ for (size_t h1=0;h1<nh;h1++) if (filtered[h1]==0) for (size_t h2=h1;h2<nh;h2++) if (filtered[h2]==0) {
+ double ll=0.0;
+ for (size_t r=0;r<reads.size();r++){
+ ll+=log(0.5)+addLogs(rl[r*nh+h1],rl[r*nh+h2]);
+ }
+ // now we have the log likelihood, store posterior
+ pairs_posterior[h1*nh+h2] = ll + prior[h1*nh+h2];
+
+ // prinhaplotype pair
+ /*
+ cout << "POST: " << h1 << " " << h2 << " " << pairs_posterior[h1*nh+h2] << " PR: " << prior[h1*nh+h2];
+
+ cout << " IN: " << hap_num_indels[h1] << " SNP: " << hap_num_snps[h1]; cout << " H1:"; for (It it=haps[h1].indels.begin();it!=haps[h1].indels.end();it++) cout << it->first << "," << it->second.getString() << ";";
+ cout << " IN: " << hap_num_indels[h2] << " SNP: " << hap_num_snps[h2]; cout << " H2:"; for (It it=haps[h2].indels.begin();it!=haps[h2].indels.end();it++) cout << it->first << "," << it->second.getString() << ";";
+ cout << endl;
+ */
+
+ if (pairs_posterior[h1*nh+h2]>max_ll_indel && (hap_num_candidate_indels[h1]>0 || hap_num_candidate_indels[h2]>0)) {
+ max_ll_indel = pairs_posterior[h1*nh+h2];
+ max_indel_pair[0] = h1;
+ max_indel_pair[1] = h2;
+ }
+ if (pairs_posterior[h1*nh+h2]>max_ll_noindel && (hap_num_candidate_indels[h1]==0 && hap_num_candidate_indels[h2]==0)) {
+ max_ll_noindel = pairs_posterior[h1*nh+h2];
+ max_noindel_pair[0] = h1;
+ max_noindel_pair[1] = h2;
+ }
+
+ }
+
+ // output map-based variant calls
+
+ double qual = 0.0;
+ double ll_ref = max_ll_noindel;
+ qual = - 10.0*( ll_ref - addLogs(max_ll_indel, ll_ref) )/log(10.0);
+ cout << "ll_ref: " << ll_ref << " max_ll_indel: " << max_ll_indel << " qual: " << qual << endl;
+ if (max_indel_pair[0]==-1 || max_indel_pair[1]==-1) throw string("Could not find indel allele");
+ if (1) {
+ int numUnmappedRealigned = 0;
+ int hx1 = max_indel_pair[0];
+ int hx2 = max_indel_pair[1];
+ for (size_t r=0;r<reads.size();r++) {
+ if (reads[r].isUnmapped()) {
+ if (liks[hx1][r].offHap == false || liks[hx2][r].offHap == false) {
+ numUnmappedRealigned++;
+ }
+ }
+ }
+
+ map<int, std::set<AlignedVariant> > indels;
+ for (int i=0;i<2;i++) {
+
+ const Haplotype & hap = haps[ max_indel_pair[i] ];
+
+ for (It it=hap.indels.begin();it!=hap.indels.end();it++) if (!it->second.isRef() || (it->second.isSNP() && it->second.getString()[3]=='D')){
+ indels[it->first].insert(it->second);
+ }
+ }
+
+ for (map<int, std::set<AlignedVariant> >::const_iterator it = indels.begin();it!=indels.end();it++) {
+
+ double msq = 0;
+ int numf=0, numr=0, n=0;
+ int m =2;
+ if (max_indel_pair[0]==max_indel_pair[1]) m = 1;
+ for (int i=0;i<m;i++) {
+ int h=max_indel_pair[i];
+ It iter = haps[h].indels.find(it->first);
+ if (iter != haps[h].indels.end() && iter->second.isIndel()) {
+
+ for (int r=0;r<nr;r++) {
+ bool covered=false, nft=false, nrt=false;
+
+ map<int, bool>::const_iterator it2=liks[h][r].hapIndelCovered.find(it->first);
+ if (it2!=liks[h][r].hapIndelCovered.end() && it2->second) covered=true;
+
+ if (covered) { // hapHasVar[] is to check whether haplotype truely has variant or only the reference
+ if (reads[r].onReverseStrand) nrt=true; else nft=true;
+ double mq=-10*log10(1.0-reads[r].mapQual);
+ msq+=mq*mq;
+ n++;
+ }
+ if (nft) numf++;
+ if (nrt) numr++;
+ }
+ }
+ }
+
+ if (n!=0) msq=sqrt(msq/double(n)); else msq=0.0;
+
+ int was_candidate = 0;
+
+ // determine genotype
+ const std::set< AlignedVariant> & alleles = it->second;
+ string genotype;
+ std::set<string> all_genotype;
+ string nref_all;
+
+ int vc_f = 0;
+ int vc_r = 0;
+ if (1) {
+ const AlignedVariant & avar = *alleles.begin();
+ const AlignedVariant *av = candidateVariants.findVariant(avar.getStartHap()+leftPos, avar.getType(), avar.getString());
+ if (av!=NULL) was_candidate=1;
+ vc_f += varCoverage[PAV(it->first, avar)].nf;
+ vc_r += varCoverage[PAV(it->first, avar)].nr;
+ }
+
+ string a1="*REF", a2="*REF";
+ bool a1_ref=true, a2_ref=true;
+ It ita1 = haps[hx1].indels.find(it->first);
+ It ita2 = haps[hx2].indels.find(it->first);
+
+ if (ita1 != haps[hx1].indels.end() && !ita1->second.isRef()) {
+ a1 = ita1->second.getString();
+ a1_ref=false;
+ }
+ if (ita2 != haps[hx2].indels.end() && !ita2->second.isRef()) {
+ a2 = ita2->second.getString();
+ a2_ref=false;
+ }
+ all_genotype.insert(a1);
+ all_genotype.insert(a2);
+
+ //cout << "a1: " << a1 << " a2: " << a2 << " a1ref " << a1_ref << " a2ref " << a2_ref << endl;
+
+ if (a1_ref && a2_ref) throw string("genotyping error");
+
+ if (a1==a2) {
+ genotype = string("1/1");
+ nref_all = a1;
+ } else {
+
+ if (a1_ref) {
+ genotype = string("0/1");
+ nref_all = a2;
+ } else if (a2_ref) {
+ genotype = string("0/1");
+ nref_all = a1;
+ } else {
+ nref_all = a1+','+a2;
+ genotype = string("1/2");
+
+ if (1) {
+ const AlignedVariant & avar = *alleles.rbegin();
+ const AlignedVariant *av = candidateVariants.findVariant(avar.getStartHap()+leftPos, avar.getType(), avar.getString());
+ if (av!=NULL) was_candidate=1;
+ vc_f += varCoverage[PAV(it->first, avar)].nf;
+ vc_r += varCoverage[PAV(it->first, avar)].nr;
+ }
+ }
+ }
+
+ // determine genotype quality
+ //cout << "POS: " << it->first << endl;
+ double max_ll_altgeno = -HUGE_VAL;
+ for (size_t h1=0;h1<nh;h1++) if (filtered[h1]==0) for (size_t h2=h1;h2<nh;h2++) if (filtered[h2]==0) {
+ if (!( (h1==hx1 && h2 == hx2) || (h2==hx1 && h1 == hx2))) {
+ std::set<string> _alt_genotype;
+ It it2=haps[h1].indels.find(it->first);
+ if (it2 == haps[h1].indels.end() || it2->second.isRef()) {
+ _alt_genotype.insert(string("*REF"));
+ } else {
+ _alt_genotype.insert(it2->second.getString());
+ }
+ it2=haps[h2].indels.find(it->first);
+ if (it2 == haps[h2].indels.end() || it2->second.isRef()) {
+ _alt_genotype.insert(string("*REF"));
+ } else {
+ _alt_genotype.insert(it2->second.getString());
+ }
+ // cout << "CALLED: " << h1 << " " << h2 << " geno: " << *all_genotype.begin() << " " << *all_genotype.rbegin() << " ALT: " << *_alt_genotype.begin() << " " << *_alt_genotype.rbegin() << " " << pairs_posterior[h1*nh+h2] << " " << max_ll_altgeno << endl;
+
+ if (_alt_genotype != all_genotype && max_ll_altgeno<pairs_posterior[h1*nh+h2]) {
+ max_ll_altgeno = pairs_posterior[h1*nh+h2];
+
+ }
+
+ }
+ }
+ double genoqual = 0.0;
+ genoqual = - 10.0*( max_ll_altgeno - addLogs(max_ll_indel, max_ll_altgeno) )/log(10.0);
+
+
+
+
+
+
+ ostringstream glfs; glfs << genotype << ":" << genoqual;
+
+
+
+ OutputData::Line line(glfData);
+ line.set("msg", "ok");
+ line.set("index", index);
+ line.set("tid", params.tid);
+ line.set("analysis_type",string("dip.map"));
+ line.set("indidx",0);
+ line.set("lpos",leftPos);
+ line.set("rpos",rightPos);
+ line.set("center_position",candPos);
+ line.set("realigned_position",it->first+leftPos);
+ line.set("was_candidate_in_window",was_candidate);
+ line.set("qual", qual );
+ //line.set("post_prob_variant", exp(logp));
+ //line.set("est_freq", freq);
+ line.set("nref_all", nref_all);
+ line.set("num_reads", readidx.size());
+ line.set("msq",msq);
+ //line.set("numOffAll",numOffBoth);
+ //line.set("num_indel",numMappedIndels);
+ line.set("num_cover_forward", numf);
+ line.set("num_cover_reverse", numr);
+ line.set("var_coverage_forward", vc_f);
+ line.set("var_coverage_reverse", vc_r);
+ line.set("num_unmapped_realigned", numUnmappedRealigned);
+ line.set("glf", glfs.str());
+ //line.set("likrr", lik[0]);
+ //line.set("likrn", lik[1]);
+ //line.set("liknn", lik[2]);
+ glfData.output(line);
+ }
+ }
+
+
+ for (map<int, std::set<PAV> >::const_iterator it= allVariantsByPos.begin();it!=allVariantsByPos.end();it++)
+ {
+ bool has_variants_in_window = 0;
+ BOOST_FOREACH(PAV pav, it->second) {
+ const AlignedVariant & avar = pav.second;
+ const AlignedVariant *av = candidateVariants.findVariant(avar.getStartHap()+leftPos, avar.getType(), avar.getString());
+ //cout << "VAR " << avar.getString() << endl;
+ if (av!=NULL) {
+ has_variants_in_window=1;
+ break;
+ }
+ }
+
+
+
+ std::set<int> support;
+ int totnf=0, totnr=0;
+ double log5=log(0.5);
+ int nf=0;
+ int nr=0;
+
+ // compute RMS of mapping qualities
+
+ int pos=it->first;
+ int posIdx = posToPosIdx[pos];
+ double msq=0.0;
+ int n=0;
+
+ //cout << endl;
+ //cout << "pos: " << pos << " posIdx " << posIdx << endl;
+
+ typedef std::set<int> IntGenotype;
+ map<IntGenotype, double> genLiks;
+
+ typedef map<IntGenotype,double>::iterator IGIt;
+
+ double maxll=-HUGE_VAL;
+ int hx1, hx2;
+
+ for (size_t h1=0;h1<nh;h1++) if (filtered[h1]==0) for (size_t h2=h1;h2<nh;h2++) if (filtered[h2]==0) {
+ IntGenotype genotype;
+ int v1=hapVar[h1*numVarPos+posIdx];
+ int v2=hapVar[h2*numVarPos+posIdx];
+
+ genotype.insert(hapVar[h1*numVarPos+posIdx]);
+ genotype.insert(hapVar[h2*numVarPos+posIdx]);
+
+ double logPriorPos=0.0;
+ //cerr << "FIX THIS!\n" << endl;
+// if ( (v1>0 && varType[v1]==VARSNP) || (v2>0 && varType[v2]==VARSNP)) { logPriorPos += log(params.priorSNP); } else if ( (v1>0 && varType[v1]==VARINDEL) || (v2>0 && varType[v2]==VARINDEL) ) { logPriorPos+=log(params.priorIndel); };
+
+ AlignedVariant av1, av2;
+ if (v1) av1=variants[v1].second; else av1=AlignedVariant("*REF",-1);
+ if (v2) av2=variants[v2].second; else av2=AlignedVariant("*REF",-1);
+
+ logPriorPos = getPairPrior(av1,av2,leftPos, candidateVariants);
+ double pr=prior[h1*nh+h2]-logPriorPos; // substract prior for this site to obtain likelihood
+
+ //cout << "prior: " << h1 << " " << h2 << ": " << prior[h1*nh+h2] << " logPriorPos " << logPriorPos << endl;
+ double ll=pr;
+ for (size_t r=0;r<reads.size();r++){
+ ll+=log5+addLogs(rl[r*nh+h1],rl[r*nh+h2]);
+ }
+
+ //cout << "genotype: " << *(genotype.begin()) << " " << *(genotype.rbegin()) << " lik: " << ll <<endl;
+
+
+ IGIt igit = genLiks.find(genotype);
+ if (igit==genLiks.end()) {
+ genLiks[genotype]=ll;
+ } else {
+ genLiks[genotype]=addLogs(genLiks[genotype],ll);
+ }
+
+ if (ll>maxll) {
+ maxll=ll;
+ hx1=h1;
+ hx2=h2;
+ }
+
+ }
+ //cout << "hx1: " << hx1 << " hx2: " << hx2 << " postprob: " << maxll << endl;
+
+ // see how many unmapped reads were realigned to the MAP haplotypes
+
+ int numUnmappedRealigned = 0;
+ for (size_t r=0;r<reads.size();r++) {
+ if (reads[r].isUnmapped()) {
+ if (liks[hx1][r].offHap == false || liks[hx2][r].offHap == false) {
+ numUnmappedRealigned++;
+ }
+ }
+ }
+
+ if (params.outputPooledLikelihoods) {
+ string ofLiksFile = params.fileName;
+ ofLiksFile.append(".check.txt");
+ ofstream ofLiks(ofLiksFile.c_str());
+ if (!ofLiks.is_open()) {
+ throw string("Cannot open file for writing");
+ }
+
+ // output haplotypes
+ //
+ ofLiks << "HAPLOTYPES" << endl;
+ for (size_t h=0;h<haps.size();h++) {
+ ofLiks << h;
+ stringstream varss;
+
+ for(map<int, AlignedVariant>::const_iterator it = haps[h].indels.begin(); it != haps[h].indels.end(); it++) {
+ varss << leftPos+it->first << "," << it->second.getString() << ";";
+ }
+ ofLiks << "\t" << varss.str() << endl;
+ }
+
+ ofLiks << "READS" << endl;
+
+
+ for (size_t r=0;r<reads.size();r++) {
+ int offBoth = 0;
+ if (liks[hx1][r].offHap == true && liks[hx2][r].offHap == true) {
+ offBoth =1;
+ }
+ ofLiks << r << "\t" << bam1_qname(reads[r].getBam()) << "\t" << reads[r].pos << "\t" << reads[r].mapQual;
+ for (size_t h=0;h<haps.size();h++) {
+ ofLiks << "\t" << liks[h][r].ll;
+ }
+ for (size_t h=0;h<haps.size();h++) {
+ ofLiks << "\t" << int(liks[h][r].offHap);
+ }
+ ofLiks << endl;
+
+ }
+ ofLiks.close();
+ }
+#define DEBUGDIPLOIDGLF
+#ifdef DEBUGDIPLOIDGLF
+ if (params.outputPooledLikelihoods) {
+ for (size_t r=0;r<reads.size();r++) {
+ int mhx1 = 0;
+ int mhx2 = 1;
+
+ cout << endl << "**READ** " << r << " " << bam1_qname(reads[r].getBam()) << " mapQual: " << reads[r].mapQual << " liks: " << liks[mhx1][r].ll << " " << liks[mhx2][r].ll << " unMapped: " << reads[r].isUnmapped() << endl;
+
+ if (1) {
+ /*
+ cout << string(50,' '); cout << reads[r].seq.seq << endl;
+ cout << haps[hx1].seq << endl;
+ cout << haps[hx2].seq << endl;
+ */
+ Read newread(reads[r]);
+
+ /*
+ newread.mapQual = 1.0 - 1e-20;
+ newread.complement();
+ newread.reverse();
+ */
+
+ cout << "first: " << endl;
+ ObservationModelFBMaxErr obs(haps[mhx1], newread, leftPos, params.obsParams);
+ cout << string(50,' ') << haps[mhx1].seq << endl;
+ obs.printAlignment(50);
+
+
+ cout << "second: " << endl;
+ ObservationModelFBMaxErr obs2(haps[mhx2], newread, leftPos, params.obsParams);
+ cout << string(50,' ') << haps[mhx2].seq << endl;
+ obs2.printAlignment(50);
+
+
+ } else {
+ int hidx = hx1; if (liks[hx2][r].ll>liks[hx1][r].ll) hidx = hx2;
+ cout << "hidx: " << hidx << " hx1: " << hx1 << " hx2: " << hx2 << endl;
+ ObservationModelFBMaxErr obs(haps[hidx], reads[r], leftPos, params.obsParams);
+ cout << string(50,' ') << haps[hidx].seq << endl;
+ obs.printAlignment(50);
+ }
+ }
+ }
+
+#endif
+
+
+
+
+ double allmsq=0.0;
+ int numMappedIndels=0;
+
+ int nBQT=0, nmmBQT=0;
+ double mLogBQ=0.0;
+ int nMMLeft=0;
+ int nMMRight=0;
+ int numOffBoth =0;
+
+ BOOST_FOREACH(int r, readidx) {
+ double mq=-10*log10(1.0-reads[r].mapQual);
+ allmsq+=(mq*mq);
+ //cout << " " << 10*log10(1-reads[r].mapQual);
+
+ int mlidx; double ml=-HUGE_VAL;
+ std::set<int> mlis;
+
+ if (liks[hx1][r].offHap && liks[hx2][r].offHap) numOffBoth++;
+
+ if (liks[hx1][r].ll>=liks[hx2][r]) {
+ mlidx=hx1;
+ ml=liks[hx1][r].ll;
+ } else {
+ mlidx=hx2;
+ ml=liks[hx2][r].ll;
+ }
+
+ mlis.insert(mlidx);
+
+ bool nrt=false, nft=false;
+
+ map<int,bool>::const_iterator it;
+ BOOST_FOREACH(int h, mlis) {
+ bool covered=false;
+ numMappedIndels += int(liks[h][r].indels.size());
+ nBQT+=liks[h][r].nBQT;
+ nmmBQT+=liks[h][r].nmmBQT;
+ mLogBQ+=liks[h][r].mLogBQ;
+ if (liks[h][r].nMMLeft>=2) nMMLeft++;
+ if (liks[h][r].nMMRight>=2) nMMRight++;
+
+
+ map<int, AlignedVariant>::const_iterator hit=haps[h].indels.find(pos);
+ if (hit->second.isIndel()) {
+ it=liks[h][r].hapIndelCovered.find(pos);
+ if (it!=liks[h][r].hapIndelCovered.end() && it->second) covered=true;
+ } else if (hit->second.isSNP()) {
+ it=liks[h][r].hapSNPCovered.find(pos);
+ if (it!=liks[h][r].hapSNPCovered.end() && it->second) covered=true;
+ }
+ if (covered) { // hapHasVar[] is to check whether haplotype truely has variant or only the reference
+ if (reads[r].onReverseStrand) nrt=true; else nft=true;
+ double mq=-10*log10(1.0-reads[r].mapQual);
+ msq+=mq*mq;
+ n++;
+ }
+ }
+ if (nft) nf++;
+ if (nrt) nr++;
+ } // foreach read r
+ // cout << endl;
+
+ if (n!=0) msq=sqrt(msq/double(n)); else msq=0.0;
+ totnf+=nf;
+ totnr+=nr;
+
+ allmsq=(readidx.size()!=0)?sqrt(allmsq/double(readidx.size())):0;
+
+
+ // recode variant indexes to glf/vcf indexes
+ int nidx=1;
+ map <int, int> toVCFidx;
+ vector<string> alleles;
+ alleles.push_back("R");
+ toVCFidx[0]=0;
+
+ ostringstream oAlleles;
+ ostringstream oCovForward;
+ ostringstream oCovReverse;
+ int first=1;
+ for (size_t h=0;h<nh;h++) {
+ int v = hapVar[h*numVarPos+posIdx];
+ if (v!=0) {
+ map<int,int>::iterator tit = toVCFidx.find(v);
+ if (tit==toVCFidx.end()) {
+ toVCFidx[v]=nidx++;
+ alleles.push_back(variants[v].second.getString());
+ string str=(first==1) ? string(""):string(",");
+
+ oAlleles << str << variants[v].second.getString();
+ oCovForward << str << varCoverage[variants[v]].nf;
+ oCovReverse << str << varCoverage[variants[v]].nr;
+ first = 0;
+ }
+ }
+ }
+
+
+ // compute genotype posterior
+
+ ostringstream o;
+
+ first=1;
+ for (map<IntGenotype, double>::iterator git=genLiks.begin();git!=genLiks.end();git++) {
+ int v1 = *(git->first.begin());
+ int v2 = *(git->first.rbegin());
+ int a1 = toVCFidx[ *(git->first.begin()) ];
+ int a2 = toVCFidx[ *(git->first.rbegin()) ];
+
+ string str=(first==1) ? string(""):string(",");
+ o << str << a1 << "/" << a2 << ":" << git->second;
+
+ double logPrior=0.0;
+ if ( (v1>0 && varType[v1]==VARSNP) || (v2>0 && varType[v2]==VARSNP)) { logPrior += log(params.priorSNP); } else if ( (v1>0 && varType[v1]==VARINDEL) || (v2>0 && varType[v2]==VARINDEL) ) { logPrior+=log(params.priorIndel); };
+
+ git->second -= logPrior;
+ first = 0;
+ }
+
+
+
+
+
+ if (params.outputGLF) {
+ OutputData::Line line(glfData);
+ line.set("msg", "ok");
+ line.set("index", index);
+ line.set("tid", params.tid);
+ line.set("analysis_type",program);
+ line.set("indidx",0);
+ line.set("lpos",leftPos);
+ line.set("rpos",rightPos);
+ line.set("center_position",candPos);
+ line.set("realigned_position",pos+leftPos);
+ line.set("was_candidate_in_window",has_variants_in_window);
+ line.set("logZ", maxll );
+ //line.set("post_prob_variant", exp(logp));
+ //line.set("est_freq", freq);
+ line.set("nBQT", nBQT);
+ line.set("nmmBQT", nmmBQT);
+ line.set("mLogBQ", mLogBQ/double(nBQT));
+ line.set("nMMLeft", nMMLeft);
+ line.set("nMMRight", nMMRight);
+ line.set("nref_all", oAlleles.str());
+ line.set("num_reads", readidx.size());
+ line.set("msq",allmsq);
+ line.set("numOffAll",numOffBoth);
+ line.set("num_indel",numMappedIndels);
+ line.set("num_cover_forward", nf);
+ line.set("num_cover_reverse", nr);
+
+ line.set("var_coverage_forward", oCovForward.str());
+ line.set("var_coverage_reverse", oCovReverse.str());
+
+
+ line.set("glf", o.str());
+ line.set("num_unmapped_realigned", numUnmappedRealigned);
+ //line.set("likrr", lik[0]);
+ //line.set("likrn", lik[1]);
+ //line.set("liknn", lik[2]);
+ glfData.output(line);
+ }
+ // glfOutput << PRID << " " << b << " " << params.tid << " " << candPos << " " << pav.first+leftPos << " " << pav.second.getString() << " " << reads.size() << " " << msq << " " << nf << " " << nr << " " << lik[0] << " " << lik[1] << " " << lik[2] << endl;
+
+
+ }
+
+}
+#endif
+
+void DetInDel::estimateHaplotypeFrequencies(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<double> & hapFreqs)
+{
+
+ // estimate haplotype frequencies using EM
+ hapFreqs.clear();
+
+ size_t nh=haps.size();
+ size_t nr=reads.size();
+
+
+
+
+ vector<double> rl(nh*nr,0.0); // read given haplotype likelihoods
+
+ vector<double> z(nh*nr,0.0); // expectations of read-haplotype indicator variables
+ vector<double> pi(nh); // log of haplotype frequencies
+ vector<double> nk(nh,0.0); // counts for each haplotype
+
+ hapFreqs=nk;
+
+ // initialize frequencies
+ for (size_t h=0;h<nh;h++) pi[h]=1.0/double(nh);
+
+ for (size_t h=0;h<nh;h++) for (size_t r=0;r<nr;r++) {
+ // initialize read-haplotype likelihoods
+ rl[h*nr+r]=liks[h][r].ll;
+
+ // initialize expectations of indicator variables
+ z[h*nr+r]=0.5;
+ }
+
+
+ bool converged=false;
+ double tol=params.EMtol;
+
+ double eOld=-HUGE_VAL, eNew;
+
+ cout << "EM HapFreqs:";
+
+ int iter=0;
+ while (!converged) {
+
+ // compute expectation of indicator variables
+ for (size_t h=0;h<nh;h++) nk[h]=0.0;
+
+ int idx=0;
+ for (size_t r=0;r<nr;r++) {
+ double lognorm=-HUGE_VAL;
+ // compute responsibilities
+ for (size_t h=0;h<nh;h++) {
+ z[h*nr+r]=pi[h]+(rl[h*nr+r]);
+ lognorm=addLogs(lognorm, z[h*nr+r]);
+ }
+ // normalize and compute counts
+ for (size_t h=0;h<nh;h++) {
+ z[nr*h+r]-=lognorm;
+ z[nr*h+r]=exp(z[nr*h+r]);
+
+ nk[h]+=z[nr*h+r];
+ }
+ }
+
+ // compute frequencies
+
+ for (size_t h=0;h<nh;h++) {
+ double nph=nk[h]/nr;
+ pi[h]=log(nph);
+ }
+
+
+ idx=0;
+ eNew=0.0;
+ for (size_t h=0;h<nh;h++) {
+
+ for (size_t r=0;r<nr;r++) {
+ // compute responsibilities
+ eNew+=z[idx]*( pi[h]+rl[idx]);
+ idx++;
+ }
+ }
+ //cout << "[" << eNew << "]" << endl;
+ //
+ if (eOld>eNew) throw string("EM Error in estimateHapFreqs");
+ converged=(fabs(eOld-eNew))<tol || iter>25;
+
+ eOld=eNew;
+
+
+ iter++;
+ }
+
+ for (size_t h=0;h<nh;h++) { cout << " " << exp(pi[h]); }
+ cout << endl;
+
+ // output haplotypes and estimated frequencies
+
+ for (size_t h=0;h<nh;h++) hapFreqs[h]=exp(pi[h]);
+}
+
+
+void DetInDel::computePairLikelihoods(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<HapPairLik> & likPairs, bool usePrior, const AlignedCandidates & candidateVariants, int leftPos)
+{
+// cout << "Computing pair likelihoods...\n";
+ likPairs.clear();
+ size_t lh=haps.size();
+ likPairs.reserve(lh*(lh/2));
+ //const double log10=log(10);
+ double maxLL=-HUGE_VAL; int hpm, hmm;
+ size_t midx;
+ for (size_t hp=0;hp<lh;hp++) for (size_t hm=hp;hm<lh;hm++) {
+ double ll=0.0;
+ HapPairLik hpl;
+ hpl.numIndFirst=0;
+ hpl.numIndSecond=0;
+ hpl.numOffBoth=0;
+ hpl.numOffBothError=0.0;
+ hpl.numFirst=0;
+ hpl.numSecond=0;
+ hpl.h1=hp;
+ hpl.h2=hm;
+ for (size_t r=0;r<reads.size();r++) {
+ ll+=(addLogs(liks[hp][r].ll,liks[hm][r].ll)+log(.5));
+ const MLAlignment & ml1=liks[hp][r];
+ const MLAlignment & ml2=liks[hm][r];
+
+ // record which indel was on which haplotype
+ if (!ml1.offHap && ml1.ll>ml2.ll && ml1.indels.size()!=0) hpl.numIndFirst++;
+ else if (!ml2.offHap && ml2.ll>ml1.ll && ml2.indels.size()!=0) hpl.numIndSecond++;
+ if (ml1.offHapHMQ && ml2.offHapHMQ) { hpl.numOffBoth++; hpl.numOffBothError+=reads[r].mapQual; };
+ if (ml1.ll>=ml2.ll) hpl.numFirst++;
+ if (ml2.ll>=ml1.ll) hpl.numSecond++;
+
+ // determine read coverage of all the variants
+ // TODO this part is really slow!
+ for (map<int, bool>::const_iterator it=ml1.hapIndelCovered.begin();it!=ml1.hapIndelCovered.end();it++) if (it->second) if (ml1.ll>=ml2.ll) { if (reads[r].onReverseStrand) hpl.hapIndelCoverage1[it->first].nr++; else hpl.hapIndelCoverage1[it->first].nf++; }
+ for (map<int, bool>::const_iterator it=ml2.hapIndelCovered.begin();it!=ml2.hapIndelCovered.end();it++) if (it->second) if (ml2.ll>=ml1.ll) { if (reads[r].onReverseStrand) hpl.hapIndelCoverage2[it->first].nr++; else hpl.hapIndelCoverage2[it->first].nf++; }
+ for (map<int, bool>::const_iterator it=ml1.hapSNPCovered.begin();it!=ml1.hapSNPCovered.end();it++) if (it->second) if (ml1.ll>=ml2.ll) { if (reads[r].onReverseStrand) hpl.hapSNPCoverage1[it->first].nr++; else hpl.hapSNPCoverage1[it->first].nf++; }
+ for (map<int, bool>::const_iterator it=ml2.hapSNPCovered.begin();it!=ml2.hapSNPCovered.end();it++) if (it->second) if (ml2.ll>=ml1.ll) { if (reads[r].onReverseStrand) hpl.hapSNPCoverage2[it->first].nr++; else hpl.hapSNPCoverage2[it->first].nf++; }
+
+
+ }
+ if (usePrior) ll+=getHaplotypePrior(haps[hp], haps[hm], leftPos, candidateVariants);
+
+ hpl.ll=ll;
+ if (ll>maxLL) {
+ maxLL=ll;
+ hpm=hp;
+ hmm=hm;
+ midx=likPairs.size();
+ }
+ // cout << "hp: " << hp << " hm: " << hm << " ll: " << ll << endl;
+ likPairs.push_back(hpl);
+ }
+
+
+// cout << "ML hap: " << hpm << " " << hmm << " midx: " << midx << endl;
+ /*
+ cout << haps[hpm] << endl;
+ cout << "indels: "; for (map<int, AlignedVariant>::const_iterator it=haps[hpm].indels.begin();it!=haps[hpm].indels.end();it++) {
+ cout << "[" << it->first << "," << it->second.getString() << "]";
+ }
+ cout << endl;
+ cout << "coverage: ";
+ for (map<int, VariantCoverage>::const_iterator it=likPairs[midx].hapIndelCoverage1.begin();it!=likPairs[midx].hapIndelCoverage1.end();it++) {
+ cout << "[" << it->first << "," << it->second.nf << "," << it->second.nr << "]";
+ }
+ cout << endl;
+ cout << haps[hmm] << endl;
+ cout << "indels: "; for (map<int, AlignedVariant>::const_iterator it=haps[hmm].indels.begin();it!=haps[hmm].indels.end();it++) {
+ cout << "[" << it->first << "," << it->second.getString() << "]";
+ }
+ cout << endl;
+ for (map<int, VariantCoverage>::const_iterator it=likPairs[midx].hapIndelCoverage2.begin();it!=likPairs[midx].hapIndelCoverage2.end();it++) {
+ cout << "[" << it->first << "," << it->second.nf << "," << it->second.nr << "]";
+ }
+
+ cout << endl;
+ */
+
+ class SortFunc {
+ public:
+ static bool sortFunc(const HapPairLik & hpl1, const HapPairLik & hpl2)
+ {
+ // sort in decreasing order
+ return hpl1.ll>hpl2.ll;
+ }
+ };
+ sort(likPairs.begin(), likPairs.end(),SortFunc::sortFunc);
+}
+
+void DetInDel::statisticsHaplotypePair(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, HapPairLik & hpl, OutputData::Line & line)
+{
+ hpl.numIndFirst=0;
+ hpl.numIndSecond=0;
+ hpl.numOffBoth=0;
+ hpl.numOffBothError=0.0;
+ hpl.numFirst=0;
+ hpl.numSecond=0;
+ int hp=hpl.h1;
+ int hm=hpl.h2;
+ for (size_t r=0;r<reads.size();r++) {
+ const MLAlignment & ml1=liks[hp][r];
+ const MLAlignment & ml2=liks[hm][r];
+
+ if (!ml1.offHap && ml1.ll>ml2.ll && ml1.indels.size()!=0) hpl.numIndFirst++;
+ else if (!ml2.offHap && ml2.ll>ml1.ll && ml2.indels.size()!=0) hpl.numIndSecond++;
+ if (ml1.offHapHMQ && ml2.offHapHMQ) { hpl.numOffBoth++; hpl.numOffBothError+=reads[r].mapQual; };
+ if (ml1.ll>=ml2.ll) hpl.numFirst++;
+ if (ml2.ll>=ml1.ll) hpl.numSecond++;
+
+// determine read coverage of all the variants
+// TODO this part is really slow!
+ for (map<int, bool>::const_iterator it=ml1.hapIndelCovered.begin();it!=ml1.hapIndelCovered.end();it++) if (it->second) if (ml1.ll>=ml2.ll) { if (reads[r].onReverseStrand) hpl.hapIndelCoverage1[it->first].nr++; else hpl.hapIndelCoverage1[it->first].nf++; }
+ for (map<int, bool>::const_iterator it=ml2.hapIndelCovered.begin();it!=ml2.hapIndelCovered.end();it++) if (it->second) if (ml2.ll>=ml1.ll) { if (reads[r].onReverseStrand) hpl.hapIndelCoverage2[it->first].nr++; else hpl.hapIndelCoverage2[it->first].nf++; }
+ for (map<int, bool>::const_iterator it=ml1.hapSNPCovered.begin();it!=ml1.hapSNPCovered.end();it++) if (it->second) if (ml1.ll>=ml2.ll) { if (reads[r].onReverseStrand) hpl.hapSNPCoverage1[it->first].nr++; else hpl.hapSNPCoverage1[it->first].nf++; }
+ for (map<int, bool>::const_iterator it=ml2.hapSNPCovered.begin();it!=ml2.hapSNPCovered.end();it++) if (it->second) if (ml2.ll>=ml1.ll) { if (reads[r].onReverseStrand) hpl.hapSNPCoverage2[it->first].nr++; else hpl.hapSNPCoverage2[it->first].nf++; }
+
+ // record which indel was on which haplotype
+ }
+
+ line.set("num_off_hap", hpl.numOffBoth);
+ line.set("num_mapped_to_first",hpl.numFirst);
+ line.set("num_mapped_to_second",hpl.numSecond);
+}
+
+
+
+void parseRegionString(const string & region, int & start, int & end)
+{
+ string filtered;
+ for(size_t x=0;x<region.size();x++) {
+ char c=region[x];
+ if (c=='-') filtered+=' ';
+ else if (c!=',') filtered+=c;
+ }
+ istringstream is(filtered);
+ string e; is >> e;
+ if (!from_string(start,e,std::dec)) throw string("Cannot parse region for start!");
+ is >> e;
+ if (!from_string(end,e,std::dec)) throw string("Cannot parse region end!");
+}
+
+void getParameters(po::variables_map & vm, DetInDel::Parameters & params)
+{
+ params.maxHap=vm["maxHap"].as<uint32_t> ();
+ params.maxReads=vm["maxRead"].as<uint32_t> ();
+ params.width=vm["width"].as<uint32_t> ();
+ params.mapQualThreshold=vm["mapQualThreshold"].as<double>();
+ params.skipMaxHap=vm["skipMaxHap"].as<uint32_t>();
+ //params.glfNumHap=vm["glfNumHap"].as<uint32_t>();
+ params.inferenceMethod=vm["inferenceMethod"].as<string>();
+ params.minReadOverlap=vm["minReadOverlap"].as<uint32_t>();
+ params.maxReadLength=vm["maxReadLength"].as<uint32_t>();
+ //params.scaleErr=vm["mapScaleError"].as<double>();
+ //params.minCount=vm["minCount"].as<uint32_t>();
+ params.maxHapReadProd=vm["maxHapReadProd"].as<uint32_t>();
+
+ params.priorSNP=vm["priorSNP"].as<double>();
+ params.priorIndel=vm["priorIndel"].as<double>();
+ params.bayesa0=vm["bayesa0"].as<double>();
+ params.bayesType=vm["bayesType"].as<string>();
+
+
+
+
+ if (vm.count("ref")) {
+ params.alignAgainstReference=true;
+ params.refFileName=vm["ref"].as<string>();
+ } else {
+ params.alignAgainstReference=false;
+ }
+
+ params.obsParams.pError=vm["pError"].as<double>();
+ params.obsParams.pMut=vm["pMut"].as<double>();
+
+ //params.obsParams.baseQualThreshold=vm["baseQualThreshold"].as<double>();
+ // params.obsParams.fixedBaseQual=vm["fixedBaseQual"].as<double>();
+ params.obsParams.maxLengthIndel=vm["maxLengthIndel"].as<int>();
+ params.obsParams.maxLengthDel=params.obsParams.maxLengthIndel;
+ params.obsParams.mapQualThreshold=vm["capMapQualThreshold"].as<double>();
+ params.obsParams.capMapQualFast=vm["capMapQualFast"].as<double>();
+ //params.obsParams.scaleErr=vm["obsScaleError"].as<double>();
+ //params.obsParams.numE=vm["numE"].as<int>();
+ params.obsParams.padCover = vm["flankRefSeq"].as<int>();
+ params.obsParams.maxMismatch = vm["flankMaxMismatch"].as<int>();
+ params.checkAllCIGARs=vm["checkAllCIGARs"].as<int>();
+
+ params.varFileIsOneBased=vm.count("varFileIsOneBased")?true:false;
+ params.outputRealignedBAM=vm.count("outputRealignedBAM")?true:false;
+ params.analyzeLowFreq=vm.count("compareReadHap")?true:false;
+ params.analyzeLowFreqDiffThreshold=vm["compareReadHapThreshold"].as<double>();
+ params.showHapDist=vm.count("showEmpirical")?true:false;
+ params.showCandHap=vm.count("showCandHap")?true:false;
+ params.showReads=vm.count("showReads")?true:false;
+ params.quiet=vm.count("quiet")?true:false;
+ params.computeML=vm.count("computeML")?true:false;
+ params.computeMAP=vm.count("computeMAP")?true:false;
+ params.doDiploid=vm.count("doDiploid")?true:false;
+
+ params.filterHaplotypes=vm.count("filterHaplotypes")?true:false;
+
+ params.printCallsOnly=vm.count("printCallsOnly")?true:false;
+ params.estimateHapFreqs=vm.count("doPooled")?true:false;
+ params.outputPooledLikelihoods=vm.count("opl")?true:false;
+ params.showHapAlignments=vm.count("showHapAlignments")?true:false;
+ if (vm.count("filterReadAux")) params.filterReadAux=vm["filterReadAux"].as<string>();
+ if (vm.count("processRealignedBAM")) params.processRealignedBAM=vm["processRealignedBAM"].as<string>();
+
+ params.slower=vm.count("faster")?false:true;
+ params.changeINStoN=vm.count("changeINStoN")?true:false;
+ params.outputGLF=true;
+
+
+ // removed options
+/*
+ params.outputRealignedBAM=vm.count("outputRealignedBAM")?true:false;
+ params.obsParams.modelType=vm["modelType"].as<string>();
+ params.mapUnmappedReads=vm.count("mapUnmapped")?true:false;
+ params.obsParams.mapUnmappedReads=vm.count("mapUnmapped")?true:false;
+ params.obsParams.pFirstgLO=vm["pFirstgLO"].as<double>();
+ //params.numOutputTopHap=vm["numOutputTopHap"].as<int>();
+
+*/
+
+}
+
+
+
+int smain(int argc, char *argv[])
+{
+ if (1) {
+ Haplotype hap;
+ Read read;
+
+ //hap.seq= "ATCGTGTAGCTCTCTGGCTGGCTAGCTGATTGGCTCTTGCC";
+ //read.seq.seq= "CTCTCTGGCTGGCTAGCGAT";
+ Haplotype ref;
+ // 012345678901234567890123456789
+ hap.seq= "ATCGATTCGTGATATATATATTCAATGTAGTCGCTAG";
+ read.seq.seq= "ATCGATTCGTGATAATATTCAATGTAGTCGCTAG";
+
+
+ //hap.seq= "ATCGATTCGTGATATATATATTCAATGTAGTCGCTAG";
+ //read.seq.seq= "ATCGATTCGTGATATATATATAATTCAATGTAGTCGCTAG";
+
+ // 012345678901234567890123456789
+ //hap.seq= "ATCGATTCGTGTTTTTTCAATGTAGTCGCTAG";
+ //read.seq.seq= "ATCGATTCGTGTTTTTCAATGTAGTCGCTAG";
+
+ read.mapQual=1-1e-16;
+
+ ObservationModelParameters obsParams;
+ read.setAllQual(0.99);
+
+ ObservationModelFBMaxErr omfbe(hap, read, 0, obsParams);
+ /*
+ ObservationModelS oms(hap, read, 0, obsParams);
+ HapHash hash(4, hap);
+
+ oms.align(hash);
+ */
+
+ double ll= omfbe.calcLikelihood();
+
+ cout << "ll: " << ll << endl;
+ cout << string(50,' ') << hap.seq << endl;
+ omfbe.printAlignment(50);
+ }
+ if (0) {
+ Haplotype hap;
+ Read read;
+
+ //hap.seq= "ATCGTGTAGCTCTCTGGCTGGCTAGCTGATTGGCTCTTGCC";
+ //read.seq.seq= "CTCTCTGGCTGGCTAGCGAT";
+
+ hap.seq= "AAAATCACCAACACTTCATAATCTATTTTTTCCCCTGAGGAACTTCCTAAAATGAATAAAAAAAAACCCCAGCCACATCTGCATTTGCAAACAGGAAACTCTGCAAGCCATACTAAGACCAAAGCTTAGTT";
+ read.seq.seq= "CAAACAGGAAACTCTGCAAGCCATACTAAGACCAAAGCTTAGTTA";
+
+
+ read.mapQual=1-1e-16;
+
+ ObservationModelParameters obsParams;
+ read.setAllQual(0.99);
+
+ ObservationModelFBMaxErr omfbe(hap, read, 0, obsParams);
+ /*
+ ObservationModelS oms(hap, read, 0, obsParams);
+ HapHash hash(4, hap);
+
+ oms.align(hash);
+ */
+
+ double ll= omfbe.calcLikelihood();
+
+ cout << "ll: " << ll << endl;
+ cout << string(50,' ') << hap.seq << endl;
+ omfbe.printAlignment(50);
+ }
+
+ return 0;
+
+}
+
+
+
+
+
+
+#ifdef DINDEL
+int main(int argc, char *argv[])
+{
+ po::options_description which("[Required] Program option");
+ which.add_options()
+ ("analysis", po::value<string>()->default_value("indels"),"Analysis type:\n"
+ "getCIGARindels: Extract indels from CIGARs of mapped reads, and infer libary insert size distributions\n"
+ "indels: infer indels\n"
+ "realignCandidates: Realign/reposition candidates in candidate file\n");
+
+ po::options_description required("[Required] ");
+ required.add_options()
+ ("ref", po::value<string>(),"fasta reference sequence (should be indexed with .fai file)")
+ ("outputFile", po::value<string>(),"file-prefix for output results");
+
+ po::options_description baminput("[Required] BAM input. Choose one of the following");
+ baminput.add_options()
+ ("bamFile",po::value<string>(), "read alignment file (should be indexed)")
+ ("bamFiles",po::value<string>(), "file containing filepaths for BAMs to be jointly analysed (not possible for --analysis==indels");
+
+
+ po::options_description regioninput("[Required for analysis == getCIGARindels]: \nRegion to be considered for extraction of candidate indels.");
+ regioninput.add_options()
+ ("region", po::value<string>(),"region to be analysed in format start-end, eg. 1000-2000")
+ ("tid", po::value<string>(),"target sequence (eg 'X') ");
+
+ po::options_description varfileinput("[Required for analysis == indels]");
+ varfileinput.add_options()
+ ("varFile", po::value<string>(), "file with candidate variants to be tested.")
+ ("varFileIsOneBased", "coordinates in varFile are one-based");
+
+ po::options_description output_options("Output options");
+ output_options.add_options()
+ ("outputRealignedBAM", "output BAM file with realigned reads")
+ ("processRealignedBAM", po::value<string>(),"ABSOLUTE path to script to process realigned BAM file")
+ //("outputGLF", "outputGLF for individuals in each bam file")
+ ("quiet", "quiet output");
+ //("printCallsOnly", "print only genotypes where call_lik_ref>0.0001 (only affects --single)");
+
+ po::options_description single_analysis("parameters for analysis==indels option");
+ single_analysis.add_options()
+ ("doDiploid", "analyze data assuming a diploid sequence")
+ ("doPooled", "estimate haplotype frequencies using Bayesian EM algorithm.\nMay be applied to single individual and pools.");
+
+ po::options_description analysis_opt("General algorithm parameters");
+ analysis_opt.add_options()
+ //("mapUnmapped", "remap unmapped reads for which mate is mapped")
+ ("faster","use faster but less accurate ungapped read-haplotype alignment model")
+ ("filterHaplotypes","prefilter haplotypes based on coverage")
+ ("flankRefSeq",po::value<int>()->default_value(2),"#bases of reference sequence of indel region")
+ ("flankMaxMismatch",po::value<int>()->default_value(2),"max number of mismatches in indel region")
+ ("priorSNP", po::value<double>()->default_value(1.0/1000.0), "prior probability of a SNP site")
+ ("priorIndel", po::value<double>()->default_value(1.0/10000.0), "prior probability of a detected indel not being a sequencing error")
+ ("width", po::value<uint32_t>()->default_value(60), "number of bases to left and right of indel")
+ ("maxHap", po::value<uint32_t>()->default_value(8), "maximum number of haplotypes in likelihood computation")
+ ("maxRead", po::value<uint32_t>()->default_value(10000), "maximum number of reads in likelihood computation")
+ ("mapQualThreshold", po::value<double>()->default_value(0.99), "lower limit for read mapping quality")
+ ("capMapQualThreshold", po::value<double>()->default_value(100.0), "upper limit for read mapping quality in observationmodel_old (phred units)")
+ ("capMapQualFast", po::value<double>()->default_value(45.0), "cap mapping quality in alignment using fast ungapped method\n (WARNING: setting it too high (>50) might result in significant overcalling!)")
+ ("skipMaxHap", po::value<uint32_t>()->default_value(200), "skip computation if number of haplotypes exceeds this number")
+ //("glfNumHap", po::value<uint32_t>()->default_value(5), "number of haplotypes per glf-class")
+ //("numOutputTopHap", po::value<int>()->default_value(5), "number of haplotype pairs output to haplotype file")
+ ("minReadOverlap", po::value<uint32_t>()->default_value(20),"minimum overlap between read and haplotype")
+ ("maxReadLength", po::value<uint32_t>()->default_value(500),"maximum length of reads")
+ ("minCount", po::value<uint32_t>()->default_value(1), "minimum number of WS observations of indel")
+ ("maxHapReadProd",po::value<uint32_t>()->default_value(10000000), "skip if product of number of reads and haplotypes exceeds this value")
+ ("changeINStoN", "change sequence of inserted sequence to 'N', so that no penalty is incurred if a read mismatches the inserted sequence");
+ po::options_description pooled_analysis("parameters for --pooled option");
+ pooled_analysis.add_options()
+ ("bayesa0", po::value<double>()->default_value(0.001), "Dirichlet a0 parameter haplotype frequency prior")
+ ("bayesType",po::value<string>()->default_value("singlevariant"), "Bayesian EM program type (all or singlevariant or priorpersite)");
+
+
+ po::options_description option_filter("General algorithm filtering options");
+ option_filter.add_options()
+ ("checkAllCIGARs",po::value<int>()->default_value(1),"include all indels at the position of the call site")
+ ("filterReadAux", po::value<string>(), "match string for exclusion of reads based on auxilary information");
+
+
+ po::options_description obsModel("Observation model parameters");
+ obsModel.add_options()
+ ("pError", po::value<double>()->default_value(5e-4), "probability of a read indel")
+ //("modelType", po::value<string>()->default_value("probabilistic"), "probabilistic/threshold")
+ ("pMut", po::value<double>()->default_value(1e-5), "probability of a mutation in the read")
+ ("maxLengthIndel", po::value<int>()->default_value(5), "maximum length of a _sequencing error_ indel in read [not for --faster option]");
+ //("pFirstgLO",po::value<double>()->default_value(0.01),"probability of transition from off the haplotype to on the haplotype");
+
+ po::options_description libParams("Library options");
+ libParams.add_options()
+ ("libFile", po::value<string>(), "file with library insert histograms (as generated by --analysis getCIGARindels)");
+
+
+ po::options_description miscAnalysis("Misc results analysis options");
+ miscAnalysis.add_options()
+ ("compareReadHap", "compare likelihood differences in reads against haplotypes")
+ ("compareReadHapThreshold", po::value<double>()->default_value(0.5), "difference threshold for viewing")
+ ("showEmpirical", "show empirical distribution over nucleotides")
+ ("showCandHap", "show candidate haplotypes for fast method")
+ ("showHapAlignments","show for each haplotype which reads map to it")
+ ("showReads","show reads")
+ ("inferenceMethod",po::value<string>()->default_value("empirical"), "inference method")
+ ("opl","output likelihoods for every read and haplotype");
+
+ required.add(which).add(baminput).add(regioninput).add(varfileinput).add(output_options).add(single_analysis).add(analysis_opt).add(pooled_analysis).add(option_filter).add(obsModel).add(libParams).add(miscAnalysis);
+
+ po::variables_map vm;
+
+ try {
+ po::store(po::parse_command_line(argc, argv, required), vm);
+ } catch (boost::program_options::error) {
+ cout << "Error parsing input options. Usage: \n\n" << required <<"\n";
+ exit(1);
+ }
+ po::notify(vm);
+
+ // analysis
+ if (!(vm.count("analysis"))) {
+ cerr << "Error: Specify which analysis (--analysis) is required." << endl;
+ exit(1);
+ }
+
+ // required
+ if (!(vm.count("ref") && vm.count("outputFile"))) {
+ cerr << "Error: One of the following options was not specified: --ref --tid or --outputFile" << endl;
+ exit(1);
+ }
+
+ if (vm.count("getCIGARindels") && vm.count("region") && !vm.count("tid")) {
+ cerr << "--tid must be specified if analysis==getCIGARindels and --region option is used. " << endl;
+ exit(1);
+ }
+//#define DEBUGGING
+#ifndef DEBUGGING
+ try {
+#endif
+ // extract required parameters
+ string file;
+ int multipleFiles=0;
+ string analysis=vm["analysis"].as<string>();
+
+ // baminput
+ if (analysis=="indels" || analysis=="getCIGARindels") {
+ if (!(vm.count("bamFile") || vm.count("bamFiles"))) {
+ cerr << "Error: Specify either --bamFile or --bamFiles." << endl;
+ exit(1);
+ }
+
+ if (vm.count("bamFile")) {
+ file=vm["bamFile"].as< string >();
+ cout << "Reading BAM file: " << file << endl;
+ } else if (vm.count("bamFiles")) {
+ file=vm["bamFiles"].as<string>();
+ multipleFiles=1;
+ }
+ }
+
+ string faFile=vm["ref"].as<string>();
+ string outputFile=vm["outputFile"].as< string >();
+
+ string modelType="probabilistic"; //vm["modelType"].as< string >();
+ DetInDel::Parameters params(string("1"), outputFile, modelType);
+ getParameters(vm, params);
+
+ if (analysis=="getCIGARindels") {
+ GetCandidatesFromCIGAR gcfc;
+ string outputFile=vm["outputFile"].as< string >();// outputFile.append(".variants.txt");
+ fasta::Fasta fa(faFile);
+ if (vm.count("region")) {
+ string tid=vm["tid"].as<string>();
+ string region=vm["region"].as<string>();
+ int start, end;
+ parseRegionString(region, start, end);
+ DetInDel detInDel(file, params, multipleFiles);
+ const vector<MyBam *> & bams = detInDel.getMyBams();
+
+ cout << "Getting indels from CIGARs in mapped reads from region " << tid << ":" << start << "-" << end << endl;
+ gcfc.getIndelFromCIGARRegion(bams,tid, start, end, outputFile, fa);
+
+ } else {
+ if (multipleFiles) {
+ cerr << "Can extract the full set of indels from only BAM file at a time." << endl;
+ exit(1);
+ }
+ gcfc.get(file, outputFile, faFile);
+ }
+ } else if (analysis=="indels") {
+ if (!vm.count("varFile")) {
+ cerr << "Please specify the file with the candidate variants." << endl;
+ exit(1);
+ }
+
+ string varFile = vm["varFile"].as<string>();
+
+ DetInDel detInDel(file, params, multipleFiles);
+
+ if (vm.count("libFile")) {
+ cout << "Detected library file..." << endl;
+ detInDel.params.mapUnmappedReads=true;
+ detInDel.params.obsParams.mapUnmappedReads=true;
+ detInDel.addLibrary(vm["libFile"].as<string>());
+ }
+ detInDel.params.print();
+
+
+ detInDel.detectIndels(varFile);
+ } else if (analysis == "realignCandidates") {
+ GetCandidatesFromCIGAR gcfc;
+ string outputFile=vm["outputFile"].as< string >(); outputFile.append(".variants.txt");
+
+ if (!vm.count("varFile")) {
+ cerr << "Please specify the file with the candidate variants." << endl;
+ exit(1);
+ }
+
+ string varFile = vm["varFile"].as<string>();
+
+ if (varFile == outputFile) {
+ cerr << "outputFile is same as variant file used for input!" << endl;
+ exit(1);
+ }
+
+ gcfc.realignCandidateFile(varFile, params.varFileIsOneBased,outputFile, faFile);
+ } else {
+ cerr << "Unrecognized --analysis option." << endl;
+ exit(1);
+ }
+#ifndef DEBUGGING
+ }
+ catch (string s) {
+ cout << "Exception: " << s << endl;
+ exit(1);
+ }
+#endif
+ return 0;
+}
+#endif
+
diff --git a/DInDel.hpp b/DInDel.hpp
new file mode 100644
index 0000000..3c60a33
--- /dev/null
+++ b/DInDel.hpp
@@ -0,0 +1,397 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef DINDEL_HPP_
+#define DINDEL_HPP_
+#include <stdlib.h>
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <boost/tuple/tuple.hpp>
+#include <ext/hash_map>
+
+#include "MyBam.hpp"
+#include "faidx.h"
+#include "Haplotype.hpp"
+#include "ObservationModel.hpp"
+#include "HaplotypeDistribution.hpp"
+#include "ObservationModelFB.hpp"
+//#include "Fast.hpp"
+#include "MLAlignment.hpp"
+#include "Read.hpp"
+#include "StringHash.hpp"
+
+#include "OutputData.hpp"
+#include "Library.hpp"
+#include "VariantFile.hpp"
+
+const int SHIFTSTRAND = 1000000; // used to keep track of forward and reverse matches in ::filterHaplotypes
+
+using namespace std;
+using namespace boost;
+using __gnu_cxx::hash;
+typedef struct
+{
+ double pOff, pOn;
+} HapReadLik;
+
+
+
+
+
+class VariantCoverage {
+public:
+ VariantCoverage()
+ {
+ nf=0;
+ nr=0;
+ }
+ VariantCoverage(int _nf, int _nr)
+ {
+ nf = _nf;
+ nr = _nr;
+ }
+ int nf, nr; // forward and reverse
+};
+
+class DetInDel
+{
+public:
+ //DetInDel(const string & bfName, const string & tid, const string &outputFileName, const string & modelType) : params(tid, outputFileName, modelType) { fai=NULL; };
+ static int fetchFuncFindInDel(const bam1_t *b, void *data);
+ void findInDels(uint32_t start, uint32_t end, bool report);
+ void detectIndels(const string & variantsFileName);
+ void callVariants(const string & variantsFile);
+ void findInDelsPositionsFile(const string & fileName);
+ string getRefSeq(uint32_t lpos, uint32_t rpos);
+ void empiricalDistributionMethod(int index, const vector<Read> & reads, uint32_t pos, uint32_t leftPos, uint32_t rightPos, const AlignedCandidates & candidateVariants, OutputData & oData, OutputData & glfData);
+ void fastMethod(const vector<Read> & reads, uint32_t pos, uint32_t leftPos, uint32_t rightPos, ostream & output);
+ bool getHaplotypes(vector<Haplotype> & haps, const vector<Read> & reads, uint32_t pos, uint32_t & leftPos, uint32_t & rightPos, const AlignedCandidates & candidateVariants);
+ const vector<MyBam *> & getMyBams() const { return myBams; }
+ class HapPairLik {
+ public:
+ double ll;
+ int h1, h2;
+ int numIndFirst;
+ int numIndSecond;
+ int numFirst, numSecond; // number of reads mapped to first and second
+ int numOffBoth; // number of reads that do not map to either haplotype
+ double numOffBothError;
+ map<int, VariantCoverage> hapIndelCoverage1, hapSNPCoverage1,hapIndelCoverage2, hapSNPCoverage2; // indels and snps in the _haplotype_ covered by the read
+
+ operator double() const { return ll;};
+ };
+
+ class HapEstResult {
+ public:
+ HapEstResult();
+ HapEstResult(const AlignedVariant & _av, int _pos, double _prob, double _freq, int _nrf, int _nrr) {
+ av=_av;
+ pos=_pos;
+ prob=_prob;
+ freq=_freq;
+ nrf=_nrf;
+ nrr=_nrr;
+ };
+ AlignedVariant av;
+ int pos;
+ double prob;
+ double freq;
+ int nrf; // number of reads on reverse strand
+ int nrr; // number of reads on forward strand
+ };
+
+ void addLibrary ( const string & name, const Library & lib)
+ {
+ libraries[name.c_str()]=lib;
+ }
+ void addLibrary ( const string & fileName)
+ {
+ libraries.addFromFile(fileName);
+ }
+
+
+protected:
+ void outputHapsAndFreqs(ostream *output, const string & prefix, const vector<Haplotype> & haps, const vector<double> & freqs, uint32_t leftPos);
+ //void getReads(uint32_t leftPos, uint32_t rightPos, vector<Read> & reads);
+ void getReads(uint32_t leftPos, uint32_t rightPos, vector<Read> & reads, uint32_t & oldLeftPos, uint32_t & oldRightFetchReadPos, vector<Read *> & readBuffer, bool reset);
+
+ double getMaxHap(Haplotype & h1, Haplotype &h2, HapPairLik & hpl, const vector<Haplotype> & haps, vector<HapPairLik> & likPairs);
+ void outputMaxHap(ostream *output, const string & prefix, const vector<Haplotype> & haps, vector<HapPairLik> & likPairs);
+ void outputTopHaps(ostream *output, const string & prefix, const vector<Haplotype> & haps, vector<HapPairLik> & likPairs, int n);
+ bool alignHaplotypes(vector<Haplotype> & haps, uint32_t pos, uint32_t & leftPos, uint32_t & rightPos, map<int, set<AlignedVariant> > & variants);
+ bool generateHaplotypes(vector<Haplotype> & haps, uint32_t pos, uint32_t & leftPos, uint32_t & rightPos, const map<int, set<Variant> > & variants);
+ double getHaplotypePrior(const Haplotype & h1, const Haplotype & h2, int leftPos, const AlignedCandidates & candidateVariants);
+ void computeLikelihoods(const vector<Haplotype> &haps, const vector<Read> & reads, vector<vector<MLAlignment> > & liks, uint32_t leftPos, uint32_t rightPos, vector<int> & onHap);
+
+ void computeHapPosition(const Haplotype & hap, const Read & read, vector<int> & alPos, int leftPos);
+ void computeLikelihoodsFaster(const vector<Haplotype> &haps, const vector<Read> & reads, vector<vector<MLAlignment> > & liks, uint32_t leftPos, uint32_t rightPos, vector<int> & onHap);
+
+ void computePairLikelihoods(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<HapPairLik> & likPairs, bool usePrior, const AlignedCandidates & candidateVariants, int leftPos);
+ void statisticsHaplotypePair(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, HapPairLik & hpl,OutputData::Line & line);
+
+ void estimateHaplotypeFrequencies(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<double> & hapFreqs);
+ void estimateHaplotypeFrequenciesPosterior(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<double> & hapFreqs, map <int, vector<tuple<AlignedVariant, double,double> > > & posteriors, uint32_t pos, uint32_t leftPos, ostream & glfOutput);
+ void estimateHaplotypeFrequenciesBayesEM(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<double> & hapFreqs, vector <HapEstResult > & posteriors, uint32_t candPos, uint32_t leftPos, uint32_t rightPos, OutputData & glfData, int index, const AlignedCandidates & candidateVariants,string program);
+ void diploidGLF(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<double> & hapFreqs, vector <HapEstResult > & posteriors, uint32_t candPos, uint32_t leftPos, uint32_t rightPos, OutputData & glfData, int index, const AlignedCandidates & candidateVariants, string program);
+
+
+ void debug(const pair<Haplotype, Haplotype> & hp, const vector<Read> & reads, uint32_t leftPos, uint32_t rightPos);
+ void debug(const pair<Haplotype, Haplotype> & hp1, const pair<Haplotype, Haplotype> & hp2, const vector<Read> & reads, uint32_t leftPos, uint32_t rightPos);
+ void analyzeDifference(const pair<Haplotype, Haplotype> & hp1, const vector<Read> & reads, uint32_t leftPos, uint32_t rightPos);
+ void showAlignments(const pair<Haplotype, Haplotype> & hp1, const vector<Read> & reads, uint32_t leftPos, uint32_t rightPos);
+ void showAlignmentsPerHaplotype(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, uint32_t candPos, uint32_t leftPos);
+
+ double getPairPrior(const AlignedVariant & av1, const AlignedVariant & av2, int leftPos,const AlignedCandidates & candidateVariants);
+
+ void filterHaplotypes(const vector<Haplotype> & haps, const vector<Read> & reads, const vector<vector<MLAlignment> > & liks, vector<int> & filtered, map<pair<int, AlignedVariant>, VariantCoverage> & varCoverage, bool doFilter);
+
+ //MyBam myBam;
+
+ //MyBam myBam;
+ vector<MyBam *> myBams;
+ vector<string> myBamsFileNames;
+ LibraryCollection libraries;
+
+
+ class CIGAR : public vector<pair<int,int> >
+ {
+ public:
+ typedef pair<int,int> CIGOp;
+ int refPos;
+ };
+ CIGAR getCIGAR(const Haplotype & hap, const Read & read, const MLAlignment & ml, int refSeqStart);
+ void writeRealignedBAMFile(const string & fileName, const vector<CIGAR> & cigars, const vector<Read> & reads, const vector<int> & onHap, const bam_header_t *bh);
+ void writeUnalignedBAMFile(const string & fileName, const vector<Read> & reads, const vector<int> & onHap, const bam_header_t *bh);
+ class InDel {
+ public:
+ InDel()
+ {
+ count[0]=0;
+ count[1]=0;
+ }
+ typedef enum { In, Del} Type;
+ Type type;
+ size_t count[2];
+ };
+
+public:
+ class Parameters {
+ public:
+ Parameters(const string & _tid, string _fileName, const string & modelType) : obsParams(modelType)
+ {
+ tid=_tid;
+ fileName=_fileName;
+ setDefaultValues();
+ }
+ void setDefaultValues()
+ {
+ bayesa0=0.001;
+ width=30;
+ maxHap=100;
+ skipMaxHap=1000;
+ maxReads=500;
+ mapQualThreshold=0.995;
+ glfNumHap=5;
+ inferenceMethod="empirical";
+ minReadOverlap=5;
+ minCount=2;
+ maxReadLength=40;
+ numOutputTopHap=5;
+ checkAllCIGARs=1;
+ bayesType="all";
+
+ fastWidth=4;
+ analyzeLowFreq=false;
+ analyzeLowFreqDiffThreshold=1.0;
+ showHapDist=true;
+ showHapAlignments=false;
+ showCandHap=false;
+ showReads=false;
+ fastWidthOverlap=4;
+ noIndelWindow=-1;
+ mapUnmappedReads=false;
+ priorIndel=1.0/10000;
+ priorSNP=1.0/1000.0;
+ filterReadAux=string("");
+ quiet=true;
+ computeML=false;
+ computeMAP=false;
+ doDiploid=false;
+ slower=true;
+ estimateHapFreqs=false;
+ printCallsOnly=true;
+ outputPooledLikelihoods=false;
+ filterHaplotypes = false;
+
+ outputGLF=true;
+ outputRealignedBAM=false;
+ processRealignedBAM="no";
+ changeINStoN = false;
+
+
+ EMtol=1e-4;
+ }
+ OutputData makeOutputData(ostream & out)
+ {
+ OutputData oData(out);
+ oData("msg")("index");
+ oData("analysis_type");
+ oData("tid")("lpos")("rpos")("center_position")("realigned_position");
+ oData("ref_all")("num_reads")("num_hqreads");
+ oData("post_prob_variant")("est_freq")("was_candidate_in_window");
+
+ oData("num_mapped_to_first")("num_mapped_to_second");
+ oData("num_off_hap")("loglik_hap_pair")("loglik_next_hap_pair");
+ oData("first_var_cover_forward")("first_var_cover_reverse")("second_var_cover_forward")("second_var_cover_reverse");
+ oData("first_called_all")("second_called_all")("loglik_called_genotype")("loglik_ref_ref")("alt_genotypes");
+ return oData;
+ }
+
+ OutputData makeGLFOutputData(ostream & out)
+ {
+ OutputData oData(out);
+ oData("msg")("index");
+ oData("analysis_type");
+ oData("tid")("lpos")("rpos")("center_position")("realigned_position")("was_candidate_in_window");
+ oData("ref_all")("nref_all")("num_reads");
+ oData("post_prob_variant")("qual")("est_freq")("logZ")("hapfreqs");
+
+ oData("indidx")("msq")("numOffAll")("num_indel")("num_cover_forward")("num_cover_reverse")("num_unmapped_realigned");
+ oData("var_coverage_forward")("var_coverage_reverse");
+ oData("nBQT")("nmmBQT")("mLogBQ")("nMMLeft")("nMMRight");
+ oData("glf");
+ return oData;
+ }
+
+ OutputData makeGLFv2OutputData(ostream & out)
+ {
+ OutputData oData(out);
+ oData("msg")("index");
+ oData("analysis_type");
+ oData("tid")("candidate_position")("realigned_position");
+ oData("ref_all")("nref_all")("num_reads");
+ oData("post_prob_variant")("est_freq");
+
+ oData("indidx")("msq")("num_cover_forward")("num_cover_reverse");
+ oData("glf");
+ return oData;
+ }
+
+
+ void print()
+ {
+ cout << "DetInDel parameters: " << endl;
+ cout << "\ttid: " << tid << " width: " << width << " maxHap: " << maxHap << " maxReads: " << maxReads << " skipMaxHap: " << skipMaxHap << endl;
+ cout << "\toutputFilename: " << fileName << endl;
+ cout << "\tmapQualThreshold: " << mapQualThreshold << endl;
+ //cout << "\tscaleError: " << scaleErr << endl;
+ cout << "\tinferenceMethod: " << inferenceMethod << endl;
+ //cout << "\tglfNumHap: " << glfNumHap << endl;
+ cout << "\tanalyzeLowFreq: " << analyzeLowFreq << endl;
+ cout << "\tanalyzeLowFreqDiffThreshold: " << analyzeLowFreqDiffThreshold << endl;
+ cout << "\tshowHapDist: " << showHapDist << endl;
+ cout << "\tminReadOverlap: " << minReadOverlap << endl;
+ cout << "\tmaxReadLength: " << maxReadLength << endl;
+ //cout << "\tminCount: " << minCount << endl;
+ cout << "\tmaxHapReadProd: " << maxHapReadProd << endl;
+ //cout << "\tfastWidth: " << fastWidth << endl;
+ //cout << "\tfastWidthOverlap: " << fastWidthOverlap << endl;
+ cout << "\tshowCandHap: " << showCandHap << endl;
+ cout << "\tshowReads: " << showReads << endl;
+ cout << "\tfilterHaplotypes: " << filterHaplotypes << endl;
+ cout << "\tnoIndelWindow: " << noIndelWindow << endl;
+ cout << "\tmapUnmappedReads: " << mapUnmappedReads << endl;
+
+ cout << "\tnumOutputTopHap: " << numOutputTopHap << endl;
+
+ cout << "\tcheckAllCIGARs: " << checkAllCIGARs << endl;
+ cout << "\tchangeINStoN: " << changeINStoN << endl;
+
+
+
+
+ cout << endl;
+ cout << "\tquiet: " << quiet << endl;
+ cout << "\tprintCallsOnly: " << printCallsOnly << endl;
+ cout << "\tfaster: " << !slower << endl;
+ cout << "\tdoDiploid: " << doDiploid << endl;
+ cout << "\tdoEM: " << estimateHapFreqs << endl;
+
+ cout << "\toutputPooledLikelihoods: " << outputPooledLikelihoods << endl;
+ cout << "\toutputRealignedBAM: " << outputRealignedBAM << endl;
+ cout << "\tprocessRealignedBAM: " << processRealignedBAM << endl;
+ cout << "\tshowHapAlignments: " << showHapAlignments << endl;
+
+ cout << "\tEM tol: " << EMtol << endl;
+ cout << "\tbayesEM a0: " << bayesa0 << endl;
+ cout << "\tbayesType: " << bayesType << endl;
+
+
+ cout << "\tpriorIndel: " << priorIndel << endl;
+ cout << "\tpriorSNP: " << priorSNP << endl;
+
+ //cout << "\tmeanInsert: " << meanInsert << endl;
+ //cout << "\tstdInsert: " << stdInsert << endl;
+
+ cout << "\tfilterReadAux: " << filterReadAux << endl;
+
+ cout << "Observation model parameters: " << endl;
+ obsParams.print();
+ }
+ int noIndelWindow, numOutputTopHap, checkAllCIGARs, minReadOverlap, maxHapReadProd;
+ uint32_t width, maxHap, maxReads, skipMaxHap, glfNumHap, maxReadLength, minCount, fastWidth, fastWidthOverlap;
+ double checkBaseQualThreshold;
+ double mapQualThreshold, scaleErr, priorIndel, priorSNP, EMtol, bayesa0;
+ string fileName, inferenceMethod, refFileName, tid, filterReadAux, bayesType, processRealignedBAM;
+ bool analyzeLowFreq, showHapDist, showCandHap, showReads, showHapAlignments, alignAgainstReference, mapUnmappedReads, quiet, estimateHapFreqs, doDiploid, computeML, computeMAP, slower,printCallsOnly, outputPooledLikelihoods, filterHaplotypes;
+ bool outputRealignedBAM, outputGLF, varFileIsOneBased, changeINStoN;
+ double analyzeLowFreqDiffThreshold;
+ double meanInsert, stdInsert;
+ ObservationModelParameters obsParams;
+ };
+ Parameters params;
+
+ DetInDel(const string & bfName, const Parameters & _params, int multipleFiles);
+ ~DetInDel();
+
+
+ map<uint32_t, InDel> indels;
+ class ScanStats
+ {
+ public:
+ ScanStats()
+ {
+ numUnmappedMate=0;
+ }
+ int numUnmappedMate;
+ };
+ ScanStats scanStats;
+protected:
+ faidx_t *fai;
+
+};
+
+
+
+class FFData
+{
+public:
+ uint32_t start, end;
+ DetInDel *det;
+ map<string, int> unmappedMate;
+ map<int, int > insHisto, delHisto;
+};
+
+#endif /*DINDEL_HPP_*/
diff --git a/Fasta.hpp b/Fasta.hpp
new file mode 100644
index 0000000..d2bdf40
--- /dev/null
+++ b/Fasta.hpp
@@ -0,0 +1,72 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Fasta.hpp
+ *
+ * Created on: May 27, 2009
+ * Author: caa
+ */
+
+#ifndef FASTA_HPP_
+#define FASTA_HPP_
+
+#include <string>
+#include "bam.h"
+#include "faidx.h"
+
+
+namespace fasta {
+using namespace std;
+class Fasta {
+public:
+ Fasta()
+ {
+ fai=NULL;
+ }
+ Fasta(const string & fileName)
+ {
+ fai=NULL;
+ fai = fai_load(fileName.c_str());
+ if (!fai) {
+ throw string("Fasta: cannot open reference file.");
+ }
+ }
+
+ string getSequence(const string & tid, int start, int end)
+ {
+ char *str;
+ char *ref;
+ str = (char*)calloc(strlen(tid.c_str()) + 30, 1);
+ sprintf(str, "%s:%d-%d", tid.c_str(), start, end);
+ int len;
+ ref = fai_fetch(fai, str, &len);
+ if (len==0) throw string("faidx error, len==0");
+
+ string result(ref);
+ transform(result.begin(), result.end(), result.begin(), ::toupper);
+ free(str);
+ free(ref);
+ return result;
+ }
+ ~Fasta()
+ {
+ if (fai) fai_destroy(fai);
+ }
+
+protected:
+ faidx_t *fai;
+};
+}
+#endif /* FASTA_HPP_ */
diff --git a/Faster.cpp b/Faster.cpp
new file mode 100644
index 0000000..7ed09d5
--- /dev/null
+++ b/Faster.cpp
@@ -0,0 +1,785 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Fast.cpp
+ *
+ * Created on: Feb 25, 2009
+ * Author: caa
+ */
+
+#include <string>
+#include <assert.h>
+#include <iostream>
+#include <stdint.h>
+#include <vector>
+#include <list>
+#include <set>
+#include <string>
+#include <map>
+#include <cmath>
+#include <sstream>
+#include <algorithm>
+#include "bam.h"
+#include "Haplotype.hpp"
+#include "Faster.hpp"
+#include "Utils.hpp"
+#include "foreach.hpp"
+using namespace std;
+const int DEBUGS=0;
+
+ObservationModelS::ObservationModelS(const Haplotype & _hap, const Read & r, uint32_t hapStart, const ObservationModelParameters & _params) : params(_params)
+{
+
+ hap_ptr = &_hap;
+ read_ptr = &r;
+ if (params.maxLengthIndel>(int) hap_ptr->size()) throw string("hapSize error.");
+ hlen=(int) hap_ptr->seq.size();
+ rlen=(int) read_ptr->size();
+
+ this->hapStart=hapStart;
+
+ likelihoodComputed=false;
+ bMidError=true;
+ computeBMid();
+ setupReadLikelihoods();
+
+}
+
+void ObservationModelS::computeBMid()
+{
+ const Read & read = *read_ptr;
+ const Haplotype & hap = *hap_ptr;
+ uint32_t hapEnd=hapStart+hap.size();
+ uint32_t mReadStart=uint32_t(read.posStat.first);
+ uint32_t readEnd=mReadStart+uint32_t(read.size())-1;
+ uint32_t olStart, olEnd;
+ int mid;
+
+ bMidError=true;
+ if (mReadStart>hapEnd) {
+ bMid=0;
+ } else if (readEnd<hapStart) {
+ bMid=int(read.size())-1;
+ } else {
+ olStart=(hapStart>mReadStart)?hapStart:mReadStart;
+ olEnd=(hapEnd>readEnd)?readEnd:hapEnd;
+ mid=(int(olEnd)-int(olStart))/2+int(olStart);
+ bMid=mid-int(mReadStart);
+ bMidError=false;
+ }
+
+ if (bMid<0) { bMid=0; };
+ if (bMid>=int(read.size())) { bMid=int(read.size())-1; };
+
+ if (DEBUGS) cout << "bMid: " << bMid << endl;
+
+}
+
+
+void ObservationModelS::setupReadLikelihoods()
+{
+ const Read & read = *read_ptr;
+
+ logMatch.resize(read.size());
+ logMismatch.resize(read.size());
+ cumLogMatch.resize(read.size());
+ // initialize with prior
+ llMatch=0.0;
+ if (params.modelType=="probabilistic") {
+ for (size_t r=0;r<read.size();r++) {
+ double rq=read.qual[r];
+ double pr=rq*(1.0-params.pMut);
+ double eq=log(.25+.75*pr);
+ double uq=log(.75+1e-10-.75*pr);
+ logMatch[r]=eq;
+ logMismatch[r]=uq;
+ llMatch+=eq;
+ cumLogMatch[r]=llMatch;
+ }
+ } else {
+ throw string("Model not implemented.");
+ }
+
+
+ double mq=1.0-read.mapQual;
+ if (-10.0*log10(mq)>params.capMapQualFast) {
+ mq=pow(10.0,-params.capMapQualFast/10.0);
+ }
+
+ pOffFirst=mq;
+ pOffFirstHMQ=1e-10;
+
+ llOff=log(pOffFirst)+llMatch+double(rlen)*log(1.0-params.pError);
+ llOffHMQ=log(pOffFirstHMQ)+llMatch+double(rlen)*log(1.0-params.pError);
+
+}
+
+
+
+void ObservationModelS::AlignHash(const HapHash & hash)
+{
+
+ const Read & read = *read_ptr;
+ hash_map<int,int> hposFreq; // will keep track of frequencies of relative positions of read wrt haplotype
+ hash_map<int,int>::iterator it;
+
+ unsigned int kmer = hash.getKmer();
+
+ size_t x=0, xl=read.size()-kmer;
+
+ unsigned int key=hash.convert(read.seq.seq,x);
+ for (;x<xl+1;x++) {
+ //const set<int> & hpSet=hash.lookup(read.seq.seq,x);
+ const set<int> & hpSet = hash.lookup(key);
+ if (DEBUGS) cout << "hash: " << x << " :";
+ BOOST_FOREACH(int hp, hpSet) {
+
+ int rpfb=hp-x; // relative position of first base wrt haplotype
+ if (DEBUGS) cout << " " << rpfb;
+ it=hposFreq.find(rpfb);
+ // todo weight according to bMid?
+ if (it==hposFreq.end()) hposFreq[rpfb]=1; else it->second++;
+ }
+ if (DEBUGS) cout << endl;
+ if (x!=xl) key = hash.pushBack(key, read.seq.seq[x+kmer]);
+ }
+
+ // sort according to frequency
+ map<int, set<int> > freqToPos;
+ for (it=hposFreq.begin();it!=hposFreq.end();it++) {
+
+ if (DEBUGS) cout << "il : " << it->first << " " << it->second << endl;
+ freqToPos[it->second].insert(it->first);
+ }
+ // do alignment with top 15 frequency hash lookups
+
+ const int maxRelPos=15;
+
+ vector<int> relPos; relPos.reserve(maxRelPos);
+
+ int tot=0;
+
+ for (map<int,set<int> >::reverse_iterator rit=freqToPos.rbegin(); rit!=freqToPos.rend() ;rit++) {
+ BOOST_FOREACH(int rp, rit->second) {
+ if (tot<maxRelPos) {
+ relPos.push_back(rp);
+ if (DEBUGS) cout << "rp: " << rp << " freq: " << rit->first << endl;
+ tot++;
+ } else goto _end;
+ }
+ }
+ _end:
+
+ if (DEBUGS) cout << "done"<<endl;
+ // run HMM with sparse set of positions
+ SStateHMM(relPos);
+
+}
+
+MLAlignment ObservationModelS::align(const HapHash & hash)
+{
+ AlignHash(hash);
+ likelihoodComputed=true;
+ reportVariants();
+ return ml;
+}
+/*
+inline void ObservationModelS::doTransition(int cr, int nr, const vector<int> & state, vector<double> & alpha, vector<double> & bt, const vector<double> & tr, const int & S)
+{
+ int r=cr;
+ if (state[cr]==-1) {
+ // current readbase not fixed
+ if (state[nr]==-1) {
+ // next base is not fixed
+ for (int cs=0;cs<S;cs++) {
+ for (int ns=0;ns<S;ns++) {
+ double nv=obs[r*S+cs]+alpha[r*S+cs]+tr[cs*S+ns];
+ if (nv>alpha[cr*S+ns]+EPS) { alpha[cr*S+ns]=nv; bt[cr*S+ns]=ns; }
+ }
+ }
+
+ } else {
+ // next base is fixed
+ for (int cs=0;cs<S;cs++) {
+ ns=state[nr];
+ double nv=obs[r*S+cs]+alpha[r*S+cs]+tr[cs*S+ns];
+ if (nv>alpha[cr*S+ns]+EPS) { alpha[cr*S+ns]=nv; bt[cr*S+ns]=ns; }
+ }
+ }
+ } else {
+ // current readbase is fixed
+ if (state[nr]==-1) {
+ // next base is not fixed
+ int cs=state[r];
+ for (int ns=0;ns<S;ns++) {
+ double nv=obs[r*S+cs]+alpha[r*S+cs]+tr[cs*S+ns];
+ if (nv>alpha[cr*S+ns]+EPS) { alpha[cr*S+ns]=nv; bt[cr*S+ns]=ns; }
+ }
+ } else {
+ // next base is fixed
+ int cs=state[r];
+ int ns=state[nr];
+ double nv=obs[r*S+cs]+alpha[r*S+cs]+tr[cs*S+ns];
+ if (nv>alpha[cr*S+ns]+EPS) { alpha[cr*S+ns]=nv; bt[cr*S+ns]=ns; }
+ }
+ }
+}
+
+inline void ObservationModelS::doTransitionNF(int cr, int nr, const vector<int> & state, vector<double> & alpha, vector<double> & bt, const vector<double> & tr, const int & S)
+{
+ int r=cr;
+ // next base is not fixed
+ for (int cs=0;cs<S;cs++) {
+ for (int ns=0;ns<S;ns++) {
+ double nv=obs[r*S+cs]+alpha[r*S+cs]+tr[cs*S+ns];
+ if (nv>alpha[cr*S+ns]+EPS) { alpha[cr*S+ns]=nv; bt[cr*S+ns]=ns; }
+ }
+ }
+}
+*/
+
+void ObservationModelS::SStateHMM(vector<int> & relPos)
+{
+ // note that this HMM does not keep track of the last base before the insertion, so after the insertion it may transition not to the next haplotype base
+ // also, the length of the insertion must be present as the difference between one of the positions in relPos vector.
+
+ // int p1 and p2 are relative positions of first readbase with respect to the haplotype
+ if (DEBUGS) cout << "hlen: " << hlen << " rlen: " << rlen << endl;
+ const double EPS=1e-7;
+ int readLen=read_ptr->size();
+
+
+ relPos.push_back(-readLen);
+ std::sort(relPos.begin(), relPos.end());
+
+ mapState=vector<int>(readLen,0);
+
+ //if (DEBUGS){ cout << "relPos: "; for (int x=0;x<relPos.size();x++) cout << " " << relPos[x]; cout << endl; }
+
+
+ int S=relPos.size();
+ int T=2*S; // total number of states per slice
+
+ // note that obs will encode observation potentials only for the non-inserted states
+ vector<double> tr(S*S, -1000.0), trI(S*S, -1000.0), alpha(readLen*T,-1000.0), obs(readLen*S,0);
+
+ // NOTE alpha is defined as the message that readbase r sends to its neighbour, where neighbour depends on the readbase and bmid
+
+ vector<int> bt(readLen*T,0); // backtracking matrix for Viterbi
+
+ // setup state array
+ // initialize to all undetermined
+ vector<int> state(readLen,-1);
+
+ // initialize obs_lik (log-emission-probabilities) for every read-base
+
+ for (int r=0;r<readLen;r++) {
+ for (int s=0;s<S;s++) {
+ int p1=relPos[s];
+ if (p1+r>=0 && p1+r<hlen) {
+ obs[r*S+s]=(read_ptr->seq.seq[r]==hap_ptr->seq[p1+r])?logMatch[r]:logMismatch[r];
+ } else {
+ // this corresponds to LO/RO in ObservationModelFB
+ obs[r*S+s]=logMatch[r];
+ }
+ }
+
+ // obs[r*S+S-1]=logMatch[r]; // assume match if insertion
+ if (DEBUGS) { cout << "obs: "; for (int s=0;s<S;s++) cout << " " << -int(round(obs[r*S+s])); cout << endl; }
+ }
+
+
+
+
+ // todo : add code to fix state to OffHaplotype if to the left or right of a fixed base?
+
+
+ // setup transition-matrix
+
+ vector<double> prior(T, -1000.0), priorHMQ(T, -1000.0);
+
+ // p1 <- p1
+ // p1 <- p2
+ // p1 <- I
+
+ // p2 <- p1
+ // p2 <- p2
+ // p2 <- I
+
+ // I <- p1
+ // I <- p2
+ // I <- I
+
+
+ // setup prior distribution for bMid
+ for (int ins=0;ins<2;ins++) {
+ double pins=(ins==0)?log(1.0-params.pError):log(params.pError);
+ for (int y=0;y<S;y++) {
+ int x=y+ins*S;
+ int hp=relPos[y]+bMid;
+ if (hp>=0 && hp<hlen) {
+ prior[x]=log(1.0-pOffFirst)+pins;
+ priorHMQ[x]=log(1.0-pOffFirstHMQ)+pins;
+ } else {
+ prior[x]=log(pOffFirst)+pins;
+ priorHMQ[x]=log(pOffFirstHMQ)+pins;
+ }
+ if (DEBUGS) cout << "prior[" << x << "]: " << prior[x] << " " << priorHMQ[x] << endl;
+ }
+ }
+
+ double logpInsgNoIns = log(params.pError);
+ double logpInsgIns = -0.25;
+ double logpNoInsgIns = log(1-exp(logpInsgIns));
+ //double logpNoInsgNoIns = log(1.0-params.pError);
+
+
+
+ // transitions between relPos
+ for (int s1=0;s1<S;s1++) for (int s2=0;s2<S;s2++) {
+ double ll=-1000.0;
+ // relpos to relpos
+ // for non-inserted states only deletions are allowed.
+ // you can only transition to a lower relPos from an insertion-state (ie x>=S)
+ if (s1!=s2) {
+ double d=fabs(double(relPos[s1]-relPos[s2]));
+ ll=(d-1.0)*logpInsgIns+log(params.pError);
+ trI[s1*S+s2]=(d-1.0)*logpInsgIns;
+ } else if (s1==s2) {
+ ll=log(1.0-params.pError);
+ }
+
+ // Pr[s1 | s2 ]
+ tr[s1*S+s2]=ll;
+ }
+
+ if (DEBUGS) for (int s1=0;s1<S;s1++) {
+ cout << "tr["<< s1 << "]: "; for (int s2=0;s2<S;s2++) cout << " " << tr[s1*S+s2]; cout << endl;
+ }
+ // from left to bMid
+
+ for (int r=0;r<bMid;r++) {
+ int cr=r;
+ //doTransition(cr, nr, state, alpha, bt, tr);
+
+ for (int cs=0;cs<S;cs++) {
+ double pv=obs[r*S+cs]; if (r) pv+=alpha[(r-1)*T+cs];
+
+ // transition to non-inserted from non-inserted
+ for (int ns=cs;ns<S;ns++) {
+ double nv=pv+tr[cs*S+ns];
+ if (nv>alpha[cr*T+ns]+EPS) { alpha[cr*T+ns]=nv; bt[cr*T+ns]=cs; }
+ }
+
+ // r <--- r+1
+ // to non-ins from ins
+ int ns=cs+S;
+ double nv=pv+logpNoInsgIns;
+ if (nv>alpha[cr*T+ns]+EPS) { alpha[cr*T+ns]=nv; bt[cr*T+ns]=cs; }
+
+
+ // insertion states
+
+ // r <--- r+1
+ // ins <--- ins
+
+ int ics=cs+S;
+ ns=ics;
+ nv=logMatch[r]+logpInsgIns; if (r) nv += alpha[(r-1)*T+ics];
+ if (nv>alpha[cr*T+ns]+EPS) { alpha[cr*T+ns]=nv; bt[cr*T+ns]=ics; }
+
+
+ // ins <--- noins
+ ics=cs+S; // must transition to a lower relPos in case of insertion and going from left to right
+ for (int ns=0;ns<cs;ns++) if (relPos[cs]-r>=relPos[ns]) {
+ nv=logMatch[r]+trI[cs*S+ns]+logpInsgNoIns; if (r) nv += alpha[(r-1)*T+ics];
+ if (nv>alpha[cr*T+ns]+EPS) { alpha[cr*T+ns]=nv; bt[cr*T+ns]=ics; }
+ }
+
+
+ }
+
+ if (DEBUGS) { cout << "alpha_fw: "; for (int x=0;x<T;x++) cout << " " << alpha[r*S+x]; cout << endl; }
+ }
+
+ if (DEBUGS) cout << endl;
+
+ // from right to bMid
+
+ for (int r=readLen-1;r>bMid;r--) {
+ int cr=r;
+ //doTransition(cr, nr, state, alpha, bt, tr);
+
+ for (int cs=0;cs<S;cs++) {
+ double pv=obs[r*S+cs]; if (r<readLen-1) pv+=alpha[(r+1)*T+cs];
+
+ // transition to non-inserted from non-inserted
+ for (int ns=0;ns<=cs;ns++) {
+ double nv=pv+tr[cs*S+ns];
+ if (nv>alpha[cr*T+ns]+EPS) { alpha[cr*T+ns]=nv; bt[cr*T+ns]=cs; }
+ }
+
+ // r <--- r-1
+ // to ins from no-ins
+ double nv=logMatch[r]+logpInsgNoIns; if (r<readLen-1) nv += alpha[(r+1)*T+cs+S];
+
+ if (nv>alpha[cr*T+cs]+EPS) { alpha[cr*T+cs]=nv; bt[cr*T+cs]=cs+S; }
+
+ int ns;
+
+ // insertion states
+
+ // r <--- r-1
+ // ins <--- ins
+
+ int ics=cs+S;
+ ns=ics;
+ nv=logMatch[r]+logpInsgIns; if (r<readLen-1) nv+= alpha[(r+1)*T+ics];
+ if (nv>alpha[cr*T+ns]+EPS) { alpha[cr*T+ns]=nv; bt[cr*T+ns]=ics; }
+
+ // r <--- r-1
+ // noins <--- ins
+ ics=cs+S; // must transition to a lower relPos in case of insertion and going from left to right
+ for (int ns=cs+1;ns<S;ns++) if (relPos[cs]>relPos[ns]-r) {
+ nv=obs[r*S+cs]+logpNoInsgIns+trI[cs*S+ns]; if (r<readLen-1) nv += alpha[(r+1)*T+cs];
+ if (nv>alpha[cr*T+ns+S]+EPS) { alpha[cr*T+ns+S]=nv; bt[cr*T+ns+S]=cs; }
+ }
+
+
+ }
+ // r r-1
+ // ins <----- noins
+
+ if (DEBUGS) { cout << "alpha_bw: "; for (int x=0;x<T;x++) cout << " " << alpha[r*T+x]; cout << endl; }
+ }
+
+
+ double max=-HUGE_VAL;
+ int xmax=0;
+
+ for (int ins=0;ins<2;ins++)
+ for (int y=0;y<S;y++) {
+ int x=ins*S+y;
+ double obsv=(ins==0)?obs[bMid*S+y]:logMatch[bMid];
+ alpha[bMid*T+x]=obsv+prior[x];
+ if (bMid<readLen-1) alpha[bMid*T+x]+=alpha[(bMid+1)*T+x];
+ if (bMid>0) alpha[bMid*T+x]+=alpha[(bMid-1)*T+x];
+
+ if (alpha[bMid*T+x]>max) {
+ max=alpha[bMid*T+x];
+ xmax=x;
+ }
+ }
+
+ if (DEBUGS) { cout << "alpha_bmid: "; for (int x=0;x<T;x++) cout << " " << alpha[bMid*T+x]; cout << endl; }
+
+ // check position of bMid on haplotype
+
+ int hp=relPos[xmax%S]+bMid;
+ if (hp>=0 || hp < hlen) {
+ // bMid is an insertion
+
+ ml.offHap=false;
+
+ } else {
+ // not an insertion
+ // is it on or off the haplotype?
+
+ ml.offHap=true;
+
+
+ }
+
+
+ ml.ll=max;
+
+ max=-HUGE_VAL;
+ xmax=0;
+
+ if (DEBUGS) cout << "alpha_bmid_HMQ: ";
+ for (int ins=0;ins<2;ins++)
+ for (int y=0;y<S;y++) {
+ int x=ins*S+y;
+ double obsv=(ins==0)?obs[bMid*S+x]:logMatch[bMid];
+ double v=obsv+priorHMQ[x];
+ if (bMid<readLen-1) v+=alpha[(bMid+1)*T+x];
+ if (bMid>0) v+=alpha[(bMid-1)*T+x];
+
+ if (v>max) {
+ max=v;
+ xmax=x;
+ }
+ if (DEBUGS) cout << " " << v;
+ }
+ if (DEBUGS) cout << endl;
+
+ hp=relPos[xmax%S]+bMid;
+ if (hp>=0 || hp < hlen) {
+ // bMid is an insertion
+ ml.offHapHMQ=false;
+
+ } else {
+ // not an insertion
+ // is it on or off the haplotype?
+ ml.offHapHMQ=true;
+ }
+
+ state[bMid]=xmax;
+
+ // backtrack to get the map state
+
+ for (int b=bMid; b>0;b--) {
+ state[b-1]=bt[(b-1)*T+state[b]];
+ }
+
+ for (int b=bMid;b<readLen-1;b++) {
+ state[b+1]=bt[(b+1)*T+state[b]];
+ }
+
+ if (DEBUGS){ cout << "state: "; for (int r=0;r<readLen;r++) cout << "[" << r << " " << read_ptr->seq.seq[r] << " " << state[r] << "]"; cout << endl;}
+
+
+ // convert relative positions to absolute positions, using LO, RO, x convention
+
+ int lhp=1;
+ for (int r=0; r<readLen; r++) {
+ if (state[r]==-1) throw string("error in mapstate fast");
+ if (state[r]<S) {
+ int hp=relPos[state[r]]+r;
+ if (hp>=0 && hp<hlen) {
+ mapState[r]=hp+1;
+ lhp=hp+1;
+ } else if (hp<0) mapState[r]=0; else mapState[r]=hlen; // LO and RO
+ if (DEBUGS) cout << "ms: " << r << " " << state[r] << " " << hp << endl;
+
+ } else {
+ // insertion
+ mapState[r]=hlen+2+lhp;
+
+ }
+ if (DEBUGS) cout << "ms: " << r << " " << state[r] << " mapstate " << mapState[r] << endl;
+ }
+
+
+}
+
+
+void ObservationModelS::reportVariants()
+{
+ int hapSize=hlen;
+ int readSize=rlen;
+ int numS=hapSize+2;
+
+ const Read & read = *read_ptr;
+ const Haplotype & hap = *hap_ptr;
+
+
+ ml.align=string(hapSize, 'R');
+ ml.indels.clear();
+ ml.snps.clear();
+
+ ml.firstBase=-1;
+ ml.lastBase=-1;
+ ml.hapIndelCovered.clear();
+ ml.hapSNPCovered.clear();
+ ml.hpos.clear();
+ ml.hpos.resize(readSize);
+
+
+ int b=0;
+ while (b<readSize) {
+ // only report variants for bases that are on the haplotype
+ int s=mapState[b];
+ if ( (s%numS)>0 && (s%numS)<=hapSize ) {
+ if (s>=numS) { // insertion
+ int pos=(s%numS)-1+1; // position of insertion wrt haplotype MAINTAIN CONVENTION OF INSERTION BEFORE BASE X
+ int len=0; // length of insertion
+ int rpos=b; // start base of insertion in read
+ while (b<readSize && mapState[b]>=numS) {
+ ml.hpos[b]=MLAlignment::INS;
+ b++;
+ len++;
+ }
+ int readStart=rpos;
+ int readEnd=b-1;
+ int hapStart=pos;
+ int hapEnd=pos;
+ string seq=read.seq.seq.substr(rpos,len);
+ ml.indels[pos]=AlignedVariant(string("+").append(seq), hapStart, hapEnd, readStart, readEnd);
+ b--;
+ } else {
+ ml.hpos[b]=s-1;
+ // update firstBase and lastBase
+ if (ml.firstBase==-1) ml.firstBase=s-1; else if (s-1<ml.firstBase) ml.firstBase=s-1;
+ if (ml.lastBase==-1) ml.lastBase=s-1; else if (s-1>ml.lastBase) ml.lastBase=s-1;
+
+
+ // check for SNP
+ if (read.seq[b]!=hap.seq[s-1]) {
+ string snp;
+ snp+=hap.seq[s-1];
+ snp.append("=>");
+ snp+=read.seq[b];
+ int readStart=b;
+ int readEnd=b;
+ int hapStart=s-1;
+ int hapEnd=s-1;
+
+
+ ml.snps[s-1]=AlignedVariant(snp,hapStart, hapEnd, readStart, readEnd);
+ ml.align[s-1]=read.seq[b];
+ }
+ // check for deletion
+ if (b<readSize-1) {
+ int ns=mapState[b+1];
+ if (ns<numS && ns-s>1) { // make sure next state is not an insertion..
+ int pos=s+1-1;
+ int len=-(ns-s-1);
+ //indels[pos]=ReportVariant(len, hap.seq.substr(pos, -len), b);
+
+ for (int y=pos;y<-len+pos;y++) ml.align[y]='D';
+ int readStart=b;
+ int readEnd=b+1;
+ int hapStart=pos;
+ int hapEnd=pos-len-1;
+ string seq=hap.seq.substr(pos,-len);
+ ml.indels[pos]=AlignedVariant(string("-").append(seq), hapStart, hapEnd, readStart, readEnd);
+ }
+ }
+
+ }
+
+ } else {// on haplotype
+ if (s%numS==0) ml.hpos[b]=MLAlignment::LO; else ml.hpos[b]=MLAlignment::RO;
+
+ }
+ b++;
+ }
+
+ for (map<int,AlignedVariant>::const_iterator it=hap.indels.begin();it!=hap.indels.end();it++) {
+ const AlignedVariant & av=it->second;
+ if (av.isCovered(params.padCover, ml.firstBase, ml.lastBase)) ml.hapIndelCovered[it->first]=true; else ml.hapIndelCovered[it->first]=false;
+ }
+ for (map<int,AlignedVariant>::const_iterator it=hap.snps.begin();it!=hap.snps.end();it++) {
+ const AlignedVariant & av=it->second;
+ if (av.isCovered(params.padCover, ml.firstBase, ml.lastBase)) ml.hapSNPCovered[it->first]=true; else ml.hapSNPCovered[it->first]=false;
+ }
+
+
+}
+
+void ObservationModelS::printAlignment(size_t hapScrPos)
+{
+ // count how many bases in the read are left of the haplotype
+ if (!likelihoodComputed) throw string("Must align() first!");
+ int hapSize=hlen;
+ int readSize=rlen;
+ int numS=hapSize+2;
+
+ const Read & read = *read_ptr;
+ const Haplotype & hap = *hap_ptr;
+
+
+ string leftHap, rightHap;
+ string rhap(hap.size(),' ');
+ string ins;
+
+ bool insact=false;
+ int b=0;
+ while (b<readSize) {
+ // only report variants for bases that are on the haplotype
+ int s=mapState[b];
+ char nuc=read.seq.seq[b];
+ if (s%numS==0) {
+ //
+ leftHap+=nuc;
+ } else if ( (s%numS)>0 && (s%numS)<=hapSize ) {
+ if (s>=numS) { // insertion
+ if (!insact) {
+ insact=true;
+ ins+='[';
+ stringstream os; os << (s%numS);
+ ins.append(os.str());
+ ins+=' ';
+ }
+
+ ins+=nuc;
+
+ } else {
+ if (insact) ins+=']';
+ insact=false;
+ rhap[s-1]=nuc;
+
+ if (b<readSize-1) {
+ int ns=mapState[b+1];
+ if (ns<numS && ns-s>1) {
+ int len=ns-s-1;
+ rhap.replace(s, len, string(len,'_'));
+ }
+
+
+ }
+
+
+ }
+
+ } else {
+ rightHap+=nuc;
+ }
+ b++;
+ }
+ if (insact) ins+=']';
+
+ stringstream os;
+ os << readSize << " " << ml.offHap << " " << ml.indels.size() << " " << ml.firstBase << " " << ml.lastBase << " " << ml.ll << " ";
+ for (map<int,AlignedVariant>::const_iterator it=hap.indels.begin();it!=hap.indels.end();it++) {
+ if (ml.hapIndelCovered[it->first]) os << "1 "; else os << "0 ";
+ }
+ string prefix=os.str();
+
+ int leftHapSpace=int(hapScrPos)-int(prefix.size());
+ if (leftHapSpace<0) leftHapSpace=0;
+
+ string prLeftHap=string(leftHapSpace,' ');
+
+ if (int(leftHap.size())>leftHapSpace) {
+ prLeftHap=leftHap.substr(leftHap.size()-leftHapSpace, leftHapSpace);
+ } else if (leftHap.size()>0) {
+ prLeftHap.replace(leftHapSpace-leftHap.size(), leftHap.size(), leftHap);
+ }
+
+ cout << prefix<<prLeftHap<<rhap<<rightHap << " " << ins << " read: " << read.seq.seq << endl;
+
+
+ for (map<int,AlignedVariant>::const_iterator it=hap.indels.begin();it!=hap.indels.end();it++) {
+ cout << " " << it->first;
+ }
+ cout << endl;
+
+ cout << endl;
+
+ for (int x=0;x<readSize;x++) {
+ cout << "[" << x << ":" << ml.hpos[x] << "]";
+ }
+ cout << endl;
+}
+
+ObservationModelS::~ObservationModelS()
+{
+
+
+}
+
+
diff --git a/Faster.hpp b/Faster.hpp
new file mode 100644
index 0000000..c6913b9
--- /dev/null
+++ b/Faster.hpp
@@ -0,0 +1,101 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Faster.hpp
+ *
+ * Created on: Feb 24, 2009
+ * Author: caa
+ */
+
+#ifndef FASTER_HPP_
+#define FASTER_HPP_
+#include <string>
+#include <assert.h>
+#include <iostream>
+#include <stdint.h>
+#include <vector>
+#include <list>
+#include <set>
+#include <string>
+#include <map>
+#include <cmath>
+#include "bam.h"
+#include "Haplotype.hpp"
+#include "Read.hpp"
+#include "MLAlignment.hpp"
+#include "ObservationModel.hpp"
+using namespace std;
+
+class ObservationModelS
+{
+
+
+protected:
+ class UngappedAlignment
+ {
+ public:
+ UngappedAlignment()
+ {
+ ll=-HUGE_VAL;
+ relPos=-10000;
+ numMismatch=10000;
+ }
+ UngappedAlignment(double _ll, int _relPos, int _numMismatch)
+ {
+ ll=_ll;
+ relPos=_relPos;
+ numMismatch=_numMismatch;
+ }
+ double ll;
+ int relPos;
+ int numMismatch;
+ };
+public:
+ ObservationModelS() {};
+ ObservationModelS(const Haplotype & _hap, const Read & r, uint32_t hapStart, const ObservationModelParameters & _params);
+ virtual ~ObservationModelS();
+ MLAlignment align(const HapHash & hash);
+
+ //MLAlignment calcLikelihood();
+ //double getLogLikelihood() { calcLikelihood(); return ml.ll; };
+ // void changeHaplotype(const Haplotype & newHap);
+ void printAlignment(size_t hapScrPos);
+ void printStatistics();
+ ObservationModelParameters params;
+
+protected:
+ void computeBMid();
+ void setupReadLikelihoods();
+ void Align();
+ void reportVariants();
+ inline void doTransition(int cs, int nr, const vector<int> & state, vector<double> & alpha, vector<double> & bt, const vector<double> & tr, const int & S);
+ inline void doTransitionNF(int cs, int nr, const vector<int> & state, vector<double> & alpha, vector<double> & bt, const vector<double> & tr, const int & S);
+ void SStateHMM(vector<int> & relPos);
+ void AlignHash(const HapHash & hash);
+ MLAlignment ml;
+ vector<double> logMatch, cumLogMatch, logMismatch;
+ vector<int> mapState;
+ double llMatch; //log likelihood when all bases in the read match
+ int bMid, hlen, rlen;
+ double llOff, llOffHMQ, pOffFirst, pOffFirstHMQ;
+
+ const Haplotype *hap_ptr;
+ const Read *read_ptr;
+ size_t hapStart;
+ bool likelihoodComputed, bMidError;
+};
+
+
+#endif /* FASTER_HPP_ */
diff --git a/GetCandidates.cpp b/GetCandidates.cpp
new file mode 100644
index 0000000..14a0da6
--- /dev/null
+++ b/GetCandidates.cpp
@@ -0,0 +1,498 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * GetCandidates.cpp
+ *
+ * Created on: Aug 27, 2009
+ * Author: caa
+ */
+
+#include <fstream>
+#include <string>
+#include "MyBam.hpp"
+#include "foreach.hpp"
+#include "ObservationModelSeqAn.hpp"
+#include "GetCandidates.hpp"
+#include "Variant.hpp"
+#include "VariantFile.hpp"
+#include "bam.h"
+#include "sam.h"
+#include "Fasta.hpp"
+#include "StringHash.hpp"
+#include <set>
+#include "foreach.hpp"
+using namespace std;
+
+int GetCandidatesFromCIGAR::getIndelFromCIGARFetchFunc(const bam1_t *b, void *data)
+{
+ CFFData & dat = *( (CFFData *) data);
+ vector<CIGARindel> indels;
+ HMap::iterator it;
+ getIndelFromCIGAR(b, indels);
+ BOOST_FOREACH(CIGARindel id, indels) {
+ it = dat.hmap.find(id.refpos);
+ if (it==dat.hmap.end()) dat.hmap[id.refpos][id]=1; else (it->second)[id]++;
+ }
+ return 0;
+}
+
+void GetCandidatesFromCIGAR::getIndelFromCIGARRegion(const vector<MyBam *> & myBams, const string & tid, int start, int end, const string & outputFileName, fasta::Fasta & fa)
+{
+ CFFData data;
+
+ for (size_t b=0;b<myBams.size();b++) {
+ bam_fetch(myBams[b]->bf, myBams[b]->idx, myBams[b]->getTID(tid), start, end, &data, &GetCandidatesFromCIGAR::getIndelFromCIGARFetchFunc);
+ }
+
+ ofstream ofile(outputFileName.c_str());
+ if (!ofile.is_open()) throw string("Cannot open variants file ").append(outputFileName).append(" for writing.");
+ outputIndels(tid, data.hmap,ofile,fa,1);
+ ofile.close();
+}
+
+void GetCandidatesFromCIGAR::getIndelFromCIGAR(const bam1_t *b, vector<CIGARindel> & indels)
+{
+ const bam1_core_t *c=&b->core;
+ uint32_t *cigar=bam1_cigar(b);
+ uint32_t k, l=0;
+ uint32_t refPos = c->pos;
+ int lastop=-1;
+ uint32_t lastPos=refPos;
+ for (k = 0; k < c->n_cigar; ++k) {
+ // cout << "cigar #" << k << endl;
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int32_t len=cigar[k] >> BAM_CIGAR_SHIFT;
+ string seq;
+
+ if (op==BAM_CINS || op==BAM_CMATCH || op==BAM_CSOFT_CLIP) {
+ for(int32_t x=0;x<len;x++) {
+ if (op==BAM_CINS) {
+ seq+=( bam_nt16_rev_table[ bam1_seqi(bam1_seq(b), l) ] );
+ }
+ l++;
+ }
+ } else if (op==BAM_CDEL) {
+ seq.insert(0, len, 'D');
+ }
+
+ if (op==BAM_CINS || op==BAM_CDEL) {
+ int ilen=len; if (op==BAM_CDEL) ilen=-ilen;
+ indels.push_back(CIGARindel(refPos, ilen, seq));
+ }
+
+ // update position for the next cigar
+ lastPos=refPos;
+ if (op == BAM_CMATCH || op == BAM_CDEL || op==BAM_CREF_SKIP) {
+ refPos+=(uint32_t) len;
+ } else if (op!=BAM_CINS && op != BAM_CSOFT_CLIP && op != BAM_CHARD_CLIP) throw string("I don't know how to smoke this CIGAR");
+ lastop=op;
+ }
+}
+
+vector<AlignedVariant> GetCandidatesFromCIGAR::alignCIGAR(const string & tid, const CIGARindel & id, fasta::Fasta & fa)
+{
+
+ vector<AlignedVariant> variants;
+
+ ObservationModelParameters alignParams("probabilistic");
+ seqan::Score<int> score(-1, -460, -100,-960);
+
+ Read rh1;
+ rh1.pos=0;
+ rh1.posStat.first=0;
+ rh1.mapQual=1.0-1e-32;
+
+ map<int, AlignedVariant> alIndel, alSNP;
+
+ int width=params.alignWindow;
+
+ if (abs(id.len)>width/3) width=abs(id.len)*3;
+
+ int start=id.refpos-width;
+ int end=id.refpos+width;
+
+ string hap;
+ try {
+ hap=fa.getSequence(tid, start+1, end+1);
+ } catch (string s) {
+ cerr << "error: "<< s << endl;
+ cerr << "start: " << start << " end: " << endl;
+ return vector<AlignedVariant>();
+ }
+ //int startRef=start-params.refPad;
+ //int refEnd=end+params.refPad;
+
+ int startRef=start;
+ int refEnd=end;
+
+ string ref;
+ try {
+ ref=fa.getSequence(tid, startRef+1, refEnd+1);
+ } catch (string s) {
+ cerr << "error: "<< s << endl;
+ cerr << "startRef: " << startRef << " refEnd: " << refEnd<< endl;
+ return vector<AlignedVariant>();
+ }
+
+ Haplotype hRef; hRef.append(ref);
+
+
+
+ // create haplotype with indel
+
+
+ int pos=id.refpos-start;
+
+ int testlen = (id.len>0)?0:-id.len;
+ if (hap.size()<pos+testlen) {
+ cerr << "Cannot align variant " << id.refpos << " " << id.len << " " << id.seq << endl;
+ return variants;
+ }
+
+ if (id.len<0) {
+ hap.erase(pos,-id.len);
+ } else if (id.len>0) {
+ hap.insert(pos, id.seq);
+ }
+
+ //cout << "hap: " << hap << endl;
+
+ // align indel
+
+ rh1.seq.seq=hap; // sequence with indel
+ rh1.setAllQual(1.0-1e-16);
+
+ try {
+ ObservationModelSeqAn om(hRef, rh1, 0, alignParams, score);
+ string align;
+ om.align();
+ const MLAlignment & ml=om.getMLAlignment();
+ for(map<int, AlignedVariant>::const_iterator it=ml.indels.begin();it!=ml.indels.end();it++) if (it->second.getType()==AlignedVariant::INS || it->second.getType()==AlignedVariant::DEL) {
+ const AlignedVariant & aid = it->second;
+ int pos=startRef+it->first;
+ variants.push_back(AlignedVariant(aid.getString(),pos,pos,-1,-1));
+ }
+ } catch (const bad_alloc & ) {
+ cout << "SeqAN Alloc error: hRef.size(): " << hRef.size() << " rh1.size(): " << rh1.size() << endl;
+ // cout << "hRef: " << hRef << endl;
+ // cout << "rh1: " << rh1 << endl;
+ }
+
+
+ return variants;
+
+}
+
+void GetCandidatesFromCIGAR::outputIndels(const string & tid, const hash_map<int,map<CIGARindel, int> > & hmap, ofstream & ofile, fasta::Fasta & fa, int outputType=1)
+{
+ hash_map<int,map<CIGARindel, int> >::const_iterator it=hmap.begin();
+ hash_map<int,map<AlignedVariant, int> > realigned;
+
+
+ /*
+ ALWAYS realign indel
+ if (fastaName.empty()) {
+ BOOST_FOREACH(CIGARindel id, indels) {
+ it = hmap.find(id.refpos);
+ if (it==hmap.end()) hmap[id.refpos][id.seq]=1; else (it->second)[id.seq]++;
+ stringstream os;
+ os << tid << " " << id.refpos << " " << id.len << " " << id.seq;
+ cout << os.str() << endl;
+ }
+ } else {
+ */
+ // realign indel
+ for (it=hmap.begin();it!=hmap.end();it++) {
+ for (map<CIGARindel, int>::const_iterator i2=it->second.begin();i2!=it->second.end();i2++) {
+ const CIGARindel & id=i2->first;
+ //
+ //cout << "Here " << tid << " " << id.refpos << " " << id.len << " " << id.seq <<endl;
+ vector<AlignedVariant> indels;
+ indels=alignCIGAR(tid, id, fa);
+ BOOST_FOREACH(AlignedVariant aid, indels) if (aid.getType()==AlignedVariant::INS || aid.getType()==AlignedVariant::DEL) {
+ realigned[aid.getStartHap()][aid]=i2->second;
+ }
+
+ }
+ }
+
+ std::set<int> positions;
+ for (hash_map<int,map<AlignedVariant, int> >::const_iterator it=realigned.begin();it!=realigned.end();it++) {
+ positions.insert(it->first);
+ }
+
+
+ //for (hash_map<int,map<AlignedVariant, int> >::const_iterator it=realigned.begin();it!=realigned.end();it++) {
+ for (std::set<int>::const_iterator posit = positions.begin(); posit != positions.end(); posit++) {
+ const map<AlignedVariant, int> & _variants = realigned[*posit];
+ ostringstream ovar, ocnt;
+ ovar << tid;
+ ovar << " " << *posit;
+ for (map<AlignedVariant, int>::const_iterator i2=_variants.begin();i2!=_variants.end();i2++) {
+ const AlignedVariant & aid = i2->first;
+ int len=aid.size();
+ if (aid.getType()==AlignedVariant::DEL) len=-len;
+ if (outputType==1) {
+ ovar << " " << aid.getString();
+ ocnt << " " << i2->second;
+ } else if (outputType==2) {
+ ovar << " " << len << " " << aid.getSeq();
+ ocnt << " " << i2->second;
+ } else throw string("Huh?");
+ }
+ ofile << ovar.str() << " #" << ocnt.str() << endl;
+
+ }
+
+}
+
+void GetCandidatesFromCIGAR::realignCandidateFile(const string & _varFile, bool isOneBased, const string & outputFileName, const string & fastaName)
+{
+ hash_map<int,map<CIGARindel, int> > hmap;
+ hash_map<int,map<CIGARindel, int> >::iterator it;
+
+ fasta::Fasta fa(fastaName);
+
+ VariantFile vf(_varFile);
+
+ ofstream ofile(outputFileName.c_str());
+ if (!ofile.is_open()) throw string("Cannot open ").append(outputFileName).append(" for writing CIGAR indels.");
+
+ cout << "Realigning indels from variants file: " << _varFile << endl;
+
+ string ctid="";
+ while (!vf.eof()) {
+ vector<Variant> variants;
+ VariantFile::Candidates cand=vf.getLine(isOneBased);
+ if (cand.variants.empty()) continue;
+
+ if (cand.tid!=ctid) {
+ if (hmap.size()) {
+ outputIndels(ctid, hmap,ofile,fa);
+ cout << "Wrote realigned candidate indel for target " << ctid << " to file " << outputFileName << endl;
+ }
+ hmap.clear();
+ ctid=cand.tid;
+ }
+
+ BOOST_FOREACH(Variant var, cand.variants) if (var.isIndel()) {
+ int len=var.size();
+ if (var.getType()==Variant::DEL) len=-len;
+ CIGARindel id(cand.pos,len, var.getSeq());
+ it = hmap.find(id.refpos);
+ if (it==hmap.end()) hmap[id.refpos][id]=1; else (it->second)[id]++;
+ }
+ }
+
+ outputIndels(ctid, hmap,ofile,fa);
+ cout << "Wrote realigned candidate indels for target " << ctid << " to file " << outputFileName << endl;
+
+
+ ofile.close();
+}
+
+void GetCandidatesFromCIGAR::outputLibraries(LibInsertSize & libInsertSize, const string & outputFile)
+{
+
+ // open file
+ ofstream ofile(outputFile.c_str());
+ if (!ofile.is_open()) throw string("Cannot open ").append(outputFile).append(" for writing libraries.");
+
+ for (LibIterator libit = libInsertSize.begin();libit!=libInsertSize.end();libit++) {
+ string lib = string(libit->first);
+ // compute mean and std
+ InsertSizes & insertSizes = libit->second;
+ InsIterator insit;
+
+ long int tot = 0;
+ double mean = 0.0, std = 0.0;
+
+ std::set<int> isizes;
+
+ for (insit = insertSizes.begin(); insit!=insertSizes.end();insit++) {
+ tot += insit->second;
+ isizes.insert(insit->first);
+ }
+
+ double cum = 0;
+ int pct = int ( 0.9999 * double(tot));
+ int median = tot/2;
+ int max_isize = -1;
+ int median_isize = -1;
+ for (std::set<int>::const_iterator it = isizes.begin();it!=isizes.end();it++) {
+ cum += insertSizes[*it];
+ if (median_isize == -1 && cum>median) {
+ median_isize = *it;
+ }
+ }
+ isizes.clear();
+ max_isize = median_isize * 10;
+ //cout << "tot: " << tot << " pct: " << pct << " cum: " << cum << " max_isize: " << max_isize << " median: " << median << " median_isize: " << median_isize << endl;
+
+
+ double dtot = double(tot);
+ for (insit = insertSizes.begin(); insit!=insertSizes.end();insit++) if (insit->first<max_isize) {
+// cout << "isize: " << insit->first << " count: " << insit->second << endl;
+ mean += double(insit->first)*double(insit->second)/dtot;
+ }
+ for (insit = insertSizes.begin(); insit!=insertSizes.end();insit++) if (insit->first<max_isize) {
+ double dist = double(insit->first)-mean;
+ std += double(insit->second)/dtot*dist*dist;
+ }
+ cout << "Library: " << lib << " mean: " << mean << " stddev: " << sqrt(std) << endl;
+ // create histogram in vector
+ int len = int(mean+5*sqrt(std));
+ vector<long int> histo(len,2), inthisto(len,2);
+
+ for (insit = insertSizes.begin(); insit!=insertSizes.end();insit++) {
+ int isize = insit->first;
+ if (isize<len) {
+ histo[isize]=insit->second;
+ }
+ }
+
+ // smooth histogram out a little
+ int L = 5;
+ for (int i=0;i<len;i++) {
+ int min = i-L; if (min<0) min = 0;
+ int max = i+L; if (max>len) max = len;
+ int n = 0;
+ long int sum = 0;
+ for (int j=min;j<max;j++,n++) {
+ sum += histo[j];
+ }
+ inthisto[i] = (sum+1)/(n+1);
+ }
+
+
+ // write histogram to file
+ ofile << "#LIB " << lib << endl;
+ for (int i=0;i<len;i++) {
+ ofile << i << " " << inthisto[i] << endl;
+ }
+ }
+ ofile.close();
+}
+void GetCandidatesFromCIGAR::get(const string & _bamFile, const string & outputFileName, const string & fastaName)
+{
+ // also get histogram
+ LibInsertSize libInsertSize;
+
+ fasta::Fasta fa(fastaName);
+
+ samfile_t *bf;
+ bf=samopen(_bamFile.c_str(), "rb", 0);
+ bam1_t *b=bam_init1();
+
+ string outputFileVariants = outputFileName;
+ string outputFileLibraries = outputFileName;
+ outputFileVariants.append(".variants.txt");
+ outputFileLibraries.append(".libraries.txt");
+
+
+
+
+ ofstream ofile(outputFileVariants.c_str());
+ if (!ofile.is_open()) throw string("Cannot open ").append(outputFileName).append(" for writing CIGAR indels.");
+
+ hash_map<int,map<CIGARindel, int> > hmap;
+ hash_map<int,map<CIGARindel, int> >::iterator it;
+
+ int oldtid=-1;
+ string _oldtid="";
+ cout << "Parsing indels from CIGAR strings..." << endl;
+ long int numread = 0;
+
+
+ string defaultlib("dindel_default");
+ while (samread(bf,b)>=0) {
+
+ int btid = b->core.tid;
+ if (btid<0) continue; // unmapped read
+ const char *tidptr = bf->header->target_name[(b->core).tid];
+ if (!tidptr) continue;
+ string tid=string(tidptr);
+ if ((b->core).tid!=oldtid) {
+ if (oldtid!=-1) {
+ outputIndels(_oldtid, hmap,ofile,fa);
+ cout << "Wrote indels in CIGARS for target " << _oldtid << " to file " << outputFileName << endl;
+ }
+ oldtid=(b->core).tid;
+ _oldtid=tid;
+ hmap.clear();
+
+ }
+
+ vector<CIGARindel> indels;
+ getIndelFromCIGAR(b, indels);
+ BOOST_FOREACH(CIGARindel id, indels) {
+ it = hmap.find(id.refpos);
+ if (it==hmap.end()) hmap[id.refpos][id]=1; else (it->second)[id]++;
+ }
+
+ // get insertsize
+ //cout << int(b->core.flag & BAM_FPAIRED) << " " << int (b->core.flag & BAM_FPROPER_PAIR) << " tid: " << int(b->core.tid) << " mtid: " << b->core.mtid << " fdup: " << int(b->core.flag & BAM_FDUP) << " fqcfail: " << int( b->core.flag & BAM_FQCFAIL) << endl;
+
+ if ((b->core.flag & BAM_FPAIRED) && (b->core.flag & BAM_FPROPER_PAIR) && (b->core.tid == b->core.mtid) && !( (b->core.flag & BAM_FDUP) || (b->core.flag & BAM_FQCFAIL) )) {
+ const char *p = bam_get_library((bam_header_t *) bf->header, b);
+
+ string & lib = defaultlib;
+ if (p) lib = string(p);
+
+ //cout << "lib: " << lib << endl;
+
+ int isize = abs(b->core.isize);
+
+ LibIterator lit = libInsertSize.find(lib);
+ if (lit == libInsertSize.end()) {
+ libInsertSize[lib]=hash_map<int, long int>();
+ lit = libInsertSize.find(lib);
+ }
+
+ InsIterator iit = lit->second.find(isize);
+ if (iit == lit->second.end()) {
+ (lit->second)[isize]=1;
+ } else {
+ iit->second += 1;
+ }
+
+ }
+ numread++;
+ if (numread % 1000000==999999) {
+ cout << "Number of reads read: " << numread+1 << endl;
+ }
+ }
+ outputIndels(_oldtid, hmap,ofile,fa);
+ outputLibraries(libInsertSize, outputFileLibraries);
+
+ cout << "Wrote indels in CIGARS for target " << _oldtid << " to file " << outputFileName << endl;
+ cout << "Wrote library insert sizes to " << outputFileLibraries << endl;
+ cout << "done!" << endl;
+
+ bam_destroy1(b);
+
+ ofile.close();
+}
+
+GetCandidatesFromCIGAR::GetCandidatesFromCIGAR()
+{
+
+}
+
+
+GetCandidatesFromCIGAR::~GetCandidatesFromCIGAR()
+{
+
+}
diff --git a/GetCandidates.hpp b/GetCandidates.hpp
new file mode 100644
index 0000000..ba609bc
--- /dev/null
+++ b/GetCandidates.hpp
@@ -0,0 +1,107 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * GetCandidates.hpp
+ *
+ * Created on: Aug 27, 2009
+ * Author: caa
+ */
+
+#ifndef GETCANDIDATES_HPP_
+#define GETCANDIDATES_HPP_
+#include <map>
+#include <ext/hash_map>
+#include <vector>
+#include "MyBam.hpp"
+#include "Fasta.hpp"
+#include "Variant.hpp"
+using __gnu_cxx::hash;
+namespace std { using namespace __gnu_cxx; }
+// generic class for generating candidates
+class GetCandidates
+{
+public:
+ GetCandidates() {};
+ GetCandidates(const string & bamFile);
+
+ vector<AlignedVariant> get(const string tid, uint32_t start, uint32_t end);
+ void get(const string & outputFileName); // outputs directly to filename of the whole BAMfile
+ void outputToFile(const string & fileName);
+protected:
+ map<string, vector<AlignedVariant> > candidates; // candidates for every chromosome/reference sequence
+ MyBam bam;
+ virtual ~GetCandidates() {};
+};
+
+class GetCandidatesFromCIGAR : public GetCandidates
+{
+public:
+ class Params
+ {
+ public:
+ Params() {
+ alignWindow=100;
+ refPad=10;
+ }
+ int alignWindow, refPad;
+ } params;
+protected:
+ class CIGARindel
+ {
+ public:
+ CIGARindel(const uint32_t _refpos, int _len, const string _seq)
+ {
+ refpos=_refpos;
+ len=_len;
+ seq=_seq;
+ }
+ bool operator<(const CIGARindel & c) const
+ {
+ if (refpos==c.refpos) { if (seq!=c.seq) return seq<c.seq; else return len<c.len; } else return refpos<c.refpos;
+ }
+ uint32_t refpos;
+ int len;
+ string seq;
+ };
+ typedef hash_map<int, map<CIGARindel, int> > HMap;
+
+ class CFFData
+ {
+ public:
+ HMap hmap;
+ };
+
+public:
+ GetCandidatesFromCIGAR();
+ static int getIndelFromCIGARFetchFunc(const bam1_t *b, void *data);
+ void getIndelFromCIGARRegion(const vector<MyBam *> & myBams, const string & tid, int start, int end, const string & outputFileName, fasta::Fasta & fa);
+ void realignCandidateFile(const string & _varFile, bool isOneBased, const string & outputFileName, const string & fastaName);
+ void get(const string & bamFile, const string & outputFileName);
+ void get(const string & bamFile, const string & outputFileName, const string & fastaName);
+ ~GetCandidatesFromCIGAR();
+protected:
+ vector<AlignedVariant> alignCIGAR(const string & tid, const CIGARindel & id, fasta::Fasta & fa);
+ static void getIndelFromCIGAR(const bam1_t *b, vector<CIGARindel> & indels);
+ void outputIndels(const string & tid, const HMap & hmap, ofstream & ofile, fasta::Fasta & fa, int outputType);
+ typedef hash_map<int, long int> InsertSizes;
+ typedef string_hash<InsertSizes> LibInsertSize;
+ typedef LibInsertSize::iterator LibIterator;
+ typedef InsertSizes::iterator InsIterator;
+
+ void outputLibraries(LibInsertSize & libInsertSize, const string & outputFile);
+
+};
+
+#endif /* GETCANDIDATES_HPP_ */
diff --git a/HapBlock.cpp b/HapBlock.cpp
new file mode 100644
index 0000000..ceeb15b
--- /dev/null
+++ b/HapBlock.cpp
@@ -0,0 +1,204 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "HapBlock.hpp"
+#include <sstream>
+#include <iostream>
+using namespace std;
+
+HapBlock::HapBlock(const HapBlock & hb, uint32_t _start, uint32_t _len)
+{
+ assert(hb.end()>=_start+_len-1);
+
+ pos0=_start;
+ pos1=_start+_len-1;
+ if (pos1<pos0) {
+ cout << "SMALLER" << endl;
+ }
+ type = HapBlock::NORMAL;
+ assert(pos1>=pos0);
+ haplotypes.clear();
+ bool found=false;
+ for (map<Haplotype, int>::const_iterator it=hb.haplotypes.begin();it!=hb.haplotypes.end();it++) {
+ Haplotype newHap=Haplotype(it->first, _start-hb.pos0, _len);
+ if (newHap.type==Haplotype::Ref) found=true;
+ map<Haplotype, int>::iterator hit=haplotypes.find(newHap);
+ if (hit==haplotypes.end()) {
+ haplotypes[newHap]=it->second;
+ } else {
+ if (newHap.type==Haplotype::Ref) hit->first.type=Haplotype::Ref;
+ hit->second+=it->second;
+ }
+ //haplotypes[ Haplotype(it->first, _start-hb.pos0, _len) ]+=it->second;
+ // += because subhaplotype may occur multiple times
+ }
+}
+
+void HapBlock::insert(const Haplotype & seq)
+{
+ map<Haplotype, int>::iterator hit=haplotypes.find(seq);
+ if (hit==haplotypes.end()) {
+ haplotypes[seq]=1;
+ } else {
+ if (seq.type==Haplotype::Ref) hit->first.type=Haplotype::Ref;
+ hit->second++;
+ }
+}
+
+HapBlock::HapBlock(const Haplotype & h, uint32_t start)
+{
+ pos0=start;
+ pos1=start+h.size()-1;
+ if (pos1<pos0) {
+ cout << pos0 << " " << pos1 << " " << endl;
+ cout << "h: " << h << endl;
+ }
+ assert(pos1>=pos0);
+ haplotypes[h]=1;
+ type=HapBlock::NORMAL;
+}
+
+void HapBlock::setFrequencies()
+{
+ int sum=0;
+ for (map<Haplotype, int>::iterator it=haplotypes.begin();it!=haplotypes.end();it++) {
+ sum+=it->second;
+ }
+ for (map<Haplotype, int>::iterator it=haplotypes.begin();it!=haplotypes.end();it++) {
+ (it->first).freq=double(it->second)/double(sum);
+ }
+}
+ostream &operator<<(ostream &stream, const HapBlock &hb)
+{
+ // construct matrix
+ vector<string> output(hb.length());
+ vector<int> counts;
+ vector<double> freqs;
+ for (map<Haplotype, int>::const_iterator it=hb.haplotypes.begin();it!=hb.haplotypes.end();it++)
+ {
+ for (size_t y=0;y<hb.length();y++) {
+ if ((it->first).size()>y) output[y]+=((it->first)[y]); else output[y]+='.';
+ output[y]+=' ';
+ }
+ counts.push_back(it->second);
+ freqs.push_back(it->first.freq);
+ }
+
+ stream << "start: " << hb.start() << " end: " << hb.end() << " numHap: " << hb.haplotypes.size() << endl;
+ for (size_t y=0;y<output.size();y++) cout << output[y] << endl;
+ for (size_t y=0;y<counts.size();y++) cout << freqs[y] << " "; cout << endl;
+ for (size_t y=0;y<counts.size();y++) cout << counts[y] << " "; cout << endl;
+ for (map<Haplotype, int>::const_iterator it=hb.haplotypes.begin();it!=hb.haplotypes.end();it++) cout << it->first.type << " ";
+ return stream;
+}
+
+bool HapBlock::hasHaplotype(const Haplotype & seq, uint32_t seqStart)
+{
+ //cout << "hasHaplotype(" << seq << "," << seqStart << "): ";
+ for (map<Haplotype, int>::iterator it=haplotypes.begin();it!=haplotypes.end();it++) {
+ if (it->first.compare(seqStart-start(), seq.size(), seq)==0) { it->second++; /*cout << "true" << endl;*/ return true; };
+ }
+ //cout << "false" << endl;
+ return false;
+}
+
+void HapBlock::showVector(ostream &stream,const vector<HapBlock*> & hapBlocks,uint32_t midPos)
+{
+ size_t nb=hapBlocks.size();
+ vector<size_t> length(nb,0), num(nb,0), pos(nb,0);
+ vector<HapBlock*> hbs(nb);
+ size_t y=0,x=0,c=0;
+ const size_t offset=20;
+ size_t indelPos=0;
+ for (x=0;x<nb;x++) if (hapBlocks[x]!=NULL){
+ pos[c]=offset+y;
+ if (midPos>=hapBlocks[x]->start() && midPos<=hapBlocks[x]->end()) indelPos=pos[c];
+ length[c]=hapBlocks[x]->length();
+ y+=length[c];
+ hbs[c]=hapBlocks[x];
+ num[c]=hbs[c]->size();
+ c++;
+ }
+
+ /*
+ for (map<int, HapBlock *>::const_iterator it=hb.insertions.begin();it!=hb.insertions.end();it++,x++) {
+ pos[c]=y;
+ length[c]=it->second->length();
+ y+=length[c];
+ hbs[c]=it->second;
+ num[c]=hbs[c]->size();
+ c++;
+ }
+ */
+
+ size_t maxLen=*max_element(num.begin(), num.end());
+ vector<string> lines(maxLen*2+1,string(offset+y,' '));
+
+ lines[1][1]='R'; lines[1][2]='E'; lines[1][3]='F';
+ //for (size_t x=0;x<lines.size();x++) { lines[x][0]='\t'; };
+ for (size_t i=0;i<pos.size();i++) {
+ //cout << "o: " << o << " o.size() : " << o.size() << " pos[i]: " << pos[i] << endl;
+ lines[0][pos[i]]='|';
+ /*
+ size_t j=1;
+ for (map<Haplotype, int>::const_iterator it=hbs[i]->haplotypes.begin();it!=hbs[i]->haplotypes.end();it++) {
+ string u=it->first.seq;
+ //cout << "u: " << u << endl;
+ for (size_t l=0;l<u.size();l++) lines[j][pos[i]+l]=u[l];
+ j++;
+ }
+ j=maxLen+1;
+ for (map<Haplotype, int>::const_iterator it=hbs[i]->haplotypes.begin();it!=hbs[i]->haplotypes.end();it++) {
+ string o;
+ ostringstream os(ostringstream::out);
+ os << int(round(-log(it->first.freq))); o=os.str();
+ for (size_t l=0;l<o.size();l++) lines[j][pos[i]+l]=o[l];
+ j++;
+ }
+ */
+ // order haplotypes such that reference sequence is top, then sorted based on frequency
+ vector<Haplotype> haps; Haplotype refHap;
+ for (map<Haplotype, int>::const_iterator it=hbs[i]->haplotypes.begin();it!=hbs[i]->haplotypes.end();it++) if (it->first.type!=Haplotype::Ref) haps.push_back(it->first); else refHap=it->first;
+ class SortFunc
+ {
+ public:
+ static bool sortFunc(const Haplotype & h1, const Haplotype & h2) { return h1.freq<h2.freq; };
+ };
+ sort(haps.begin(),haps.end(), SortFunc::sortFunc);
+ haps.push_back(refHap);
+
+ size_t j=1;
+ for (int k=int(haps.size())-1;k>=0;k--) {
+ string u=haps[k].seq;
+ //cout << "u: " << u << endl;
+ for (size_t l=0;l<u.size();l++) lines[j][pos[i]+l]=u[l];
+ j++;
+ }
+ j=maxLen+1;
+
+ for (int k=int(haps.size())-1;k>=0;k--) {
+ string o;
+ ostringstream os(ostringstream::out);
+ os << int(round(-log(haps[k].freq))); o=os.str();
+ for (size_t l=0;l<o.size();l++) lines[j][pos[i]+l]=o[l];
+ j++;
+ }
+
+
+ }
+ lines[0][indelPos]='X';
+ for (size_t j=0;j<lines.size();j++) {
+ stream << lines[j] << endl;
+ }
+}
diff --git a/HapBlock.hpp b/HapBlock.hpp
new file mode 100644
index 0000000..884bafe
--- /dev/null
+++ b/HapBlock.hpp
@@ -0,0 +1,57 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef HAPBLOCK_HPP_
+#define HAPBLOCK_HPP_
+#include <stdint.h>
+#include <string>
+#include <assert.h>
+#include <iostream>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "Haplotype.hpp"
+using namespace std;
+
+class HapBlock
+{
+public:
+ map<Haplotype, int> haplotypes;
+ bool operator<(const HapBlock & hb) const { return pos0<hb.pos0; };
+
+
+ HapBlock(const Haplotype & seq, uint32_t start);
+ HapBlock(const HapBlock & hb, uint32_t _start, uint32_t _len);
+ bool hasHaplotype(const Haplotype & seq, uint32_t seqStart);
+ uint32_t start() const { return pos0; };
+ uint32_t end() const { return pos1; };
+ uint32_t length() const { return end()-start()+1; };
+ size_t size() const { return haplotypes.size(); };
+ void insert(const Haplotype & seq);// { haplotypes[seq]++; }
+ vector<pair<Haplotype,int> > getHaplotypes();
+ void setFrequencies();
+ friend ostream &operator<<(ostream &stream, const HapBlock &hb);
+ static void showVector(ostream &stream,const vector<HapBlock*> & hapBlocks, uint32_t midPos);
+ void setType(int _type) { type=_type; };
+ int getType() const { return type; };
+
+ static const int NORMAL=0;
+ static const int INSERT=1;
+private:
+ uint32_t pos0, pos1;
+ int type;
+};
+
+#endif /*HAPBLOCK_HPP_*/
diff --git a/Haplotype.hpp b/Haplotype.hpp
new file mode 100644
index 0000000..638f0a0
--- /dev/null
+++ b/Haplotype.hpp
@@ -0,0 +1,389 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef HAPLOTYPE_HPP_
+#define HAPLOTYPE_HPP_
+#include <stdint.h>
+#include <string>
+#include <assert.h>
+#include <iostream>
+#include <cmath>
+#include <map>
+#include <vector>
+#include "Variant.hpp"
+#include "MLAlignment.hpp"
+#include "foreach.hpp"
+#include <ext/hash_map>
+#include <set>
+//#include "Fast.hpp"
+using namespace std;
+using __gnu_cxx::hash;
+namespace std { using namespace __gnu_cxx; }
+
+const char NUCLEOTIDES[]={'A','T', 'G','C'};
+
+
+
+
+
+class Haplotype //: public string
+{
+public:
+ // ContainsInDel means the haplotype is contains a small non-zero length segment
+ // that was identified as an InDel from Cigar by WH alignment
+ //typedef enum { Normal, In, Del, HasIn, HasDel, HasInDel } Type;
+ typedef int Type;
+ static const int Ref=1;
+ static const int Normal=Ref<<1;
+ static const int In=Ref<<2;
+ static const int Del=Ref<<3;
+
+
+
+ mutable Type type;
+ // nfreq is the product of the frequencies of haplotypes that are not indels
+ mutable double freq, conf, nfreq;
+ uint32_t pos;
+ string seq;
+ string indel; // if haplotype has indel for a given position
+ string align; // annotates for each base in the sequence this haplotype was aligned to whether the equal to the reference=R, snp=S, deletion=D, insertions cannot be recorded this way
+ //vector<pair<string, double> > haps;
+ map<int, AlignedVariant > indels, snps;
+ MLAlignment ml;
+
+ size_t size() const { return seq.size(); };
+ char & operator[](size_t idx) { return seq[idx]; };
+ const char & operator[](size_t idx) const { return seq[idx]; };
+ Haplotype & operator+=(char c) { seq+=c; return *this; };
+ bool operator<(const Haplotype & h) const { return seq<h.seq; };
+
+ /*
+ bool operator<(const Haplotype & h) const
+ {
+ if (seq!=h.seq) {
+ if (type<h.type) return true;
+ else if (type==h.type) return seq<h.seq;
+ } else return seq<h.seq;
+ };
+ */
+
+
+ int compare ( size_t pos1, size_t n1, const Haplotype & h ) const { return seq.compare(pos1,n1,h.seq); };
+ Haplotype & insert ( size_t pos1, size_t n, char c ) { seq.insert(pos1,n,c); return *this; };
+ void reserve(size_t n) { seq.reserve(n); };
+ Haplotype & append(const string & str) { seq.append(str); return *this; };
+
+ Haplotype(const Haplotype & h, size_t pos0, size_t n)
+ {
+ seq=h.seq.substr(pos0, n);
+ conf=h.conf;
+ freq=h.freq;
+ type=h.type;
+ nfreq=h.nfreq;
+ indel=h.indel;
+ align=h.align;
+ pos=h.pos;
+ snps=h.snps;
+ indels=h.indels;
+ ml=h.ml;
+ //haps=h.haps;
+ };
+ Haplotype()
+ {
+ type=Normal;
+ conf=0.0;
+ freq=0.0;
+ nfreq=0.0;
+ pos=0;
+
+ };
+ Haplotype(Type _type)
+ {
+ type=_type;
+ conf=0.0;
+ freq=0.0;
+ nfreq=0.0;
+ pos=0;
+ }
+ Haplotype(Type _type, const string & _seq )
+ {
+ seq=_seq;
+ type=_type;
+ conf=0.0;
+ freq=0.0;
+ nfreq=0.0;
+ pos=0;
+ }
+ Haplotype(const Haplotype &h)
+ {
+ seq=h.seq;
+ conf=h.conf;
+ freq=h.freq;
+ type=h.type;
+ nfreq=h.nfreq;
+ indel=h.indel;
+ align=h.align;
+ pos=h.pos;
+ snps=h.snps;
+ indels=h.indels;
+ ml=h.ml;
+ //haps=h.haps;
+ }
+
+ Haplotype & operator=(const Haplotype & h)
+ {
+ if (&h!=this) {
+
+ seq=h.seq;
+ conf=h.conf;
+ freq=h.freq;
+ type=h.type;
+ nfreq=h.nfreq;
+ indel=h.indel;
+ pos=h.pos;
+ snps=h.snps;
+ indels=h.indels;
+ align=h.align;
+ ml=h.ml;
+ //haps=h.haps;
+ }
+ return *this;
+ }
+
+ string getIndel(int relPos) const
+ {
+ map<int, AlignedVariant>::const_iterator it=indels.find(relPos);
+ if (it==indels.end()) {
+ char a=align[relPos];
+ if (a=='R') return string("*REF"); else return string("R=>")+=a;
+ } else {
+ const AlignedVariant & av=it->second;
+ //if (av.getType()==Variant::SNP) throw string("Haplotype::getIndel error");
+ return av.getString();
+ }
+ }
+
+ string getSNP(int relPos) const
+ {
+ map<int, AlignedVariant>::const_iterator it=snps.find(relPos);
+ if (it==snps.end()) {
+ char a=align[relPos];
+ if (a=='R') return string("*REF"); else return string("R=>")+=a;
+ } else {
+ return it->second.getString();
+ }
+ }
+ Haplotype filtered() const
+ {
+ /*
+ Haplotype hap=*this, newhap=*this;
+ newhap.seq.clear();
+ transform(hap.seq.begin(), hap.seq.end(), hap.seq.begin(), ::toupper);
+
+ for (size_t x=0;x<hap.seq.size();x++) {
+ if (hap.seq[x]!='_' && hap.seq[x]!='#') newhap+=hap.seq[x];
+ }
+ */
+ return *this;
+ }
+
+ void addRefVariant(int rp)
+ {
+ map<int, AlignedVariant>::const_iterator it;
+
+ // first do indels
+ int offset=0;
+ // get base position in haplotype of rp (relative position in reference sequence)
+
+ bool addVariant=true;
+ it=indels.begin();
+ while (it!=indels.end() && it->first<=rp) {
+ if (it->second.getType()==AlignedVariant::DEL){
+ if (it->first+it->second.size()<=rp) {
+ offset-=it->second.size();
+ } else {
+ // deletion deleted rp from reference
+ //addVariant=false;
+ break;
+ }
+ }
+ if (it->second.getType()==AlignedVariant::INS) offset+=it->second.size();
+ it++;
+ }
+
+ if (addVariant) {
+ int readStart=rp+offset;
+ int readEnd=rp+offset;
+ int hapStart=rp;
+ int hapEnd=rp;
+
+ if (indels.find(rp)==indels.end()) {
+ // no indel at position relPos
+ string gt;
+ char a=align[rp];
+ if (a!='R') {
+ gt=string("R=>"); gt+=a;
+ } else gt=string("*REF");
+ indels[rp]=AlignedVariant(gt, hapStart, hapEnd, readStart, readEnd);
+ }
+
+ if (snps.find(rp)==snps.end()) {
+ // no snp at position relPos
+ string gt;
+ char a=align[rp];
+ if (a!='R') {
+ gt=string("R=>"); gt+=a;
+ } else gt=string("*REF");
+ snps[rp]=AlignedVariant(gt, hapStart, hapEnd, readStart, readEnd);
+ }
+ }
+ }
+
+
+ int countIndels() const
+ {
+ int num = 0;
+ for (map<int, AlignedVariant>::const_iterator it = indels.begin();it!=indels.end();it++) {
+ if (it->second.getType() == Variant::INS || it->second.getType() == Variant::DEL) num++;
+ }
+ return num;
+ }
+
+ int countSNPs() const
+ {
+ int num = 0;
+ for (map<int, AlignedVariant>::const_iterator it = snps.begin();it!=snps.end();it++) {
+ if (it->second.getType() == Variant::SNP && !it->second.isRef()) num++;
+ }
+ return num;
+ }
+ /*
+ int getRefPos(int pos) const
+ {
+ // returns position of base in haplotype with respect to reference it was aligned to
+ if (!indels.size()) return pos; else {
+ int offset=0;
+ map<int, AlignedVariant>::const_iterator it=indels.begin();
+ while (it!=indels.end() && pos>it->second.getPos()) {
+ offset-=it->second.length;
+ it++;
+ }
+ return pos+offset;
+ }
+ }
+ */
+
+ friend ostream &operator<<(ostream &stream, const Haplotype &h)
+ {
+ stream << "type: " << h.type << " seq: " << h.seq << " len: " << h.size() << " nfreq: " << h.nfreq << " freq: " << h.freq << " indel: " << h.indel;
+ return stream;
+ }
+
+ /*
+ void printHaps() const
+ {
+ cout << "freq: " << nfreq << " length: " << seq.size() << endl;
+ for (size_t i=0;i<haps.size();i++) cout << "[" << i << " |" << haps[i].first << "|," << haps[i].second << "]";
+ cout << endl;
+
+ for (size_t i=0;i<haps.size();i++) cout << haps[i].first; cout << endl;
+ for (size_t i=0;i<haps.size();i++) {
+ cout << int(round(-log(haps[i].second)));
+ if (haps[i].first.size()>1) cout << string(haps[i].first.size()-1,' ');
+ }
+ cout << endl;
+ cout << seq << endl;
+
+ cout << endl;
+ }
+ */
+
+};
+
+
+class HapHash
+{
+public:
+ HapHash(unsigned int _kmer, const Haplotype & hap )
+ {
+ kmer=_kmer;
+ mask=( 1<< (2*kmer) )-1;
+ makeHash(hap);
+ }
+ unsigned int getKmer() const { return kmer; };
+ unsigned int getMask() const { return mask; };
+
+ const set<int> & lookup(const string & seq, int pos) const {
+ int v=convert(seq,pos);
+ Hash::const_iterator it=hash.find(v);
+ if (it==hash.end()) return emptySet; else return it->second;
+ }
+
+ inline const set<int> & lookup(unsigned int key) const
+ {
+ Hash::const_iterator it=hash.find(key);
+ if (it==hash.end()) return emptySet; else return it->second;
+
+ }
+ inline unsigned int convert(const string & seq, int pos) const
+ {
+ if (pos+kmer>seq.size()) throw string("HapHash string too short");
+ int v=0;
+ for (int x=pos, y=0;x<int(pos+kmer);x++,y++) {
+ v |= (map_char(seq[x]) << (2*y) );
+ }
+ return v;
+ }
+ inline unsigned int pushBack(const unsigned int & key, const char & c) const
+ {
+ return (key >> 2) | (map_char(c) << (2*(kmer-1)));
+ }
+ void print() {
+ for (Hash::const_iterator it=hash.begin();it!=hash.end();it++) {
+ cout << "hash[" << it->first << "]: ";
+ BOOST_FOREACH(int i, it->second) {
+ cout << " " << i;
+ }
+ cout << endl;
+
+ }
+
+
+ }
+ inline int map_char(const char & c) const
+ {
+ // TODO do something with N's in reads?
+ if (c=='A') return 0; else if (c=='C') return 1; else if (c=='G') return 2; else if (c=='T') return 3; else return 0; //throw string("Haplotype/Read in hash has N's");
+ }
+ typedef hash_map<unsigned int, set<int> > Hash;
+
+protected:
+ unsigned int kmer;
+ unsigned int mask;
+ const Haplotype *hap_ptr;
+ Hash hash;
+ set<int> emptySet;
+
+
+
+ void makeHash(const Haplotype & hap)
+ {
+ for (int x=0;x<int(hap.size())-int(kmer);x++) hash[convert(hap.seq,x)].insert(x);
+ }
+};
+
+
+
+#endif /*HAPLOTYPE_HPP_*/
+
diff --git a/HaplotypeDistribution.cpp b/HaplotypeDistribution.cpp
new file mode 100644
index 0000000..910252b
--- /dev/null
+++ b/HaplotypeDistribution.cpp
@@ -0,0 +1,486 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <stdint.h>
+#include <string>
+#include <assert.h>
+#include <iostream>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include "HaplotypeDistribution.hpp"
+
+using namespace std;
+
+
+
+
+HaplotypeDistribution::HaplotypeDistribution(uint32_t _midPos, const string & refSeq, uint32_t refSeqStart)
+{
+ pos0=0;
+ pos1=0;
+ midPos=_midPos;
+
+ // add reference sequence
+ uint32_t bs = 4;
+ uint32_t rm = refSeq.size() % bs;
+ int add = 1; if (rm==0) add=0;
+
+ for (size_t x=0;x<(refSeq.size()/bs)+add;x++) {
+ uint32_t start = refSeqStart+x*bs;
+ Haplotype refHap(Haplotype::Ref, refSeq.substr(x*bs,bs));
+ insertSeq(refHap, start);
+ }
+}
+
+
+ostream &operator<<(ostream &stream, const HaplotypeDistribution &hb)
+{
+ size_t cnt=0;
+
+ for (size_t x=0;x<hb.hapBlocks.size();x++) if (hb.hapBlocks[x]!=NULL){
+ stream << "HAPLOTYPE BLOCK " << cnt++ << endl;
+ stream << *hb.hapBlocks[x] << endl;
+ }
+
+ cnt=0;
+
+ for (map<int, HapBlock *>::const_iterator it=hb.insertions.begin();it!=hb.insertions.end();it++) {
+ stream << "INSERTION " << cnt++ << endl;
+ stream << *it->second << endl;
+ }
+ return stream;
+}
+
+
+
+int HaplotypeDistribution::fetchFuncInsertRead(const bam1_t *b, void *data)
+{
+ ( (HaplotypeDistribution*) data)->insertRead(b);
+ return 0;
+}
+
+void HaplotypeDistribution::insertRead(const bam1_t* b)
+{
+ if ((b->core.flag & BAM_FMUNMAP) != 0) return;
+ /*
+ for each cigar operation in read {
+
+ get sequence corresponding to cigar n
+ calc starting position in reference
+ calc confidence of seq (product of mapping quality and base qualities)
+ make haplotype-> seq
+ insert_seq(seq, reference_pos)
+ */
+
+ const bam1_core_t *c=&b->core;
+ uint32_t *cigar=bam1_cigar(b);
+ uint32_t k, l=0;
+ uint32_t refPos = c->pos;
+ int lastop=-1;
+ uint32_t lastPos=refPos;
+ //cout << "read: " << bam1_qname(b) << endl;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int32_t len=cigar[k] >> BAM_CIGAR_SHIFT;
+ // cout << "cigar" << k << endl;
+
+ Haplotype seq; seq.reserve(len);
+ seq.conf=(double) c->qual; // this scales the confidence from the individual base calls with the mapping confidence
+
+ if (op==BAM_CINS || op==BAM_CMATCH || op==BAM_CSOFT_CLIP) {
+ for(int32_t x=0;x<len;x++) {
+ seq+=( bam_nt16_rev_table[ bam1_seqi(bam1_seq(b), l) ] );
+ seq.conf+=(double) bam1_qual(b)[l]; // base quality is on log10 scale
+ l++;
+ }
+ } else if (op==BAM_CDEL) {
+ seq.insert(0, len, '#');
+ // cout << "INSDELETION" << endl;
+ }
+
+
+ if (op==BAM_CINS) seq.type=Haplotype::In; else if (op==BAM_CDEL) seq.type=Haplotype::Del; else seq.type=Haplotype::Normal;
+ //cout << endl << " *** " << endl;
+ //cout << "op: " << op << " len: " << len << endl;
+ //cout << "CIGARseq: " << seq << endl;
+ //cout << "refPos " << refPos << endl;
+
+ // now add it to the haplotype structure for this location
+ if (seq.size()) {
+ if (1) { //refPos!=midPos) {
+ //cout << bam1_qname(b) << " refpos: " << refPos << " seq: " << seq << endl;
+ if (seq[0]=='#') {
+ // deletion, recode
+
+ if (seq.size()>30) {
+ cerr << string("Deletion is too long...") << endl;
+ len = 30;
+
+ }
+ seq.seq.clear();
+ seq.seq+=(char(int('#')+len));
+ }
+ insertSeq(seq, refPos);
+ }
+ else {
+ if (op==BAM_CINS || op==BAM_CDEL) {
+ indelsAtMidPos.insert(seq);
+ } else insertSeq(seq, refPos);
+ }
+ }
+
+ // check if previous seq was not an insert
+ if (lastop!=-1 && lastop!=BAM_CINS) {
+ if (lastPos==refPos && lastop!=BAM_CSOFT_CLIP && lastop!=BAM_CHARD_CLIP) throw string("Mag niet.");
+ for (uint32_t pos=lastPos;pos<refPos;pos++) {
+ map<int, HapBlock*>::iterator it=insertions.find(pos);
+ if (it!=insertions.end()) {
+ (it->second)->insert(Haplotype(Haplotype::In));
+ }
+ }
+ }
+
+ // update position for the next cigar
+ lastPos=refPos;
+ if (op == BAM_CMATCH || op == BAM_CDEL || op==BAM_CREF_SKIP) {
+ refPos+=(uint32_t) len;
+ } else if (op!=BAM_CINS && op != BAM_CSOFT_CLIP && op != BAM_CHARD_CLIP) throw string("I don't know how to smoke this CIGAR");
+ lastop=op;
+ }
+
+
+}
+
+vector<Variant> HaplotypeDistribution::getIndelVariantsAtMidPos()
+{
+ vector<Variant> variants;
+ BOOST_FOREACH(Haplotype hap, indelsAtMidPos) {
+ if (hap.type==Haplotype::In) {
+ variants.push_back(Variant(string("+").append(hap.seq)));
+ } else if (hap.type==Haplotype::Del) {
+ variants.push_back(Variant(string("-").append(string(hap.seq.size(),'R'))));
+ } else throw string("Unrecognized variant");
+ }
+ return variants;
+}
+
+void HaplotypeDistribution::setFrequencies()
+{
+ for (size_t x=0;x<hapBlocks.size();x++) {
+ HapBlock * hb=hapBlocks[x];
+ if (hb!=NULL) {
+ hb->setFrequencies();
+ }
+ }
+ // insertions
+
+ for (map<int, HapBlock*>::iterator it=insertions.begin();it!=insertions.end();it++) {
+ HapBlock * hb=it->second;
+ if (hb!=NULL) {
+ hb->setFrequencies();
+ }
+ }
+
+}
+
+void HaplotypeDistribution::updateBlock(HapBlock *hb, const Haplotype & seq, uint32_t seqStart)
+{
+ if (seq.size()!=hb->length() || seqStart!=hb->start()) throw string("updateBlock-seq mismatch.");
+ hb->insert(seq);
+}
+
+
+
+
+
+bool sortFunc(const HapBlock *a, const HapBlock *b)
+{
+ if (a==NULL && b!=NULL) return false;
+ else if (a!=NULL && b==NULL) return true;
+ else if (a==NULL && b==NULL) return false;
+ else {
+ if (a->start()<b->start()) return true; else return false;
+ }
+}
+
+void HaplotypeDistribution::check()
+{
+ sort(hapBlocks.begin(), hapBlocks.end(), sortFunc);
+ vector<HapBlock*>::iterator it=find(hapBlocks.begin(), hapBlocks.end(),(HapBlock *) NULL );
+ for (vector<HapBlock*>::iterator it2=it;it2!=hapBlocks.end();it2++) {
+ if (*it2!=NULL) throw string("Error: NULLs not consecutive");
+ }
+ for (size_t x=0;x<hapBlocks.size();x++) {
+ if (hapBlocks[x]->end()<hapBlocks[x]->start()) {
+ cout << "CHECK SMALLER HD:" << endl;
+ cout << *this << endl;
+
+ throw string("Blocks are smaller!");
+ }
+ }
+ for (size_t x=1;x<hapBlocks.size();x++) {
+ if (hapBlocks[x-1]->end()+1!=hapBlocks[x]->start()) {
+ cout << "CHECK CONSECUTIVE HD:" << endl;
+ cout << *this << endl;
+
+ throw string("Blocks are not consecutive!");
+ }
+ }
+
+ for (size_t x=1;x<hapBlocks.size();x++) {
+ if (hapBlocks[x-1]->end()>=hapBlocks[x]->start()) {
+ cout << "CHECK HD:" << endl;
+ cout << *this << endl;
+
+ throw string("Blocks are overlapping!");
+ }
+ }
+}
+
+
+void HaplotypeDistribution::newBlock(HapBlock *hb)
+{
+ //checkBlock(hb);
+ vector<HapBlock*>::iterator it=find(hapBlocks.begin(), hapBlocks.end(),(HapBlock *) NULL );
+ if (it==hapBlocks.end()) {
+ hapBlocks.push_back(hb);
+ } else {
+ *it=hb;
+ }
+ if (hb->start()<pos0) pos0=hb->start();
+ if (hb->end()>pos1) pos1=hb->end();
+
+ sort(hapBlocks.begin(), hapBlocks.end(), sortFunc);
+}
+
+
+/*
+void HaplotypeDistribution::invalidateBlock(HapBlock *hb)
+{
+ for (uint32_t p=hb->start();p<=hb->end();p++) {
+ posToBlock[p-pos0]=-1;
+ }
+}
+*/
+void HaplotypeDistribution::deleteBlock(int idx)
+{
+ delete hapBlocks[idx];
+ hapBlocks[idx]=NULL;
+ if (hapBlocks.size()>1 && idx!=(int)hapBlocks.size()-1) {
+ hapBlocks[idx]=hapBlocks.back();
+ hapBlocks.back()=NULL;
+ }
+}
+
+
+void HaplotypeDistribution::splitBlock(int idx, const Haplotype & seq, uint32_t seqStart)
+{
+ // block **********
+ // seq ***
+
+ if (seq.size()==0) throw string("Empty haplotype!");
+
+ uint32_t seqEnd=seqStart+seq.size()-1;
+ HapBlock & block=*hapBlocks[idx];
+ if (seqStart<block.start()||seqStart+seq.size()-1>block.end()) throw string("seq outside of block boundaries");
+
+ uint32_t lenA=seqStart-block.start();
+ uint32_t lenB=seq.size();
+ if (lenB==0) throw string("Empty sequence!");
+ uint32_t lenC=(block.end()==seqStart+seq.size()-1) ? 0 : block.end()-seqEnd;
+ // cout << "block.start: " << block.start() << " block.end " << block.end() << " seqStart: " << seqStart << " seqEnd: " << seqStart+seq.size()-1 << " lenA: " << lenA << " lenB: " << lenB << " lenC: " << lenC << endl;
+ if (1) { //!block.hasHaplotype(seq, seqStart)) {
+ // split them
+ // note that blocks are not overlapping, so we can append them to the vector
+ HapBlock *hbA, *hbB, *hbC;
+ if (lenA) hbA=new HapBlock(block, block.start(), lenA);
+ if (lenB) hbB=new HapBlock(block, block.start()+lenA, lenB);
+
+
+
+ if (lenC) hbC=new HapBlock(block, hbB->end()+1, lenC);
+
+ deleteBlock(idx);
+
+ if (lenB) newBlock(hbB);
+ updateBlock(hbB, seq, seqStart);
+
+ //if (lenA) cout << "hbA: " << *hbA << endl;
+ //if (lenB) cout << "hbB: " << *hbB << endl;
+ //if (lenC) cout << "hbC: " << *hbC << endl;
+
+ if (lenA) newBlock(hbA);
+ if (lenC) newBlock(hbC);
+ }
+
+
+
+}
+
+int HaplotypeDistribution::getFirstOverlappingBlock(uint32_t seqStart, uint32_t seqEnd) const
+{
+ size_t x=0;
+ while (x<hapBlocks.size()&&hapBlocks[x]!=NULL) {
+ const HapBlock & hb=*hapBlocks[x];
+ if ( (hb.end()>=seqStart) && hb.start()<=seqEnd ) return int(x);
+ x++;
+ }
+ return -1;
+}
+
+void HaplotypeDistribution::insertSeq(Haplotype & seq, uint32_t seqStart)
+{
+ // this->check();
+ if (seq.type==Haplotype::Normal || seq.type==Haplotype::Ref || seq.type==Haplotype::Del) {
+ //cout << "insertSeq. seq: " << seq << " seqStart: " << seqStart << endl;
+ uint32_t seqEnd = seqStart+(uint32_t) seq.size()-1;
+ int idx=getFirstOverlappingBlock(seqStart, seqEnd);
+ if (idx!=-1) {
+ // cout << "found idx: " << idx << endl;
+ HapBlock & block=*hapBlocks[idx];
+ if (block.start()<seqStart) {
+ assert(block.end()>=seqStart);
+ if (seqEnd>block.end()) {
+ // cout << "typeA" << endl;
+ uint32_t overlap=block.end()-seqStart+1;
+ // block **********
+ // seq ********
+ Haplotype seqA(seq, 0, overlap);
+ splitBlock(idx, seqA, seqStart);
+ Haplotype seqB(seq, overlap, seq.size());
+ insertSeq(seqB, seqStart+overlap);
+ } else {
+ // cout << "typeB" << endl;
+ // block ***************
+ // seq *****
+ splitBlock(idx, seq, seqStart);
+ }
+ } else
+ {
+ // block.start() >=seqStart
+ if (block.end()>seqEnd) {
+ // block.start() >=seqStart && block.end()>seqEnd
+ // cout << "typeC" << endl;
+ // block ***********
+ // seq *******
+ uint32_t overlap=seqEnd-block.start()+1;
+ assert(overlap>0 && overlap<=seq.size());
+
+ Haplotype seqB(seq, seq.size()-overlap, overlap);
+
+ splitBlock(idx, seqB, block.start());
+ // cout << "seqB: " << seqB << endl;
+
+ if (overlap<seq.size()) {
+ Haplotype seqA(seq, 0, seq.size()-overlap);
+ // cout << "seqA: " << seqA << endl;
+ newBlock(new HapBlock(seqA, seqStart));
+ }
+
+ // note that newBlock invalidates idx!
+
+ } else
+ {
+ // block.start() >=seqStart && block.end()<=seqEnd
+ // cout << "typeD" << endl;
+ // block ********* ***
+ // seq ******************
+ uint32_t lenA=block.start()-seqStart;
+ uint32_t lenB=block.end()-block.start()+1;
+ uint32_t lenC=seqEnd-block.end();
+ if (lenA) {
+ Haplotype seqA(seq, 0, lenA);
+ newBlock(new HapBlock(seqA, seqStart));
+ }
+
+ assert(lenB>0);
+ Haplotype seqB(seq, lenA, lenB);
+ updateBlock(&block, seqB, seqStart+lenA);
+
+ if (lenC) {
+ Haplotype seqC(seq, lenA+lenB, lenC);
+ insertSeq(seqC, seqStart+lenA+lenB);
+ }
+ }
+ }
+
+ } else {
+ // cout << "newBLock" << endl;
+ newBlock(new HapBlock(seq, seqStart));
+ }
+ }
+ else if (seq.type==Haplotype::In) {
+ map<int, HapBlock *>::iterator it=insertions.find(seqStart);
+ if (it==insertions.end()) {
+ insertions[seqStart]=new HapBlock(seq, seqStart);
+ insertions[seqStart]->setType(HapBlock::INSERT);
+ insertions[seqStart]->insert(Haplotype(Haplotype::Ref));//, string(seq.size(),'0')));
+ } else
+ {
+ it->second->insert(seq);
+ }
+ }
+ else throw string("Cannot handle this case.");
+}
+
+size_t HaplotypeDistribution::getNumberOfHaplotypes(uint32_t start, uint32_t end) const
+{
+ size_t n=1;
+ size_t x=0;
+ while (x<hapBlocks.size() && hapBlocks[x]!=NULL) {
+ const HapBlock & hb=*hapBlocks[x];
+ if (hb.end()>=start && hb.start()<=end) {
+ n*=hapBlocks[x]->size();
+ }
+ x++;
+ }
+ for (map<int, HapBlock*>::const_iterator it=insertions.begin();it!=insertions.end();it++) {
+ if (it->first>=(int) start && it->first<=(int)end) {
+ n*=(it->second->size()); // we consider also haplotypes without the insertion
+ }
+ }
+ return n;
+}
+
+size_t HaplotypeDistribution::getNumberOfHaplotypes(uint32_t start, uint32_t end, double minFreq) const
+{
+ size_t n=1;
+ size_t x=0;
+ while (x<hapBlocks.size() && hapBlocks[x]!=NULL) {
+ const HapBlock & hb=*hapBlocks[x];
+ if (hb.end()>=start && hb.start()<=end) {
+ size_t nh=0;
+ for (map<Haplotype, int>::const_iterator hit=hb.haplotypes.begin();hit!=hb.haplotypes.end();hit++) {
+ const Haplotype & h=hit->first;
+ if (h.type==Haplotype::In || h.type==Haplotype::Del) nh++;
+ else { if (h.freq>minFreq) nh++; }
+ }
+ n*=nh;
+ }
+ x++;
+ }
+ for (map<int, HapBlock*>::const_iterator it=insertions.begin();it!=insertions.end();it++) {
+ if (it->first>=(int) start && it->first<=(int)end) {
+ n*=(it->second->size()); // we consider also haplotypes without the insertion
+ }
+ }
+ return n;
+}
+
+HaplotypeDistribution::~HaplotypeDistribution()
+{
+ for (size_t x=0;x<hapBlocks.size();x++) if (hapBlocks[x]!=NULL) delete hapBlocks[x];
+ for (map<int,HapBlock*>::iterator it=insertions.begin();it!=insertions.end();it++) delete it->second;
+}
diff --git a/HaplotypeDistribution.hpp b/HaplotypeDistribution.hpp
new file mode 100644
index 0000000..9432c0c
--- /dev/null
+++ b/HaplotypeDistribution.hpp
@@ -0,0 +1,498 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef HAPLOTYPEDISTRIBUTION_HPP_
+#define HAPLOTYPEDISTRIBUTION_HPP_
+#include <string>
+#include <assert.h>
+#include <iostream>
+#include <stdint.h>
+#include <vector>
+#include <list>
+#include <set>
+#include <string>
+#include <map>
+#include <cmath>
+#include "bam.h"
+#include "Haplotype.hpp"
+#include "HapBlock.hpp"
+#include "foreach.hpp"
+#include "VariantFile.hpp"
+using namespace std;
+
+
+
+
+class HaplotypeDistribution
+{
+friend class HDIterator2;
+// methods
+public:
+ HaplotypeDistribution(uint32_t _midPos, const string & refSeq, uint32_t refSeqStart);
+
+ void insertRead(const bam1_t *b);
+ static int fetchFuncInsertRead(const bam1_t *b, void *data);
+ pair<vector<Haplotype>, vector<double> > enumerateHaplotypes(double th);
+ friend ostream &operator<<(ostream &stream, const HaplotypeDistribution &hb);
+ void insertSeq(Haplotype & seq, uint32_t seqStart);
+ void check();
+ size_t getNumberOfHaplotypes(uint32_t start, uint32_t end) const;
+ size_t getNumberOfHaplotypes(uint32_t start, uint32_t end, double minFreq) const;
+ void setFrequencies();
+ ~HaplotypeDistribution();
+ set<Haplotype> getIndelsAtMidPos() const { return indelsAtMidPos; };
+ vector<Variant> getIndelVariantsAtMidPos();
+protected:
+ void updateBlock(HapBlock *hb, const Haplotype & seq, uint32_t seqStart);
+ void newBlock(HapBlock *hb);
+ void deleteBlock(int idx);
+ void splitBlock(int idx, const Haplotype & seq, uint32_t seqStart);
+ int getFirstOverlappingBlock(uint32_t seqStart, uint32_t seqEnd) const;
+
+ uint32_t len;
+ uint32_t pos0, pos1, midPos;
+ vector<HapBlock*> hapBlocks;
+ map<int, HapBlock*> insertions;
+
+ set<Haplotype> indelsAtMidPos;
+
+};
+
+
+class HDHapBlock
+{
+public:
+ HDHapBlock() { };
+ vector<Haplotype> haps;
+ uint32_t start, end;
+ int type;
+};
+
+class HDIterator2
+{
+public:
+ HDIterator2(const HaplotypeDistribution &hd, size_t maxHap, uint32_t pos, uint32_t left, uint32_t right, int _noIndelWindow=-1)
+ {
+ // noIndelWindow ignores indels around pos
+ noIndelWindow=_noIndelWindow;
+ // variants will be added at position pos
+
+ hdPtr=&hd;
+ midPos=pos;
+ setupBlocks(hd, pos, left, right);
+ setThresholds(maxHap);
+ init();
+ };
+
+ void init()
+ {
+ for (size_t x=0;x<iter.size();x++) {
+ iter[x]=0;
+ }
+ hap.seq.clear();
+ _last=false;
+ };
+ void operator++()
+ {
+ size_t x;
+ for (x=0;x<iter.size() && (++iter[x])==max[x];++x) {
+ iter[x]=0;
+ if (x==iter.size()-1) _last=true;
+ }
+ };
+ bool last() const { return _last; };
+ uint32_t start() const { return (*hapBlocks.begin())->start(); };
+ uint32_t end() const { return (*hapBlocks.rbegin())->end(); };
+ Haplotype getMaxFreqHap() const
+ {
+ Haplotype maxh;
+ maxh.seq.clear();
+ maxh.freq=1.0;
+ maxh.nfreq=1.0;
+ for (size_t x=0;x<hbs.size();x++) {
+ double mf=0.0;
+ size_t idx;
+ for (size_t y=0;y<hbs[x].haps.size();y++) if (hbs[x].haps[y].freq>mf) { idx=y; mf=hbs[x].haps[y].freq; };
+ maxh.freq*=mf;
+ if (!hasIndel[x]) maxh.nfreq*=mf;
+ maxh.append(hbs[x].haps[idx].seq);
+ }
+ return maxh;
+ }
+
+ operator Haplotype()
+ {
+ throw string("REIMPLEMENT");
+ hap.seq.clear();
+ hap.freq=1.0;
+ hap.nfreq=1.0;
+ hap.type=Haplotype::Normal;
+ //hap.haps.clear();
+ for (size_t x=0;x<iter.size();x++) {
+ const Haplotype & h=hbs[x].haps[iter[x]];
+ // do not append deletions as they are codes by '#'
+ if (h.type==Haplotype::In || h.type==Haplotype::Normal) {
+ hap.append(h.seq);
+ }
+ if (h.seq.size()>0) {
+ hap.type|=h.type;
+ }
+
+ hap.freq*=h.freq;
+ if (hasIndel[x]==0) {
+ hap.nfreq*=h.freq;
+ }
+ //hap.haps.push_back(pair<string, double>(h.seq, h.freq));
+ }
+ return hap;
+ }
+
+
+ double getLogNumHaps() const { return logNumHap; };
+ friend ostream &operator<<(ostream &stream, const HDIterator2 & hdi)
+ {
+ vector<HapBlock *> hb; hb.reserve(hdi.hapBlocks.size());
+ for (list<HapBlock*>::const_iterator lit=hdi.hapBlocks.begin();lit!=hdi.hapBlocks.end();lit++) hb.push_back(*lit);
+ HapBlock::showVector(stream, hb, hdi.midPos);
+ return stream;
+ }
+
+ void generateHapsWithAlignedVariants(vector<Haplotype> & haps, const AlignedCandidates & variants, int print=0, bool changeINStoN=false)
+ {
+ haps.clear();
+ if (print) {
+ cout << "Variants: ";
+ BOOST_FOREACH(Variant var, variants.variants) {
+ cout << "[" << var.size() << " " << var.getSeq() << "]";
+ }
+ cout << endl;
+ }
+
+ //map <Haplotype, Haplotype> pRef, pInd;
+ set <Haplotype> setHap;
+ vector <Haplotype> vecHap;
+ vector <vector<int> > vecRefPos;
+
+ size_t minLen=100000;
+ init();
+ while (!last()) {
+ hap.seq.clear();
+ hap.freq=1.0;
+ hap.nfreq=1.0;
+ hap.type=Haplotype::Normal;
+ //hap.haps.clear();
+
+ vector<int> refPos;
+ for (size_t x=0;x<iter.size();x++) {
+ const Haplotype & h=hbs[x].haps[iter[x]];
+ int len = hbs[x].end-hbs[x].start+1;
+ if (hbs[x].type == HapBlock::NORMAL) {
+ int p = hbs[x].start;
+ bool hasDel = false;
+ for (size_t y=0;y<h.seq.size();y++) {
+ int c=int(h.seq[y]);
+ if (c>=35 && c<65) { hasDel = true; }
+ refPos.push_back(p);
+ p++;
+ }
+ if (hasDel == false && int(h.seq.size())!=len) throw string("What's going on here?");
+ } else if (hbs[x].type == HapBlock::INSERT) {
+ for (size_t y=0;y<h.seq.size();y++) {
+ refPos.push_back(-1);
+ }
+ }
+ hap.append(h.seq);
+ hap.freq*=h.freq;
+ }
+
+ // effectuate deletions at positions outside midPos
+
+ size_t y=0;
+ while (y<hap.size()) {
+ int c=int(hap[y]);
+ if (c>=35 && c<65) {
+ int len=c-int('#');
+ if (len>int(hap.size()-y)) len=hap.size()-y;
+ hap.seq.erase(y,len);
+ refPos.erase(refPos.begin()+y,refPos.begin()+y+len);
+ } else y++;
+ }
+ vecHap.push_back(hap);
+ vecRefPos.push_back(refPos);
+ ++(*this);
+ }
+
+ // first add variants combinatorially, the add variants to the set of combinatorially generated haplotypes
+
+ for (int ac = 1;ac>=0;ac--) {
+ size_t numHap = vecHap.size();
+
+ bool addComb = false;
+ if (ac==1) {
+ addComb = true;
+ } else numHap = vecHap.size();
+ BOOST_FOREACH(AlignedVariant var, variants.variants) {
+ if (addComb) {
+ numHap = vecHap.size();
+ }
+
+ if (var.getAddComb()==addComb) {
+ for (size_t h=0;h<numHap;h++) {
+ Haplotype _hap=vecHap[h];
+ bool changed=false;
+
+ //cout << "******************************" << endl;
+ //cout << "var: " << var.getStartHap() << " " << var.getString() << endl;
+ //cout << " hap: " << vecHap[h].seq << endl;
+
+ vector<int> refPos = vecRefPos[h];
+ vector<int>::iterator it = find(refPos.begin(), refPos.end(), var.getStartHap());
+ if (it!=refPos.end()) {
+
+ int idx = distance(refPos.begin(), it);
+ if (var.getType()==Variant::DEL) {
+ // deletion
+ _hap.seq.erase(idx, var.size());
+ refPos.erase(refPos.begin()+idx, refPos.begin()+idx+var.size());
+ changed=true;
+ } else if (var.getType()==Variant::INS) {
+ // insertion
+ if (changeINStoN) {
+ _hap.seq.insert(idx, string(var.getSeq().size(), 'N'));
+ } else {
+ _hap.seq.insert(idx, var.getSeq());
+ }
+ refPos.insert(refPos.begin()+idx, (size_t) var.size(), -1);
+
+ changed=true;
+ } else if (var.getType()==Variant::SNP) {
+ // snp
+ const string & seq=var.getSeq();
+ char nuc=seq[3];
+ if (_hap.seq[idx]!=seq[3]) {
+ _hap.seq[idx]=nuc;
+ changed=true;
+ }
+ }
+ if (changed) {
+ // cout << "_hap: " << _hap.seq << endl;
+ vecHap.push_back(_hap);
+ vecRefPos.push_back(refPos);
+ }
+ }
+ }
+ }
+ }
+ }
+ for (size_t x=0;x<vecHap.size();x++) if (vecHap[x].size()<minLen) minLen=vecHap[x].size();
+
+ BOOST_FOREACH(Haplotype hap, vecHap) {
+ //setHap.insert(Haplotype(hap,0, minLen));
+ setHap.insert(hap);
+ }
+
+ BOOST_FOREACH(Haplotype hap, setHap) {
+ haps.push_back(hap);
+ }
+
+ }
+
+protected:
+ void setupBlocks(const HaplotypeDistribution &hd, uint32_t pos, uint32_t left, uint32_t right)
+ {
+ //cout << "_minFreq: " << _minFreq << endl;
+ for (size_t x=0;x<hd.hapBlocks.size();x++) if (hd.hapBlocks[x]!=NULL) {
+ if (x) {
+ if (hd.hapBlocks[x-1]->end()>hd.hapBlocks[x]->start()) {
+ cout << hd.hapBlocks[x-1]->end() << " " << hd.hapBlocks[x]->start() << endl;
+ cout << "HD: " << endl << hd << endl;
+ throw string("Blocks are overlapping.");
+ }
+ }
+ if (hd.hapBlocks[x]->start()>=left && hd.hapBlocks[x]->end()<=right) {
+ if (hd.hapBlocks[x-1]->end()+1!=hd.hapBlocks[x]->start()) {
+ cout << "NOT CONSECUTIVE" << endl;
+ cout << hd.hapBlocks[x-1]->end() << " " << hd.hapBlocks[x]->start() << endl;
+ cout << "HD: " << endl << hd << endl;
+
+ throw string("Blocks are not consecutive.");
+
+ }
+
+ hapBlocks.push_back(hd.hapBlocks[x]);
+ //cout << *hd.hapBlocks[x] << endl;
+ }
+ }
+
+ // insertions
+
+ list<HapBlock*>::iterator lit=hapBlocks.begin();
+ for (map<int, HapBlock*>::const_iterator it=hd.insertions.begin();it!=hd.insertions.end();it++) {
+ if (it->second->start()>=left) {
+ for (list<HapBlock*>::iterator lit2=lit;lit2!=hapBlocks.end();lit2++) {
+ if (int((*lit2)->start())>=it->first) {
+ hapBlocks.insert(lit2, it->second);
+ lit=lit2;
+ break;
+ }
+ }
+ }
+ }
+
+ // copy
+
+ bool found=false;
+ hbs.resize(hapBlocks.size());
+ hasIndel.resize(hapBlocks.size());
+ int x=0;
+ for (lit=hapBlocks.begin();lit!=hapBlocks.end();lit++,x++) {
+ uint32_t bs=(*lit)->start();
+ uint32_t be=(*lit)->end();
+ if (pos>=bs && pos<=be) {
+ indelIdx=x;
+ indelOffs=pos-bs;
+ found=true;
+ }
+ hasIndel[x]=0;
+ for (map<Haplotype,int>::const_iterator it=(*lit)->haplotypes.begin();it!=(*lit)->haplotypes.end();it++) {
+ hbs[x].haps.push_back(it->first);
+ }
+ hbs[x].start=bs;
+ hbs[x].end=be;
+ hbs[x].type=(*lit)->getType();
+ if (hbs[x].type==HapBlock::INSERT) hbs[x].end=hbs[x].start-1;
+ // set frequency of reference haplotype in block
+ // this makes sure that the reference haplotype is always included
+ bool reffound=false;
+ for (size_t y=0;y<hbs[x].haps.size();y++) {
+ if (hbs[x].haps[y].type==Haplotype::Ref) {
+ reffound=true;
+ //hbs[x][y].freq=1.0;
+ } else {
+ //for (size_t z=0;z<hbs[x].haps[y].seq.size();z++) hbs[x].haps[y].seq[z] = tolower(hbs[x].haps[y].seq[z]);
+ }
+ }
+ if (!reffound) {
+ cout << **lit << endl;
+
+ }
+ assert(reffound==true);
+ }
+
+ if (hbs.size() == 0) {
+ throw string("Not enough blocks.");
+ }
+
+ //if (!found) throw string("Cannot find position of indel in haplotypedistribution.");
+ // cout << "maxFreqHap: " << getMaxFreqHap() << endl;
+
+
+ };
+
+ void setThresholds(size_t maxHap)
+ {
+ // hasIndel is currently set to zero for all blocks, because HaplotypeDistribution
+ // does not includes indels at midPos
+ // get lowest frequency
+ vector<double> minFreq(hbs.size(),0.0);
+ vector<int> elim(hbs.size(),1);
+ size_t x=0;
+
+ typedef vector<Haplotype>::iterator LHIt;
+ LHIt it;
+
+ double logMinHap=0.0;
+ double logNH=0.0;
+
+ for (x=0;x<hbs.size();x++) {
+ logNH+=log(double(hbs[x].haps.size()));
+ }
+
+
+
+
+ double logMH=log(double(maxHap));
+ if (logMH<logMinHap) logMH=logMinHap;
+
+
+ // keep removing haplotypes until we have the desired number of haplotypes
+ bool erased=true;
+ while (logNH>logMH && erased) {
+ erased=false;
+ for (x=0;x<hbs.size();x++) {
+ double mf=2.0;
+ for (it=hbs[x].haps.begin();it!=hbs[x].haps.end();it++) {
+ if (it->type!=Haplotype::Ref && it->freq<mf) mf=it->freq;
+ }
+ if (hbs[x].haps.size()<=1) { minFreq[x]=2.0; elim[x]=0; } else minFreq[x]=mf;
+ }
+
+ vector<double>::iterator mel=min_element(minFreq.begin(), minFreq.end());
+ assert(mel!=minFreq.end());
+ size_t y=distance(minFreq.begin(),mel);
+
+ if (elim[y]==0) break;
+ // erase the element
+ for (it=hbs[y].haps.begin();it!=hbs[y].haps.end();it++) if (it->type!=Haplotype::Ref && it->freq<=*mel) {
+ hbs[y].haps.erase(it);
+ erased=true;
+ break;
+ }
+
+ logNH=0.0;
+
+ for (x=0;x<hbs.size();x++) {
+ logNH+=log(double(hbs[x].haps.size()));
+
+ }
+ //cout << "logNH: " << logNH << " logMH: " << logMH << endl;
+ }
+ max.resize(hbs.size(),0);
+ iter.resize(hbs.size(),0);
+ for (x=0;x<hbs.size();x++) max[x]=hbs[x].haps.size();
+
+ logNumHap=logNH;
+
+ // check if we still have the reference sequence in every block
+ for (size_t x=0;x<hbs.size();x++) {
+ // cout << "hbs[" << x << "]: " << hbs[x].haps.size() << endl;
+ bool reffound=false;
+ for (size_t y=0;y<hbs[x].haps.size();y++) {
+ if (hbs[x].haps[y].type==Haplotype::Ref) {
+ reffound=true;
+ }
+ }
+ if (!reffound) {
+ cout << "x: " << x << endl;
+
+ }
+ if (!reffound) { throw string("Cannot find reference sequence."); };
+ }
+ }
+
+ double logNumHap;
+ bool _last;
+ vector<int > iter, max;
+ vector<size_t> hasIndel;
+ list<HapBlock *> hapBlocks;
+ Haplotype hap;
+ vector<HDHapBlock > hbs;
+ int indelIdx, indelOffs, noIndelWindow;
+ uint32_t midPos;
+
+ typedef list<HapBlock*>::iterator HBIt;
+ const HaplotypeDistribution *hdPtr;
+};
+
+#endif
diff --git a/Library.hpp b/Library.hpp
new file mode 100644
index 0000000..d4c9731
--- /dev/null
+++ b/Library.hpp
@@ -0,0 +1,258 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * MyBam.hpp
+ *
+ * Created on: Aug 27, 2009
+ * Author: caa
+ */
+#ifndef LIBRARY_HPP
+#define LIBRARY_HPP
+#include <stdlib.h>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <boost/tuple/tuple.hpp>
+#include <string>
+#include "bam.h"
+#include "Utils.hpp"
+#include "StringHash.hpp"
+
+using namespace std;
+
+class Library
+{
+public:
+ Library()
+ {
+ }
+ Library(int type)
+ {
+ if (type == 0) {
+ maxins = 2000;
+ vector<double> counts = vector<double>(maxins, 1.0);
+ calcProb(counts);
+ } else throw string("Library type not recognized.");
+ }
+ Library(const vector<double> & counts)
+ {
+ // read library from histogram file
+ if (counts.size() == 0) {
+ cout << "HUH LIB" << endl;
+ }
+ calcProb(counts);
+ }
+
+ int getMaxInsertSize() const { return maxins; };
+ int getModus() const { return modeInsertSize; };
+ double getProb(int x) const {
+ if (x<0) x = -x;
+ if (x>=maxins) x = maxins-1;
+ return probs[x];
+ }
+ double getNinetyFifthPctProb() const { return ninetyfifth_pct_prob; };
+ void print() const
+ {
+ for (size_t x=0;x<probs.size();x++) {
+ cout << " " << probs[x];
+ if (log(probs[x])>0) {
+ cout << "LOG>0: " << x << endl;
+ }
+ }
+ cout << endl;
+ }
+
+protected:
+ int modeInsertSize;
+ void calcProb(const vector<double> & counts)
+ {
+ int max_isize = 2000;
+ int max_count = -1;
+ // get insertsize at maximum
+
+ //cout << "CALCPROB: " << endl;
+ for (int s=0;s<int(counts.size());s++) {
+ if (counts[s]>=max_count) {
+ max_count = counts[s];
+ max_isize = s;
+ }
+ }
+
+
+ maxins = 25*max_isize;
+ if (maxins>int(counts.size())) { maxins=int(counts.size()); }
+ probs=vector<double>(maxins, 0.0);
+ double z=0.0, max=-1.0;
+ modeInsertSize=0;
+ for (int d=0;d<maxins;d++) {
+ probs[d] = counts[d];
+ if (probs[d]>max) {
+ max=probs[d];
+ modeInsertSize=d;
+ }
+ z+=probs[d];
+ }
+ for (int d=0;d<maxins;d++) {
+ probs[d] /=z;
+ if (probs[d]<1e-10) probs[d]=1e-10;
+ // cout << "probs[" << d<<"]: " << probs[d] << endl;
+ }
+
+ sortProbs = probs;
+ std::sort(sortProbs.begin(), sortProbs.end());
+ double sum = 0.0;
+ for (int x=sortProbs.size()-1;x>0;x--) {
+ sum+=sortProbs[x];
+ if (sum>0.95) {
+ ninetyfifth_pct_prob = sortProbs[x];
+ break;
+ }
+ }
+ cout << "max: " << max/z << " ninetyfifth_pct_prob: " << ninetyfifth_pct_prob << endl;
+
+ }
+ vector<double> probs, sortProbs;
+ int maxins;
+ double ninetyfifth_pct_prob;
+};
+
+class LibraryCollection : public string_hash<Library>
+{
+public:
+ LibraryCollection()
+ {
+ (*this)["single_end"]=Library(0);
+ }
+
+ void addFromFile(const string & fileName)
+ {
+ ifstream fin;
+ fin.open(fileName.c_str());
+ if (!fin.is_open()) throw string("Cannot open variant file ").append(fileName);
+
+ int numLibs = 0;
+ int numLines = 0;
+
+ vector<double> counts;
+
+ string libName;
+
+ int prev=-1;
+ int max_count = -1; // highest count
+ int max_isize = -1; // insert size corresponding to highest count
+ while (!fin.eof()) {
+ string line;
+ getline(fin, line);
+ numLines++;
+ if (line.empty()) {
+ break;
+ }
+ istringstream is(line);
+ string isize_str, count_str;
+ int isize=-1;
+ double count=-1;
+ is >> isize_str;
+ if (isize_str == "#LIB") {
+
+ if (counts.size()>0 && !libName.empty()) {
+ // next library!!!!!
+ cout << "Storing library: " << libName << endl;
+
+
+ LibraryCollection::const_iterator it = this->begin();
+ it = this->find(libName);
+ if (it != this->end()) {
+ cerr << "Error: libName: " << libName << endl;
+ cerr << "Duplicate library IDs." << endl;
+ throw string("Library error");
+ } else {
+ (*this)[libName]=Library(counts);
+ cout << "Number of inserts: " << counts.size() << endl;
+ numLibs++;
+ }
+ counts.clear();
+ prev=-1;
+ }
+
+ string label;
+ is >> label;
+ libName = label;
+ if (label.empty()) {
+ cerr << "Cannot read library name in line " << numLines << " of " << fileName << endl;
+ throw string("Cannot read library name ");
+ }
+
+
+ goto nextline;
+
+ } else {
+ if (!from_string<int>(isize, isize_str, std::dec)) { cerr << "Error reading from library file" << endl; }
+ is >> count_str; if (!from_string<double>(count, count_str, std::dec)) { cerr << "Error reading from library file" << endl; }
+
+ if (isize!=prev+1) {
+ cout << "isize: " << isize << " prev: " << prev << endl;
+ cerr << "Library insert sizes must be consecutive" << endl;
+ throw string("Library error.");
+ }
+ if (count<0) {
+ cerr << "Library insert size count is negative.." << endl;
+ throw string("Library error.");
+ }
+
+ counts.push_back(count);
+ prev=isize;
+ }
+ nextline:
+ line.clear();
+ }
+
+ // store last library
+
+ LibraryCollection::const_iterator it = this->begin();
+ it = this->find(libName);
+ if (it != this->end()) {
+ cerr << "Duplicate library IDs." << endl;
+ throw string("Library error");
+ } else {
+ (*this)[libName]=Library(counts);
+ numLibs++;
+ cout << "Library " << libName << " loaded with " << counts.size() << " insert sizes." << endl;
+ }
+
+ if (numLibs==0) {
+ cerr << "Could not find any libraries. Are the headers specified correctly?" << endl;
+ }
+
+ fin.close();
+
+
+ }
+
+ double getMaxInsertSize() const
+ {
+ double max = -HUGE_VAL;
+ for (LibraryCollection::const_iterator it = this->begin(); it!= this->end(); it++) {
+ if (it->second.getMaxInsertSize()>max) max = it->second.getMaxInsertSize();
+ }
+ return max;
+ }
+ //const bam_header_t *getBamHeader() const { return bamHeader; }
+protected:
+ //const bam_header_t *bamHeader;
+
+};
+
+#endif
diff --git a/MLAlignment.hpp b/MLAlignment.hpp
new file mode 100644
index 0000000..6618331
--- /dev/null
+++ b/MLAlignment.hpp
@@ -0,0 +1,78 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * MLAlignment.hpp
+ *
+ * Created on: Apr 2, 2009
+ * Author: caa
+ */
+
+#ifndef MLALIGNMENT_HPP_
+#define MLALIGNMENT_HPP_
+#include <vector>
+#include <string>
+#include <map>
+#include "Variant.hpp"
+class MLAlignment
+{
+public:
+ static const int INS=-1;
+ static const int DEL=-2;
+ static const int LO=-3;
+ static const int RO=-4;
+ MLAlignment()
+ {
+ relPos=-1;
+ ll=0.0;
+ llOn=0.0;
+ llOff=0.0;
+ offHap=false;
+ offHapHMQ=false;
+ numIndels=0;
+ numMismatch=0;
+ hr=-1;
+ hl=-1;
+
+ }
+
+ int relPos; // relative position of read wrt haplotype
+
+
+
+ int firstBase, lastBase; //first and last base of haplotype covered by the read
+ map<int, AlignedVariant> indels, snps;
+
+ map<int, bool> hapIndelCovered, hapSNPCovered; // indels and snps in the _haplotype_ covered by the read
+
+ double ll; // loglikelihood
+ double llOn, llOff; // without priors/mapping qualities taken into account
+ bool offHap, offHapHMQ; // haplotype is mapped outside haplotype window with read-mapping quality and an artificial high-mapping-quality respectively
+ int hl, hr; // left and rightmost base on haplotype covered by the read
+ int numIndels, numMismatch;
+ int nBQT, nmmBQT; // number of aligned bases and number mismatching above threshold
+ double mLogBQ; // mean log BaseQuality
+
+ int nMMLeft, nMMRight;
+
+ string align;
+ vector<int> hpos;
+ operator double() const { return ll; };
+ void print()
+ {
+ cout << "relPos: " << relPos << " offHap: " << (int) offHap << " ll: " << ll << endl;
+ }
+};
+
+#endif /* MLALIGNMENT_HPP_ */
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f9237c4
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,15 @@
+SAMTOOLDIR=/nfs/users/nfs_c/caa/source/samtools/
+SEQANDIR=seqan_library/
+
+CPPFLAGS= -DNDEBUG -D_IOLIB=2 -DMINREADS=2 -DDINDEL
+CXXFLAGS= -I$(SAMTOOLDIR) -I$(SEQANDIR) -I./ -Wno-deprecated -O3
+LDFLAGS= -L$(SAMTOOLDIR) -lbam -lz -lboost_program_options -static
+
+SRCSDINDEL=DInDel.cpp HapBlock.cpp HaplotypeDistribution.cpp ObservationModelFB.cpp GetCandidates.cpp Faster.cpp
+OBJSDINDEL=$(SRCSDINDEL:%.cpp=%.o)
+
+dindel:$(OBJSDINDEL) Read.hpp DInDel.hpp HapBlock.hpp Haplotype.hpp HaplotypeDistribution.hpp MyBam.hpp GetCandidates.hpp Variant.hpp Fasta.hpp OutputData.hpp MLAlignment.hpp ObservationModelSeqAn.hpp VariantFile.hpp ReadIndelErrorModel.hpp Library.hpp Faster.hpp
+ $(CXX) -o $@ $(CXXFLAGS) $(DINDELFLAGS) $(OBJSDINDEL) $(LDFLAGS)
+
+clean:
+ rm -f $(OBJSDINDEL) $(OBJSCOMPAREVARIANTS) $(OBJSMAKEGLF)
diff --git a/MyBam.hpp b/MyBam.hpp
new file mode 100644
index 0000000..e15ad2f
--- /dev/null
+++ b/MyBam.hpp
@@ -0,0 +1,98 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * MyBam.hpp
+ *
+ * Created on: Aug 27, 2009
+ * Author: caa
+ */
+#ifndef MYBAM_HPP
+#define MYBAM_HPP
+#include <stdlib.h>
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <map>
+#include <boost/tuple/tuple.hpp>
+#include "faidx.h"
+#include "bam.h"
+using namespace std;
+
+class MyBam
+{
+public:
+ MyBam() { initialized=false; };
+ MyBam(const string &bamFile)
+ {
+ // load bam file
+ initialized=false;
+ init(bamFile);
+ }
+
+ MyBam(const MyBam & myBam)
+ {
+ initialized=false;
+ init(myBam.fileName);
+ }
+ int getTID(const string & str) const
+ {
+ map<string, int>::const_iterator it=strToTID.find(str);
+ if (it==strToTID.end()) {
+ throw string("Cannot find ID!");
+ } else return it->second;
+ }
+
+ void destroy()
+ {
+ if (initialized) {
+ bam_close(bf);
+ bam_header_destroy(bh);
+ bam_index_destroy(idx);
+ }
+ initialized=false;
+ }
+
+ ~MyBam()
+ {
+ destroy();
+ }
+
+ bamFile bf;
+ bam_header_t *bh;
+ bam_index_t *idx;
+ string fileName;
+ private:
+ void init(const string & _fileName)
+ {
+
+ destroy();
+ fileName=_fileName;
+ bf=bam_open(fileName.c_str(),"r");
+ if (!bf) throw string("Cannot open BAM file.");
+ bh=bam_header_read(bf);
+ for (int nt=0;nt<bh->n_targets;nt++) {
+ // cout << "target_name[" << nt << "]: " << string(bh->target_name[nt]) << endl;
+ strToTID[string(bh->target_name[nt])]=nt;
+ }
+
+ idx=bam_index_load(fileName.c_str());
+ initialized=true;
+ }
+ bool initialized;
+ map<string, int> strToTID;
+};
+
+
+#endif
diff --git a/ObservationModel.hpp b/ObservationModel.hpp
new file mode 100644
index 0000000..54e4c1d
--- /dev/null
+++ b/ObservationModel.hpp
@@ -0,0 +1,103 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * ObservationModel.hpp
+ *
+ * Created on: Apr 2, 2009
+ * Author: caa
+ */
+
+#ifndef OBSERVATIONMODEL_HPP_
+#define OBSERVATIONMODEL_HPP_
+#include <iostream>
+#include <string>
+
+using namespace std;
+class ObservationModelParameters
+{
+public:
+ ObservationModelParameters() {
+ modelType="probabilistic";
+ setDefaultValues();
+ }
+ ObservationModelParameters(const string & modelType) {
+ if (modelType=="threshold" || modelType=="probabilistic") this->modelType=modelType; else throw string("Model not supported.");
+ setDefaultValues();
+ }
+ void setDefaultValues()
+ {
+ pError=1e-4;
+ baseQualThreshold=0.995;
+ fixedBaseQual=0.99;
+ maxLengthIndel=10;
+ mapQualThreshold=100.0;
+ capMapQualFast=40.0;
+ scaleErr=0.95;
+ numE=3;
+ pMut=1e-4;
+ minOverlap=0;
+ numIndels=1;
+ indelDist="exponential";
+ maxLengthDel=maxLengthIndel;
+ pFirstgLO=0.01;
+ checkBaseQualThreshold = 0.95;
+
+ bMid=-1;
+ forceReadOnHaplotype=false;
+ mapUnmappedReads = false;
+
+ padCover=5;
+ maxMismatch=1;
+ maxTryHash=5;
+ }
+
+ void print()
+ {
+ cout << "\tmodelType: " << modelType << endl;
+ cout << "\tmaxLengthIndel: " << maxLengthIndel << " pError: " << pError << endl;
+ cout << "\tbaseQualThreshold: " << baseQualThreshold << " fixedBaseQual: " << fixedBaseQual << endl;
+ cout << "\tmapQualThreshold: " << mapQualThreshold << endl;
+ cout << "\tcapMapQualFast: " << capMapQualFast << endl;
+ cout << "\tminOverlap: " << minOverlap << endl;
+ cout << "\tscaleError: " << scaleErr << endl;
+ cout << "\tnumE: " << numE << endl;
+ cout << "\tpMut: " << pMut << endl;
+ cout << "\tnumIndels: " << numIndels << endl;
+ cout << "\tindelDistribution: " << indelDist << endl;
+ cout << "\tmaxLengthDel: " << maxLengthDel << " pError: " << pError << endl;
+ cout << "\tpFirstgLO: " << pFirstgLO << endl;
+ cout << "\tpadCover: " << padCover << endl;
+ cout << "\tmaxMismatch: " << maxMismatch << endl;
+ cout << "\tmaxTryHash: " << maxTryHash << endl;
+ cout << "\tcheckBaseQualThreshold: " << checkBaseQualThreshold << endl;
+ cout << "\tmapUnmappedReads: " << mapUnmappedReads << endl;
+ //cout << "\tlogLikThreshod: " << logLikThreshold << endl;
+ }
+ double pError, baseQualThreshold, fixedBaseQual, mapQualThreshold, capMapQualFast, scaleErr, pMut;
+ int maxLengthIndel, numE, minOverlap, numIndels, bMid;
+ double checkBaseQualThreshold;
+
+ string modelType, indelDist;
+ int maxLengthDel,maxTryHash;
+ bool forceReadOnHaplotype, mapUnmappedReads;
+ double pFirstgLO;
+ int padCover, maxMismatch;
+
+
+};
+
+
+
+#endif /* OBSERVATIONMODEL_HPP_ */
diff --git a/ObservationModelFB.cpp b/ObservationModelFB.cpp
new file mode 100644
index 0000000..189b3c3
--- /dev/null
+++ b/ObservationModelFB.cpp
@@ -0,0 +1,1829 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <vector>
+#include <cmath>
+#include <set>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include "ObservationModelFB.hpp"
+#include "Haplotype.hpp"
+#include "Read.hpp"
+#include "ReadIndelErrorModel.hpp"
+using namespace std;
+//#define DEBUGHMM
+
+ObservationModelFB::ObservationModelFB(const Haplotype & _hap, const Read & r, uint32_t hapStart, const ObservationModelParameters & _params) : read(r), params(_params)
+{
+
+ Init(_hap, hapStart);
+
+}
+
+void ObservationModelFB::Init(const Haplotype & _hap, uint32_t hapStart)
+{
+ hap=_hap.filtered();
+ memAllocated=false;
+ HMMInitialized=false;
+ HMMConsistent=false;
+ obsInitialized=false;
+ forwardDone=false;
+ backwardDone=false;
+ likelihoodComputed=false;
+ makeObsVector=false;
+ marginalsComputed=false;
+ if (params.maxLengthDel>(int) hap.size()) throw string("hapSize error.");
+
+
+ //bMid=_bMid; compute bMid position
+ uint32_t hapEnd=hapStart+hap.size();
+ uint32_t mReadStart=uint32_t(read.posStat.first);
+ uint32_t readEnd=mReadStart+uint32_t(read.size())-1;
+ uint32_t olStart, olEnd;
+ int mid;
+ if (read.isUnmapped()) {
+ bMid = int ( read.size()/2 );
+ } else {
+ if (mReadStart>hapEnd) {
+ bMid=int(read.size()/2);
+ /*
+ cout << "hapStart: " << hapStart << " hapEnd: " << hapEnd << endl;
+ cout << "mReadStart: " << mReadStart << " readEnd: " << readEnd << endl;
+ cout << "read.posStat.first: " << read.posStat.first << " read.posStat.second: " << read.posStat.second << endl;
+ cout << "Read: " << read << endl;
+ cout << "BMID error: read is not on haplotype. Changing ObservationModelParameters." << endl;
+ */
+ //cerr << "BMIDE" << endl;
+ params.baseQualThreshold=0.0;
+ } else if (readEnd<hapStart) {
+ bMid=int(read.size()/2);
+ params.baseQualThreshold=0.0;
+ /*
+ cout << "hapStart: " << hapStart << " hapEnd: " << hapEnd << endl;
+ cout << "mReadStart: " << mReadStart << " readEnd: " << readEnd << endl;
+ cout << "read.posStat.first: " << read.posStat.first << " read.posStat.second: " << read.posStat.second << endl;
+ cout << "Read: " << read << endl;
+ cout << "BMID error: read is not on haplotype. Changing ObservationModelParameters" << endl;
+ */
+ //cerr << "BMIDE" << endl;
+ } else {
+ olStart=(hapStart>mReadStart)?hapStart:mReadStart;
+ olEnd=(hapEnd>readEnd)?readEnd:hapEnd;
+ mid=(int(olEnd)-int(olStart))/2+int(olStart);
+ bMid=mid-int(mReadStart);
+ }
+ }
+
+ /*
+ if (bMid<0 || bMid>=int(read.size())) {
+ cout << "hapStart: " << hapStart << " readStart: " << mReadStart << " readEnd: " << readEnd << " olStart: " << olStart << " olEnd: " << olEnd << " bMid: " << bMid << " mid: " << mid << endl;
+ throw string("error");
+ }
+ */
+
+ if (params.bMid!=-1) bMid=params.bMid;
+
+ if (bMid<0) { cout << "BMIDERROR" << endl; bMid=0; };
+ if (bMid>=int(read.size())) { cout << "BMIDERROR" << endl; bMid=int(read.size())-1; };
+
+ this->hapStart=hapStart;
+}
+
+
+void ObservationModelFB::changeHaplotype(const Haplotype & newHap)
+{
+ if (hap.size()!=newHap.filtered().size()) {
+ cout << "hap: " << hap << " newHap: " << newHap << endl;
+ throw string("New haplotype must have same length as old haplotype.");
+ }
+
+ hap=newHap.filtered();
+
+ obsInitialized=false;
+ HMMConsistent=false;
+ forwardDone=false;
+ backwardDone=false;
+ likelihoodComputed=false;
+
+}
+
+void ObservationModelFB::calcLikelihoodFromLastSlice()
+{
+ throw string("CHANGE ME! PRIOR NOT CALCULATED IN RIGHT PLACE");
+ if (likelihoodComputed) return;
+ double *alpha_l=alpha[bMid];
+ double *beta_l=beta[bMid];
+ double *obs_l=obs[bMid];
+ logLikelihood=0.0;
+ logLikelihoodNoPrior=0.0;
+ likOffHap.resize(2);
+ likOffHap[0]=0.0;
+ likOffHap[1]=0.0;
+
+ int y=0;
+ double max=0.0;
+ int maxidx=0;
+ for (int x=0;x<2*numS;x++, y++) {
+
+ double v=alpha_l[y]+obs_l[y]+beta_l[y];
+ double mar=exp(v);
+ if (mar>max) {
+ max=mar;
+ maxidx=x;
+ }
+ logLikelihood+=mar;
+ v=alpha_l[y]+obs_l[y]+beta_l[y]-prior[x];
+ double marnp=exp(v);
+ logLikelihoodNoPrior+=marnp;
+ if ((x%numS)==0) likOffHap[0]+=marnp; else if ((x%numS)!=ROState) likOffHap[1]+=marnp;
+ }
+ logLikelihood=log(logLikelihood);
+ logLikelihoodNoPrior=log(logLikelihoodNoPrior);
+ likOffHap[0]=log(likOffHap[0]);
+ likOffHap[1]=log(likOffHap[1]);
+
+ ml.ll=logLikelihood;
+ ml.llOff=likOffHap[0];
+ ml.llOn=likOffHap[1];
+ if ((maxidx%numS)==0 || (maxidx%numS)==ROState) {
+ ml.offHapHMQ=true;
+ ml.offHap=true;
+ }
+#ifdef DEBUGHMM
+ //cout << "calcLikelihoodFromLastSlice(): " << logLikelihood << endl;
+ //cout << "here: " << scientific << setprecision(10) << log(likOffHap[0]*exp(prior[0])+likOffHap[1]*exp(prior[1])) << " " << logLikelihood << endl;
+#endif
+ likelihoodComputed=true;
+}
+
+MLAlignment ObservationModelFB::calcLikelihood()
+{
+
+ initHMM();
+ setupReadObservationPotentials();
+ computeForwardMessages();
+ calcLikelihoodFromLastSlice();
+
+ return ml;
+}
+
+
+void ObservationModelFB::setupTransitionProbs()
+{
+ logpLOgLO=log(1.0-params.pFirstgLO);
+ logpFirstgLO=log(params.pFirstgLO);
+
+ numT=params.maxLengthDel+2;
+ logPTrans.resize(numT);
+ // maxT is the transition which corresponds to a normal-operation base extension
+ logPTrans[1]=log(1.0-params.pError);
+ double norm=0.0;
+ for (int x=1;x<numT;x++) if (x!=1) {
+ double p=-fabs(1.0-double(x));
+ logPTrans[x]=p;
+ norm+=exp(p);
+ }
+ norm=log(norm/params.pError);
+ for (int x=1;x<numT;x++) if (x!=1) logPTrans[x]-=norm;
+
+ // check norm
+ norm=0.0;
+ for (int x=1;x<numT;x++) norm+=exp(logPTrans[x]);
+ assert(fabs(norm-1.0)<1e-15);
+
+ logpInsgIns=-1.0;
+ logpNoInsgIns=log(1.0-exp(logpInsgIns));
+ logpInsgNoIns=log(params.pError);
+ logpNoInsgNoIns=log(1-params.pError);
+ /*
+ cout << "logpInsgIns: " << logpInsgIns << endl;
+ cout << "logpNoInsgIns: " << logpNoInsgIns << endl;
+ cout << "logpInsgNoIns: " << logpInsgNoIns << endl;
+ cout << "logpNoInsgNoIns: " << logpNoInsgNoIns << endl;
+ */
+
+}
+
+
+void ObservationModelFB::setupReadObservationPotentials()
+{
+ if (obsInitialized) return;
+ int b;
+ if (params.modelType=="probabilistic") {
+ for (b=0;b<readSize;b++) {
+ double rq=read.qual[b];
+ char nuc=read.seq[b];
+ double *obs_b=obs[b];
+
+ double *obs_b_ins=&obs_b[numS];
+ double *obs_b_noins=obs_b;
+ double pr=rq*(1.0-params.pMut);
+ double eq=log(.25+.75*pr);
+ double uq=log(.75+1e-10-.75*pr);
+
+
+ obs_b_ins[0]=eq; // left of haplotype
+ obs_b_ins[hapSize+1]=eq; // right of haplotype
+
+ obs_b_noins[0]=eq; // left of haplotype
+ obs_b_noins[hapSize+1]=eq; // right of haplotype
+
+ for (int y=0;y<hapSize;y++) {
+ // given an insertion in the read assume match to prevent favoring of the insertion based on low base qualities
+ obs_b_ins[y+1]=eq;
+ if (hap[y] == 'N' || hap[y]==nuc) {
+ obs_b_noins[y+1]=eq;
+ } else {
+ obs_b_noins[y+1]=uq;
+ }
+ }
+ }
+ } else throw string("Unsupported observation model.");
+
+
+
+
+
+ //throw string("Check priors!");
+ //cout << "prior[0]: " << prior[0] << endl;
+ if (params.forceReadOnHaplotype) {
+ forceOnHap();
+ }
+
+ obsInitialized=true;
+}
+
+void ObservationModelFB::computeBMidPrior(vector<double> & _prior, double mapQual)
+{
+ double pOffFirst;
+ double mq=1.0-mapQual;
+ if (-10.0*log10(mq)>params.mapQualThreshold) {
+ mq=pow(10.0,-params.mapQualThreshold/10.0);
+ }
+ pOffFirst=mq;
+
+ _prior=vector<double>(2*numS,0.0);
+ vector<double> pinsert = vector<double>(numS,0.0);
+ if (params.mapUnmappedReads && read.isPaired()) {
+ //cout << "read: " << bam1_qname(read.getBam()) << " read.pos: " << read.pos << " read.matePos: " << read.matePos << " read.mateLen: " << read.mateLen << " read.isUnmapped: " << read.isUnmapped() << " mateUnmapped: " << read.mateIsUnmapped() << " bmid: " << bMid << " hapStart: " << hapStart << " read.tid: " << read.getBam()->core.tid << " read.mtid: " << read.getBam()->core.mtid << endl;
+ //
+
+ if (!read.mateIsUnmapped() && read.mateLen != -1 && read.getBam()->core.tid == read.getBam()->core.mtid) {
+ if (read.mateIsReverse()) {
+ for ( int x=1;x<hapSize+1;x++) pinsert[x] = log(read.getLibrary().getProb(abs(hapStart+x-bMid-int(read.matePos+read.mateLen))));
+ } else {
+ for ( int x=1;x<hapSize+1;x++) pinsert[x] = log(read.getLibrary().getProb(abs(hapStart+x+readSize-bMid-int(read.matePos))));
+ }
+ pinsert[0] = log(read.getLibrary().getNinetyFifthPctProb());
+ // for (int x=0;x<hapSize+1;x++) cout << " " << pinsert[x];
+ // cout << endl;
+ }
+
+ }
+
+ for (size_t i=0;i<2;i++) {
+ double logpIns=(i==1)?(logpInsgNoIns):log(1.0-exp(logpInsgNoIns));
+ _prior[i*numS+0]=log(pOffFirst)+logpIns+pinsert[0];
+ _prior[i*numS+ROState]=-100.0;
+ for (int x=1;x<hapSize+1;x++) {
+ _prior[i*numS+x]=pinsert[x]+log((1.0-pOffFirst))+logpIns;
+ }
+ }
+
+}
+
+void ObservationModelFB::forceOnHap()
+{
+ for (int b=0;b<readSize;b++) {
+ obs[b][0]=-1000.0;
+ obs[b][ROState]=-1000.0;
+ obs[b][numS]=-1000.0;
+ obs[b][ROState+numS]=-1000.0;
+ }
+
+}
+
+void ObservationModelFB::getObsVector(int b, double *vec) const
+{
+ throw string("Not implemented");
+// for (int y=0;y<4;y++) vec[y]=obsVector[(b<<2)+y];
+}
+
+void ObservationModelFB::initHMM()
+{
+ if (HMMInitialized) return;
+ hapSize=hap.size();
+ numS=hapSize+2;
+ readSize=read.seq.size();
+ ROState=hapSize+1;
+
+
+ allocateMemory();
+
+ for (int x=0;x<2*numS;x++) {
+ alpha[0][x]=0.0;
+ beta[readSize-1][x]=0.0;
+ }
+
+ HMMInitialized=true;
+ HMMConsistent=false;
+ forwardDone=false;
+ backwardDone=false;
+ likelihoodComputed=false;
+ marginalsComputed=false;
+
+ setupTransitionProbs();
+
+}
+
+void ObservationModelFB::allocateMemory()
+{
+ if (memAllocated) return; //throw string("Memory already allocated.");
+ mar.reserve(readSize);
+ obs.reserve(readSize);
+ alpha.reserve(readSize);
+ beta.reserve(readSize);
+
+ for (int b=0;b<readSize;b++) {
+ mar.push_back(new double[numS*2]);
+ obs.push_back(new double[numS*2]);
+ alpha.push_back(new double[numS*2]);
+ beta.push_back(new double[numS*2]);
+ xmar.push_back(new double[numS]);
+ }
+ if (makeObsVector) { obsVector=new double[4*readSize]; };
+ memAllocated=true;
+}
+
+void ObservationModelFB::deleteMemory()
+{
+ if (memAllocated) {
+ for (int b=0;b<readSize;b++) {
+ delete[] mar[b];
+ delete[] obs[b];
+ delete[] alpha[b];
+ delete[] beta[b];
+ delete[] xmar[b];
+ }
+ if (makeObsVector) delete[] obsVector;
+ memAllocated=false;
+ }
+
+}
+
+
+
+void ObservationModelFB::passMessageOneInc(double *alpha_l, const double *alpha_l_1, const double *obs_l_1)
+{
+ // P(x^l=x^{l-1}+d|x^{l-1})=(10^{logpSkip*d})
+ for (int x=0;x<2*numS;x++) alpha_l[x]=0.0;
+
+ // x^l, i^l=0 = > x^{l+1}, i^{l+1}=0
+ alpha_l[0]+=( exp(obs_l_1[0]+alpha_l_1[0]+logpLOgLO+logpNoInsgNoIns) );
+ alpha_l[1]+=( exp(obs_l_1[0]+alpha_l_1[0]+logpFirstgLO+logpNoInsgNoIns ) );
+ for (int x=1;x<=hapSize;x++ ) {
+ double tmp=obs_l_1[x]+alpha_l_1[x]+logpNoInsgNoIns;
+ for (int y=1;y<numT;y++) {
+ int newx=x+y;
+ if (newx>hapSize) newx=ROState;
+ alpha_l[newx]+=exp(tmp+logPTrans[y]);
+ }
+ }
+ // RO -> RO pROgRO=1.0;
+ alpha_l[ROState]+=exp(obs_l_1[ROState]+alpha_l_1[ROState]+logpNoInsgNoIns);
+
+ // x^l, i^l=0 = > x^{l+1}=x^l, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ alpha_l[numS+x]+=exp(obs_l_1[x]+alpha_l_1[x]+logpInsgNoIns);
+ }
+
+ // x^l, i^l=1 = > x^{l+1}, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ alpha_l[numS+x]+=exp(obs_l_1[numS+x]+alpha_l_1[numS+x]+logpInsgIns);
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l+1, i^{l+1}=0
+ alpha_l[0]+=exp(obs_l_1[numS+0]+alpha_l_1[numS+0]+logpNoInsgIns); // cannot go from insertion on to the haplotype
+ for (int x=1;x<=hapSize+1;x++ ) {
+ int newx=x+1; if (newx>ROState) newx=ROState;
+ alpha_l[newx]+=exp(obs_l_1[numS+x]+alpha_l_1[numS+x]+logpNoInsgIns);
+ }
+
+
+ // convert back to log
+
+ for (int x=0;x<2*numS;x++) alpha_l[x]=log(alpha_l[x]);
+}
+
+// doing a pass for a P(X_{l+1}|X_l) potential where the next state is lower than current state
+
+void ObservationModelFB::passMessageOneDec(double *alpha_l, const double *alpha_l_1,const double *obs_l_1)
+{
+ // P(x^l=x^{l-1}+d|x^{l-1})=(10^{logpSkip*d})
+ for (int x=0;x<2*numS;x++) alpha_l[x]=0.0;
+
+ // x^l, i^l=0 = > x^{l+1}, i^{l+1}=0
+ alpha_l[ROState]+=( exp(obs_l_1[ROState]+alpha_l_1[ROState]+logpLOgLO+logpNoInsgNoIns) );
+ alpha_l[hapSize]+=( exp(obs_l_1[ROState]+alpha_l_1[ROState]+logpFirstgLO+logpNoInsgNoIns ) );
+ for (int x=1;x<=hapSize;x++ ) {
+ double tmp=obs_l_1[x]+alpha_l_1[x]+logpNoInsgNoIns;
+ for (int y=1;y<numT;y++) {
+ int newx=x-y;
+ if (newx<0) newx=0;
+ alpha_l[newx]+=exp(tmp+logPTrans[y]);
+ }
+ }
+ // RO -> RO pROgRO=1.0;
+ alpha_l[0]+=exp(obs_l_1[0]+alpha_l_1[0]+logpNoInsgNoIns);
+
+ // x^l, i^l=0 = > x^{l+1}=x^l-1, i^{l+1}=1
+ alpha_l[numS+ROState]+=( exp(obs_l_1[ROState]+alpha_l_1[ROState]+logpLOgLO+logpInsgNoIns ) );
+ alpha_l[numS+hapSize]+=( exp(obs_l_1[ROState]+alpha_l_1[ROState]+logpFirstgLO+logpInsgNoIns ) );
+ for (int x=0;x<=hapSize;x++ ) {
+ int newx=x-1; if (newx<0) newx=0;
+ alpha_l[numS+newx]+=exp(obs_l_1[x]+alpha_l_1[x]+logpInsgNoIns);
+ }
+
+
+ /*
+ for (int x=0;x<=hapSize+1;x++ ) {
+ alpha_l[numS+x]+=exp(obs_l_1[x]+alpha_l_1[x]+logpInsgNoIns);
+ }
+ */
+
+ // x^l, i^l=1 = > x^{l+1}, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ alpha_l[numS+x]+=exp(obs_l_1[numS+x]+alpha_l_1[numS+x]+logpInsgIns);
+ }
+
+
+
+ // x^l, i^l=1 = > x^{l+1}=x^l, i^{l+1}=0
+ for (int x=0;x<=hapSize+1;x++ ) {
+ alpha_l[x]+=exp(obs_l_1[numS+x]+alpha_l_1[numS+x]+logpNoInsgIns);
+ }
+
+ /*
+ alpha_l[ROState]+=exp(obs_l_1[numS+ROState]+alpha_l_1[numS+ROState]+logpNoInsgIns); // cannot go from insertion on to the haplotype
+ for (int x=0;x<=hapSize;x++ ) {
+ int newx=x-1; if (newx<0) newx=0;
+ alpha_l[newx]+=exp(obs_l_1[numS+x]+alpha_l_1[numS+x]+logpNoInsgIns);
+ }
+ */
+
+
+ // convert back to log
+ for (int x=0;x<2*numS;x++) alpha_l[x]=log(alpha_l[x]);
+}
+
+void ObservationModelFB::passMessageTwoInc(double *beta_l, const double *beta_l_1,const double *obs_l_1)
+{
+
+ // x^l, i^l=0 = > x^{l+1}, i^{l+1}=0
+ beta_l[0]=( exp(obs_l_1[0]+beta_l_1[0]+logpLOgLO+logpNoInsgNoIns) ) + ( exp(obs_l_1[1]+beta_l_1[1]+logpFirstgLO+logpNoInsgNoIns ) );
+ for (int x=1;x<=hapSize;x++ ) {
+ // double tmp=beta_l_1[x]+logpNoInsgNoIns;
+ beta_l[x]=0.0;
+ for (int y=1;y<numT;y++) {
+ int newx=x+y;
+ if (newx>hapSize) newx=ROState;
+ beta_l[x]+=exp(logPTrans[y]+logpNoInsgNoIns+beta_l_1[newx]+obs_l_1[newx]);
+ }
+ }
+
+ // RO -> RO pROgRO=1.0;
+ beta_l[ROState]=exp(obs_l_1[ROState]+beta_l_1[ROState]+logpNoInsgNoIns);
+
+ //
+ // x^l, i^l=0 = > x^{l+1}=x^l, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ beta_l[x]+=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgNoIns);
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ beta_l[numS+x]=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns);
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l+1, i^{l+1}=0
+ beta_l[0+numS]+=exp(obs_l_1[0]+beta_l_1[0]+logpNoInsgIns); // cannot go from insertion on to the haplotype
+ for (int x=1;x<=hapSize+1;x++ ) {
+ int newx=x+1; if (newx>ROState) newx=ROState;
+ beta_l[numS+x]+=exp(obs_l_1[newx]+beta_l_1[newx]+logpNoInsgIns);
+ }
+
+
+ // convert back to log
+
+ for (int x=0;x<2*numS;x++) beta_l[x]=log(beta_l[x]);
+}
+
+
+
+void ObservationModelFB::passMessageTwoDec(double *beta_l, const double *beta_l_1,const double *obs_l_1)
+{
+ // P(x^l=x^{l-1}+d|x^{l-1})=(10^{logpSkip*d})
+ //for (int x=0;x<2*numS;x++) beta_l[x]=0.0;
+
+ // x^l, i^l=0 = > x^{l+1}, i^{l+1}=0
+ beta_l[ROState]=( exp(obs_l_1[ROState]+beta_l_1[ROState]+logpLOgLO+logpNoInsgNoIns) )+( exp(obs_l_1[hapSize]+beta_l_1[hapSize]+logpFirstgLO+logpNoInsgNoIns ) );
+ for (int x=1;x<=hapSize;x++ ) {
+ beta_l[x]=0.0;
+ for (int y=1;y<numT;y++) {
+ int newx=x-y;
+ if (newx<0) newx=0;
+ beta_l[x]+=exp(obs_l_1[newx]+logPTrans[y]+beta_l_1[newx]+logpNoInsgNoIns);
+ }
+ }
+ // RO -> RO pROgRO=1.0;
+ beta_l[0]=exp(obs_l_1[0]+beta_l_1[0]+logpNoInsgNoIns);
+
+ // x^l, i^l=0 = > x^{l+1}=x^l-1, i^{l+1}=1
+ beta_l[ROState]+=(exp(obs_l_1[numS+ROState]+beta_l_1[numS+ROState]+logpLOgLO+logpInsgNoIns)+exp(obs_l_1[numS+hapSize]+beta_l_1[numS+hapSize]+logpFirstgLO+logpInsgNoIns)); // cannot go from insertion on to the haplotype
+ for (int x=0;x<=hapSize;x++) {
+ int newx=x-1; if (newx<0) newx=0;
+ beta_l[x]+=exp(obs_l_1[numS+newx]+beta_l_1[numS+newx]+logpInsgNoIns);
+ }
+
+ /*
+ * from forward OneDec
+ alpha_l[numS+ROState]+=( exp(obs_l_1[ROState]+alpha_l_1[ROState]+logpLOgLO+logpInsgNoIns ) );
+ alpha_l[numS+hapSize]+=( exp(obs_l_1[ROState]+alpha_l_1[ROState]+logpFirstgLO+logpInsgNoIns ) );
+ for (int x=0;x<=hapSize;x++ ) {
+ int newx=x-1; if (newx<0) newx=0;
+ alpha_l[numS+newx]+=exp(obs_l_1[x]+alpha_l_1[x]+logpInsgNoIns);
+ }
+ */
+ /*
+ for (int x=0;x<=hapSize+1;x++ ) {
+ beta_l[x]+=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgNoIns);
+ }
+ */
+
+ // x^l, i^l=1 = > x^{l+1}, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ beta_l[numS+x]=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns);
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l, i^{l+1}=0
+ for (int x=0;x<=hapSize+1;x++ ) {
+ beta_l[numS+x]+=exp(obs_l_1[x]+beta_l_1[x]+logpNoInsgIns);
+ }
+ // convert back to log
+ for (int x=0;x<2*numS;x++) beta_l[x]=log(beta_l[x]);
+}
+
+
+void ObservationModelFB::computeForwardMessages()
+{
+ if (forwardDone) return;
+ /*
+ for (int b=1;b<readSize;b++) {
+ passMessageTwoDec(alpha[b], alpha[b-1], obs[b-1]);
+ }
+ */
+
+ for (int b=1;b<=bMid;b++) {
+ passMessageTwoDec(alpha[b], alpha[b-1], obs[b-1]);
+ }
+ for (int b=readSize-1;b>bMid;b--) {
+ passMessageTwoInc(beta[b-1], beta[b], obs[b]);
+ }
+
+
+ forwardDone=true;
+}
+
+void ObservationModelFB::computeBackwardMessages()
+{
+ if (backwardDone) return;
+ /*
+ for (int b=readSize-1;b>0;b--) {
+ passMessageOneDec(beta[b-1], beta[b], obs[b]);
+ }
+ */
+
+
+ for (int b=bMid;b>0;b--) {
+ passMessageOneDec(beta[b-1], beta[b], obs[b]);
+ }
+ for (int b=bMid+1;b<readSize;b++) {
+ passMessageOneInc(alpha[b], alpha[b-1], obs[b-1]);
+ }
+
+
+ backwardDone=true;
+}
+
+bool ObservationModelFB::_badValue(double v)
+{
+ if (isnan(v)||isinf(v)) return true; else return false;
+}
+
+bool ObservationModelFB::hasErrors()
+{
+ for (int l=0;l<readSize;l++) {
+ for (int x=0;x<hapSize+2;x++) {
+ if (_badValue(mar[l][x]) || _badValue(alpha[l][x]) || _badValue(beta[l][x]) ) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+void ObservationModelFB::computeMarginals()
+{
+ // also normalizes
+ if (marginalsComputed) return;
+#ifdef DEBUGHMM
+ vector<double> logL(readSize);
+ cout << "log-likelihoods: ";
+#endif
+ if (!HMMInitialized) initHMM();
+ if (!obsInitialized) setupReadObservationPotentials();
+ if (!forwardDone) computeForwardMessages();
+ // if (!likelihoodComputed) calcLikelihoodFromLastSlice();
+ if (!backwardDone) computeBackwardMessages();
+
+ for (int l=0;l<readSize;l++) {
+ double sum=0.0;
+ for (int x=0;x<2*numS;x++) {
+ mar[l][x]=exp(alpha[l][x]+beta[l][x]+obs[l][x]);
+ sum+=mar[l][x];
+ }
+ for (int x=0;x<2*numS;x++) mar[l][x]/=sum;
+ //if (l==0) logLikelihood=log(sum);
+#ifdef DEBUGHMM
+ logL[l]=log(sum);
+ cout << "[" << l << "," << scientific << setprecision(10) << logL[l] << "]";
+
+#endif
+ }
+#ifdef DEBUGHMM
+ cout << "end" << endl;
+#endif
+ HMMConsistent=true;
+ likelihoodComputed=true;
+ marginalsComputed=true;
+#ifdef DEBUGHMM
+ set<int> sl;
+ for (int l=0;l<readSize;l++) sl.insert(int(logL[l]*1000.0));
+ if (sl.size()!=1) {
+ //printMarginals();
+ cout << "inconsistent" << endl;
+ //throw string("HMM inconsistent!");
+ }
+#endif
+}
+
+void ObservationModelFB::computeXMarginals()
+{
+
+ if (!marginalsComputed) computeMarginals();
+ for (int b=0;b<readSize;b++) {
+ double *_mar=mar[b];
+ double *_xmar=xmar[b];
+ for (int x=0;x<numS;x++) _xmar[x]=0.0;
+
+ int y=0;
+ for (int ins=0;ins<2;ins++) {
+ for (int x=0;x<numS;x++) _xmar[x]+=_mar[y++];
+ }
+ }
+ cout.precision(2);
+
+}
+
+void ObservationModelFB::printMarginalsInt( const vector<double*> & pot)
+{
+ cout.precision(2);
+
+ for (size_t b=0;b<pot.size();b++) {
+ cout << "base[" << b << "]: " << endl;
+ int y=0;
+ for (int ins=0;ins<2;ins++) {
+ cout << " ins: " << ins << " ";
+ for (int x=0;x<numS;x++) {
+ stringstream os; os<<fixed << (pot[b][y++]);
+ string s=os.str();
+ cout << string(s,0,5) <<" " ;
+ }
+
+ cout << endl;
+ }
+ }
+
+}
+
+void ObservationModelFB::printMarginals()
+{
+ cout << "read: " << read.seq << " hap: " << hap.seq << endl;
+ cout << "logLikelihood: " << logLikelihood << endl;
+
+ /*
+ cout << "obs: " << endl;
+ printMarginalsInt(obs);
+ cout << "alpha: " << endl;
+ printMarginalsInt(alpha);
+ cout << "beta: " << endl;
+ printMarginalsInt(beta);
+ */
+
+ cout << "obs: " << endl;
+ printMarginalsInt(obs);
+ cout << "alpha: " << endl;
+ printMarginalsInt(alpha);
+ cout << "mar: " << endl;
+ printMarginalsInt(mar);
+}
+
+
+void ObservationModelFB::printStatistics()
+{
+ cout << "bMid: " << bMid << endl;
+ cout << "pTrans: "; for (int x=0;x<numT;x++) cout << exp(logPTrans[x]) << " "; cout << endl;
+}
+
+void ObservationModelFB::printAlignment(size_t hapScrPos)
+{
+ if (!HMMConsistent) computeMarginals();
+ computeXMarginals();
+ // for every base determine most likely position
+ vector<double> maxP(readSize,-HUGE_VAL), entropy(readSize), obsLik(readSize);
+ vector<int> maxIdx(readSize, 0);
+
+ int min=ROState+1;
+ bool isIncreasing=true;
+ for (int b=0;b<readSize;b++) {
+ double max=-HUGE_VAL;
+ int idx;
+ double *m=xmar[b];
+ entropy[b]=0.0;
+ for (int s=0;s<hapSize+2;s++) {
+ entropy[b]+=m[s]*exp(m[s]);
+ if (m[s]>max) { max=m[s]; idx=s; };
+ }
+ maxP[b]=exp(max);
+ maxIdx[b]=idx;
+ if (b && maxIdx[b]!=0 && maxIdx[b]!=ROState) if (maxIdx[b]-maxIdx[b-1]!=1) isIncreasing=false;
+ if (idx<min) min=idx;
+
+ obsLik[b]=obs[b][idx];
+ if (b==bMid) obsLik[b]-=prior[idx];
+
+ /*
+ if (idx!=0 && idx!=ROState) {
+ char hn=hap.seq[idx-1];
+ char rn=read.seq[b];
+
+ //if (hn!=rn) cout << "b: " << b << " idx: " << idx << " " << hn << " " << rn << " ol: " << obsLik[b] << endl;
+ //if (hn==rn && obsLik[b]<-.5) cout << "b: " << b << " idx: " << idx << " " << hn << " " << rn << " ol: " << obsLik[b] << endl;
+ }
+ */
+ }
+
+ // number of bases left and right off the haplotype
+ //printMarginalsInt(mar);
+ size_t nLeft=0, nRight=0;
+ for (int b=0;b<readSize;b++) {
+ if (maxIdx[b]==0) nLeft++; else if (maxIdx[b]==ROState) nRight++;
+ }
+
+
+ size_t rskip=0;
+ if (nLeft>hapScrPos) { rskip=1+nLeft-hapScrPos; nLeft=hapScrPos; };
+
+ size_t offset=hapScrPos-nLeft;
+
+ // print aligned read
+
+ string readString(nLeft+nRight+hapSize+readSize,' ');
+ size_t idxL=0, idxR=0;
+ for (int b=rskip;b<readSize;b++) {
+ char nuc=read.seq[b];
+ if (read.qual[b]<params.baseQualThreshold) { nuc=::tolower(nuc); };
+ if (maxIdx[b]==0) readString[idxL++]=nuc;
+ else if (maxIdx[b]==ROState) readString[nLeft+hapSize+idxR++]=nuc;
+ else readString[nLeft+maxIdx[b]-1]=nuc;
+
+ }
+
+ ostringstream os(ostringstream::out);
+ os << isIncreasing << " " << logLikelihood << " " << (maxIdx[bMid]!=0 & maxIdx[bMid] !=ROState) << " ";
+ size_t s=os.str().size();
+ size_t rs,ds;
+ if (offset>=s) {
+ rs=0;
+ ds=offset-s;
+ } else {
+ rs=s-offset;
+ ds=0;
+ }
+
+
+ cout << os.str() << string(ds,' ') << string(readString, rs, readString.size()) << endl;
+ /*
+ cout << "obsLik:" << string(offset+min-1-7,' '); for (int x=0;x<readSize;x++) cout << int(round(-obsLik[x])); cout << endl;
+ cout << "-logq: " << string(offset+min-1-7,' '); for (int x=0;x<readSize;x++) cout << int(round(-log(read.qual[x]))); cout << endl;
+ cout << "leq: " << string(offset+min-1-7,' ');
+ for (int b=0;b<readSize;b++) {
+ double pr=read.mapQual*read.qual[b];
+ double eq=log(.25+.75*pr);
+ cout << int(round(-eq));
+ }
+ cout << endl;
+ cout << "luq: " << string(offset+min-1-7,' ');
+ for (int b=0;b<readSize;b++) {
+ double pr=read.mapQual*read.qual[b];
+ double uq=log(1e-16+1.0-pr)+log(.25);
+ cout << int(round(-uq));
+ }
+ cout << endl;
+
+ for (int b=0;b<readSize;b++) cout << maxIdx[b] << " ";
+ cout << " isIncreasing: " << isIncreasing << " hapSize:" << hapSize << " ROState: " << ROState << endl;
+ */
+}
+
+ObservationModelFB::~ObservationModelFB()
+{
+ deleteMemory();
+}
+
+ObservationModelFBMax::ObservationModelFBMax(const Haplotype & _hap, const Read & r, uint32_t hapStart, const ObservationModelParameters & _params)
+{
+
+ read=r;
+ params=_params;
+ Init(_hap, hapStart);
+
+}
+
+
+inline void ObservationModelFBMax::updateMax(double & destValue, int & destIdx, const double newValue, const int newIdx)
+{
+ if (newValue>destValue+EPS) {
+ destValue=newValue;
+ destIdx=newIdx;
+ }
+ else if (newValue>=destValue && newValue<=destValue+1e-5 && destIdx>newIdx) {
+ destValue=newValue;
+ destIdx=newIdx;
+ }
+
+}
+
+
+// note that max-product works completely in the log-domain
+void ObservationModelFBMax::passMessageTwoInc(double *beta_l, const double *beta_l_1,const double *obs_l_1, int *bt_l)
+{
+
+ // x^l, i^l=0 => x^{l+1}, i^{l+1}=0
+ //beta_l[0]=( exp(obs_l_1[0]+beta_l_1[0]+logpLOgLO+logpNoInsgNoIns) ) + ( exp(obs_l_1[1]+beta_l_1[1]+logpFirstgLO+logpNoInsgNoIns ) );
+ beta_l[0]=-HUGE_VAL;
+ updateMax(beta_l[0], bt_l[0], obs_l_1[0]+beta_l_1[0]+logpLOgLO+logpNoInsgNoIns, 0);
+ updateMax(beta_l[0], bt_l[0], obs_l_1[1]+beta_l_1[1]+logpFirstgLO+logpNoInsgNoIns, 1);
+
+ for (int x=1;x<=hapSize;x++ ) {
+ // double tmp=beta_l_1[x]+logpNoInsgNoIns;
+ beta_l[x]=-HUGE_VAL;
+ for (int y=1;y<numT;y++) {
+ int newx=x+y;
+ if (newx>hapSize) newx=ROState;
+ //beta_l[x]+=exp(logPTrans[y]+logpNoInsgNoIns+beta_l_1[newx]+obs_l_1[newx]);
+ updateMax(beta_l[x], bt_l[x], logPTrans[y]+logpNoInsgNoIns+beta_l_1[newx]+obs_l_1[newx], newx);
+ }
+ }
+
+ // RO -> RO pROgRO=1.0;
+ //beta_l[ROState]=exp(obs_l_1[ROState]+beta_l_1[ROState]+logpNoInsgNoIns);
+ beta_l[ROState]=-HUGE_VAL;
+ updateMax(beta_l[ROState], bt_l[ROState],obs_l_1[ROState]+beta_l_1[ROState]+logpNoInsgNoIns, ROState);
+
+ //
+ // x^l, i^l=0 = > x^{l+1}=x^l, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ //beta_l[x]+=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgNoIns);
+ updateMax(beta_l[x], bt_l[x], obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgNoIns, numS+x);
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ //beta_l[numS+x]=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns);
+ beta_l[numS+x]=obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns;
+ bt_l[numS+x]=numS+x;
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l+1, i^{l+1}=0
+ //beta_l[0+numS]+=exp(obs_l_1[0]+beta_l_1[0]+logpNoInsgIns); // cannot go from insertion on to the haplotype
+ updateMax(beta_l[0+numS], bt_l[0+numS], obs_l_1[0]+beta_l_1[0]+logpNoInsgIns, 0);
+ for (int x=1;x<=hapSize+1;x++ ) {
+ int newx=x+1; if (newx>ROState) newx=ROState;
+ //beta_l[numS+x]+=exp(obs_l_1[newx]+beta_l_1[newx]+logpNoInsgIns);
+ updateMax(beta_l[numS+x], bt_l[numS+x], obs_l_1[newx]+beta_l_1[newx]+logpNoInsgIns, newx);
+ }
+
+
+ // convert back to log
+
+ // for (int x=0;x<2*numS;x++) beta_l[x]=log(beta_l[x]);
+}
+
+
+
+
+/*
+void ObservationModelFBMax::passMessageTwoDec(double *beta_l, const double *beta_l_1,const double *obs_l_1, int *bt_l)
+{
+ // P(x^l=x^{l-1}+d|x^{l-1})=(10^{logpSkip*d})
+ //for (int x=0;x<2*numS;x++) beta_l[x]=0.0;
+
+ // x^l, i^l=0 = > x^{l+1}, i^{l+1}=0
+ //beta_l[ROState]=( exp(obs_l_1[ROState]+beta_l_1[ROState]+logpLOgLO+logpNoInsgNoIns) )+( exp(obs_l_1[hapSize]+beta_l_1[hapSize]+logpFirstgLO+logpNoInsgNoIns ) );
+
+ beta_l[ROState]=-HUGE_VAL;
+ updateMax(beta_l[ROState], bt_l[ROState], obs_l_1[ROState]+beta_l_1[ROState]+logpLOgLO+logpNoInsgNoIns, ROState);
+ updateMax(beta_l[ROState], bt_l[ROState], obs_l_1[hapSize]+beta_l_1[hapSize]+logpFirstgLO+logpNoInsgNoIns, hapSize);
+
+
+ for (int x=1;x<=hapSize;x++ ) {
+ beta_l[x]=-HUGE_VAL;
+ for (int y=1;y<numT;y++) {
+ int newx=x-y;
+ if (newx<0) newx=0;
+ //beta_l[x]+=exp(obs_l_1[newx]+logPTrans[y]+beta_l_1[newx]+logpNoInsgNoIns);
+ updateMax(beta_l[x], bt_l[x], obs_l_1[newx]+logPTrans[y]+beta_l_1[newx]+logpNoInsgNoIns, newx);
+ }
+ }
+ // RO -> RO pROgRO=1.0;
+ //beta_l[0]=exp(obs_l_1[0]+beta_l_1[0]+logpNoInsgNoIns);
+ beta_l[0]=obs_l_1[0]+beta_l_1[0]+logpNoInsgNoIns;
+ bt_l[0]=0;
+
+ // x^l, i^l=0 = > x^{l+1}=x^l, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ //beta_l[x]+=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgNoIns);
+ updateMax(beta_l[x], bt_l[x], obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgNoIns, numS+x);
+ }
+
+ // x^l, i^l=1 = > x^{l+1}, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ //beta_l[numS+x]=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns);
+ beta_l[numS+x]=obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns;
+ bt_l[numS+x]=numS+x;
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l+1, i^{l+1}=0
+ //beta_l[numS+ROState]+=exp(obs_l_1[ROState]+beta_l_1[ROState]+logpNoInsgIns); // cannot go from insertion on to the haplotype
+ updateMax(beta_l[numS+ROState], bt_l[numS+ROState], obs_l_1[ROState]+beta_l_1[ROState]+logpNoInsgIns, ROState);
+
+ for (int x=0;x<=hapSize;x++ ) {
+ int newx=x-1; if (newx<0) newx=0;
+ //beta_l[numS+x]+=exp(obs_l_1[newx]+beta_l_1[newx]+logpNoInsgIns);
+ updateMax(beta_l[numS+x], bt_l[numS+x],obs_l_1[newx]+beta_l_1[newx]+logpNoInsgIns, newx );
+ }
+ // convert back to log
+ // for (int x=0;x<2*numS;x++) beta_l[x]=log(beta_l[x]);
+}
+*/
+void ObservationModelFBMax::passMessageTwoDec(double *beta_l, const double *beta_l_1,const double *obs_l_1, int *bt_l)
+{
+ // P(x^l=x^{l-1}+d|x^{l-1})=(10^{logpSkip*d})
+
+ // x^l, i^l=0 = > x^{l+1}, i^{l+1}=0
+ beta_l[ROState]=-HUGE_VAL;
+ updateMax(beta_l[ROState], bt_l[ROState], obs_l_1[ROState]+beta_l_1[ROState]+logpLOgLO+logpNoInsgNoIns, ROState);
+ updateMax(beta_l[ROState], bt_l[ROState], obs_l_1[hapSize]+beta_l_1[hapSize]+logpFirstgLO+logpNoInsgNoIns, hapSize);
+
+ for (int x=1;x<=hapSize;x++ ) {
+ beta_l[x]=-HUGE_VAL;
+ for (int y=1;y<numT;y++) {
+ int newx=x-y;
+ if (newx<0) newx=0;
+ //beta_l[x]+=exp(obs_l_1[newx]+logPTrans[y]+beta_l_1[newx]+logpNoInsgNoIns);
+ updateMax(beta_l[x], bt_l[x], obs_l_1[newx]+logPTrans[y]+beta_l_1[newx]+logpNoInsgNoIns, newx);
+ }
+ }
+ // RO -> RO pROgRO=1.0;
+ //beta_l[0]=exp(obs_l_1[0]+beta_l_1[0]+logpNoInsgNoIns);
+ beta_l[0]=obs_l_1[0]+beta_l_1[0]+logpNoInsgNoIns;
+ bt_l[0]=0;
+
+
+ // x^l, i^l=0 = > x^{l+1}=x^l-1, i^{l+1}=1
+ //beta_l[ROState]+=(exp(obs_l_1[numS+ROState]+beta_l_1[numS+ROState]+logpLOgLO+logpInsgNoIns)+exp(obs_l_1[numS+hapSize]+beta_l_1[numS+hapSize]+logpFirstgLO+logpInsgNoIns)); // cannot go from insertion on to the haplotype
+ updateMax(beta_l[ROState], bt_l[ROState],obs_l_1[numS+ROState]+beta_l_1[numS+ROState]+logpLOgLO+logpInsgNoIns, numS+ROState);
+ updateMax(beta_l[ROState], bt_l[ROState],obs_l_1[numS+hapSize]+beta_l_1[numS+hapSize]+logpFirstgLO+logpInsgNoIns, numS+hapSize);
+
+ for (int x=0;x<=hapSize;x++) {
+ int newx=x-1; if (newx<0) newx=0;
+ //beta_l[x]+=exp(obs_l_1[numS+newx]+beta_l_1[numS+newx]+logpInsgNoIns);
+ updateMax(beta_l[x], bt_l[x],obs_l_1[numS+newx]+beta_l_1[numS+newx]+logpInsgNoIns, numS+newx);
+ }
+
+
+
+ // x^l, i^l=1 = > x^{l+1}, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ // beta_l[numS+x]=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns);
+ beta_l[numS+x]=obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns;
+ bt_l[numS+x]=numS+x;
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l, i^{l+1}=0
+ for (int x=0;x<=hapSize+1;x++ ) {
+ // beta_l[numS+x]+=exp(obs_l_1[x]+beta_l_1[x]+logpNoInsgIns);
+ updateMax(beta_l[numS+x], bt_l[numS+x],obs_l_1[x]+beta_l_1[x]+logpNoInsgIns, x);
+ }
+ // convert back to log
+ //for (int x=0;x<2*numS;x++) beta_l[x]=log(beta_l[x]);
+}
+
+
+void ObservationModelFBMax::runHMM()
+{
+ if (HMMConsistent) return;
+ initHMM();
+ setupReadObservationPotentials();
+ computeForwardMessages();
+ calcLikelihoodFromLastSlice();
+ computeMAPState();
+ HMMConsistent=true;
+}
+
+MLAlignment ObservationModelFBMax::calcLikelihood()
+{
+ runHMM();
+ reportVariants();
+ return ml;
+}
+
+void ObservationModelFBMax::calcLikelihoodFromLastSlice()
+{
+ if (likelihoodComputed) return;
+ double *alpha_l=alpha[bMid];
+ double *beta_l=beta[bMid];
+ double *obs_l=obs[bMid];
+ logLikelihood=-HUGE_VAL;
+ logLikelihoodNoPrior=0.0;
+ likOffHap.resize(2);
+ likOffHap[0]=-HUGE_VAL;
+ likOffHap[1]=-HUGE_VAL;
+
+ int mapStateRMQ=0;
+
+ double llHMQ = -HUGE_VAL;
+
+ vector<double> priorRMQ, priorHMQ;
+ computeBMidPrior(priorRMQ, read.mapQual);
+ computeBMidPrior(priorHMQ, 1.0-1e-10);
+
+ int y=0;
+ for (int x=0;x<2*numS;x++, y++) {
+
+ double v=alpha_l[y]+obs_l[y]+beta_l[y]+priorRMQ[y];
+ if (v>logLikelihood+EPS) {
+ logLikelihood=v;
+ mapStateRMQ=x;
+ }
+
+ if ((x%numS)==0) {
+ if (v>likOffHap[0]) likOffHap[0]=v;
+ } else if ((x%numS)!=ROState) {
+ if (v>likOffHap[1]) likOffHap[1]=v;
+ }
+
+ v=alpha_l[y]+obs_l[y]+beta_l[y]+priorHMQ[y];
+ if (v>llHMQ+EPS) {
+ llHMQ=v;
+ mapState[bMid]=x;
+ }
+
+
+ }
+ //cout << "read: " << bam1_qname(this->read.getBam()) << " read.pos: " << read.pos << " matePos: " << this->read.matePos << " lib: " << " prior[" << bMid << ":" << mapState[bMid] << "]: " << priorRMQ[mapStateRMQ] << endl;
+ //cout << "lib: " << this->read.getLibraryName() << endl;
+ ml.ll=logLikelihood;
+ if ((mapState[bMid]%numS)==0 || (mapState[bMid]%numS)==ROState) {
+ ml.offHapHMQ=true;
+ }else {
+ ml.offHapHMQ=false;
+ }
+
+ if ((mapStateRMQ%numS)==0 || (mapStateRMQ%numS)==ROState) {
+ ml.offHap=true;
+ }else {
+ ml.offHap=false;
+ }
+
+
+ ml.llOff=likOffHap[0];
+ ml.llOn=likOffHap[1];
+
+ // now recompute mapState: we want to only show alignments to the haplotype
+
+#ifdef DEBUGHMM
+ //cout << "calcLikelihoodFromLastSlice(): " << logLikelihood << endl;
+ //cout << "here: " << scientific << setprecision(10) << log(likOffHap[0]*exp(prior[0])+likOffHap[1]*exp(prior[1])) << " " << logLikelihood << endl;
+#endif
+ likelihoodComputed=true;
+}
+
+
+
+void ObservationModelFBMax::computeMAPState()
+{
+
+ // now backtrack
+ for (int b=bMid; b>0;b--) {
+// cout << "mapState[" << b << "]: " << mapState[b] << " btf[]: " << btf[b][mapState[b]] << endl;
+ mapState[b-1]=btf[b][mapState[b]];
+ }
+
+ for (int b=bMid;b<readSize-1;b++) {
+ mapState[b+1]=btb[b][mapState[b]];
+// cout << "mapState[" << b << "]: " << mapState[b] << " btf[]: " << btb[b][mapState[b]] << endl;
+
+ }
+
+ //cout << "mapState: "; for (int b=0;b<readSize;b++) cout << " " << mapState[b]; cout << endl;
+
+}
+
+/*
+void ObservationModelFBMax::reportVariants(map<int, ReportVariant > & indels, map<int, ReportVariant > & snps, string & align)
+{
+ runHMM();
+
+ align=string(hapSize, 'R');
+ indels.clear();
+ snps.clear();
+
+ ml.firstBase=-1;
+ ml.lastBase=-1;
+
+ int b=0;
+ while (b<readSize) {
+ // only report variants for bases that are on the haplotype
+ int s=mapState[b];
+ if ( (s%numS)>0 && (s%numS)<=hapSize ) {
+ if (s>=numS) { // insertion
+ int pos=(s%numS)-1; // position of insertion wrt haplotype
+ int len=0; // length of insertion
+ int rpos=b; // start base of insertion in read
+ while (b<readSize && mapState[b]>=numS) {
+ b++;
+ len++;
+ }
+ indels[pos]=ReportVariant(len, read.seq.seq.substr(rpos, len), b-len);
+ indels[pos]=ReportVariant()
+ } else {
+ // update firstBase and lastBase
+ if (ml.firstBase==-1) ml.firstBase=s-1; else if (s-1<ml.firstBase) ml.firstBase=s-1;
+ if (ml.lastBase==-1) ml.lastBase=s-1; else if (s-1>ml.lastBase) ml.lastBase=s-1;
+
+
+ // check for SNP
+ if (read.seq[b]!=hap.seq[s-1]) {
+ string snp;
+ snp+=hap.seq[s-1];
+ snp.append("=>");
+ snp+=read.seq[b];
+ snps[s-1]=(ReportVariant(0,snp, b));
+ align[s-1]=read.seq[b];
+ }
+ // check for deletion
+ if (b<readSize-1) {
+ int ns=mapState[b+1];
+ if (ns<numS && ns-s>1) { // make sure next state is not an insertion..
+ int pos=s+1-1;
+ int len=-(ns-s-1);
+ indels[pos]=ReportVariant(len, hap.seq.substr(pos, -len), b);
+ for (int y=pos;y<-len+pos;y++) align[y]='D';
+ }
+ }
+
+ }
+
+ }
+
+ b++;
+ }
+
+
+}
+*/
+/*
+void getUniqueCoordinates(const Haplotype & hap, const Read & read, AlignedVariant & av)
+{
+ int rightFlankHap, leftFlankHap,rightFlankRead, leftFlankRead;
+
+ if (av.getType()==Variant::INS || av.getType() == Variant::DEL) {
+ const string & seq = av.getSeq();
+ int l = seq.size();
+
+ if (1) {
+
+
+ int p = av.getStartHap();
+ cout << "startHap: " << p << endl;
+ cout << "startRead: " << av.getStartRead() << endl;
+
+ while (p+l<=hap.seq.size()) {
+ string ss = hap.seq.substr(p,l);
+ if (ss!=seq) {
+
+ break;
+ }
+ p+=l;
+ }
+
+ rightFlankHap = p;
+
+ p = av.getStartHap()-l;
+
+ while (p>=0) {
+ string ss = hap.seq.substr(p,l);
+ if (ss!=seq) {
+ p+=l;
+ break;
+ }
+ p-=l;
+ }
+
+ leftFlankHap = p-1;
+ if (leftFlankHap<0) leftFlankHap = 0;
+
+ cout << "leftFlankHap: " << leftFlankHap << " rightFlankHap: " << rightFlankHap << endl;
+ }
+
+ // do read flanks
+
+ if (av.getType() == Variant::INS) {
+ int p = av.getStartRead();
+ cout << "startRead: " << p << endl;
+ while (p+l<=read.seq.size()) {
+ string ss = read.seq.seq.substr(p,l);
+ if (ss!=seq) {
+
+ break;
+ }
+ p+=l;
+ }
+
+ rightFlankRead = p;
+
+ p = av.getStartRead()-l;
+
+ while (p>=0) {
+ string ss = read.seq.seq.substr(p,l);
+ if (ss!=seq) {
+ p+=l;
+ break;
+ }
+ p-=l;
+ }
+
+ int leftFlankRead = p-1;
+ if (leftFlankRead<0) leftFlankRead = 0;
+
+ cout << "leftFlankRead: " << leftFlankRead << " rightFlankRead: " << rightFlankRead << endl;
+
+
+
+ } else if (av.getType() == Variant::DEL) {
+ int p = av.getStartRead()+1;
+ cout << "startRead: " << p << endl;
+ while (p+l<=read.seq.size()) {
+ string ss = read.seq.seq.substr(p,l);
+ if (ss!=seq) {
+
+ break;
+ }
+ p+=l;
+ }
+
+ rightFlankRead = p;
+
+ p = av.getStartRead()+1-l;
+
+ while (p>=0) {
+ string ss = read.seq.seq.substr(p,l);
+ if (ss!=seq) {
+ p+=l;
+ break;
+ }
+ p-=l;
+ }
+
+ int leftFlankRead = p-1;
+ if (leftFlankRead<0) leftFlankRead = 0;
+
+ cout << "leftFlankRead: " << leftFlankRead << " rightFlankRead: " << rightFlankRead << endl;
+
+
+
+ }
+
+
+ }
+
+
+
+}
+*/
+
+
+void ObservationModelFBMax::reportVariants()
+{
+ runHMM();
+
+ ml.align=string(hapSize, 'R');
+ ml.indels.clear();
+ ml.snps.clear();
+
+ ml.firstBase=-1;
+ ml.lastBase=-1;
+ ml.hapIndelCovered.clear();
+ ml.hapSNPCovered.clear();
+ ml.hpos.clear();
+ ml.hpos.resize(readSize);
+
+ ml.nBQT=0;
+ ml.nmmBQT=0;
+ ml.mLogBQ=0.0;
+ ml.nMMRight=0;
+ ml.nMMLeft=0;
+ ml.numIndels = 0;
+ ml.numMismatch = 0;
+
+ int b=0;
+ while (b<readSize) {
+ // only report variants for bases that are on the haplotype
+ int s=mapState[b];
+ if ( (s%numS)>0 && (s%numS)<=hapSize ) {
+ if (s>=numS) { // insertion
+ int pos=(s%numS)-1+1; // position of insertion wrt haplotype MAINTAIN CONVENTION OF INSERTION BEFORE BASE X
+ int len=0; // length of insertion
+ int rpos=b; // start base of insertion in read
+ while (b<readSize && mapState[b]>=numS) {
+ ml.hpos[b]=MLAlignment::INS;
+ b++;
+ len++;
+ }
+ int readStart=rpos;
+ int readEnd=b-1;
+ int hapStart=pos;
+ int hapEnd=pos;
+ string seq=read.seq.seq.substr(rpos,len);
+ ml.indels[pos]=AlignedVariant(string("+").append(seq), hapStart, hapEnd, readStart, readEnd);
+ ml.numIndels++;
+ b--;
+ //getFlankingCoordinatesBetter(this->hap, this->read, ml.indels[pos]);
+
+ } else {
+ ml.hpos[b]=s-1;
+ // update firstBase and lastBase
+ if (ml.firstBase==-1) ml.firstBase=s-1; else if (s-1<ml.firstBase) ml.firstBase=s-1;
+ if (ml.lastBase==-1) ml.lastBase=s-1; else if (s-1>ml.lastBase) ml.lastBase=s-1;
+
+ if (read.qual[b]>params.checkBaseQualThreshold){
+ ml.nBQT++;
+ ml.mLogBQ+=log10(1.0-read.qual[b]);
+ }
+
+ // check for SNP
+ if (read.seq[b]!=hap.seq[s-1]) {
+ string snp;
+ snp+=hap.seq[s-1];
+ snp.append("=>");
+ snp+=read.seq[b];
+ int readStart=b;
+ int readEnd=b;
+ int hapStart=s-1;
+ int hapEnd=s-1;
+
+ if (read.qual[b]>params.checkBaseQualThreshold) {
+ ml.nmmBQT++;
+
+ }
+
+ if (b<6) ml.nMMLeft++;
+ if (b>readSize-6) ml.nMMRight++;
+
+
+ if (read.qual[b]>0.95) ml.numMismatch++;
+
+
+
+ ml.snps[s-1]=AlignedVariant(snp,hapStart, hapEnd, readStart, readEnd);
+ ml.align[s-1]=read.seq[b];
+ }
+ // check for deletion
+ if (b<readSize-1) {
+ int ns=mapState[b+1];
+ if (ns<numS && ns-s>1) { // make sure next state is not an insertion..
+ int pos=s+1-1;
+ int len=-(ns-s-1);
+ //indels[pos]=ReportVariant(len, hap.seq.substr(pos, -len), b);
+
+ for (int y=pos;y<-len+pos;y++) ml.align[y]='D';
+ int readStart=b;
+ int readEnd=b+1;
+ int hapStart=pos;
+ int hapEnd=pos-len-1;
+ string seq=hap.seq.substr(pos,-len);
+ ml.indels[pos]=AlignedVariant(string("-").append(seq), hapStart, hapEnd, readStart, readEnd);
+ //getFlankingCoordinatesBetter(this->hap, this->read, ml.indels[pos]);
+ ml.numIndels++;
+ }
+ }
+
+ }
+
+ } else {// on haplotype
+ if (s%numS==0) ml.hpos[b]=MLAlignment::LO; else ml.hpos[b]=MLAlignment::RO;
+
+ }
+ b++;
+ }
+
+ for (map<int,AlignedVariant>::const_iterator it=hap.indels.begin();it!=hap.indels.end();it++) {
+ const AlignedVariant & av=it->second;
+ if (av.isCovered(params.padCover, ml.firstBase, ml.lastBase)) ml.hapIndelCovered[it->first]=true; else ml.hapIndelCovered[it->first]=false;
+ }
+ for (map<int,AlignedVariant>::const_iterator it=hap.snps.begin();it!=hap.snps.end();it++) {
+ const AlignedVariant & av=it->second;
+ if (av.isCovered(params.padCover, ml.firstBase, ml.lastBase)) ml.hapSNPCovered[it->first]=true; else ml.hapSNPCovered[it->first]=false;
+ }
+
+
+}
+
+void ObservationModelFBMax::printAlignment(size_t hapScrPos)
+{
+ // count how many bases in the read are left of the haplotype
+ calcLikelihood();
+
+ /*
+ for (int b=0;b<readSize;b++) {
+ int s=mapState[b];
+ //cout << "[" << b << " " << read.seq.seq[b] << " " << hap.seq[(s%numS)-1] << " " << s << "]";
+
+ }
+ */
+
+ string leftHap, rightHap;
+ string rhap(hap.size(),' ');
+ string ins;
+
+ bool insact=false;
+ int b=0;
+ while (b<readSize) {
+ // only report variants for bases that are on the haplotype
+ int s=mapState[b];
+ char nuc=read.seq.seq[b];
+ if (s%numS==0) {
+ //
+ leftHap+=nuc;
+ } else if ( (s%numS)>0 && (s%numS)<=hapSize ) {
+ if (s>=numS) { // insertion
+ if (!insact) {
+ insact=true;
+ ins+='[';
+ stringstream os; os << (s%numS);
+ ins.append(os.str());
+ ins+=' ';
+ }
+
+ ins+=nuc;
+
+ } else {
+ if (insact) ins+=']';
+ insact=false;
+ rhap[s-1]=nuc;
+
+ if (b<readSize-1) {
+ int ns=mapState[b+1];
+ if (ns<numS && ns-s>1) {
+ int len=ns-s-1;
+ rhap.replace(s, len, string(len,'_'));
+ }
+
+
+ }
+
+
+ }
+
+ } else {
+ rightHap+=nuc;
+ }
+ b++;
+ }
+ if (insact) ins+=']';
+
+ stringstream os;
+ os << readSize << " " << ml.offHap << " " << ml.indels.size() << " " << ml.firstBase << " " << ml.lastBase << " " << logLikelihood << " ";
+ for (map<int,AlignedVariant>::const_iterator it=hap.indels.begin();it!=hap.indels.end();it++) {
+ if (ml.hapIndelCovered[it->first]) os << "1 "; else os << "0 ";
+ }
+ string prefix=os.str();
+
+ int leftHapSpace=int(hapScrPos)-int(prefix.size());
+ if (leftHapSpace<0) leftHapSpace=0;
+
+ string prLeftHap=string(leftHapSpace,' ');
+
+ if (int(leftHap.size())>leftHapSpace) {
+ prLeftHap=leftHap.substr(leftHap.size()-leftHapSpace, leftHapSpace);
+ } else if (leftHap.size()>0) {
+ prLeftHap.replace(leftHapSpace-leftHap.size(), leftHap.size(), leftHap);
+ }
+
+ cout << prefix<<prLeftHap<<rhap<<rightHap << " " << ins << " read: " << read.seq.seq << endl;
+
+
+ for (map<int,AlignedVariant>::const_iterator it=hap.indels.begin();it!=hap.indels.end();it++) {
+ cout << " " << it->first;
+ }
+ cout << endl;
+}
+
+
+
+void ObservationModelFBMax::computeForwardMessages()
+{
+ if (forwardDone) return;
+
+ for (int b=1;b<=bMid;b++) {
+ passMessageTwoDec(alpha[b], alpha[b-1], obs[b-1], btf[b]);
+ }
+ for (int b=readSize-1;b>bMid;b--) {
+ passMessageTwoInc(beta[b-1], beta[b], obs[b], btb[b-1]);
+ }
+
+ forwardDone=true;
+}
+
+void ObservationModelFBMax::computeBackwardMessages()
+{
+ // no backward messages for this model
+ backwardDone=true;
+}
+
+void ObservationModelFBMax::allocateMemory()
+{
+ if (memAllocated) return; //throw string("Memory already allocated.");
+ mapState.resize(readSize, 0);
+
+
+ obs.reserve(readSize);
+ alpha.reserve(readSize);
+ beta.reserve(readSize);
+ btf.reserve(readSize);
+ btb.reserve(readSize);
+
+ for (int b=0;b<readSize;b++) {
+ obs.push_back(new double[numS*2]);
+ alpha.push_back(new double[numS*2]);
+ beta.push_back(new double[numS*2]);
+ if (b<=bMid) {
+ btf.push_back(new int[numS*2]);
+ } else btf.push_back(NULL);
+ if (b>=bMid) {
+ btb.push_back(new int[numS*2]);
+ } else btb.push_back(NULL);
+ }
+ if (makeObsVector) { obsVector=new double[4*readSize]; };
+ memAllocated=true;
+}
+
+void ObservationModelFBMax::deleteMemory()
+{
+ if (memAllocated) {
+ for (int b=0;b<readSize;b++) {
+ delete[] obs[b];
+ delete[] alpha[b];
+ delete[] beta[b];
+ if (btf[b]!=NULL) delete[] btf[b];
+ if (btb[b]!=NULL) delete[] btb[b];
+ }
+ if (makeObsVector) delete[] obsVector;
+ memAllocated=false;
+ }
+}
+
+ObservationModelFBMaxErr::ObservationModelFBMaxErr(const Haplotype & _hap, const Read & r, uint32_t hapStart, const ObservationModelParameters & _params)
+{
+
+ read=r;
+ params=_params;
+ Init(_hap, hapStart);
+}
+
+
+
+void ObservationModelFBMaxErr::setupTransitionProbs()
+{
+ logpLOgLO=log(1.0-params.pFirstgLO);
+ logpFirstgLO=log(params.pFirstgLO);
+
+ numT=params.maxLengthDel+2;
+ logPTrans.resize(numT);
+ // maxT is the transition which corresponds to a normal-operation base extension
+ logPTrans[1]=log(1.0-params.pError);
+ double norm=0.0;
+ for (int x=1;x<numT;x++) if (x!=1) {
+ double p=-fabs(1.0-double(x));
+ logPTrans[x]=p;
+ norm+=exp(p);
+ }
+ norm=log(norm/params.pError);
+ for (int x=1;x<numT;x++) if (x!=1) logPTrans[x]-=norm;
+
+ // check norm
+ norm=0.0;
+ for (int x=1;x<numT;x++) norm+=exp(logPTrans[x]);
+ assert(fabs(norm-1.0)<1e-15);
+
+ logpInsgIns=-.5 ;
+ logpNoInsgIns=log(1.0-exp(logpInsgIns));
+ logpInsgNoIns=log(params.pError);
+ logpNoInsgNoIns=log(1-params.pError);
+ /*
+ cout << "logpInsgIns: " << logpInsgIns << endl;
+ cout << "logpNoInsgIns: " << logpNoInsgIns << endl;
+ cout << "logpInsgNoIns: " << logpInsgNoIns << endl;
+ cout << "logpNoInsgNoIns: " << logpNoInsgNoIns << endl;
+ */
+
+ // determine base-specific error probabilities
+ ReadIndelErrorModel riem;
+
+ logProbError = vector<double>(hapSize+2,log(1e-5));
+ logProbNoError = vector<double>(hapSize+2,log(1-1e-5));
+
+
+ int len=1;
+ double perr=riem.getViterbiHPError(1);
+ logProbError[1]=log(perr);
+ logProbNoError[1]=log(1.0-perr);
+
+ // NOTE X = ( LO, 0, 1, 2, 3, .. )
+ for (int b=1;b<hapSize;b++) {
+ if (hap.seq[b]==hap.seq[b-1]) {
+ len++;
+ } else {
+ perr=riem.getViterbiHPError(len);
+// cout << "len: " << len << " perr: " << perr << endl;
+ logProbError[b]=log(perr);
+ logProbNoError[b]=log(1.0-perr);
+ len=1;
+ }
+ //cout << "hap[" << b << "]: " << len << " " << logProbError[b+1] << endl;
+ }
+ perr=riem.getViterbiHPError(len);
+ // cout << "len: " << len << " perr: " << perr << endl;
+ logProbError[hapSize-1]=log(perr);
+ logProbNoError[hapSize-1]=log(1.0-perr);
+
+
+ /*
+ cout << "logProbError: " << endl;
+ for (int x=0;x<=hapSize+1;x++) {
+ cout << "x: " << x << " " << ((x>0&&x<=hapSize)?hap.seq[x-1]:'N') << " " << logProbError[x] << " " << logProbNoError[x] <<endl;
+ }
+ */
+
+}
+
+void ObservationModelFBMaxErr::passMessageTwoInc(double *beta_l, const double *beta_l_1,const double *obs_l_1, int *bt_l)
+{ // b-1 b
+
+ // x^l, i^l=0 => x^{l+1}, i^{l+1}=0
+ //beta_l[0]=( exp(obs_l_1[0]+beta_l_1[0]+logpLOgLO+logpNoInsgNoIns) ) + ( exp(obs_l_1[1]+beta_l_1[1]+logpFirstgLO+logpNoInsgNoIns ) );
+ beta_l[0]=-HUGE_VAL;
+ updateMax(beta_l[0], bt_l[0], obs_l_1[0]+beta_l_1[0]+logpLOgLO+logpNoInsgNoIns, 0);
+ updateMax(beta_l[0], bt_l[0], obs_l_1[1]+beta_l_1[1]+logpFirstgLO+logpNoInsgNoIns, 1);
+
+ for (int x=1;x<=hapSize;x++ ) {
+ // double tmp=beta_l_1[x]+logpNoInsgNoIns;
+ beta_l[x]=-HUGE_VAL;
+ for (int y=1;y<numT;y++) {
+ int newx=x+y;
+ if (newx>hapSize) newx=ROState;
+ double lpn=logProbNoError[newx];
+ double lpt=logProbError[newx];
+ double lp=(y==1)?lpn:(lpt+double(y-1)*logpInsgIns);
+
+ //beta_l[x]+=exp(logPTrans[y]+logpNoInsgNoIns+beta_l_1[newx]+obs_l_1[newx]);
+ updateMax(beta_l[x], bt_l[x], lp+lpn+beta_l_1[newx]+obs_l_1[newx], newx);
+ }
+ }
+
+ // RO -> RO pROgRO=1.0;
+ //beta_l[ROState]=exp(obs_l_1[ROState]+beta_l_1[ROState]+logpNoInsgNoIns);
+ beta_l[ROState]=-HUGE_VAL;
+ updateMax(beta_l[ROState], bt_l[ROState],obs_l_1[ROState]+beta_l_1[ROState]+logProbNoError[ROState], ROState);
+
+ //
+ // x^l, i^l=0 = > x^{l+1}=x^l, i^{l+1}=1
+ for (int x=0;x<=hapSize;x++ ) {
+ //beta_l[x]+=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgNoIns);
+ updateMax(beta_l[x], bt_l[x], obs_l_1[numS+x]+beta_l_1[numS+x]+logProbError[x+1], numS+x);
+ }
+ int x=hapSize+1; updateMax(beta_l[x], bt_l[x], obs_l_1[numS+x]+beta_l_1[numS+x], numS+x);
+
+
+ // x^l, i^l=1 = > x^{l+1}=x^l, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ //beta_l[numS+x]=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns);
+ beta_l[numS+x]=obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns;
+ bt_l[numS+x]=numS+x;
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l+1, i^{l+1}=0
+ //beta_l[0+numS]+=exp(obs_l_1[0]+beta_l_1[0]+logpNoInsgIns); // cannot go from insertion on to the haplotype
+ updateMax(beta_l[0+numS], bt_l[0+numS], obs_l_1[0]+beta_l_1[0]+logpNoInsgIns, 0);
+ for (int x=1;x<=hapSize+1;x++ ) {
+ int newx=x+1; if (newx>ROState) newx=ROState;
+ //beta_l[numS+x]+=exp(obs_l_1[newx]+beta_l_1[newx]+logpNoInsgIns);
+ updateMax(beta_l[numS+x], bt_l[numS+x], obs_l_1[newx]+beta_l_1[newx]+logpNoInsgIns, newx);
+ }
+
+
+ // convert back to log
+
+ // for (int x=0;x<2*numS;x++) beta_l[x]=log(beta_l[x]);
+}
+
+void ObservationModelFBMaxErr::passMessageTwoDec(double *beta_l, const double *beta_l_1,const double *obs_l_1, int *bt_l)
+{ // b b-1
+ // P(x^l=x^{l-1}+d|x^{l-1})=(10^{logpSkip*d})
+
+ // x^l, i^l=0 = > x^{l+1}, i^{l+1}=0
+ beta_l[ROState]=-HUGE_VAL;
+ updateMax(beta_l[ROState], bt_l[ROState], obs_l_1[ROState]+beta_l_1[ROState]+logpLOgLO+logpNoInsgNoIns, ROState);
+ updateMax(beta_l[ROState], bt_l[ROState], obs_l_1[hapSize]+beta_l_1[hapSize]+logpFirstgLO+logpNoInsgNoIns, hapSize);
+
+ for (int x=1;x<=hapSize;x++ ) {
+ beta_l[x]=-HUGE_VAL;
+ double lpt = logProbError[x];
+ double lpn = logProbNoError[x];
+ for (int y=1;y<numT;y++) {
+ int newx=x-y;
+ if (newx<0) newx=0;
+ double lp=(y==1)?lpn:(lpt+double(y-1)*logpInsgIns);
+ //beta_l[x]+=exp(obs_l_1[newx]+logPTrans[y]+beta_l_1[newx]+logpNoInsgNoIns);
+ updateMax(beta_l[x], bt_l[x], obs_l_1[newx]+lp+beta_l_1[newx]+lpn, newx);
+ }
+ }
+ // RO -> RO pROgRO=1.0;
+ //beta_l[0]=exp(obs_l_1[0]+beta_l_1[0]+logpNoInsgNoIns);
+ beta_l[0]=obs_l_1[0]+beta_l_1[0]+logpNoInsgNoIns;
+ bt_l[0]=0;
+
+
+ // x^l, i^l=0 = > x^{l+1}=x^l-1, i^{l+1}=1
+ //beta_l[ROState]+=(exp(obs_l_1[numS+ROState]+beta_l_1[numS+ROState]+logpLOgLO+logpInsgNoIns)+exp(obs_l_1[numS+hapSize]+beta_l_1[numS+hapSize]+logpFirstgLO+logpInsgNoIns)); // cannot go from insertion on to the haplotype
+ updateMax(beta_l[ROState], bt_l[ROState],obs_l_1[numS+ROState]+beta_l_1[numS+ROState]+logpLOgLO+logProbError[ROState], numS+ROState);
+ updateMax(beta_l[ROState], bt_l[ROState],obs_l_1[numS+hapSize]+beta_l_1[numS+hapSize]+logpFirstgLO+logProbError[hapSize], numS+hapSize);
+
+ for (int x=1;x<=hapSize;x++) {
+ int newx=x-1; if (newx<0) newx=0;
+ //beta_l[x]+=exp(obs_l_1[numS+newx]+beta_l_1[numS+newx]+logpInsgNoIns);
+ updateMax(beta_l[x], bt_l[x],obs_l_1[numS+newx]+beta_l_1[numS+newx]+logProbError[x], numS+newx);
+ }
+
+
+
+ // x^l, i^l=1 = > x^{l+1}, i^{l+1}=1
+ for (int x=0;x<=hapSize+1;x++ ) {
+ // beta_l[numS+x]=exp(obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns);
+ beta_l[numS+x]=obs_l_1[numS+x]+beta_l_1[numS+x]+logpInsgIns;
+ bt_l[numS+x]=numS+x;
+ }
+
+ // x^l, i^l=1 = > x^{l+1}=x^l, i^{l+1}=0
+ for (int x=1;x<=hapSize+1;x++ ) {
+ // beta_l[numS+x]+=exp(obs_l_1[x]+beta_l_1[x]+logpNoInsgIns);
+ updateMax(beta_l[numS+x], bt_l[numS+x],obs_l_1[x]+beta_l_1[x]+logpNoInsgIns, x);
+ }
+ // convert back to log
+ //for (int x=0;x<2*numS;x++) beta_l[x]=log(beta_l[x]);
+}
diff --git a/ObservationModelFB.hpp b/ObservationModelFB.hpp
new file mode 100644
index 0000000..d995cc5
--- /dev/null
+++ b/ObservationModelFB.hpp
@@ -0,0 +1,169 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OBSERVATIONMODELFB_HPP_
+#define OBSERVATIONMODELFB_HPP_
+#include <vector>
+#include "Haplotype.hpp"
+#include "Read.hpp"
+#include "MLAlignment.hpp"
+#include "ObservationModel.hpp"
+using namespace std;
+
+const double LOGTINY=-100.0;
+const double EPS=1e-10;
+// simple HMM inference algorithm for observation of single read given from a true underlying haplotype
+class ObservationModelFB
+{
+public:
+ ObservationModelFB() {};
+ virtual ~ObservationModelFB();
+ MLAlignment calcLikelihood();
+ double getLogLikelihood() { calcLikelihood(); return ml.ll; };
+ double* getMarginal(int readBase) { assert(readBase<readSize); return mar[readBase]; };
+ void getObsVector(int b, double *vec) const;
+ /*!
+ * @abstract Change haplotype used for likelihood computation, which
+ * can be useful in the EM algorithm
+ * @params newHap new haplotype
+ * @discussion newHap must have same length as previous haplotype
+ */
+ void changeHaplotype(const Haplotype & newHap);
+ void printMarginals();
+ void computeMarginals();
+ void computeXMarginals();
+ void printAlignment(size_t hapScrPos);
+ void printStatistics();
+ vector<double> getOffHapLik() const { return likOffHap; };
+ const MLAlignment & getMLAlignment() const { return ml; }
+
+protected:
+ virtual void Init(const Haplotype & _hap, uint32_t hapStart);
+ void forceOnHap();
+ void setupReadObservationPotentials();
+ virtual void setupTransitionProbs();
+ virtual void initHMM();
+ //void runHMM();
+ virtual void allocateMemory();
+ virtual void deleteMemory();
+ virtual void computeBMidPrior(vector<double> & _prior, double mapQual);
+
+ void passMessageOneInc(double *alpha_l, const double *alpha_l_1, const double *obs_l_1);
+ void passMessageOneDec(double *alpha_l, const double *alpha_l_1, const double *obs_l_1);
+ virtual void passMessageTwoInc(double *beta_l, const double *beta_l_1,const double *obs_l_1);
+ virtual void passMessageTwoDec(double *beta_l, const double *beta_l_1,const double *obs_l_1);
+
+ virtual void computeForwardMessages();
+ void computeBackwardMessages();
+ virtual void calcLikelihoodFromLastSlice();
+ void printMarginalsInt(const vector<double*> & pot);
+ bool hasErrors();
+ bool _badValue(double v);
+
+ // maximum number of bases that may be skipped by the sequencer
+ // exponential decay rate of skip probability
+
+
+ double logLikelihood, logLikelihoodNoPrior;
+ MLAlignment ml;
+
+ Haplotype hap;
+ Read read;
+ vector<double> prior;
+ vector<double> logPTrans;
+ vector<double> likOffHap;
+ vector<double> priorOffHap;
+
+
+
+ bool obsInitialized, memAllocated, HMMInitialized, HMMConsistent, likelihoodComputed, forwardDone, backwardDone, marginalsComputed;
+
+ bool makeObsVector;
+
+ // potentials are stored as log-values
+ // observation potentials
+ vector<double*> obs;
+
+ // posterior marginals given _hap
+ vector<double*> mar, xmar;
+ // forward and backward messages
+ vector<double*> alpha, beta;
+ double *obsVector;
+
+ vector<double> logProbError, logProbNoError;
+
+ // structure of read-base variable
+
+ // {LeftOfHaplotype, Hap_1, Hap_2, Hap_3, ..., Hap_length, RightOfHaplotype}
+
+ // HMM internal variables
+
+ int hapSize, readSize, ROState, bMid, hapStart, numT, numS;
+
+ double logpLOgLO, logpFirstgLO;
+ double logpInsgIns, logpInsgNoIns, logpNoInsgNoIns, logpNoInsgIns;
+
+public:
+
+ ObservationModelParameters params;
+ ObservationModelFB(const Haplotype & _hap, const Read & r, uint32_t hapStart, const ObservationModelParameters & params);
+
+
+};
+
+class ObservationModelFBMax : public ObservationModelFB
+{
+public:
+ void printMarginals() { throw string("Not possible for this model"); }
+ void computeMarginals() { throw string("Not possible for this model"); }
+ void computeXMarginals() { throw string("Not possible for this model"); }
+ void reportVariants();
+ void printAlignment(size_t hapScrPos);
+ MLAlignment calcLikelihood();
+ ObservationModelFBMax(const Haplotype & _hap, const Read & r, uint32_t hapStart, const ObservationModelParameters & params);
+ ObservationModelFBMax() {};
+
+ ~ObservationModelFBMax() { deleteMemory(); };
+ vector<int> getMapState() { runHMM(); return mapState; };
+protected:
+ void passMessageOneInc(double *alpha_l, const double *alpha_l_1, const double *obs_l_1);
+ void passMessageOneDec(double *alpha_l, const double *alpha_l_1, const double *obs_l_1);
+ virtual void passMessageTwoInc(double *beta_l, const double *beta_l_1,const double *obs_l_1, int * bt_l);
+ virtual void passMessageTwoDec(double *beta_l, const double *beta_l_1,const double *obs_l_1, int * bt_l);
+ void allocateMemory();
+ void deleteMemory();
+ void runHMM();
+ void computeForwardMessages();
+ void computeBackwardMessages();
+ void calcLikelihoodFromLastSlice();
+ void computeMAPState();
+ inline void updateMax(double & destValue, int & destIdx, const double newValue, const int newIdx);
+ vector<int *> btf, btb;
+ vector<int> mapState; // MAP state for HMM
+ //static double EPS=1e-10;
+};
+
+class ObservationModelFBMaxErr : public ObservationModelFBMax
+{
+public:
+ ObservationModelFBMaxErr(const Haplotype & _hap, const Read & r, uint32_t hapStart, const ObservationModelParameters & params);
+
+protected:
+ void setupTransitionProbs();
+ void passMessageTwoInc(double *beta_l, const double *beta_l_1,const double *obs_l_1, int * bt_l);
+ void passMessageTwoDec(double *beta_l, const double *beta_l_1,const double *obs_l_1, int * bt_l);
+};
+
+
+#endif /*ObservationModelFB_HPP_*/
diff --git a/ObservationModelSeqAn.hpp b/ObservationModelSeqAn.hpp
new file mode 100644
index 0000000..99a1a2a
--- /dev/null
+++ b/ObservationModelSeqAn.hpp
@@ -0,0 +1,377 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * ObservationModelSeqAn.hpp
+ *
+ * Created on: Sep 4, 2009
+ * Author: caa
+ */
+
+#ifndef OBSERVATIONMODELSEQAN_HPP_
+#define OBSERVATIONMODELSEQAN_HPP_
+
+#include <seqan/align.h>
+#include <seqan/align/align_base.h>
+#include <seqan/graph_align.h>
+#include "Haplotype.hpp"
+#include "Read.hpp"
+#include "ObservationModel.hpp"
+#include "MLAlignment.hpp"
+using namespace seqan;
+using namespace std;
+
+const int DEBUGCONVERTALIGNMENT = 0;
+
+class Realign {
+public:
+ void getFlankingCoordinatesBetter(const Haplotype & hap, const Read & read, AlignedVariant & av)
+ {
+ int rightFlankHap, leftFlankHap,rightFlankRead, leftFlankRead;
+ //cout << "variant: " << av.getString() << endl;
+ //cout << "startHap: " << av.getStartHap() << endl;
+ //cout << "startRead: " << av.getStartRead() << endl;
+
+
+ if (av.getType()==Variant::DEL) {
+ const string & seq = av.getSeq();
+ int l = seq.size();
+
+ string origSeq = hap.seq;
+
+ int sh = av.getStartHap();
+ origSeq.erase(sh, l);
+ leftFlankHap = sh-1;
+ rightFlankHap = sh+l;
+ int newpos = sh;
+ for (int x=sh-1;x>0;x--) {
+ string newseq = hap.seq;
+ newseq.erase(x, l);
+ if (newseq == origSeq) {
+ leftFlankHap = x-1;
+ newpos = x;
+ }
+ }
+ if (leftFlankHap<=0) leftFlankHap = 0;
+ for (int x=sh+1;x<int(hap.seq.size()-l);x++) {
+ string newseq = hap.seq;
+ newseq.erase(x, l);
+ if (newseq == origSeq) {
+ rightFlankHap = x+l;
+ newpos = x;
+ }
+ }
+
+ leftFlankRead = av.getStartRead()- (sh-leftFlankHap)+1; if (leftFlankRead<0) leftFlankRead = 0;
+ rightFlankRead = av.getStartRead()+1 + (rightFlankHap-sh-l); if (rightFlankRead>=int(read.seq.size())) leftFlankRead = read.seq.size()-1;
+
+
+
+ //cout << "leftFlankHap: " << leftFlankHap << " rightFlankHap: " << rightFlankHap << endl;
+ //cout << "leftFlankRead: " << leftFlankRead << " rightFlankRead: " << rightFlankRead << endl;
+ } else if (av.getType()==Variant::INS) {
+ const string & seq = av.getSeq();
+ int l = seq.size();
+ string newiseq;
+ string origSeq = hap.seq;
+
+ int sh = av.getStartHap();
+ origSeq.insert(sh, seq);
+ leftFlankHap = sh-1;
+ rightFlankHap = sh;
+ int newpos = sh;
+ for (int x=sh-1;x>0;x--) {
+ string newseq = hap.seq;
+ string iseq = origSeq.substr(x, l);
+ newseq.insert(x, iseq);
+ int eq = 0;
+ if (newseq == origSeq) {
+ leftFlankHap = x-1;
+ eq = 1;
+ newpos = x;
+ newiseq = iseq;
+ }
+ //cout << "x: " << x << " iseq: " << iseq << " eq: " << eq << endl;
+
+ }
+ if (leftFlankHap<=0) leftFlankHap = 0;
+ for (int x=sh+1;x<int(hap.seq.size()-l);x++) {
+ string newseq = hap.seq;
+ string iseq = origSeq.substr(x, l);
+ newseq.insert(x, iseq);
+ int eq=0;
+ if (newseq == origSeq) {
+ rightFlankHap = x;
+ eq=1;
+ newpos = x;
+ }
+ //cout << "x: " << x << " iseq: " << iseq << " eq: " << eq << endl;
+
+ }
+
+ leftFlankRead = av.getStartRead()- (sh-leftFlankHap)+1; if (leftFlankRead<0) leftFlankRead = 0;
+ rightFlankRead = av.getStartRead()+l + (rightFlankHap-sh); if (rightFlankRead>=int(read.seq.size())) leftFlankRead = read.seq.size()-1;
+
+
+
+ //cout << "leftFlankHap: " << leftFlankHap << " rightFlankHap: " << rightFlankHap << endl;
+ //cout << "leftFlankRead: " << leftFlankRead << " rightFlankRead: " << rightFlankRead << endl;
+ //cout << "newiseq: " << newiseq << endl;
+ } else {
+ leftFlankRead = av.getStartRead()-1; if (leftFlankRead<0) leftFlankRead = 0;
+ rightFlankRead = av.getStartRead()+1; if (rightFlankRead>=int(read.seq.size())) leftFlankRead = read.seq.size()-1;
+ leftFlankHap = av.getStartHap()-1; if (leftFlankHap<0) leftFlankHap = 0;
+ rightFlankHap = av.getStartHap()+1; if (rightFlankHap>=int(hap.seq.size())) leftFlankHap = hap.seq.size()-1;
+ }
+ av.setFlanking(leftFlankHap, rightFlankHap, leftFlankRead, rightFlankRead);
+ }
+};
+
+
+template <typename TSource, typename TSpec>
+inline void
+convertAlignment(
+ Align<TSource, TSpec> const & source, MLAlignment & ml, int hlen, int rlen, const Haplotype & hap, const Read & read)
+{
+ typedef Align<TSource, TSpec> const TAlign;
+ typedef typename Row<TAlign>::Type TRow;
+ typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
+ typedef typename Position<TAlign>::Type TPosition;
+
+ TPosition begin_ = beginPosition(cols(source));
+ TPosition end_ = endPosition(cols(source));
+
+ Realign realign;
+
+ if (DEBUGCONVERTALIGNMENT) cout << "begin_ " << begin_ << " end_ " << end_ << endl;
+
+
+ ml.relPos = 0;
+ bool fbfound=false;
+
+ TRow& row_ = row(source, 0);
+ typedef typename Iterator<typename Row<TAlign>::Type const, Standard>::Type TIter;
+ TIter begin1_ = iter(row_, begin_);
+ TIter end1_ = iter(row_, end_);
+
+
+ ml.align=string(hlen,'R');
+ ml.hpos=vector<int>(rlen,MLAlignment::LO);
+
+
+ int b=0;
+ int hs=0; // relative start of haplo
+
+ int rb=0;
+
+ while (isGap(row(source,0), begin_+b)) {
+ ml.relPos--;
+ if (!isGap(row(source,1),begin_+b)) {
+ ml.hpos[rb]=MLAlignment::LO;
+ rb++;
+ }
+ ++b;
+ }
+ hs=b;
+
+ if (DEBUGCONVERTALIGNMENT) cout << "relpos: " << ml.relPos << " hs: " << hs << endl;
+
+ int hb=0; // number of haplotype bases
+
+ while (begin_+b<end_ && rb<rlen) {
+ if (DEBUGCONVERTALIGNMENT) cout << "b: " << b << " hb: " << hb << endl;
+ if (isGap(row(source,0), begin_+b)) {
+ if (hb<hlen) {
+ // insertion
+ string seq("+");
+ TIter it=iter(row(source,1), begin_+b);
+ while (isGap(row(source,0),begin_+b) && begin_+b<end_) {
+ seq+=convert<char>(*it);
+ ml.hpos[rb]=MLAlignment::INS;
+ ++b;
+ ++it;
+ ++rb;
+ }
+ if (DEBUGCONVERTALIGNMENT) cout << "insertion: " << hb << " seq: " << seq << " readpos: " << rb-1 << " - " << rb-seq.size()+1 << endl;
+ AlignedVariant av(seq, hb,hb, rb-seq.size()+1, rb-1);
+ realign.getFlankingCoordinatesBetter(hap, read, av);
+ ml.indels[hb]=av;
+ } else {
+ ml.hpos[rb]=MLAlignment::RO;
+ ++rb;
+ ++b;
+ }
+
+ } else {
+ if (!isGap( row(source,1), begin_+b)) {
+ if (!fbfound) {
+ fbfound=true;
+ ml.firstBase=hb;
+ }
+ if (row(source,1)[begin_+b]!=row(source,0)[begin_+b]) {
+ // SNP
+ string snp("X=>X");
+ snp[0]=convert<char>(row(source,0)[begin_+b]);
+ snp[3]=convert<char>(row(source,1)[begin_+b]);
+
+ if (DEBUGCONVERTALIGNMENT) cout << "SNP: " << hb << " " << convert<char>(row(source,0)[begin_+b]) << "=>" << convert<char>(row(source,1)[begin_+b]) << endl;
+ ml.snps[hb]=AlignedVariant(snp, hb, hb, rb,rb);
+ realign.getFlankingCoordinatesBetter(hap, read, ml.snps[hb]);
+ ml.align[hb]=snp[3];
+ }
+ ml.hpos[rb]=hb;
+ ++rb;
+ ++b;
+ ++hb;
+ } else {
+ // deletion
+ string seq("-");
+ TIter it=iter(row(source,0), begin_+b);
+ int len=0;
+ while (isGap(row(source,1),begin_+b) && begin_+b<end_) {
+ seq+=convert<char>(*it);
+ ml.align[hb]='D';
+ ++b;
+ ++it;
+ ++hb;
+ ++len;
+ }
+ if (fbfound) {
+ ml.indels[hb-len]=AlignedVariant(seq, hb-len,hb-1, rb-1,rb);
+ realign.getFlankingCoordinatesBetter(hap, read, ml.indels[hb-len]);
+ if (DEBUGCONVERTALIGNMENT) cout << "deletion: " << hb-len << " - " << hb-1 << " seq: " << seq << " readpos: " << rb-1 << " - " << rb << endl;
+
+ }
+
+
+ }
+
+ }
+ }
+ ml.lastBase=hb;
+
+ if (DEBUGCONVERTALIGNMENT) {
+ cout << "mfb: " << ml.firstBase << " " << ml.lastBase << endl;
+ for (int r=0;r<rlen;r++) cout << "[" << r << "," << ml.hpos[r] << "]"; cout << endl;
+ }
+
+}
+
+/*
+int main()
+{
+
+
+
+ // check
+ // seqan::DnaString _refSeq("ATGGCGTGACTGATCCTATCCCCGTT");
+ // seqan::DnaString _hapSeq("TTATATGGCGTG");
+
+ //seqan::DnaString _refSeq("ATGGCGTGACTGATCCTATCGTCGTT");
+ //seqan::DnaString _hapSeq("CCCGGTGACTCC");
+
+ seqan::DnaString _refSeq("ATGGCGTGACTGATCCTATCGTCGTT");
+ seqan::DnaString _hapSeq("CTATCGTCTGTAGGTGTCCT");
+
+
+ seqan::Score<int> score(-1, -460, -100,-960);
+
+ seqan::Align<seqan::DnaString, seqan::ArrayGaps> align;
+ seqan::resize(seqan::rows(align), 2);
+ seqan::assignSource(seqan::row(align, 0), _refSeq);
+ seqan::assignSource(seqan::row(align, 1), _hapSeq);
+ cout << "Score = " << seqan::globalAlignment(align, score) << endl;
+ cout << align << endl;
+
+ MLAlignment ml;
+
+ convertAlignment(align,ml, length(_refSeq),length(_hapSeq));
+
+}
+*/
+
+class ObservationModelSeqAn
+{
+public:
+ ObservationModelSeqAn(const Haplotype & _hap, const Read & r, uint32_t _hapStart, const ObservationModelParameters & _params, const seqan::Score<int> & _score)
+ {
+ score =_score;
+ hap_ptr = &_hap;
+ read_ptr = &r;
+ hapStart=_hapStart;
+ params=_params;
+ aligned=false;
+
+ // cout << "hap.seq: " << _hap.seq << endl;
+ // cout << "read: " << r.seq.seq << endl;
+
+ }
+
+ void align()
+ {
+ if (aligned) return;
+ seqan::DnaString _hapSeq(hap_ptr->seq);
+ seqan::DnaString _readSeq(read_ptr->seq.seq);
+ alignResult=MyAlign();
+
+ seqan::resize(seqan::rows(alignResult), 2);
+ seqan::assignSource(seqan::row(alignResult, 0), _hapSeq);
+ seqan::assignSource(seqan::row(alignResult, 1), _readSeq);
+
+ stringstream os;
+ os << seqan::globalAlignment(alignResult, score) << endl;
+ ml.ll=atof(os.str().c_str());
+
+
+ if (DEBUGCONVERTALIGNMENT) {
+ cout << alignResult << endl;
+ }
+
+
+ convertAlignment(alignResult, ml, length(_hapSeq), length(_readSeq), *hap_ptr, *read_ptr);
+
+ reportVariants();
+ aligned=true;
+ }
+ const MLAlignment & getMLAlignment() { align(); return ml; }
+
+protected:
+ void reportVariants()
+ {
+ const Haplotype & hap = *hap_ptr;
+ for (map<int,AlignedVariant>::const_iterator it=hap.indels.begin();it!=hap.indels.end();it++) {
+ const AlignedVariant & av=it->second;
+ if (av.isCovered(params.padCover, ml.firstBase, ml.lastBase)) ml.hapIndelCovered[it->first]=true; else ml.hapIndelCovered[it->first]=false;
+ }
+ for (map<int,AlignedVariant>::const_iterator it=hap.snps.begin();it!=hap.snps.end();it++) {
+ const AlignedVariant & av=it->second;
+ if (av.isCovered(params.padCover, ml.firstBase, ml.lastBase)) ml.hapSNPCovered[it->first]=true; else ml.hapSNPCovered[it->first]=false;
+ }
+ }
+
+ typedef seqan::Align<seqan::DnaString, seqan::ArrayGaps> MyAlign;
+ MyAlign alignResult;
+ ObservationModelParameters params;
+ const Haplotype *hap_ptr;
+ const Read *read_ptr;
+ uint32_t hapStart;
+
+
+ MLAlignment ml;
+ seqan::Score<int> score;
+ bool aligned;
+
+};
+
+#endif /* OBSERVATIONMODELSEQAN_HPP_ */
diff --git a/OutputData.hpp b/OutputData.hpp
new file mode 100644
index 0000000..ae693eb
--- /dev/null
+++ b/OutputData.hpp
@@ -0,0 +1,121 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * OutputData.hpp
+ *
+ * Created on: Sep 7, 2009
+ * Author: caa
+ */
+
+#ifndef OUTPUTDATA_HPP_
+#define OUTPUTDATA_HPP_
+#include <vector>
+#include "StringHash.hpp"
+#include <string>
+#include <iostream>
+#include "foreach.hpp"
+using namespace std;
+using namespace boost;
+
+class OutputData
+{
+ friend class Line;
+public:
+ OutputData(ostream &_out) { out = &_out; numLines=0;};
+
+ OutputData(ostream & _out, const vector<string> & _columnLabels)
+ {
+ out=&_out;
+ BOOST_FOREACH(string label, _columnLabels) (*this)(label);
+ numLines=0;
+ }
+ OutputData & operator() (const string & label)
+ {
+ HashIt it=labelToColumn.find(label);
+ if (it!=labelToColumn.end()) throw string("Duplicate label ").append(label);
+ labelToColumn[label]=int(labels.size());
+ labels.push_back(label);
+ return *this;
+ }
+ string headerString() const
+ {
+ stringstream out;
+ if (labels.size()>0) {
+ out << labels[0];
+ for (size_t x=1;x<labels.size();x++) out << " " << labels[x];
+ }
+ return out.str();
+ }
+ template<class T> void outputLine(T x)
+ {
+ *out << x << endl;
+ }
+ class Line
+ {
+ public:
+ Line(const OutputData & od) : lineData(od.labelToColumn.size(),"NA")
+ {
+ labelToColumnPtr = & od.labelToColumn;
+
+ }
+ string get(const string & columnLabel) const
+ {
+ string_hash<int>::const_iterator it=labelToColumnPtr->find(columnLabel);
+ if (it != labelToColumnPtr->end()) {
+ return lineData[it->second];
+ } else throw string("Column label ").append(columnLabel).append(" not found!");
+ }
+ template<class T> Line & set(const string & columnLabel, T x)
+ {
+ string_hash<int>::const_iterator it=labelToColumnPtr->find(columnLabel);
+ if (it != labelToColumnPtr->end()) {
+ stringstream os;
+ os << x;
+ lineData[it->second]=os.str();
+ } else throw string("Column label ").append(columnLabel).append(" not found!");
+ return *this;
+ }
+ string toString() const
+ {
+ stringstream out;
+ if (lineData.size()>0) {
+ out << lineData[0];
+ for (size_t x=1;x<lineData.size();x++) out << " " << lineData[x];
+ }
+ return out.str();
+ }
+ vector<string> lineData;
+ protected:
+ const string_hash<int> *labelToColumnPtr;
+ };
+ void output(const OutputData::Line & line)
+ {
+ numLines++;
+ *out << line.toString() << endl;
+ }
+
+ ostream *out;
+protected:
+ typedef string_hash<int>::iterator HashIt;
+ string_hash<int> labelToColumn;
+
+ vector<string> labels;
+ int numLines;
+};
+
+
+
+
+#endif /* OUTPUTDATA_HPP_ */
diff --git a/README b/README
new file mode 100644
index 0000000..f10cc5f
--- /dev/null
+++ b/README
@@ -0,0 +1,15 @@
+The source code for Dindel itself is released under GPL3.
+
+In the directory you will find parts of the Boost library (www.boost.org), and
+parts of the Seqan library (www.seqan.de). You will still need to install a copy
+of the Boost library in order for the program_options to compile. The parts of
+the Boost library included in Dindel are there to make it compatible with old
+versions of Boost which didn't have 'foreach.hpp' for instance. The Seqan
+library is included as Dindel uses its Needleman-Wunsch algorithm to align
+candidate haplotypes to the reference sequence.
+
+You will need to download the source code for SAMtools in order to compile
+Dindel. The path to the SAMtools source files can be specified in the Makefile.
+
+If all libraries are in place, simply type "make" at the command line to compile
+Dindel.
diff --git a/Read.hpp b/Read.hpp
new file mode 100644
index 0000000..851518a
--- /dev/null
+++ b/Read.hpp
@@ -0,0 +1,451 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef READ_HPP_
+#define READ_HPP_
+#include <cmath>
+#include "Haplotype.hpp"
+#include "bam.h"
+#include "Library.hpp"
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <ext/hash_map>
+#include "MyBam.hpp"
+
+using namespace std;
+
+class Read
+{
+
+public:
+ class FetchReadData
+ {
+ public:
+ FetchReadData(vector<Read> * _reads,int _poolID, const LibraryCollection * _libraries, vector<MyBam *> * _myBams, int _numReads = 0, int _maxNumReads = 100000)
+ {
+ reads=_reads;
+ poolID=_poolID;
+ libraries=_libraries;
+ myBams = _myBams;
+ numUnknownLib=0;
+ numReads = _numReads;
+ maxNumReads = _maxNumReads;
+ }
+ vector<Read> * reads;
+ vector<MyBam *> * myBams;
+ int poolID;
+ const LibraryCollection * libraries;
+ int numUnknownLib;
+ int numReads;
+ int maxNumReads;
+ string_hash<int> unknownLib;
+ };
+
+ Read()
+ {
+ init(0);
+ }
+ Read(uint32_t _pos)
+ {
+ init(pos);
+ }
+ void init(uint32_t _pos)
+ {
+ pos=_pos;
+ posStat.first=double(_pos);
+ posStat.second=1.0;
+ initBam=false;
+ onReverseStrand=false;
+ poolID=-1;
+ mateLen = -1;
+ matePos = -1;
+ library = NULL;
+ }
+ Read(const Read & r)
+ {
+ initBam=false;
+ copy(r,0);
+ }
+ Read(const Read & r, int poolID)
+ {
+ initBam=false;
+ copy(r,0);
+ }
+ void copy(const Read &r, int x)
+ {
+ seq=r.seq;
+ pos=r.pos;
+ qual=r.qual;
+ posStat=r.posStat;
+ mapQual=r.mapQual;
+ onReverseStrand=r.onReverseStrand;
+ poolID=r.poolID;
+ matePos = r.matePos;
+ mateLen = r.mateLen;
+ library = r.library;
+ bamHeader = r.bamHeader;
+ if (initBam) {
+ delete[] bam->data;
+ delete bam;
+ initBam=false;
+ }
+
+ if (r.initBam) {
+ bam=new bam1_t;
+ *bam=*r.bam;
+ bam->data=new uint8_t[r.bam->m_data];
+ bam->m_data=r.bam->m_data;
+ for (int m=0;m<r.bam->m_data;m++) bam->data[m]=r.bam->data[m];
+ initBam=true;
+ }
+ }
+ Read & operator=(const Read & r)
+ {
+ if (&r!=this) {
+ copy(r,1);
+ }
+ return *this;
+ }
+ Read(const bam1_t *b, const LibraryCollection & libraries, int _poolID, bam_header_t * _bamHeader, const string & overrideLibName = string("") )
+ {
+ const bam1_core_t *c=&b->core;
+ uint32_t len=c->l_qseq;
+ double mapPhred=(double) c->qual;
+ mapQual=(1.0-pow(10.0, -mapPhred/10.0));
+ if (mapQual<0.0 || mapQual>1.0 || isnan(mapQual) || isinf(mapQual)) throw string("Phred error.");
+ if (mapQual<1e-16) mapQual=1e-16;
+ if (mapQual>1-1e-16) mapQual=1-1e-16;
+
+ // cout << "mapPhred: " << mapPhred << " qmap: " << qmap << endl;
+
+ pos=c->pos;
+
+ seq.reserve(len);
+ qual.reserve(len);
+ for(size_t x=0;x<len;x++) {
+ seq+=( bam_nt16_rev_table[ bam1_seqi(bam1_seq(b), x) ] );
+
+ // convert phred to probability
+ double basePhred=(double) ( ( (uint8_t*) bam1_qual(b))[x] );
+ double q=(1.0-pow(10.0, -basePhred/10.0));
+ if (q<0.0 || q>1.0 ||isnan(q) || isinf(q)) throw string("Phred error.");
+ if (q<1e-16) q=1e-16;
+ if (q>1.0-1e-16) q=1.0-1e-16;
+ qual.push_back( q ); // base quality is on log10 scale
+ }
+
+ posStat=computePositionStatistics(b);
+
+ bam = new bam1_t;
+ *bam=*b;
+ bam->data=new uint8_t[b->m_data];
+ bam->m_data=b->m_data;
+ for (int m=0;m<b->m_data;m++) bam->data[m]=b->data[m];
+ initBam=true;
+
+ if (bam->core.flag & BAM_FREVERSE) onReverseStrand=true; else onReverseStrand=false;
+ poolID=_poolID;
+ matePos = bam->core.mpos;
+ mateLen = -1;
+
+ this->bamHeader = _bamHeader;
+ LibraryCollection::const_iterator it;
+ if (overrideLibName.empty()) {
+ it = libraries.find( this->getLibraryName() );
+ } else {
+ it = libraries.find( overrideLibName );
+ }
+
+ if (it == libraries.end()) {
+ deleteBam();
+ initBam = false;
+ throw string("Cannot find library: ").append(this->getLibraryName());
+ } else {
+ library = (const Library *) & (it->second);
+ }
+ }
+ uint32_t getEndPos() const
+ {
+ return bam->core.n_cigar? bam_calend(&bam->core, bam1_cigar(bam)) : bam->core.pos + 1;
+ }
+ string getLibraryName() const
+ {
+ if (this->isPaired()) {
+ const char *p = bam_get_library((bam_header_t *) this->bamHeader, this->bam);
+ if (p) {
+ return string(p);
+ } else {
+ return string("dindel_default");
+ }
+ } else {
+ return string("single_end");
+ }
+ }
+
+ int32_t getBAMMatePos() const { return bam->core.mpos; }
+ bool isUnmapped() const { return (bam->core.flag & BAM_FUNMAP) != 0 ; }
+ bool mateIsUnmapped() const { return (bam->core.flag & BAM_FMUNMAP) != 0; }
+ bool mateIsReverse() const { return (bam->core.flag & BAM_FMREVERSE) != 0; }
+ bool isReverse() const { return (bam->core.flag & BAM_FREVERSE) != 0; }
+ bool isPaired() const { return (bam->core.flag & BAM_FPAIRED) != 0; }
+ void complement()
+ {
+ for (size_t s=0;s<this->seq.seq.size();s++) {
+ char & nuc = this->seq.seq[s];
+ if (nuc == 'A') nuc = 'T';
+ else if (nuc == 'T') nuc = 'A';
+ else if (nuc == 'C') nuc = 'G';
+ else if (nuc == 'G') nuc = 'C';
+ }
+ }
+ void reverse()
+ {
+ string newseq = this->seq.seq;
+ size_t len = newseq.size();
+ for (size_t x=0;x<newseq.size();x++) newseq[len-x-1]=this->seq.seq[x];
+ this->seq.seq = newseq;
+ }
+
+ string getAuxData() const
+ {
+ stringstream os;
+ uint8_t *s = bam1_aux(bam);
+
+ while (s < bam->data + bam->data_len) {
+ uint8_t type, key[2];
+ key[0] = s[0]; key[1] = s[1];
+ s += 2; type = *s; ++s;
+ //printf("\t%c%c:", key[0], key[1]);
+ os << "\t" << key[0] << key[1];
+ /*
+ if (type == 'A') { printf("A:%c", *s); ++s; }
+ else if (type == 'C') { printf("i:%u", *s); ++s; }
+ else if (type == 'c') { printf("i:%d", *s); ++s; }
+ else if (type == 'S') { printf("i:%u", *(uint16_t*)s); s += 2; }
+ else if (type == 's') { printf("i:%d", *(int16_t*)s); s += 2; }
+ else if (type == 'I') { printf("i:%u", *(uint32_t*)s); s += 4; }
+ else if (type == 'i') { printf("i:%d", *(int32_t*)s); s += 4; }
+ else if (type == 'f') { printf("f:%g", *(float*)s); s += 4; }
+ else if (type == 'Z' || type == 'H') { printf("%c:", type); while (*s) putchar(*s++); ++s; }
+ */
+ if (type == 'A') { os << "A:"<<(char)*s; ++s; }
+ else if (type == 'C') { os << "i:" << (unsigned int) *s; ++s; }
+ else if (type == 'c') { os << "i:" << (int) *s; ++s; }
+ else if (type == 'S') { os << "i:" << *(uint16_t*)s; s += 2; }
+ else if (type == 's') { os << "i:" << *(int16_t*)s; s += 2; }
+ else if (type == 'I') { os << "i:" << *(uint32_t*)s; s += 4; }
+ else if (type == 'i') { os << "i:" << *(int32_t*)s; s += 4; }
+ else if (type == 'f') { os << "f:" << *(float*)s; s += 4; }
+ else if (type == 'Z' || type == 'H') { os << type <<":"; while (*s) os << (char) (*s++); ++s; }
+ }
+ return os.str();
+ }
+
+ const Library & getLibrary() const { return *this->library; };
+
+ // compute mean and standard deviation of first base position
+ static pair<double, double> computePositionStatistics(const bam1_t *b)
+ {
+ const bam1_core_t *c=&b->core;
+ uint32_t *cigar=bam1_cigar(b);
+ uint32_t k;
+ int32_t pos=0, mean=0,totLen=0;
+
+ uint32_t refPos = c->pos;
+ double var=0.0;
+ if (c->n_cigar==0) {
+ return pair<double,double>(-1.0,-1.0);
+ }
+
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int32_t len=cigar[k] >> BAM_CIGAR_SHIFT;
+
+ if (op==BAM_CMATCH) {
+ mean+=len*(pos-totLen);
+ totLen+=len;
+ }
+ // update position for the next cigar
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP || op ==BAM_CHARD_CLIP) {
+ pos+=len;
+ }
+ }
+ double dmean=double(mean)/double(totLen);
+
+ pos=0;
+ totLen=0;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int32_t len=cigar[k] >> BAM_CIGAR_SHIFT;
+
+ if (op==BAM_CMATCH) {
+ var+=double(len)*(double(pos-totLen)-dmean)*(double(pos-totLen)-dmean);
+ totLen+=len;
+ }
+ // update position for the next cigar
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP || op ==BAM_CHARD_CLIP) {
+ pos+=len;
+ }
+ }
+ var=var/double(totLen);
+ return pair<double,double>(dmean+double(refPos), var);
+ }
+ /*
+ static void filterReads(vector<Read> & reads, size_t max)
+ {
+ // filter using map quality
+ class SortFunc {
+ public:
+ static bool sortFunc(const Read & r1, const Read & r2)
+ {
+ // sort in decreasing order
+ if (r1.mapQual>r2.mapQual) return true; else return false;
+ }
+ };
+ sort(reads.begin(), reads.end(), SortFunc::sortFunc);
+ //reads.resize(max);
+ vector<Read> filteredReads;
+ for (size_t i=0;i<reads.size() && i<max ;i++) {
+ filteredReads[i]=reads[i];
+ }
+ reads.swap(filteredReads);
+ }
+ static void filterReads(vector<Read> & reads, size_t maxNum, double minMapQual, size_t maxReadLength)
+ {
+ vector<Read> filteredReads;
+ if (minMapQual<0.0) minMapQual=0.0;
+ // filter using map quality
+ class SortFunc {
+ public:
+ static bool sortFunc(const Read & r1, const Read & r2)
+ {
+ // sort in decreasing order
+ if (r1.mapQual>r2.mapQual) return true; else return false;
+ }
+ };
+ for (size_t r=0;r<reads.size();r++) {
+ const bam1_core_t *c=&(reads[r].bam->core);
+ if (reads[r].size()>maxReadLength || c->n_cigar==0) reads[r].mapQual=-1.0;
+
+ }
+
+ sort(reads.begin(), reads.end(), SortFunc::sortFunc);
+ size_t max; for (max=0;max<maxNum && max<reads.size();max++) if (!(reads[max].mapQual<minMapQual)) filteredReads.push_back(Read(reads[max])); else break;
+ reads.swap(filteredReads);
+ }
+ */
+ static void filterReads(vector<Read> & reads, int exclude, const string & match)
+ {
+ vector<Read> filteredReads;
+ for (size_t r=0;r<reads.size();r++) {
+ string str=reads[r].getAuxData();
+ size_t found=str.find(match);
+ if (exclude) {
+ if (found==string::npos) {
+ filteredReads.push_back(reads[r]);
+ }
+ } else {
+ // include if match
+ if (found!=string::npos) filteredReads.push_back(reads[r]);
+ }
+ }
+ reads.swap(filteredReads);
+ }
+ /*
+ static int fetchFuncVector(const bam1_t *b, void *data)
+ {
+ FetchReadData *ptr=(FetchReadData *) data;
+
+ if (!( (b->core.flag & BAM_FDUP) || (b->core.flag & BAM_FQCFAIL) )) {
+ ptr->reads->push_back(Read(b, *(ptr->libraries)));
+ }
+ return 0;
+ }
+ */
+
+ static int fetchFuncVectorPooled(const bam1_t *b, void *data)
+ {
+ FetchReadData *ptr=(FetchReadData *) data;
+
+ if (!( (b->core.flag & BAM_FDUP) || (b->core.flag & BAM_FQCFAIL) )) {
+ try {
+ ptr->reads->push_back(Read(b, *(ptr->libraries), ptr->poolID, (*(ptr->myBams))[ptr->poolID]->bh));
+ ptr->numReads++;
+ } catch (string s) {
+ if (s.find("Cannot find library")!=string::npos) {
+ string lib = s.substr(22, s.size()-22);
+ string_hash<int>::iterator _it = ptr->unknownLib.find(lib);
+ if (_it == ptr->unknownLib.end()) {
+ ptr->unknownLib[lib] = 0;
+ } else _it->second++;
+ }
+ ptr->numUnknownLib++;
+ ptr->reads->push_back(Read(b, *(ptr->libraries), ptr->poolID, (*(ptr->myBams))[ptr->poolID]->bh, string("single_end")));
+ ptr->numReads++;
+ }
+ }
+ if (ptr->numReads > ptr->maxNumReads) {
+ throw string("Too many reads in region");
+ }
+ if (ptr->numReads % 10000 == 9999) cout << "numreads: " << ptr->numReads << endl;
+ return 0;
+ }
+
+ friend ostream &operator<<(ostream &stream, const Read & read)
+ {
+ cout << "pos: " << read.pos << " 1-mapping quality: " << 1.0-read.mapQual << " ";
+ for (size_t b=0;b<read.seq.size();b++) stream << read.seq[b];
+ for (size_t b=0;b<read.qual.size();b++) stream << " " << read.qual[b];
+ return stream;
+ };
+ bam1_t * getBam() const { return bam; };
+
+ size_t size() const { return seq.size(); };
+ void setAllQual(double v) { qual.clear(); qual.reserve(seq.size()); for (size_t x=0;x<seq.size();x++) qual.push_back(v); };
+
+
+ Haplotype seq;
+ vector<double> qual;
+ pair<double,double> posStat;
+ // offset of read with respect to some reference position
+
+ int32_t pos, matePos, mateLen;
+
+ double mapQual;
+ bool initBam;
+ bool onReverseStrand;
+ int poolID;
+
+ bam_header_t * bamHeader;
+ const Library * library; // pointer to library this read was generated from
+
+ bam1_t *bam;
+
+ void deleteBam()
+ {
+ delete[] bam->data; delete bam;
+ }
+
+ ~Read()
+ {
+ if (initBam) {
+ deleteBam();
+ }
+ }
+};
+
+#endif /*READ_HPP_*/
diff --git a/ReadIndelErrorModel.hpp b/ReadIndelErrorModel.hpp
new file mode 100644
index 0000000..fced785
--- /dev/null
+++ b/ReadIndelErrorModel.hpp
@@ -0,0 +1,57 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * ReadIndelErrorModel.hpp
+ *
+ * Created on: Oct 12, 2009
+ * Author: caa
+ */
+
+#ifndef READINDELERRORMODEL_HPP_
+#define READINDELERRORMODEL_HPP_
+
+class ReadIndelErrorModel
+{
+public:
+ ReadIndelErrorModel()
+ {
+ double hp[] = { 2.9e-5, 2.9e-5,2.9e-5, 2.9e-5, 4.3e-5, 1.1e-4, 2.4e-4, 5.7e-4, 1.0e-3, 1.4e-3 };
+ baselineProbs = vector<double>(10,0.0);
+ for (int x=0;x<10;x++) {
+ baselineProbs[x]=hp[x];
+ }
+ }
+ double getViterbiHPError(int hpLen)
+ {
+ int len=hpLen;
+ if (len<1) len=1;
+ double pbe;
+ if (len<=10) {
+ pbe=baselineProbs[len-1];
+ } else {
+ pbe=baselineProbs[9]+4.3e-4*double(len-10);
+ }
+ pbe *= double(hpLen);
+ if (pbe>0.99) pbe = 0.99;
+ return pbe;
+
+ }
+private:
+ vector<double> baselineProbs;
+
+};
+
+
+#endif /* READINDELERRORMODEL_HPP_ */
diff --git a/StringHash.hpp b/StringHash.hpp
new file mode 100644
index 0000000..b8d27cf
--- /dev/null
+++ b/StringHash.hpp
@@ -0,0 +1,40 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * OutputData.hpp
+ *
+ * Created on: Sep 7, 2009
+ * Author: caa
+ */
+
+#ifndef STRINGHASH_HPP_
+#define STRINGHASH_HPP_
+#include <vector>
+#include <ext/hash_map>
+#include <string>
+#include <iostream>
+#include "foreach.hpp"
+using namespace std;
+using namespace boost;
+using __gnu_cxx::hash;
+
+struct my_hash_funct : public unary_function<string, size_t>
+{
+ size_t operator()(const string & x) const { return hash<const char*>() (x.c_str()); }
+};
+
+template<class T> class string_hash : public hash_map<string, T, my_hash_funct> {};
+
+#endif /* OUTPUTDATA_HPP_ */
diff --git a/Utils.hpp b/Utils.hpp
new file mode 100644
index 0000000..1b4e41b
--- /dev/null
+++ b/Utils.hpp
@@ -0,0 +1,51 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Utils.hpp
+ *
+ * Created on: Mar 11, 2009
+ * Author: caa
+ */
+
+#ifndef UTILS_HPP_
+#define UTILS_HPP_
+#include <string>
+#include <sstream>
+#include <iostream>
+using namespace std;
+
+inline double addLogs(const double l1, const double l2)
+{
+ if (l1>l2) {
+ double diff=l2-l1;
+ return l1+log(1.0+exp(diff));
+ } else {
+ double diff=l1-l2;
+ return l2+log(1.0+exp(diff));
+ }
+}
+
+template <class T>
+bool from_string(T& t,
+ const std::string& s,
+ std::ios_base& (*f)(std::ios_base&))
+{
+ std::istringstream iss(s);
+ return !(iss >> f >> t).fail();
+}
+
+
+
+#endif /* UTILS_HPP_ */
diff --git a/Variant.hpp b/Variant.hpp
new file mode 100644
index 0000000..36da630
--- /dev/null
+++ b/Variant.hpp
@@ -0,0 +1,179 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Variant.hpp
+ *
+ * Created on: Aug 27, 2009
+ * Author: caa
+ */
+
+#ifndef VARIANT_HPP_
+#define VARIANT_HPP_
+#include <string>
+#include <vector>
+
+using namespace std;
+
+
+class Variant
+{
+public:
+ Variant() {};
+ Variant(const string & _str) { initFromString(_str); }
+
+ int size() const { return length; };
+ typedef enum { INS, DEL, SNP, REF } Type;
+ const string & getString() const { return str; };
+ const string & getSeq() const { return seq; }
+ Type getType() const { return type; }
+ bool isIndel() const { if (type==INS || type==DEL) return true; else return false; };
+ bool isSNP() const { if (type==SNP) return true; else return false; };
+ bool isRef() const { if (type==REF) return true; else return false; };
+protected:
+ void initFromString(const string & str)
+ {
+ int ok=1;
+ if (str.size()>1) {
+ if (str[0]=='-') {
+ // deletion
+ length=int(str.size())-1;
+ seq=str.substr(1, length);
+ type=Variant::DEL;
+ } else if (str[0]=='+') {
+ length=(int(str.size())-1);
+ seq=str.substr(1, length);
+ type=Variant::INS;
+ } else if (str.size()==4 && str[1]=='=' && str[2]=='>') {
+ type=Variant::SNP;
+ seq=str;
+ length=1;
+ } else if (str=="*REF") {
+ type=Variant::REF;
+ seq=string("*REF");
+ length=1;
+ } else ok=0;
+ } else ok=0;
+ if (!ok) { cout << "input string: " << str << endl; throw string("Unrecognized variant"); }
+ this->str=str;
+ }
+ Type type;
+ string seq;
+ string str;
+ int length;
+};
+
+
+class AlignedVariant : public Variant
+{
+public:
+ AlignedVariant() {};
+ AlignedVariant(const string & _str, int _startHap, int _endHap, int _startRead, int _endRead)
+ {
+ initFromString(_str);
+ startHap=_startHap;
+ endHap=_endHap;
+ startRead=_startRead;
+ endRead=_endRead;
+
+ leftFlankHap = startHap;
+ leftFlankRead = startRead;
+
+ rightFlankHap = endHap;
+ rightFlankRead = endRead;
+
+
+ freq = -1.0;
+ addComb = false;
+ }
+ AlignedVariant(const string & _str, int canonicalPos, double _freq=-1.0, bool _addComb = false)
+ {
+ initFromString(_str);
+ startHap = canonicalPos;
+ if (type==DEL) {
+ endHap = startHap+length-1;
+ } else {
+ endHap = startHap;
+ }
+ startRead=-1;
+ endRead = -1;
+
+ leftFlankHap = startHap;
+ leftFlankRead = startRead;
+
+ rightFlankHap = endHap;
+ rightFlankRead = endRead;
+
+
+ freq=_freq;
+ addComb = _addComb;
+ }
+
+
+
+ bool isCovered(int pad, int firstBase, int lastBase) const
+ {
+ if (firstBase+pad<=startRead && lastBase-pad>=endRead) return true; else return false;
+ }
+
+ bool operator<(const AlignedVariant & v) const
+ {
+ if (startHap!=v.startHap) return startHap<v.startHap; else return this->getString()<v.getString();
+ }
+ bool isEqual(int pos, int type, const string & str) const {
+ if (this->type == type && this->startHap == pos) {
+ if (type == AlignedVariant::SNP) {
+ if (str.substr(1,3)==this->str.substr(1,3)) return true; else return false;
+ } else {
+ if (type == AlignedVariant::INS) {
+ if (this->getString() == str) return true; else return false;
+ } else if (type == AlignedVariant::DEL) {
+ if (this->getString().size()==str.size()) return true; else return false;
+ }
+ }
+ } else return false;
+ return false;
+ }
+ int getStartRead() const { return startRead; };
+ int getStartHap() const { return startHap; };
+ int getEndHap() const { return endHap; };
+ double getFreq() const { return freq; };
+ bool getAddComb() const { return addComb; };
+
+ int getLeftFlankHap() const { return leftFlankHap; }
+ int getRightFlankHap() const { return rightFlankHap; }
+ int getLeftFlankRead() const { return leftFlankRead; }
+ int getRightFlankRead() const { return rightFlankRead; }
+
+ void setFlanking(int _leftFlankHap, int _rightFlankHap, int _leftFlankRead, int _rightFlankRead)
+ {
+ leftFlankRead = _leftFlankRead;
+ rightFlankRead = _rightFlankRead;
+
+ leftFlankHap = _leftFlankHap;
+ rightFlankHap = _rightFlankHap;
+ }
+
+protected:
+ int startHap, endHap; // position of variant in the haplotype the read is aligned to.
+ int startRead, endRead; // position of variant in the read aligned to the haplotype
+ int leftFlankHap, rightFlankHap; // position of left and right base flanking the indel in the _haplotype_ (ie the target sequence)
+ int leftFlankRead, rightFlankRead; // position of left and right base flanking the indel in the _read_ (ie the sequence aligned to the target sequence)
+ double freq;
+ bool addComb; // add combinatorially in generation of candidate haplotypes?
+};
+
+
+
+#endif /* VARIANT_HPP_ */
diff --git a/VariantFile.hpp b/VariantFile.hpp
new file mode 100644
index 0000000..2598fac
--- /dev/null
+++ b/VariantFile.hpp
@@ -0,0 +1,304 @@
+/*
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * VariantFile.hpp
+ *
+ * Created on: Sep 9, 2009
+ * Author: caa
+ */
+
+#ifndef VARIANTFILE_HPP_
+#define VARIANTFILE_HPP_
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "Utils.hpp"
+#include "Variant.hpp"
+using namespace std;
+
+/*
+ * class used as input for realignment
+ */
+
+class AlignedCandidates
+{
+public:
+ AlignedCandidates()
+ {
+ tid="";
+ }
+ AlignedCandidates(const string & _tid, const vector<AlignedVariant> & _variants, int _leftPos, int _rightPos)
+ {
+ tid=_tid;
+ variants=_variants;
+ leftPos = _leftPos;
+ rightPos = _rightPos;
+ computePositions();
+ }
+ vector<AlignedVariant> variants;
+ string tid;
+ int centerPos, leftPos, rightPos;
+ const AlignedVariant * findVariant(int pos, int type, const string & str) const
+ {
+ //cout << " ******** " << endl;
+ for (size_t x=0;x<variants.size();x++) {
+ //cout << "pos: " << pos << " variants[x].pos: " << variants[x].getStartHap() << " " << variants[x].getString() << endl;
+ if (variants[x].isEqual(pos, type, str)) return (const AlignedVariant *) &(variants[x]);
+ }
+ // cout << " ******** " << endl;
+ return NULL;
+ }
+private:
+ void computePositions()
+ {
+ centerPos = leftPos+(rightPos-leftPos)/2;
+ }
+};
+
+
+class VariantFile
+{
+public:
+ class Candidates
+ {
+ public:
+ Candidates()
+ {
+ tid="";
+ pos=0;
+ prior=-1.0;
+ }
+ Candidates(const string & _tid, uint32_t _pos, double _prior, const vector<Variant> & _variants, const vector<double> & _freqs)
+ {
+ tid=_tid;
+ pos=_pos;
+ prior=_prior;
+ variants=_variants;
+ freqs=_freqs;
+ }
+ vector<Variant> variants;
+ vector<double> freqs;
+ double prior;
+ string tid;
+ uint32_t pos;
+ };
+
+
+public:
+ VariantFile(const string & fileName)
+ {
+ index=0;
+ isOpen=false;
+ open(fileName);
+ }
+
+ int open(const string & fileName)
+ {
+ fin.open(fileName.c_str());
+ if (!fin.is_open()) throw string("Cannot open variant file ").append(fileName);
+ isOpen=true;
+ return 0;
+ }
+
+ bool eof() { if (isOpen) return fin.eof(); else return true; };
+
+ Candidates getLine(bool isOneBased=false)
+ {
+ if (!isOpen) return empty;
+
+ uint32_t pos;
+ string tid;
+ double prior=-1.0;
+
+ string line;
+ getline(fin, line);
+ if (line.empty()) return empty;
+
+ istringstream is(line);
+
+ index++;
+
+ if (!is.eof()) is >> tid; else return empty;
+ if (!is.eof()) is >> pos; else return empty;
+
+ // convert to zero-based coordinates
+ if (isOneBased) pos--;
+
+ // get variants from line
+ vector<Variant> variants;
+ vector<double> freqs;
+
+ string col;
+
+ try {
+ while (!is.eof()) {
+ is >> col;
+ if ( col.size() && ( (col[0]!='-' && col[0] != '+' && col[0] != 'A' && col[0] != 'C' && col[0]!='G' && col[0]!='T' && col[0]!='R') ) ) break;
+ Variant variant(col);
+ if (variant.getSeq().size()!=0) variants.push_back(variant);
+ }
+ } catch (string err) {
+ cerr << "Could not parse variants in line " << index << " in variants file." << endl;
+ return empty;
+ }
+
+ if (col.find('#') != string::npos) return Candidates(tid, pos, prior, variants, freqs);
+
+ prior=from_string<double>(prior, col, std::dec);
+
+ bool error=false;
+ while (!is.eof()) {
+ string in;
+ is >> in;
+
+ if (in.find('#')!=string::npos) break;
+ double freq;
+ if (!from_string<double>(freq, in, std::dec)) { error=true; break; };
+ freqs.push_back(freq);
+ }
+
+ if (error || (!error && freqs.size()>0 && freqs.size()!=variants.size())) {
+ freqs.clear();
+ cerr << "Could not parse all frequencies in line " << index << " in variants file." << endl;
+ }
+
+ if (variants.size()==0) {
+ cerr << "Could not parse any variants in line: " << index << " SKIPPING." << endl;
+ return empty;
+ }
+
+ return Candidates(tid, pos, prior, variants, freqs);
+ }
+
+ AlignedCandidates getLineVector(bool isOneBased=false)
+ {
+ if (!isOpen) return aligned_empty;
+
+ uint32_t pos;
+ int leftPos, rightPos;
+ string tid;
+
+ string line;
+ getline(fin, line);
+ if (line.empty()) return aligned_empty;
+
+ istringstream is(line);
+
+ index++;
+
+ if (!is.eof()) is >> tid; else return aligned_empty;
+ if (!is.eof()) {
+ string str;
+ is >> str;
+ if (!from_string<int>(leftPos, str, std::dec)) throw string("Cannot read left boundary of region.");
+
+ } else return aligned_empty;
+ if (!is.eof()) {
+ string str;
+ is >> str;
+ if (!from_string<int>(rightPos, str, std::dec)) throw string("Cannot read left boundary of region.");
+
+ } else return aligned_empty;
+
+ //cout << "leftPos: " << leftPos << " rightPos: " << rightPos << endl;
+
+ // get variants from line
+ vector<AlignedVariant> variants;
+ vector<double> freqs;
+
+ string col;
+
+ try {
+ while (!is.eof()) {
+ string pvf_str;
+ if (!is.eof()) is >> pvf_str;
+ //cout << "pvf_str: " << pvf_str << endl;
+
+ if (pvf_str.empty()) break;
+ if (pvf_str[0]=='#' || pvf_str[0]=='%') break;
+
+ vector<string> els;
+ int lastpos=0;
+ for (int x=0;x<int(pvf_str.size());x++) {
+ if ((pvf_str[x]==';' || pvf_str[x]==',') && x-lastpos>0) {
+ els.push_back(pvf_str.substr(lastpos,x-lastpos));
+ // cout << "els " << x << " : " << els[els.size()-1] << endl;
+ lastpos=x+1;
+ }
+ }
+ els.push_back(pvf_str.substr(lastpos, pvf_str.size()-lastpos));
+
+ if (els.size()<2) {
+ cerr << "Error reading line in variantfile!\n";
+ } else {
+ double freq=-1.0;
+ bool addComb=false;
+ if (!from_string<uint32_t>(pos, els[0], std::dec)) throw string("Cannot read position");
+ // convert to zero-based coordinates
+ if (isOneBased) pos--;
+
+ string & col = els[1];
+ if ( col.size()==0 || ( (col[0]!='-' && col[0] != '+' && col[0] != 'A' && col[0] != 'C' && col[0]!='G' && col[0]!='T' && col[0]!='R') ) ) throw string("Unrecognized variant");
+
+ if (els.size()>2) {
+ if (!from_string<double>(freq, els[2],std::dec)) throw string("Cannot read prior/frequency");
+ }
+ if (els.size()>3) {
+ int addc;
+ if (!from_string<int>(addc, els[3],std::dec)) throw string("Cannot add_combinatorial");
+ if (addc) addComb = true;
+ }
+
+ AlignedVariant variant(col,pos, freq, addComb);
+ if (variant.getSeq().size()!=0) {
+ variants.push_back(variant);
+ }
+ }
+ // split into pos, var, col
+
+
+ }
+ } catch (string err) {
+ cerr << "Could not parse variants in line " << index << " in variants file." << endl;
+ cerr << "Error: " << err << endl;
+ return aligned_empty;
+ }
+
+ if (variants.size()==0) {
+ cerr << "Could not parse any variants in line: " << index << " SKIPPING." << endl;
+ return aligned_empty;
+ }
+
+ return AlignedCandidates(tid, variants, leftPos, rightPos);
+ }
+
+ ~VariantFile()
+ {
+ fin.close();
+ }
+
+protected:
+ ifstream fin;
+ bool isOpen;
+ Candidates empty;
+ AlignedCandidates aligned_empty;
+ int index;
+};
+
+
+#endif /* VARIANTFILE_HPP_ */
diff --git a/digamma.hpp b/digamma.hpp
new file mode 100644
index 0000000..a1008d2
--- /dev/null
+++ b/digamma.hpp
@@ -0,0 +1,450 @@
+// (C) Copyright John Maddock 2006.
+// Use, modification and distribution are subject to the
+// Boost Software License, Version 1.0. (See accompanying file
+// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BOOST_MATH_SF_DIGAMMA_HPP
+#define BOOST_MATH_SF_DIGAMMA_HPP
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#include <boost/math/tools/rational.hpp>
+#include <boost/math/tools/promotion.hpp>
+#include <boost/math/policies/error_handling.hpp>
+#include <boost/math/constants/constants.hpp>
+#include <boost/mpl/comparison.hpp>
+
+namespace boost{
+namespace math{
+namespace detail{
+//
+// Begin by defining the smallest value for which it is safe to
+// use the asymptotic expansion for digamma:
+//
+inline unsigned digamma_large_lim(const mpl::int_<0>*)
+{ return 20; }
+
+inline unsigned digamma_large_lim(const void*)
+{ return 10; }
+//
+// Implementations of the asymptotic expansion come next,
+// the coefficients of the series have been evaluated
+// in advance at high precision, and the series truncated
+// at the first term that's too small to effect the result.
+// Note that the series becomes divergent after a while
+// so truncation is very important.
+//
+// This first one gives 34-digit precision for x >= 20:
+//
+template <class T>
+inline T digamma_imp_large(T x, const mpl::int_<0>*)
+{
+ BOOST_MATH_STD_USING // ADL of std functions.
+ static const T P[] = {
+ 0.083333333333333333333333333333333333333333333333333L,
+ -0.0083333333333333333333333333333333333333333333333333L,
+ 0.003968253968253968253968253968253968253968253968254L,
+ -0.0041666666666666666666666666666666666666666666666667L,
+ 0.0075757575757575757575757575757575757575757575757576L,
+ -0.021092796092796092796092796092796092796092796092796L,
+ 0.083333333333333333333333333333333333333333333333333L,
+ -0.44325980392156862745098039215686274509803921568627L,
+ 3.0539543302701197438039543302701197438039543302701L,
+ -26.456212121212121212121212121212121212121212121212L,
+ 281.4601449275362318840579710144927536231884057971L,
+ -3607.510546398046398046398046398046398046398046398L,
+ 54827.583333333333333333333333333333333333333333333L,
+ -974936.82385057471264367816091954022988505747126437L,
+ 20052695.796688078946143462272494530559046688078946L,
+ -472384867.72162990196078431372549019607843137254902L,
+ 12635724795.916666666666666666666666666666666666667L
+ };
+ x -= 1;
+ T result = log(x);
+ result += 1 / (2 * x);
+ T z = 1 / (x*x);
+ result -= z * tools::evaluate_polynomial(P, z);
+ return result;
+}
+//
+// 19-digit precision for x >= 10:
+//
+template <class T>
+inline T digamma_imp_large(T x, const mpl::int_<64>*)
+{
+ BOOST_MATH_STD_USING // ADL of std functions.
+ static const T P[] = {
+ 0.083333333333333333333333333333333333333333333333333L,
+ -0.0083333333333333333333333333333333333333333333333333L,
+ 0.003968253968253968253968253968253968253968253968254L,
+ -0.0041666666666666666666666666666666666666666666666667L,
+ 0.0075757575757575757575757575757575757575757575757576L,
+ -0.021092796092796092796092796092796092796092796092796L,
+ 0.083333333333333333333333333333333333333333333333333L,
+ -0.44325980392156862745098039215686274509803921568627L,
+ 3.0539543302701197438039543302701197438039543302701L,
+ -26.456212121212121212121212121212121212121212121212L,
+ 281.4601449275362318840579710144927536231884057971L,
+ };
+ x -= 1;
+ T result = log(x);
+ result += 1 / (2 * x);
+ T z = 1 / (x*x);
+ result -= z * tools::evaluate_polynomial(P, z);
+ return result;
+}
+//
+// 17-digit precision for x >= 10:
+//
+template <class T>
+inline T digamma_imp_large(T x, const mpl::int_<53>*)
+{
+ BOOST_MATH_STD_USING // ADL of std functions.
+ static const T P[] = {
+ 0.083333333333333333333333333333333333333333333333333L,
+ -0.0083333333333333333333333333333333333333333333333333L,
+ 0.003968253968253968253968253968253968253968253968254L,
+ -0.0041666666666666666666666666666666666666666666666667L,
+ 0.0075757575757575757575757575757575757575757575757576L,
+ -0.021092796092796092796092796092796092796092796092796L,
+ 0.083333333333333333333333333333333333333333333333333L,
+ -0.44325980392156862745098039215686274509803921568627L
+ };
+ x -= 1;
+ T result = log(x);
+ result += 1 / (2 * x);
+ T z = 1 / (x*x);
+ result -= z * tools::evaluate_polynomial(P, z);
+ return result;
+}
+//
+// 9-digit precision for x >= 10:
+//
+template <class T>
+inline T digamma_imp_large(T x, const mpl::int_<24>*)
+{
+ BOOST_MATH_STD_USING // ADL of std functions.
+ static const T P[] = {
+ 0.083333333333333333333333333333333333333333333333333L,
+ -0.0083333333333333333333333333333333333333333333333333L,
+ 0.003968253968253968253968253968253968253968253968254L
+ };
+ x -= 1;
+ T result = log(x);
+ result += 1 / (2 * x);
+ T z = 1 / (x*x);
+ result -= z * tools::evaluate_polynomial(P, z);
+ return result;
+}
+//
+// Now follow rational approximations over the range [1,2].
+//
+// 35-digit precision:
+//
+template <class T>
+T digamma_imp_1_2(T x, const mpl::int_<0>*)
+{
+ //
+ // Now the approximation, we use the form:
+ //
+ // digamma(x) = (x - root) * (Y + R(x-1))
+ //
+ // Where root is the location of the positive root of digamma,
+ // Y is a constant, and R is optimised for low absolute error
+ // compared to Y.
+ //
+ // Max error found at 128-bit long double precision: 5.541e-35
+ // Maximum Deviation Found (approximation error): 1.965e-35
+ //
+ static const float Y = 0.99558162689208984375F;
+
+ static const T root1 = 1569415565.0 / 1073741824uL;
+ static const T root2 = (381566830.0 / 1073741824uL) / 1073741824uL;
+ static const T root3 = ((111616537.0 / 1073741824uL) / 1073741824uL) / 1073741824uL;
+ static const T root4 = (((503992070.0 / 1073741824uL) / 1073741824uL) / 1073741824uL) / 1073741824uL;
+ static const T root5 = 0.52112228569249997894452490385577338504019838794544e-36L;
+
+ static const T P[] = {
+ 0.25479851061131551526977464225335883769L,
+ -0.18684290534374944114622235683619897417L,
+ -0.80360876047931768958995775910991929922L,
+ -0.67227342794829064330498117008564270136L,
+ -0.26569010991230617151285010695543858005L,
+ -0.05775672694575986971640757748003553385L,
+ -0.0071432147823164975485922555833274240665L,
+ -0.00048740753910766168912364555706064993274L,
+ -0.16454996865214115723416538844975174761e-4L,
+ -0.20327832297631728077731148515093164955e-6L
+ };
+ static const T Q[] = {
+ 1,
+ 2.6210924610812025425088411043163287646L,
+ 2.6850757078559596612621337395886392594L,
+ 1.4320913706209965531250495490639289418L,
+ 0.4410872083455009362557012239501953402L,
+ 0.081385727399251729505165509278152487225L,
+ 0.0089478633066857163432104815183858149496L,
+ 0.00055861622855066424871506755481997374154L,
+ 0.1760168552357342401304462967950178554e-4L,
+ 0.20585454493572473724556649516040874384e-6L,
+ -0.90745971844439990284514121823069162795e-11L,
+ 0.48857673606545846774761343500033283272e-13L,
+ };
+ T g = x - root1;
+ g -= root2;
+ g -= root3;
+ g -= root4;
+ g -= root5;
+ T r = tools::evaluate_polynomial(P, x-1) / tools::evaluate_polynomial(Q, x-1);
+ T result = g * Y + g * r;
+
+ return result;
+}
+//
+// 19-digit precision:
+//
+template <class T>
+T digamma_imp_1_2(T x, const mpl::int_<64>*)
+{
+ //
+ // Now the approximation, we use the form:
+ //
+ // digamma(x) = (x - root) * (Y + R(x-1))
+ //
+ // Where root is the location of the positive root of digamma,
+ // Y is a constant, and R is optimised for low absolute error
+ // compared to Y.
+ //
+ // Max error found at 80-bit long double precision: 5.016e-20
+ // Maximum Deviation Found (approximation error): 3.575e-20
+ //
+ static const float Y = 0.99558162689208984375F;
+
+ static const T root1 = 1569415565.0 / 1073741824uL;
+ static const T root2 = (381566830.0 / 1073741824uL) / 1073741824uL;
+ static const T root3 = 0.9016312093258695918615325266959189453125e-19L;
+
+ static const T P[] = {
+ 0.254798510611315515235L,
+ -0.314628554532916496608L,
+ -0.665836341559876230295L,
+ -0.314767657147375752913L,
+ -0.0541156266153505273939L,
+ -0.00289268368333918761452L
+ };
+ static const T Q[] = {
+ 1,
+ 2.1195759927055347547L,
+ 1.54350554664961128724L,
+ 0.486986018231042975162L,
+ 0.0660481487173569812846L,
+ 0.00298999662592323990972L,
+ -0.165079794012604905639e-5L,
+ 0.317940243105952177571e-7L
+ };
+ T g = x - root1;
+ g -= root2;
+ g -= root3;
+ T r = tools::evaluate_polynomial(P, x-1) / tools::evaluate_polynomial(Q, x-1);
+ T result = g * Y + g * r;
+
+ return result;
+}
+//
+// 18-digit precision:
+//
+template <class T>
+T digamma_imp_1_2(T x, const mpl::int_<53>*)
+{
+ //
+ // Now the approximation, we use the form:
+ //
+ // digamma(x) = (x - root) * (Y + R(x-1))
+ //
+ // Where root is the location of the positive root of digamma,
+ // Y is a constant, and R is optimised for low absolute error
+ // compared to Y.
+ //
+ // Maximum Deviation Found: 1.466e-18
+ // At double precision, max error found: 2.452e-17
+ //
+ static const float Y = 0.99558162689208984F;
+
+ static const T root1 = 1569415565.0 / 1073741824uL;
+ static const T root2 = (381566830.0 / 1073741824uL) / 1073741824uL;
+ static const T root3 = 0.9016312093258695918615325266959189453125e-19L;
+
+ static const T P[] = {
+ 0.25479851061131551L,
+ -0.32555031186804491L,
+ -0.65031853770896507L,
+ -0.28919126444774784L,
+ -0.045251321448739056L,
+ -0.0020713321167745952L
+ };
+ static const T Q[] = {
+ 1L,
+ 2.0767117023730469L,
+ 1.4606242909763515L,
+ 0.43593529692665969L,
+ 0.054151797245674225L,
+ 0.0021284987017821144L,
+ -0.55789841321675513e-6L
+ };
+ T g = x - root1;
+ g -= root2;
+ g -= root3;
+ T r = tools::evaluate_polynomial(P, x-1) / tools::evaluate_polynomial(Q, x-1);
+ T result = g * Y + g * r;
+
+ return result;
+}
+//
+// 9-digit precision:
+//
+template <class T>
+inline T digamma_imp_1_2(T x, const mpl::int_<24>*)
+{
+ //
+ // Now the approximation, we use the form:
+ //
+ // digamma(x) = (x - root) * (Y + R(x-1))
+ //
+ // Where root is the location of the positive root of digamma,
+ // Y is a constant, and R is optimised for low absolute error
+ // compared to Y.
+ //
+ // Maximum Deviation Found: 3.388e-010
+ // At float precision, max error found: 2.008725e-008
+ //
+ static const float Y = 0.99558162689208984f;
+ static const T root = 1532632.0f / 1048576;
+ static const T root_minor = static_cast<T>(0.3700660185912626595423257213284682051735604e-6L);
+ static const T P[] = {
+ 0.25479851023250261e0,
+ -0.44981331915268368e0,
+ -0.43916936919946835e0,
+ -0.61041765350579073e-1
+ };
+ static const T Q[] = {
+ 0.1e1,
+ 0.15890202430554952e1,
+ 0.65341249856146947e0,
+ 0.63851690523355715e-1
+ };
+ T g = x - root;
+ g -= root_minor;
+ T r = tools::evaluate_polynomial(P, x-1) / tools::evaluate_polynomial(Q, x-1);
+ T result = g * Y + g * r;
+
+ return result;
+}
+
+template <class T, class Tag, class Policy>
+T digamma_imp(T x, const Tag* t, const Policy& pol)
+{
+ //
+ // This handles reflection of negative arguments, and all our
+ // error handling, then forwards to the T-specific approximation.
+ //
+ BOOST_MATH_STD_USING // ADL of std functions.
+
+ T result = 0;
+ //
+ // Check for negative arguments and use reflection:
+ //
+ if(x < 0)
+ {
+ // Reflect:
+ x = 1 - x;
+ // Argument reduction for tan:
+ T remainder = x - floor(x);
+ // Shift to negative if > 0.5:
+ if(remainder > 0.5)
+ {
+ remainder -= 1;
+ }
+ //
+ // check for evaluation at a negative pole:
+ //
+ if(remainder == 0)
+ {
+ return policies::raise_pole_error<T>("boost::math::digamma<%1%>(%1%)", 0, (1-x), pol);
+ }
+ result = constants::pi<T>() / tan(constants::pi<T>() * remainder);
+ }
+ //
+ // If we're above the lower-limit for the
+ // asymptotic expansion then use it:
+ //
+ if(x >= digamma_large_lim(t))
+ {
+ result += digamma_imp_large(x, t);
+ }
+ else
+ {
+ //
+ // If x > 2 reduce to the interval [1,2]:
+ //
+ while(x > 2)
+ {
+ x -= 1;
+ result += 1/x;
+ }
+ //
+ // If x < 1 use recurrance to shift to > 1:
+ //
+ if(x < 1)
+ {
+ result = -1/x;
+ x += 1;
+ }
+ result += digamma_imp_1_2(x, t);
+ }
+ return result;
+}
+
+} // namespace detail
+
+template <class T, class Policy>
+inline typename tools::promote_args<T>::type
+ digamma(T x, const Policy& pol)
+{
+ typedef typename tools::promote_args<T>::type result_type;
+ typedef typename policies::evaluation<result_type, Policy>::type value_type;
+ typedef typename policies::precision<T, Policy>::type precision_type;
+ typedef typename mpl::if_<
+ mpl::or_<
+ mpl::less_equal<precision_type, mpl::int_<0> >,
+ mpl::greater<precision_type, mpl::int_<64> >
+ >,
+ mpl::int_<0>,
+ typename mpl::if_<
+ mpl::less<precision_type, mpl::int_<25> >,
+ mpl::int_<24>,
+ typename mpl::if_<
+ mpl::less<precision_type, mpl::int_<54> >,
+ mpl::int_<53>,
+ mpl::int_<64>
+ >::type
+ >::type
+ >::type tag_type;
+
+ return policies::checked_narrowing_cast<result_type, Policy>(detail::digamma_imp(
+ static_cast<value_type>(x),
+ static_cast<const tag_type*>(0), pol), "boost::math::digamma<%1%>(%1%)");
+}
+
+template <class T>
+inline typename tools::promote_args<T>::type
+ digamma(T x)
+{
+ return digamma(x, policies::policy<>());
+}
+
+} // namespace math
+} // namespace boost
+#endif
+
diff --git a/foreach.hpp b/foreach.hpp
new file mode 100644
index 0000000..b9018f8
--- /dev/null
+++ b/foreach.hpp
@@ -0,0 +1,812 @@
+///////////////////////////////////////////////////////////////////////////////
+// foreach.hpp header file
+//
+// Copyright 2004 Eric Niebler.
+// Distributed under the Boost Software License, Version 1.0. (See
+// accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+//
+// Credits:
+// Anson Tsao - for the initial inspiration and several good suggestions.
+// Thorsten Ottosen - for Boost.Range, and for suggesting a way to detect
+// const-qualified rvalues at compile time on VC7.1+
+// Russell Hind - For help porting to Borland
+// Alisdair Meredith - For help porting to Borland
+// Stefan Slapeta - For help porting to Intel
+
+#ifndef BOOST_FOREACH
+
+// MS compatible compilers support #pragma once
+#if defined(_MSC_VER) && (_MSC_VER >= 1020)
+# pragma once
+#endif
+
+#include <cstddef>
+#include <utility> // for std::pair
+
+#include <boost/config.hpp>
+#include <boost/detail/workaround.hpp>
+
+// Some compilers let us detect even const-qualified rvalues at compile-time
+#if BOOST_WORKAROUND(BOOST_MSVC, >= 1310) \
+ || (BOOST_WORKAROUND(__GNUC__, >= 4) && !defined(BOOST_INTEL)) \
+ || (BOOST_WORKAROUND(__GNUC__, == 3) && (__GNUC_MINOR__ >= 4) && !defined(BOOST_INTEL))
+# define BOOST_FOREACH_COMPILE_TIME_CONST_RVALUE_DETECTION
+#else
+// Some compilers allow temporaries to be bound to non-const references.
+// These compilers make it impossible to for BOOST_FOREACH to detect
+// temporaries and avoid reevaluation of the collection expression.
+# if BOOST_WORKAROUND(BOOST_MSVC, <= 1300) \
+ || BOOST_WORKAROUND(__BORLANDC__, BOOST_TESTED_AT(0x564)) \
+ || (BOOST_WORKAROUND(BOOST_INTEL_CXX_VERSION, <= 700) && defined(_MSC_VER)) \
+ || BOOST_WORKAROUND(__SUNPRO_CC, BOOST_TESTED_AT(0x570)) \
+ || BOOST_WORKAROUND(__DECCXX_VER, BOOST_TESTED_AT(60590042))
+# define BOOST_FOREACH_NO_RVALUE_DETECTION
+# endif
+// Some compilers do not correctly implement the lvalue/rvalue conversion
+// rules of the ternary conditional operator.
+# if defined(BOOST_FOREACH_NO_RVALUE_DETECTION) \
+ || defined(BOOST_NO_SFINAE) \
+ || BOOST_WORKAROUND(BOOST_MSVC, BOOST_TESTED_AT(1400)) \
+ || BOOST_WORKAROUND(BOOST_INTEL_WIN, <= 810) \
+ || BOOST_WORKAROUND(__GNUC__, < 3) \
+ || (BOOST_WORKAROUND(__GNUC__, == 3) && (__GNUC_MINOR__ <= 2)) \
+ || (BOOST_WORKAROUND(__GNUC__, == 3) && (__GNUC_MINOR__ <= 3) && defined(__APPLE_CC__)) \
+ || BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) \
+ || BOOST_WORKAROUND(__MWERKS__, BOOST_TESTED_AT(0x3206))
+# define BOOST_FOREACH_NO_CONST_RVALUE_DETECTION
+# else
+# define BOOST_FOREACH_RUN_TIME_CONST_RVALUE_DETECTION
+# endif
+#endif
+
+#include <boost/mpl/if.hpp>
+#include <boost/mpl/logical.hpp>
+#include <boost/mpl/eval_if.hpp>
+#include <boost/noncopyable.hpp>
+#include <boost/range/end.hpp>
+#include <boost/range/begin.hpp>
+#include <boost/range/result_iterator.hpp>
+#include <boost/type_traits/is_array.hpp>
+#include <boost/type_traits/is_const.hpp>
+#include <boost/type_traits/is_abstract.hpp>
+#include <boost/type_traits/is_base_and_derived.hpp>
+#include <boost/iterator/iterator_traits.hpp>
+#include <boost/utility/addressof.hpp>
+
+#ifdef BOOST_FOREACH_RUN_TIME_CONST_RVALUE_DETECTION
+# include <new>
+# include <boost/aligned_storage.hpp>
+# include <boost/utility/enable_if.hpp>
+# include <boost/type_traits/remove_const.hpp>
+#endif
+
+// This must be at global scope, hence the uglified name
+enum boost_foreach_argument_dependent_lookup_hack
+{
+ boost_foreach_argument_dependent_lookup_hack_value
+};
+
+namespace boost
+{
+
+// forward declarations for iterator_range
+template<typename T>
+class iterator_range;
+
+// forward declarations for sub_range
+template<typename T>
+class sub_range;
+
+namespace foreach
+{
+ ///////////////////////////////////////////////////////////////////////////////
+ // in_range
+ //
+ template<typename T>
+ inline std::pair<T, T> in_range(T begin, T end)
+ {
+ return std::make_pair(begin, end);
+ }
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // boost::foreach::tag
+ //
+ typedef boost_foreach_argument_dependent_lookup_hack tag;
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // boost::foreach::is_lightweight_proxy
+ // Specialize this for user-defined collection types if they are inexpensive to copy.
+ // This tells BOOST_FOREACH it can avoid the rvalue/lvalue detection stuff.
+ template<typename T>
+ struct is_lightweight_proxy
+ : boost::mpl::false_
+ {
+ };
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // boost::foreach::is_noncopyable
+ // Specialize this for user-defined collection types if they cannot be copied.
+ // This also tells BOOST_FOREACH to avoid the rvalue/lvalue detection stuff.
+ template<typename T>
+ struct is_noncopyable
+ #if !defined(BOOST_BROKEN_IS_BASE_AND_DERIVED) && !defined(BOOST_NO_IS_ABSTRACT)
+ : boost::mpl::or_<
+ boost::is_abstract<T>
+ , boost::is_base_and_derived<boost::noncopyable, T>
+ >
+ #elif !defined(BOOST_BROKEN_IS_BASE_AND_DERIVED)
+ : boost::is_base_and_derived<boost::noncopyable, T>
+ #elif !defined(BOOST_NO_IS_ABSTRACT)
+ : boost::is_abstract<T>
+ #else
+ : boost::mpl::false_
+ #endif
+ {
+ };
+
+} // namespace foreach
+
+} // namespace boost
+
+// vc6/7 needs help ordering the following overloads
+#ifdef BOOST_NO_FUNCTION_TEMPLATE_ORDERING
+# define BOOST_FOREACH_TAG_DEFAULT ...
+#else
+# define BOOST_FOREACH_TAG_DEFAULT boost::foreach::tag
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// boost_foreach_is_lightweight_proxy
+// Another customization point for the is_lightweight_proxy optimization,
+// this one works on legacy compilers. Overload boost_foreach_is_lightweight_proxy
+// at the global namespace for your type.
+template<typename T>
+inline boost::foreach::is_lightweight_proxy<T> *
+boost_foreach_is_lightweight_proxy(T *&, BOOST_FOREACH_TAG_DEFAULT) { return 0; }
+
+template<typename T>
+inline boost::mpl::true_ *
+boost_foreach_is_lightweight_proxy(std::pair<T, T> *&, boost::foreach::tag) { return 0; }
+
+template<typename T>
+inline boost::mpl::true_ *
+boost_foreach_is_lightweight_proxy(boost::iterator_range<T> *&, boost::foreach::tag) { return 0; }
+
+template<typename T>
+inline boost::mpl::true_ *
+boost_foreach_is_lightweight_proxy(boost::sub_range<T> *&, boost::foreach::tag) { return 0; }
+
+template<typename T>
+inline boost::mpl::true_ *
+boost_foreach_is_lightweight_proxy(T **&, boost::foreach::tag) { return 0; }
+
+///////////////////////////////////////////////////////////////////////////////
+// boost_foreach_is_noncopyable
+// Another customization point for the is_noncopyable trait,
+// this one works on legacy compilers. Overload boost_foreach_is_noncopyable
+// at the global namespace for your type.
+template<typename T>
+inline boost::foreach::is_noncopyable<T> *
+boost_foreach_is_noncopyable(T *&, BOOST_FOREACH_TAG_DEFAULT) { return 0; }
+
+namespace boost
+{
+
+namespace foreach_detail_
+{
+
+///////////////////////////////////////////////////////////////////////////////
+// Define some utilities for assessing the properties of expressions
+//
+typedef char yes_type;
+typedef char (&no_type)[2];
+yes_type is_true(boost::mpl::true_ *);
+no_type is_true(boost::mpl::false_ *);
+
+// Extracts the desired property from the expression without evaluating it
+#define BOOST_FOREACH_PROTECT(expr) \
+ (static_cast<boost::mpl::bool_<1 == sizeof(boost::foreach_detail_::is_true(expr))> *>(0))
+
+template<typename Bool1, typename Bool2>
+inline boost::mpl::and_<Bool1, Bool2> *and_(Bool1 *, Bool2 *) { return 0; }
+
+template<typename Bool1, typename Bool2, typename Bool3>
+inline boost::mpl::and_<Bool1, Bool2, Bool3> *and_(Bool1 *, Bool2 *, Bool3 *) { return 0; }
+
+template<typename Bool1, typename Bool2>
+inline boost::mpl::or_<Bool1, Bool2> *or_(Bool1 *, Bool2 *) { return 0; }
+
+template<typename Bool1, typename Bool2, typename Bool3>
+inline boost::mpl::or_<Bool1, Bool2, Bool3> *or_(Bool1 *, Bool2 *, Bool3 *) { return 0; }
+
+template<typename Bool>
+inline boost::mpl::not_<Bool> *not_(Bool *) { return 0; }
+
+template<typename T>
+inline boost::mpl::false_ *is_rvalue_(T &, int) { return 0; }
+
+template<typename T>
+inline boost::mpl::true_ *is_rvalue_(T const &, ...) { return 0; }
+
+template<typename T>
+inline boost::is_array<T> *is_array_(T const &) { return 0; }
+
+template<typename T>
+inline boost::is_const<T> *is_const_(T &) { return 0; }
+
+#ifndef BOOST_FOREACH_NO_RVALUE_DETECTION
+template<typename T>
+inline boost::mpl::true_ *is_const_(T const &) { return 0; }
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// auto_any_t/auto_any
+// General utility for putting an object of any type into automatic storage
+struct auto_any_base
+{
+ // auto_any_base must evaluate to false in boolean context so that
+ // they can be declared in if() statements.
+ operator bool() const
+ {
+ return false;
+ }
+};
+
+template<typename T>
+struct auto_any : auto_any_base
+{
+ auto_any(T const &t)
+ : item(t)
+ {
+ }
+
+ // temporaries of type auto_any will be bound to const auto_any_base
+ // references, but we still want to be able to mutate the stored
+ // data, so declare it as mutable.
+ mutable T item;
+};
+
+typedef auto_any_base const &auto_any_t;
+
+template<typename T, typename C>
+inline BOOST_DEDUCED_TYPENAME boost::mpl::if_<C, T const, T>::type &auto_any_cast(auto_any_t a)
+{
+ return static_cast<auto_any<T> const &>(a).item;
+}
+
+typedef boost::mpl::true_ const_;
+
+///////////////////////////////////////////////////////////////////////////////
+// type2type
+//
+template<typename T, typename C = boost::mpl::false_>
+struct type2type
+ : boost::mpl::if_<C, T const, T>
+{
+};
+
+template<typename T, typename C = boost::mpl::false_>
+struct foreach_iterator
+{
+ typedef BOOST_DEDUCED_TYPENAME boost::mpl::eval_if<
+ C
+ , range_const_iterator<T>
+ , range_iterator<T>
+ >::type type;
+};
+
+template<typename T, typename C = boost::mpl::false_>
+struct foreach_reference
+ : iterator_reference<BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type>
+{
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// encode_type
+//
+template<typename T>
+inline type2type<T> *encode_type(T &, boost::mpl::false_ *) { return 0; }
+
+template<typename T>
+inline type2type<T, const_> *encode_type(T const &, boost::mpl::true_ *) { return 0; }
+
+///////////////////////////////////////////////////////////////////////////////
+// set_false
+//
+inline bool set_false(bool &b) { return b = false; }
+
+///////////////////////////////////////////////////////////////////////////////
+// to_ptr
+//
+template<typename T>
+inline T *&to_ptr(T const &)
+{
+ static T *t = 0;
+ return t;
+}
+
+// Borland needs a little extra help with arrays
+#if BOOST_WORKAROUND(__BORLANDC__, BOOST_TESTED_AT(0x564))
+template<typename T,std::size_t N>
+inline T (*&to_ptr(T (&)[N]))[N]
+{
+ static T (*t)[N] = 0;
+ return t;
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// derefof
+//
+template<typename T>
+inline T &derefof(T *t)
+{
+ // This is a work-around for a compiler bug in Borland. If T* is a pointer to array type U(*)[N],
+ // then dereferencing it results in a U* instead of U(&)[N]. The cast forces the issue.
+ return reinterpret_cast<T &>(
+ *const_cast<char *>(
+ reinterpret_cast<char const volatile *>(t)
+ )
+ );
+}
+
+#ifdef BOOST_FOREACH_COMPILE_TIME_CONST_RVALUE_DETECTION
+///////////////////////////////////////////////////////////////////////////////
+// Detect at compile-time whether an expression yields an rvalue or
+// an lvalue. This is rather non-standard, but some popular compilers
+// accept it.
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// rvalue_probe
+//
+template<typename T>
+struct rvalue_probe
+{
+ struct private_type_ {};
+ // can't ever return an array by value
+ typedef BOOST_DEDUCED_TYPENAME boost::mpl::if_<
+ boost::mpl::or_<boost::is_abstract<T>, boost::is_array<T> >, private_type_, T
+ >::type value_type;
+ operator value_type();
+ operator T &() const;
+};
+
+template<typename T>
+rvalue_probe<T> const make_probe(T const &t);
+
+# define BOOST_FOREACH_IS_RVALUE(COL) \
+ boost::foreach_detail_::and_( \
+ boost::foreach_detail_::not_(boost::foreach_detail_::is_array_(COL)) \
+ , BOOST_FOREACH_PROTECT(boost::foreach_detail_::is_rvalue_( \
+ (true ? boost::foreach_detail_::make_probe(COL) : (COL)), 0)))
+
+#elif defined(BOOST_FOREACH_RUN_TIME_CONST_RVALUE_DETECTION)
+///////////////////////////////////////////////////////////////////////////////
+// Detect at run-time whether an expression yields an rvalue
+// or an lvalue. This is 100% standard C++, but not all compilers
+// accept it. Also, it causes FOREACH to break when used with non-
+// copyable collection types.
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// rvalue_probe
+//
+template<typename T>
+struct rvalue_probe
+{
+ rvalue_probe(T &t, bool &b)
+ : value(t)
+ , is_rvalue(b)
+ {
+ }
+
+ struct private_type_ {};
+ // can't ever return an array or an abstract type by value
+ #ifdef BOOST_NO_IS_ABSTRACT
+ typedef BOOST_DEDUCED_TYPENAME boost::mpl::if_<
+ boost::is_array<T>, private_type_, T
+ >::type value_type;
+ #else
+ typedef BOOST_DEDUCED_TYPENAME boost::mpl::if_<
+ boost::mpl::or_<boost::is_abstract<T>, boost::is_array<T> >, private_type_, T
+ >::type value_type;
+ #endif
+
+ operator value_type()
+ {
+ this->is_rvalue = true;
+ return this->value;
+ }
+
+ operator T &() const
+ {
+ return this->value;
+ }
+
+private:
+ T &value;
+ bool &is_rvalue;
+};
+
+template<typename T>
+rvalue_probe<T> make_probe(T &t, bool &b) { return rvalue_probe<T>(t, b); }
+
+template<typename T>
+rvalue_probe<T const> make_probe(T const &t, bool &b) { return rvalue_probe<T const>(t, b); }
+
+///////////////////////////////////////////////////////////////////////////////
+// simple_variant
+// holds either a T or a T const*
+template<typename T>
+struct simple_variant
+{
+ simple_variant(T const *t)
+ : is_rvalue(false)
+ {
+ *static_cast<T const **>(this->data.address()) = t;
+ }
+
+ simple_variant(T const &t)
+ : is_rvalue(true)
+ {
+ ::new(this->data.address()) T(t);
+ }
+
+ simple_variant(simple_variant const &that)
+ : is_rvalue(that.is_rvalue)
+ {
+ if(this->is_rvalue)
+ ::new(this->data.address()) T(*that.get());
+ else
+ *static_cast<T const **>(this->data.address()) = that.get();
+ }
+
+ ~simple_variant()
+ {
+ if(this->is_rvalue)
+ this->get()->~T();
+ }
+
+ T const *get() const
+ {
+ if(this->is_rvalue)
+ return static_cast<T const *>(this->data.address());
+ else
+ return *static_cast<T const * const *>(this->data.address());
+ }
+
+private:
+ enum size_type { size = sizeof(T) > sizeof(T*) ? sizeof(T) : sizeof(T*) };
+ simple_variant &operator =(simple_variant const &);
+ bool const is_rvalue;
+ aligned_storage<size> data;
+};
+
+// If the collection is an array or is noncopyable, it must be an lvalue.
+// If the collection is a lightweight proxy, treat it as an rvalue
+// BUGBUG what about a noncopyable proxy?
+template<typename LValue, typename IsProxy>
+inline BOOST_DEDUCED_TYPENAME boost::enable_if<boost::mpl::or_<LValue, IsProxy>, IsProxy>::type *
+should_copy_impl(LValue *, IsProxy *, bool *)
+{
+ return 0;
+}
+
+// Otherwise, we must determine at runtime whether it's an lvalue or rvalue
+inline bool *
+should_copy_impl(boost::mpl::false_ *, boost::mpl::false_ *, bool *is_rvalue)
+{
+ return is_rvalue;
+}
+
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// contain
+//
+template<typename T>
+inline auto_any<T> contain(T const &t, boost::mpl::true_ *) // rvalue
+{
+ return t;
+}
+
+template<typename T>
+inline auto_any<T *> contain(T &t, boost::mpl::false_ *) // lvalue
+{
+ // Cannot seem to get sunpro to handle addressof() with array types.
+ #if BOOST_WORKAROUND(__SUNPRO_CC, BOOST_TESTED_AT(0x570))
+ return &t;
+ #else
+ return boost::addressof(t);
+ #endif
+}
+
+#ifdef BOOST_FOREACH_RUN_TIME_CONST_RVALUE_DETECTION
+template<typename T>
+auto_any<simple_variant<T> >
+contain(T const &t, bool *rvalue)
+{
+ return *rvalue ? simple_variant<T>(t) : simple_variant<T>(&t);
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////
+// begin
+//
+template<typename T, typename C>
+inline auto_any<BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type>
+begin(auto_any_t col, type2type<T, C> *, boost::mpl::true_ *) // rvalue
+{
+ return boost::begin(auto_any_cast<T, C>(col));
+}
+
+template<typename T, typename C>
+inline auto_any<BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type>
+begin(auto_any_t col, type2type<T, C> *, boost::mpl::false_ *) // lvalue
+{
+ typedef BOOST_DEDUCED_TYPENAME type2type<T, C>::type type;
+ typedef BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type iterator;
+ return iterator(boost::begin(derefof(auto_any_cast<type *, boost::mpl::false_>(col))));
+}
+
+#ifdef BOOST_FOREACH_RUN_TIME_CONST_RVALUE_DETECTION
+template<typename T>
+auto_any<BOOST_DEDUCED_TYPENAME foreach_iterator<T, const_>::type>
+begin(auto_any_t col, type2type<T, const_> *, bool *)
+{
+ return boost::begin(*auto_any_cast<simple_variant<T>, boost::mpl::false_>(col).get());
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// end
+//
+template<typename T, typename C>
+inline auto_any<BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type>
+end(auto_any_t col, type2type<T, C> *, boost::mpl::true_ *) // rvalue
+{
+ return boost::end(auto_any_cast<T, C>(col));
+}
+
+template<typename T, typename C>
+inline auto_any<BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type>
+end(auto_any_t col, type2type<T, C> *, boost::mpl::false_ *) // lvalue
+{
+ typedef BOOST_DEDUCED_TYPENAME type2type<T, C>::type type;
+ typedef BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type iterator;
+ return iterator(boost::end(derefof(auto_any_cast<type *, boost::mpl::false_>(col))));
+}
+
+#ifdef BOOST_FOREACH_RUN_TIME_CONST_RVALUE_DETECTION
+template<typename T>
+auto_any<BOOST_DEDUCED_TYPENAME foreach_iterator<T, const_>::type>
+end(auto_any_t col, type2type<T, const_> *, bool *)
+{
+ return boost::end(*auto_any_cast<simple_variant<T>, boost::mpl::false_>(col).get());
+}
+#endif
+
+#ifndef BOOST_NO_FUNCTION_TEMPLATE_ORDERING
+template<typename T, typename C>
+inline auto_any<int>
+end(auto_any_t col, type2type<T *, C> *, boost::mpl::true_ *) // null-terminated C-style strings
+{
+ return 0; // not used
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// done
+//
+template<typename T, typename C>
+inline bool done(auto_any_t cur, auto_any_t end, type2type<T, C> *)
+{
+ typedef BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type iter_t;
+ return auto_any_cast<iter_t, boost::mpl::false_>(cur) == auto_any_cast<iter_t, boost::mpl::false_>(end);
+}
+
+#ifndef BOOST_NO_FUNCTION_TEMPLATE_ORDERING
+template<typename T, typename C>
+inline bool done(auto_any_t cur, auto_any_t, type2type<T *, C> *) // null-terminated C-style strings
+{
+ return ! *auto_any_cast<T *, boost::mpl::false_>(cur);
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// next
+//
+template<typename T, typename C>
+inline void next(auto_any_t cur, type2type<T, C> *)
+{
+ typedef BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type iter_t;
+ ++auto_any_cast<iter_t, boost::mpl::false_>(cur);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// deref
+//
+template<typename T, typename C>
+inline BOOST_DEDUCED_TYPENAME foreach_reference<T, C>::type
+deref(auto_any_t cur, type2type<T, C> *)
+{
+ typedef BOOST_DEDUCED_TYPENAME foreach_iterator<T, C>::type iter_t;
+ return *auto_any_cast<iter_t, boost::mpl::false_>(cur);
+}
+
+} // namespace foreach_detail_
+} // namespace boost
+
+// A sneaky way to get the type of the collection without evaluating the expression
+#define BOOST_FOREACH_TYPEOF(COL) \
+ (true ? 0 : boost::foreach_detail_::encode_type(COL, boost::foreach_detail_::is_const_(COL)))
+
+// returns true_* if the type is noncopyable
+#define BOOST_FOREACH_IS_NONCOPYABLE(COL) \
+ boost_foreach_is_noncopyable( \
+ boost::foreach_detail_::to_ptr(COL) \
+ , boost_foreach_argument_dependent_lookup_hack_value)
+
+// returns true_* if the type is a lightweight proxy (and is not noncopyable)
+#define BOOST_FOREACH_IS_LIGHTWEIGHT_PROXY(COL) \
+ boost::foreach_detail_::and_( \
+ boost::foreach_detail_::not_(BOOST_FOREACH_IS_NONCOPYABLE(COL)) \
+ , boost_foreach_is_lightweight_proxy( \
+ boost::foreach_detail_::to_ptr(COL) \
+ , boost_foreach_argument_dependent_lookup_hack_value))
+
+#ifdef BOOST_FOREACH_COMPILE_TIME_CONST_RVALUE_DETECTION
+///////////////////////////////////////////////////////////////////////////////
+// R-values and const R-values supported here with zero runtime overhead
+///////////////////////////////////////////////////////////////////////////////
+
+// No variable is needed to track the rvalue-ness of the collection expression
+# define BOOST_FOREACH_PREAMBLE() \
+ /**/
+
+// Evaluate the collection expression
+# define BOOST_FOREACH_EVALUATE(COL) \
+ (COL)
+
+# define BOOST_FOREACH_SHOULD_COPY(COL) \
+ (true ? 0 : boost::foreach_detail_::or_( \
+ BOOST_FOREACH_IS_RVALUE(COL) \
+ , BOOST_FOREACH_IS_LIGHTWEIGHT_PROXY(COL)))
+
+#elif defined(BOOST_FOREACH_RUN_TIME_CONST_RVALUE_DETECTION)
+///////////////////////////////////////////////////////////////////////////////
+// R-values and const R-values supported here
+///////////////////////////////////////////////////////////////////////////////
+
+// Declare a variable to track the rvalue-ness of the collection expression
+# define BOOST_FOREACH_PREAMBLE() \
+ if (bool _foreach_is_rvalue = false) {} else
+
+// Evaluate the collection expression, and detect if it is an lvalue or and rvalue
+# define BOOST_FOREACH_EVALUATE(COL) \
+ (true ? boost::foreach_detail_::make_probe((COL), _foreach_is_rvalue) : (COL))
+
+// The rvalue/lvalue-ness of the collection expression is determined dynamically, unless
+// type type is an array or is noncopyable or is non-const, in which case we know it's an lvalue.
+// If the type happens to be a lightweight proxy, always make a copy.
+# define BOOST_FOREACH_SHOULD_COPY(COL) \
+ (boost::foreach_detail_::should_copy_impl( \
+ true ? 0 : boost::foreach_detail_::or_( \
+ boost::foreach_detail_::is_array_(COL) \
+ , BOOST_FOREACH_IS_NONCOPYABLE(COL) \
+ , boost::foreach_detail_::not_(boost::foreach_detail_::is_const_(COL))) \
+ , true ? 0 : BOOST_FOREACH_IS_LIGHTWEIGHT_PROXY(COL) \
+ , &_foreach_is_rvalue))
+
+#elif !defined(BOOST_FOREACH_NO_RVALUE_DETECTION)
+///////////////////////////////////////////////////////////////////////////////
+// R-values supported here, const R-values NOT supported here
+///////////////////////////////////////////////////////////////////////////////
+
+// No variable is needed to track the rvalue-ness of the collection expression
+# define BOOST_FOREACH_PREAMBLE() \
+ /**/
+
+// Evaluate the collection expression
+# define BOOST_FOREACH_EVALUATE(COL) \
+ (COL)
+
+// Determine whether the collection expression is an lvalue or an rvalue.
+// NOTE: this gets the answer wrong for const rvalues.
+# define BOOST_FOREACH_SHOULD_COPY(COL) \
+ (true ? 0 : boost::foreach_detail_::or_( \
+ boost::foreach_detail_::is_rvalue_((COL), 0) \
+ , BOOST_FOREACH_IS_LIGHTWEIGHT_PROXY(COL)))
+
+#else
+///////////////////////////////////////////////////////////////////////////////
+// R-values NOT supported here
+///////////////////////////////////////////////////////////////////////////////
+
+// No variable is needed to track the rvalue-ness of the collection expression
+# define BOOST_FOREACH_PREAMBLE() \
+ /**/
+
+// Evaluate the collection expression
+# define BOOST_FOREACH_EVALUATE(COL) \
+ (COL)
+
+// Can't use rvalues with BOOST_FOREACH (unless they are lightweight proxies)
+# define BOOST_FOREACH_SHOULD_COPY(COL) \
+ (true ? 0 : BOOST_FOREACH_IS_LIGHTWEIGHT_PROXY(COL))
+
+#endif
+
+#define BOOST_FOREACH_CONTAIN(COL) \
+ boost::foreach_detail_::contain( \
+ BOOST_FOREACH_EVALUATE(COL) \
+ , BOOST_FOREACH_SHOULD_COPY(COL))
+
+#define BOOST_FOREACH_BEGIN(COL) \
+ boost::foreach_detail_::begin( \
+ _foreach_col \
+ , BOOST_FOREACH_TYPEOF(COL) \
+ , BOOST_FOREACH_SHOULD_COPY(COL))
+
+#define BOOST_FOREACH_END(COL) \
+ boost::foreach_detail_::end( \
+ _foreach_col \
+ , BOOST_FOREACH_TYPEOF(COL) \
+ , BOOST_FOREACH_SHOULD_COPY(COL))
+
+#define BOOST_FOREACH_DONE(COL) \
+ boost::foreach_detail_::done( \
+ _foreach_cur \
+ , _foreach_end \
+ , BOOST_FOREACH_TYPEOF(COL))
+
+#define BOOST_FOREACH_NEXT(COL) \
+ boost::foreach_detail_::next( \
+ _foreach_cur \
+ , BOOST_FOREACH_TYPEOF(COL))
+
+#define BOOST_FOREACH_DEREF(COL) \
+ boost::foreach_detail_::deref( \
+ _foreach_cur \
+ , BOOST_FOREACH_TYPEOF(COL))
+
+///////////////////////////////////////////////////////////////////////////////
+// BOOST_FOREACH
+//
+// For iterating over collections. Collections can be
+// arrays, null-terminated strings, or STL containers.
+// The loop variable can be a value or reference. For
+// example:
+//
+// std::list<int> int_list(/*stuff*/);
+// BOOST_FOREACH(int &i, int_list)
+// {
+// /*
+// * loop body goes here.
+// * i is a reference to the int in int_list.
+// */
+// }
+//
+// Alternately, you can declare the loop variable first,
+// so you can access it after the loop finishes. Obviously,
+// if you do it this way, then the loop variable cannot be
+// a reference.
+//
+// int i;
+// BOOST_FOREACH(i, int_list)
+// { ... }
+//
+#define BOOST_FOREACH(VAR, COL) \
+ BOOST_FOREACH_PREAMBLE() \
+ if (boost::foreach_detail_::auto_any_t _foreach_col = BOOST_FOREACH_CONTAIN(COL)) {} else \
+ if (boost::foreach_detail_::auto_any_t _foreach_cur = BOOST_FOREACH_BEGIN(COL)) {} else \
+ if (boost::foreach_detail_::auto_any_t _foreach_end = BOOST_FOREACH_END(COL)) {} else \
+ for (bool _foreach_continue = true; \
+ _foreach_continue && !BOOST_FOREACH_DONE(COL); \
+ _foreach_continue ? BOOST_FOREACH_NEXT(COL) : (void)0) \
+ if (boost::foreach_detail_::set_false(_foreach_continue)) {} else \
+ for (VAR = BOOST_FOREACH_DEREF(COL); !_foreach_continue; _foreach_continue = true)
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dindel.git
More information about the debian-med-commit
mailing list