[med-svn] [bitseq] 11/14: New upstream version 0.7.5+dfsg
Andreas Tille
tille at debian.org
Sat Dec 2 08:47:48 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository bitseq.
commit ebcd5e842eff36228eb7742306a08fa2b01c6cfa
Author: Andreas Tille <tille at debian.org>
Date: Sat Dec 2 09:45:57 2017 +0100
New upstream version 0.7.5+dfsg
---
ArgumentParser.cpp | 308 ++++++
ArgumentParser.h | 110 ++
CollapsedSampler.cpp | 95 ++
CollapsedSampler.h | 16 +
FileHeader.cpp | 117 +++
FileHeader.h | 45 +
GibbsParameters.cpp | 110 ++
GibbsParameters.h | 45 +
GibbsSampler.cpp | 97 ++
GibbsSampler.h | 17 +
Makefile | 129 +++
MyTimer.cpp | 60 ++
MyTimer.h | 28 +
PosteriorSamples.cpp | 299 ++++++
PosteriorSamples.h | 66 ++
README.md | 34 +
ReadDistribution.cpp | 1126 +++++++++++++++++++++
ReadDistribution.h | 134 +++
Sampler.cpp | 220 ++++
Sampler.h | 99 ++
SimpleSparse.cpp | 97 ++
SimpleSparse.h | 27 +
TagAlignments.cpp | 132 +++
TagAlignments.h | 44 +
TranscriptExpression.cpp | 87 ++
TranscriptExpression.h | 38 +
TranscriptInfo.cpp | 258 +++++
TranscriptInfo.h | 75 ++
TranscriptSequence.cpp | 197 ++++
TranscriptSequence.h | 82 ++
VariationalBayes.cpp | 384 +++++++
VariationalBayes.h | 45 +
_release_Makefile | 122 +++
asa103/LICENSE.txt | 165 +++
asa103/asa103.hpp | 96 ++
biocUpdate.sh | 24 +
changeList | 86 ++
checkTR.py | 76 ++
common.cpp | 22 +
common.h | 41 +
convertSamples.cpp | 197 ++++
debian/changelog | 14 -
debian/compat | 1 -
debian/control | 30 -
debian/copyright | 22 -
debian/patches/hardening.patch | 17 -
debian/patches/link_against_system_samtools.patch | 51 -
debian/patches/series | 2 -
debian/rules | 63 --
debian/source/format | 1 -
debian/watch | 4 -
estimateDE.cpp | 326 ++++++
estimateExpression.cpp | 597 +++++++++++
estimateHyperPar.cpp | 369 +++++++
estimateVBExpression.cpp | 238 +++++
extractSamples.cpp | 126 +++
extractTranscriptInfo.py | 91 ++
getCounts.py | 78 ++
getFoldChange.cpp | 112 ++
getGeneExpression.cpp | 120 +++
getPPLR.cpp | 152 +++
getVariance.cpp | 167 +++
getWithinGeneExpression.cpp | 248 +++++
lowess.cpp | 511 ++++++++++
lowess.h | 30 +
misc.cpp | 240 +++++
misc.h | 84 ++
parameters1.txt | 32 +
parseAlignment.cpp | 612 +++++++++++
parseAlignment.py | 482 +++++++++
releaseDo.sh | 52 +
releaseList | 55 +
tagAlignment.h | 37 +
transposeFiles.cpp | 146 +++
transposeFiles.h | 4 +
transposeLargeFile.cpp | 22 +
76 files changed, 10381 insertions(+), 205 deletions(-)
diff --git a/ArgumentParser.cpp b/ArgumentParser.cpp
new file mode 100644
index 0000000..63b94d7
--- /dev/null
+++ b/ArgumentParser.cpp
@@ -0,0 +1,308 @@
+#include<algorithm>
+#include<cstdlib>
+#include<sstream>
+
+#include"ArgumentParser.h"
+
+#include "misc.h"
+
+#include "common.h"
+
+#define FF first
+#define SS second
+#define Sof(x) (long)x.size()
+
+vector <double> tokenizeD(const string &input,const string &space = ","){//{{{
+ vector <double> ret;
+ long pos=0,f=0,n=input.size();
+ while((pos<n)&&(f<n)&&(f>=0)){
+ f=input.find(space,pos);
+ if(f==pos)pos++;
+ else{
+ if((f <n)&&(f>=0)){
+ ret.push_back(atof(input.substr(pos,f-pos).c_str()));
+ pos=f+1;
+ }
+ }
+ }
+ if(pos<n)ret.push_back(atof(input.substr(pos,n-pos).c_str()));
+ return ret;
+} //}}}
+
+
+// GET {{{
+string ArgumentParser::getS(const string &name) const{
+ if(!existsOption(name,true))return "";
+ if(mapS.find(name)!=mapS.end())
+ return mapS.find(name)->second;
+ return "";
+}
+string ArgumentParser::getLowerS(const string &name) const{
+ if(!existsOption(name,true))return "";
+ if(mapS.find(name)!=mapS.end())
+ return ns_misc::toLower(mapS.find(name)->second);
+ return "";
+}
+long ArgumentParser::getL(const string &name) const{
+ if(!existsOption(name,true))return -1;
+ if(mapL.find(name)!=mapL.end())
+ return mapL.find(name)->second;
+ return -1;
+}
+double ArgumentParser::getD(const string &name) const{
+ if(!existsOption(name,true))return -1;
+ if(mapD.find(name)!=mapD.end())
+ return mapD.find(name)->second;
+ return -1;
+}
+bool ArgumentParser::flag(const string &name) const {
+ return isSet(name);
+}
+vector<double> ArgumentParser::getTokenizedS2D(const string &name) const{
+ if(!existsOption(name,true))return vector<double>();
+ if(mapS.find(name)!=mapS.end())
+ return tokenizeD(mapS.find(name)->second);
+ return vector<double>();
+}//}}}
+// SET {{{
+void ArgumentParser::updateS(const string &name, const string &value){
+ if(!existsOption(name))error("ArgumentParser: argument name %s unknown.\n",name.c_str());
+ if(mapS.find(name)!=mapS.end())
+ mapS.find(name)->second = value;
+}//}}}
+bool ArgumentParser::parse(int argc,char * argv[]){//{{{
+// for(long i=0;i<argc;i++)message("_%s_\n",(args[i]).c_str());
+ // add verbose if possible {{{
+ if(! (existsName("v")||existsName("verbose")||existsOption("verbose")))
+ addOptionB("v","verbose","verbose",0,"Verbose output.");
+ //if(! (existsName("V")||existsName("veryVerbose")||existsOption("veryVerbose")))
+ // addOptionB("V","veryVerbose","veryVerbose",0,"Very verbose output.");
+ // }}}
+ programName=(string)argv[0];
+ string val,opt;
+ for(long i = 1; i<argc;i++){
+ val=(string)argv[i];
+ if(val[0]!='-'){
+ arguments.push_back(val);
+ continue;
+ }
+ if(Sof(val)==2){
+ opt=val.substr(1,1);
+ val="";
+ }else{
+ if(val.find("=")!=string::npos){
+ opt=val.substr(2,val.find("=")-2);
+ val=val.substr(val.find("=")+1);
+ }else{
+ opt=val.substr(2);
+ val="";
+ }
+ }
+ if((opt=="help")||(opt=="h")){
+ usage();
+ return false;
+ }
+ if(names.find(opt)==names.end()){
+ error("Unknown option '%s'.\n",argv[i]);
+ return false;
+ }
+ if(validOptions[names[opt]].type!=OTBOOL){
+ if(val==""){
+ i++;
+ if(i==argc)break;
+ val = argv[i];
+ }
+ switch(validOptions[names[opt]].type){
+ case OTSTRING:
+ mapS[names[opt]]=val;
+ break;
+ case OTLONG:
+ mapL[names[opt]]=atoi(val.c_str());
+ break;
+ case OTDOUBLE:
+ mapD[names[opt]]=atof(val.c_str());
+ break;
+ case OTBOOL:;
+ }
+ }else{
+ mapB[names[opt]]=!mapB[names[opt]];
+ }
+ }
+ //writeAll();
+ if(Sof(arguments)<minimumArguments){
+ error("Need at least %ld arguments.\n\n",minimumArguments);
+ usage();
+ return false;
+ }
+ for(long i = 0;i<Sof(compulsory);i++){
+ if(! isSet(compulsory[i])){
+ error("Missing option \"%s\"\n",(compulsory[i]).c_str());
+ usage();
+ return false;
+ }
+ }
+ // set public variable verbose
+ verbose = flag("verbose")||(existsOption("veryVerbose")&&flag("veryVerbose"));
+ return true;
+}//}}}
+void ArgumentParser::writeAll(){//{{{
+ message("arguments: ");
+ for(long i=0;i<Sof(arguments);i++)
+ message("%s ",(arguments[i]).c_str());
+ message("\n");
+ for(map<string,string>::iterator it=mapS.begin();it!=mapS.end();it++){
+ message("OPT:%s VAL:%s\n",(it->FF).c_str(),(it->SS).c_str());
+ }
+ for(map<string,long>::iterator it=mapL.begin();it!=mapL.end();it++){
+ message("OPT:%s VAL:%ld\n",(it->FF).c_str(),it->SS);
+ }
+ for(map<string,double>::iterator it=mapD.begin();it!=mapD.end();it++){
+ message("OPT:%s VAL:%lf\n",(it->FF).c_str(),(it->SS));
+ }
+ for(map<string,bool>::iterator it=mapB.begin();it!=mapB.end();it++){
+ message("OPT:%s VAL:%d\n",(it->FF).c_str(),(it->SS));
+ }
+}//}}}
+void ArgumentParser::addOptionL(const string &shortName,const string &longName, const string &name, bool comp, const string &description, long defValue){//{{{
+ Option newOpt;
+ if(existsOption(name)){
+ error("ArgumentParser: Option \"%s\"\n",(name).c_str());
+ return;
+ }
+ newOpt.type=OTLONG;
+ newOpt.shortName=shortName;
+ newOpt.longName=longName;
+ newOpt.description=description;
+ if(defValue!=-47){
+ appendDescription<long>(&newOpt.description,defValue);
+ mapL[name]=defValue;
+ }
+ validOptions[name]=newOpt;
+ if(shortName!="")names[shortName]=name;
+ if(longName!="")names[longName]=name;
+ if(comp)compulsory.push_back(name);
+}//}}}
+void ArgumentParser::addOptionD(const string &shortName,const string &longName, const string &name, bool comp, const string &description, double defValue){//{{{
+ Option newOpt;
+ if(existsOption(name)){
+ error("ArgumentParser: Option \"%s\"\n",(name).c_str());
+ return;
+ }
+ newOpt.type=OTDOUBLE;
+ newOpt.shortName=shortName;
+ newOpt.longName=longName;
+ newOpt.description=description;
+ if(defValue!=-47.47){
+ appendDescription<double>(&newOpt.description,defValue);
+ mapD[name]=defValue;
+ }
+ validOptions[name]=newOpt;
+ if(shortName!="")names[shortName]=name;
+ if(longName!="")names[longName]=name;
+ if(comp)compulsory.push_back(name);
+}//}}}
+void ArgumentParser::addOptionB(const string &shortName,const string &longName, const string &name, bool comp, const string &description, bool defValue){//{{{
+ Option newOpt;
+ if(existsOption(name)){
+ error("ArgumentParser: Option \"%s\"\n",(name).c_str());
+ return;
+ }
+ mapB[name]=defValue;
+ newOpt.type=OTBOOL;
+ newOpt.shortName=shortName;
+ newOpt.longName=longName;
+ newOpt.description=description;
+ if(defValue) newOpt.description +=" (default: On)";
+ else newOpt.description+=" (default: Off)";
+ validOptions[name]=newOpt;
+ if(shortName!="")names[shortName]=name;
+ if(longName!="")names[longName]=name;
+ if(comp)compulsory.push_back(name);
+}//}}}
+void ArgumentParser::addOptionS(const string &shortName,const string &longName, const string &name, bool comp, const string &description, const string &defValue){//{{{
+ Option newOpt;
+ if(existsOption(name)){
+ error("ArgumentParser: Option \"%s\"\n",(name).c_str());
+ return;
+ }
+ newOpt.type=OTSTRING;
+ newOpt.shortName=shortName;
+ newOpt.longName=longName;
+ newOpt.description=description;
+ if(defValue!="noDefault"){
+ appendDescription<string>(&newOpt.description,defValue);
+ mapS[name]=defValue;
+ }
+ validOptions[name]=newOpt;
+ if(shortName!="")names[shortName]=name;
+ if(longName!="")names[longName]=name;
+ if(comp)compulsory.push_back(name);
+}//}}}
+//{{{ void ArgumentParser::appendDescription(string &desc,valueType defValue)
+template <typename valueType>
+void ArgumentParser::appendDescription(string *desc,valueType defValue){
+ stringstream descStream;
+ descStream<<*desc<<" (default: "<<defValue<<")";
+ *desc = descStream.str();
+}//}}}
+void ArgumentParser::usage(){//{{{
+ map<string,Option>::iterator it;
+ vector<string>::iterator itV;
+ Option opt;
+ message("\nUsage: %s ",(programName).c_str());
+ sort(compulsory.begin(),compulsory.end());
+ for(itV=compulsory.begin();itV!=compulsory.end();itV++){
+ if(validOptions[*itV].shortName!="")
+ message("-%s ",(validOptions[*itV].shortName).c_str());
+ else
+ message("--%s ",(validOptions[*itV].longName).c_str());
+ if(validOptions[*itV].type!=OTBOOL)message("<%s> ",(*itV).c_str());
+ }
+ message(" [OPTIONS] %s\n",(argumentDesc).c_str());
+ message("\n%s\n\nOptions:\n",(programDesc).c_str());
+ message(" --help\n Show this help information.\n\n");
+ for(it=validOptions.begin();it!=validOptions.end();it++){
+ opt=it->SS;
+ message(" ");
+ if(opt.shortName!=""){
+ message("-%s",(opt.shortName).c_str());
+ if(opt.type!=OTBOOL)message(" <%s>",(it->FF).c_str());
+ if(opt.longName!="")message(" , ");
+ }
+ if(opt.longName!=""){
+ message("--%s",(opt.longName).c_str());
+ if(opt.type!=OTBOOL)message("=<%s>",(it->FF).c_str());
+ }
+ message("\n");
+ if(opt.description!=""){
+ message(" %s\n\n",(opt.description).c_str());
+ }
+ }
+}//}}}
+bool ArgumentParser::isSet(const string &name) const {//{{{
+ if(! existsOption(name,true))return false;
+ switch(validOptions.find(name)->second.type){
+ case OTSTRING:
+ if(mapS.find(name)==mapS.end())return false;
+ else return true;
+ case OTLONG:
+ if(mapL.find(name)==mapL.end())return false;
+ else return true;
+ case OTBOOL:
+ if(mapB.find(name)==mapB.end())return false;
+ else return mapB.find(name)->second;
+ case OTDOUBLE:
+ if(mapD.find(name)==mapD.end())return false;
+ else return true;
+ }
+ return false;
+}//}}}
+bool ArgumentParser::existsName(const string &name) const {//{{{
+ if(names.find(name)==names.end())return false;
+ return true;
+}//}}}
+bool ArgumentParser::existsOption(const string &name, bool warn) const {//{{{
+ if(validOptions.find(name)!=validOptions.end())return true;
+ if(warn)error("ArgumentParser: argument name %s unknown.\n",(name).c_str());
+ return false;
+}//}}}
diff --git a/ArgumentParser.h b/ArgumentParser.h
new file mode 100644
index 0000000..aaaea94
--- /dev/null
+++ b/ArgumentParser.h
@@ -0,0 +1,110 @@
+#ifndef ARGUMENTPARSER_H
+#define ARGUMENTPARSER_H
+
+#include<map>
+#include<string>
+#include<vector>
+
+using namespace std;
+
+enum OptionType {OTSTRING, OTLONG, OTBOOL, OTDOUBLE};
+struct Option{//{{{
+ OptionType type;
+ string shortName,longName,description;
+};//}}}
+
+class ArgumentParser{
+ private:
+ map<string,string> mapS;
+ map<string,long> mapL;
+ map<string,bool> mapB;
+ map<string,double> mapD;
+ map<string,string> names;
+ map<string,Option> validOptions;
+ string programName, argumentDesc, programDesc;
+ vector<string> arguments;
+ vector<string> compulsory;
+ long minimumArguments;
+
+ bool existsOption(const string &name, bool warn = false) const;
+ bool existsName(const string &name) const;
+ template <typename valueType>
+ void appendDescription(string *desc,valueType defValue);
+ public:
+ // The value of verbose option for direct access.
+ bool verbose;
+
+ // Constructor for the class sets: programDescription, additional string
+ // and minimum number of required arguments.
+ ArgumentParser(const string &pD="",const string &aD="[FILES]", long minArgs = 1){//{{{
+ verbose = false;
+ init(pD,aD,minArgs);
+ }//}}}
+ // Init function for initialization, sets the same values as constructor.
+ void init(const string &pD="",const string &aD="[FILES]", long minArgs = 1){//{{{
+ programDesc=pD;
+ argumentDesc=aD;
+ minimumArguments = minArgs;
+ }//}}}
+ // Parse function given number of arguments and array of arguments
+ // it processes the arguments and makes options available through
+ // get[S/L/D] functions and args() function.
+ bool parse(int n,char * argv[]);
+ /*
+ * SETTERS:
+ */
+ // Add option (string) adds new option, name is the name used for referring
+ // to it.
+ void addOptionS(const string &shortName,
+ const string &longName,
+ const string &name,
+ bool comp,
+ const string &description="",
+ const string &defValue="noDefault");
+ // Add option (long).
+ void addOptionL(const string &shortName, const string &longName,
+ const string &name, bool comp, const string &description="",
+ long defValue=-47);
+ // Add option (double).
+ void addOptionD(const string &shortName, const string &longName,
+ const string &name, bool comp, const string &description="",
+ double defValue=-47.47);
+ // Add option (boolean or 'flag').
+ void addOptionB(const string &shortName, const string &longName,
+ const string &name, bool comp, const string &description="",
+ bool defValue=false);
+ /*
+ * GETTERS:
+ */
+ // Return reference to vector of arguments
+ // (i.e. the strings provided with no -/-- modifier).
+ const vector<string>& args() const { return arguments; }
+ // Return true if option <name> was set.
+ bool isSet(const string &name) const;
+ // Return value of string option <name>.
+ string getS(const string &name) const;
+ // Return value of string option <name> in lower case.
+ string getLowerS(const string &name) const;
+ // Return value of integer option <name>.
+ long getL(const string &name) const;
+ // Return value of double option <name>.
+ double getD(const string &name) const;
+ // Return value of bool option <name>.
+ bool flag(const string &name) const;
+ // Return value of verbose.
+ bool verb() const { return verbose; }
+
+ /*
+ * OTHER:
+ */
+ // (Advanced get) Return tokenized (comma separated) string as vector of doubles.
+ vector<double> getTokenizedS2D(const string &name) const;
+ // Write usage string.
+ void usage();
+ // Write all options.
+ void writeAll();
+ // Update value of existing string option.
+ void updateS(const string &name, const string &value);
+};
+
+#endif
diff --git a/CollapsedSampler.cpp b/CollapsedSampler.cpp
new file mode 100644
index 0000000..6d3c131
--- /dev/null
+++ b/CollapsedSampler.cpp
@@ -0,0 +1,95 @@
+#ifdef DoSTATS
+#include<sys/time.h>
+#endif
+
+#include "CollapsedSampler.h"
+#include "common.h"
+
+void CollapsedSampler::sampleZ(){//{{{
+ int_least32_t i,j,k;
+ // Resize Z and initialize if not big enough. {{{
+ if((long)Z.size() != Nmap){
+ Z.assign(Nmap,0);
+ // init Z&C
+ for(i=0;i<Nmap;i++){
+ //choose random transcript;
+ k = (int_least32_t) (m * uniformDistribution(rng_mt));
+ Z[i]=k;
+ C[k]++;
+ }
+ }//}}}
+ // TimeStats {{{
+#ifdef DoSTATS
+ nZ++;
+ struct timeval start, end;
+ gettimeofday(&start, NULL);
+#endif
+ // }}}
+ vector<double> phi(m,0);
+ // phi of size M should be enough
+ // because of summing the probabilities for each isoform when reading the data
+ double probNorm,r,sum,const1a,const1b,const2a;
+ int_least32_t readsAlignmentsN;
+
+ const1a = beta->beta + Nunmap;
+ const1b = m * dir->alpha + Nmap - 1;
+ const2a = beta->alpha + Nmap - 1;
+ // randomize order: ???
+ for(i=0;i<Nmap;i++){
+ probNorm=0;
+ C[Z[i]]--; // use counts without the current one
+ readsAlignmentsN = alignments->getReadsI(i+1) - alignments->getReadsI(i);
+ for(j=0, k=alignments->getReadsI(i); j<readsAlignmentsN; j++, k++){
+ //message("%ld %lf ",(*alignments)[k].getTrId(),(*alignments)[k].getProb());
+ if(alignments->getTrId(k) == 0){
+ phi[j] = alignments->getProb(k) *
+ (const1a + C[0]) *
+ (const1b - C[0]); // this comes from division in "false part"
+ }else{
+ phi[j] = alignments->getProb(k) *
+ (const2a - C[0]) *
+ (dir->alpha + C[ alignments->getTrId(k) ]);
+ /*
+ /(m * dir->alpha + Nmap - 1 - C[0]) ;
+ this term was replaced by *(const1b - C[0])
+ and moved into "true part" as multiplication
+ */
+ }
+ probNorm += phi[j];
+ }
+ r = uniformDistribution(rng_mt);
+ // Apply Normalization constant:
+ r *= probNorm;
+ for(j = 0, sum = 0 ; (sum<r) && (j<readsAlignmentsN); j++){
+ sum += phi[j];
+ }
+ if(j==0){
+ // e.g. if probNorm == 0
+ // assign to noise.
+ Z[i] = 0;
+ } else {
+ Z[i] = alignments->getTrId(alignments->getReadsI(i) + j -1);
+ }
+ C[ Z[i] ]++;
+ }
+ // TimeStats {{{
+#ifdef DoSTATS
+ gettimeofday(&end, NULL);
+ tZ += (end.tv_sec-start.tv_sec)*1000*1000+(end.tv_usec-start.tv_usec);
+#endif
+ // }}}
+}//}}}
+
+void CollapsedSampler::update(){//{{{
+ Sampler::update();
+
+ sampleTheta();
+
+ updateSums();
+ if((doLog)&&(save))appendFile();
+}//}}}
+void CollapsedSampler::sample(){//{{{
+ Sampler::sample();
+
+ sampleZ();
+}//}}}
diff --git a/CollapsedSampler.h b/CollapsedSampler.h
new file mode 100644
index 0000000..eb8c56d
--- /dev/null
+++ b/CollapsedSampler.h
@@ -0,0 +1,16 @@
+#include<stdint.h>
+
+#include "Sampler.h"
+
+class CollapsedSampler : public Sampler{
+ private:
+ vector<int_least32_t> Z;
+
+ void sampleZ();
+
+ public:
+
+ virtual void update();
+ virtual void sample();
+
+};
diff --git a/FileHeader.cpp b/FileHeader.cpp
new file mode 100644
index 0000000..d40f58c
--- /dev/null
+++ b/FileHeader.cpp
@@ -0,0 +1,117 @@
+#include<cstdlib>
+
+#include "FileHeader.h"
+#include "misc.h"
+
+#include "common.h"
+
+using namespace ns_fileHeader;
+
+void FileHeader::skipEmptyLines() {//{{{
+ if(!file) return;
+ while(file->good() &&
+ ((file->peek() == ' ') ||
+ (file->peek() == '\n')))
+ file->get();
+}//}}}
+
+bool FileHeader::readValues(ofstream *outF){//{{{
+ if((file==NULL)||(!file->is_open())){
+ error("FileHeader: Input file not opened for reading.\n");
+ return false;
+ }
+ string line;
+ vector<string> words;
+ long value;
+ char *chP;
+ skipEmptyLines();
+ while(file->good() && (file->peek() == '#')){
+ // Read line.
+ getline(*file, line);
+ // If outF is defined, copy the header there.
+ if(outF!=NULL)(*outF)<<line<<endl;
+ skipEmptyLines();
+ // Tokenize line into words.
+ words = ns_misc::tokenize(line);
+ // Store words as flags. Start with 1st word as the 0th one are hashes.
+ // If word is followed by a numeric value, use it as a value for the flag.
+ for(long i=1;i<(long)words.size();i++){
+ // Only add new entry if it wasn't there already.
+ if(values.count(words[i])==0)
+ values[words[i]] = no_value;
+ // See if next word is numeric and if so, then use it as a value.
+ if(i+1<(long)words.size()){
+ value = strtol(words[i+1].c_str(), &chP, 10);
+ // Conversion was succesful the value is non-zero OR the pointer should point to end of string (null character).
+ if((value!=0)||(*chP=='\0')) {
+ // Save value and skip the number.
+ values[words[i]] = value;
+ i++;
+ }
+ }
+ }
+ }
+ return true;
+}//}}}
+
+bool FileHeader::samplesHeader(long *n, long *m, bool *transposed, bool *logged){//{{{
+ if(!readValues()){
+ *n=0;
+ *m=0;
+ return false;
+ }
+ if(logged!=NULL)if(values.count("L"))*logged = true;
+ if(values.count("T"))*transposed = true;
+ if(values.count("M") && (values["M"]!=no_value))*m = values["M"];
+ if(values.count("N") && (values["N"]!=no_value))*n = values["N"];
+ return true;
+}//}}}
+
+bool FileHeader::transcriptsHeader(long *m, long *colN){//{{{
+ if(!readValues()){
+ *m=0;
+ return false;
+ }
+ if(values.count("M") && (values["M"]!=no_value))*m = values["M"];
+ if(colN!=NULL)
+ if(values.count("colN") && (values["colN"]!=no_value))*colN = values["colN"];
+ return true;
+}//}}}
+
+bool FileHeader::probHeader(long *Nmap, long *Ntotal, long *M, AlignmentFileType *format){//{{{
+ if(!readValues()){
+ *M=0;
+ *Nmap=0;
+ return false;
+ }
+ if(values.count("LOGFORMAT")){*format = LOG_FORMAT;}
+ else if(values.count("NEWFORMAT")){*format = NEW_FORMAT;}
+ else *format = OLD_FORMAT;
+ if(values.count("Ntotal") && (values["Ntotal"]!=no_value))*Ntotal = values["Ntotal"];
+ if(values.count("Nmap") && (values["Nmap"]!=no_value))*Nmap = values["Nmap"];
+ if(values.count("M") && (values["M"]!=no_value))*M = values["M"];
+ return true;
+}//}}}
+
+bool FileHeader::varianceHeader(long *m,bool *logged){//{{{
+ if(!readValues()){
+ *m=0;
+ return false;
+ }
+ if(logged!=NULL)if(values.count("L"))*logged = true;
+ if(values.count("M") && (values["M"]!=no_value))*m = values["M"];
+ return true;
+}//}}}
+
+bool FileHeader::paramsHeader(long *parN, ofstream *outF){//{{{
+ if(!readValues(outF)){
+ *parN=0;
+ return false;
+ }
+ *parN = 0;
+ if(values.count("PN") && (values["PN"]!=no_value))*parN = values["PN"];
+ return true;
+}//}}}
+
+
+
diff --git a/FileHeader.h b/FileHeader.h
new file mode 100644
index 0000000..e1ae93f
--- /dev/null
+++ b/FileHeader.h
@@ -0,0 +1,45 @@
+#ifndef FILEHEADER_H
+#define FILEHEADER_H
+
+#include<fstream>
+#include<map>
+#include<vector>
+
+using namespace std;
+
+const long no_value = -4747;
+
+namespace ns_fileHeader {
+enum AlignmentFileType { OLD_FORMAT, NEW_FORMAT, LOG_FORMAT };
+} // namespace ns_fileHeader
+
+// FileHeader class parses file headers (lines starting with # at the beginning of the file).
+// Every word (space separated string) is considered a possible FLAG.
+// If a FLAG is followed by a numeric value, than the value is stored as the FLAG's value.
+// The individual functions then just look whether FLAG was present, and in case of integers, whether it had some value assigned to it.
+class FileHeader {
+ private:
+ ifstream *file;
+ map<string,long> values;
+ bool readValues(ofstream *outF = NULL);
+
+ void skipEmptyLines();
+ public:
+ FileHeader(ifstream *f = NULL) {
+ file = f;
+ }
+ void setFile(ifstream *f){
+ file = f;
+ }
+ void close(){
+ file->close();
+ file=NULL;
+ }
+ bool samplesHeader(long *n, long *m, bool *transposed, bool *logged = NULL);
+ bool transcriptsHeader(long *m, long *colN);
+ bool probHeader(long *Nmap, long *Ntotal, long *M, ns_fileHeader::AlignmentFileType *format);
+ bool varianceHeader(long *m, bool *logged);
+ bool paramsHeader(long *parN, ofstream *outF);
+};
+
+#endif
diff --git a/GibbsParameters.cpp b/GibbsParameters.cpp
new file mode 100644
index 0000000..82479b4
--- /dev/null
+++ b/GibbsParameters.cpp
@@ -0,0 +1,110 @@
+#include<fstream>
+
+using namespace std;
+
+#include "GibbsParameters.h"
+#include "common.h"
+
+#define DEBUGGP(x)
+#define Sof(x) (long)x.size()
+
+
+/*void gibbsParameters::setLogFiles(string tau,string tauMeans){//{{{
+ gs_samplesFile=tau;
+ gs_meansFile=tauMeans;
+}//}}}*/
+void gibbsParameters::getAllParameters(){//{{{
+ message("Parameters:\n burnIn: %ld\
+\n samplesN: %ld\n samplesSave: %ld\
+\n samplesNmax: %ld\n chainsN: %ld\
+\n targetScaleReduction: %lf\n dirAlpha: %lf\
+\n dirBeta: %lf\n betaAlpha: %lf\n betaBeta: %lf\n",
+gs_burnIn,gs_samplesN,gs_samplesSave,gs_samplesNmax,gs_chainsN,gs_targetScaleReduction,dirP.alpha,dirP.beta,betaP.alpha,betaP.beta);
+}//}}}
+bool gibbsParameters::setParameters(string paramFileName){//{{{
+ this->paramFileName = paramFileName;
+ return readParameters();
+}//}}}
+bool gibbsParameters::setParameters(ArgumentParser &args){//{{{
+ if(args.isSet("MCMC_burnIn"))gs_burnIn=args.getL("MCMC_burnIn");
+ if(args.isSet("MCMC_samplesN"))gs_samplesN=args.getL("MCMC_samplesN");
+ if(args.isSet("MCMC_samplesSave"))gs_samplesSave=args.getL("MCMC_samplesSave");
+ if(args.isSet("MCMC_samplesNmax"))gs_samplesNmax=args.getL("MCMC_samplesNmax");
+ if(args.isSet("MCMC_chainsN"))gs_chainsN=args.getL("MCMC_chainsN");
+ if(args.isSet("MCMC_scaleReduction"))gs_targetScaleReduction=args.getD("MCMC_scaleReduction");
+ if(args.isSet("MCMC_dirAlpha"))dirP.alpha=args.getD("MCMC_dirAlpha");
+ return true;
+}//}}}
+bool gibbsParameters::readParameters(){//{{{
+ ifstream pFile;
+ string param;
+ double val;
+ char tmp[256];
+ pFile.open(paramFileName.c_str());
+ while((pFile.is_open())&&(! pFile.eof())){
+ if((! (pFile>>param)) || (Sof(param)==0) || (param[0]=='#')){
+ pFile.getline(tmp,256);
+ continue;
+ }
+ pFile>>val;
+ if(pFile.good()){
+ DEBUGGP(message("# DEBUG gPar ||%s==%lf||\n",(param).c_str(),val);)
+ if(param=="burnIn")parameter("burnIn",gs_burnIn,val);
+ if(param=="samplesN")parameter("samplesN",gs_samplesN,val);
+ if(param=="samplesSave")parameter("samplesSave",gs_samplesSave,val);
+ if(param=="samplesNmax")parameter("samplesNmax",gs_samplesNmax,val);
+ if(param=="chainsN")parameter("chainsN",gs_chainsN,val);
+ if(param=="targetScaleReduction")parameter("targetScaleReduction",gs_targetScaleReduction,val);
+ if(param=="dirAlpha")parameter("dirAlpha",dirP.alpha,val);
+ if(param=="dirBeta")parameter("dirBeta",dirP.beta,val);
+ if(param=="betaAlpha")parameter("betaAlpha",betaP.alpha,val);
+ if(param=="betaBeta")parameter("betaBeta",betaP.beta,val);
+ //if(param=="output")parameter("output",gs_output,val);
+ }
+ pFile.getline(tmp,256);
+ }
+ //if(gs_samplesN>gs_samplesNmax)gs_samplesNmax=gs_samplesN;
+ pFile.close();
+ return true;
+}//}}}
+void gibbsParameters::parameter(string name, double &variable, double value){//{{{
+ bool output=false;
+ if(verbose && (variable != value))output = true;
+ variable = value;
+ if(output){
+ message("### %s: %lf\n",(name).c_str(),variable);
+ }
+}//}}}
+void gibbsParameters::parameter(string name, long &variable, double value){//{{{
+ bool output=false;
+ if(verbose && (variable != (long) value))output = true;
+ variable = (long) value;
+ if(output){
+ message("### %s: %ld\n",(name).c_str(),variable);
+ }
+}//}}}
+void gibbsParameters::parameter(string name, bool &variable, double value){//{{{
+ bool output=false;
+ if(verbose && (variable !=(bool)((long) value)))output = true;
+ variable = (bool)((long)value);
+ if(output){
+ message("### %s: %d\n",(name).c_str(),variable);
+ }
+}//}}}
+
+gibbsParameters::gibbsParameters(bool verbose){//{{{
+ this->verbose = verbose;
+ gs_burnIn=1000;
+ gs_samplesN=1000;
+ gs_samplesNmax=50000;
+ gs_samplesSave=500;
+ gs_chainsN=4;
+ gs_targetScaleReduction=1.2;
+ dirP.alpha=1;
+ dirP.beta=1;
+ betaP.alpha=10;
+ betaP.beta=2;
+ gs_samplesFile="gibbs_log.rpkmS";
+ gs_meansFile="gibbs_log.thetaMeans";
+ //gs_output=RPKM;
+}//}}}
diff --git a/GibbsParameters.h b/GibbsParameters.h
new file mode 100644
index 0000000..372b51c
--- /dev/null
+++ b/GibbsParameters.h
@@ -0,0 +1,45 @@
+#ifndef GIBBSPARAMETERS_H
+#define GIBBSPARAMETERS_H
+
+#include<string>
+
+using namespace std;
+
+#include "ArgumentParser.h"
+
+struct distributionParameters{//{{{
+ double alpha,beta;
+};//}}}
+
+class gibbsParameters{
+ private:
+ long gs_burnIn, gs_samplesN, gs_chainsN, gs_samplesNmax, gs_samplesSave;
+ double gs_targetScaleReduction;
+ bool verbose;
+ distributionParameters dirP, betaP;
+ string gs_samplesFile,gs_meansFile,paramFileName;
+ void parameter(string name, bool &variable, double value);
+ void parameter(string name, long &variable, double value);
+ void parameter(string name, double &variable, double value);
+ public:
+ gibbsParameters(bool verbose = true);
+ bool setParameters(string paramFileName);
+ bool setParameters(ArgumentParser &args);
+ bool readParameters();
+ void getAllParameters();
+ long burnIn() const {return gs_burnIn;}
+ long samplesN() const {return gs_samplesN;}
+ long samplesSave() const {return gs_samplesSave;}
+ long samplesNmax() const {return gs_samplesNmax;}
+ long chainsN() const {return gs_chainsN;}
+ const distributionParameters& dir() const {return dirP;}
+ const distributionParameters& beta()const {return betaP;}
+ double targetScaleReduction() const {return gs_targetScaleReduction;}
+// string samplesFile() const {return gs_samplesFile;}
+// string meansFile() const {return gs_meansFile;}
+// void setLogFiles(string tau,string tauMeans);
+// outputType output() const {return (outputType)gs_output;}
+};
+
+
+#endif
diff --git a/GibbsSampler.cpp b/GibbsSampler.cpp
new file mode 100644
index 0000000..7e00e9e
--- /dev/null
+++ b/GibbsSampler.cpp
@@ -0,0 +1,97 @@
+#ifdef DoSTATS
+#include<sys/time.h>
+#endif
+
+#include "GibbsSampler.h"
+#include "common.h"
+
+GibbsSampler::GibbsSampler(){ //{{{
+ thetaAct=0;
+}//}}}
+void GibbsSampler::sampleZ(){//{{{
+ // TimeStats {{{
+#ifdef DoSTATS
+ nZ++;
+ struct timeval start, end;
+ gettimeofday(&start, NULL);
+#endif
+ // }}}
+ long i,j,k;
+ vector<double> phi(m,0);
+ // phi of size M should be enough
+ // because of summing the probabilities for each isoform when reading the data
+ double probNorm,r,sum;
+ int_least32_t readsAlignmentsN;
+
+ // Reset C to zeros.
+ C.assign(C.size(),0);
+ // Assign reads.
+ for(i=0;i<Nmap;i++){
+ probNorm=0;
+ readsAlignmentsN = alignments->getReadsI(i+1) - alignments->getReadsI(i);
+ for(j=0, k=alignments->getReadsI(i); j < readsAlignmentsN; j++, k++){
+ if(alignments->getTrId(k) == 0){
+ phi[j] = alignments->getProb(k) * (1 - thetaAct);
+ }else{
+ phi[j] = alignments->getProb(k) *
+ thetaAct * theta[alignments->getTrId(k)];
+ }
+ probNorm += phi[j];
+ }
+ r = uniformDistribution(rng_mt);
+ // Apply Normalization constant:
+ r *= probNorm;
+ for(j = 0, sum = 0 ; (sum<r) && (j<readsAlignmentsN); j++){
+ sum += phi[j];
+ }
+ if(j==0){
+ // e.g. if probNorm == 0
+ // assign to noise.
+ C[0]++;
+ }else{
+ // Assign to the chosen transcript.
+ C[ alignments->getTrId( alignments->getReadsI(i)+j-1 ) ]++;
+ }
+ }
+ // TimeStats {{{
+#ifdef DoSTATS
+ gettimeofday(&end, NULL);
+ tZ += (end.tv_sec-start.tv_sec)*1000*1000+(end.tv_usec-start.tv_usec);
+#endif
+ // }}}
+}//}}}
+void GibbsSampler::sampleThetaAct(){//{{{
+#ifdef DoSTATS
+ nTa++;
+ struct timeval start, end;
+ gettimeofday(&start, NULL);
+#endif
+ double C0=C[0]+Nunmap,X,Y;
+ // counting C_0 from all reads
+ // generate thetaAct~Beta(a,b) as thetaAct = X/(X+Y) ; X~Gamma(a,1), Y~Gamma(b,1)
+ gammaDistribution.param(gDP(beta->alpha + Nmap+Nunmap - C0, 1));
+ X = gammaDistribution(rng_mt);
+ gammaDistribution.param(gDP(beta->beta + C0, 1));
+ Y = gammaDistribution(rng_mt);
+
+ thetaAct = X / (X+Y);
+#ifdef DoSTATS
+ gettimeofday(&end, NULL);
+ tTa += (end.tv_sec-start.tv_sec)*1000*1000+(end.tv_usec-start.tv_usec);
+#endif
+}//}}}
+void GibbsSampler::update(){//{{{
+ Sampler::update();
+
+ theta[0]=thetaAct; // save thetaAct as theta_0
+
+ updateSums();
+ if((doLog)&&(save))appendFile();
+}//}}}
+void GibbsSampler::sample(){//{{{
+ Sampler::sample();
+
+ sampleTheta();
+ sampleThetaAct();
+ sampleZ();
+}//}}}
diff --git a/GibbsSampler.h b/GibbsSampler.h
new file mode 100644
index 0000000..2eb387b
--- /dev/null
+++ b/GibbsSampler.h
@@ -0,0 +1,17 @@
+
+#include "Sampler.h"
+
+class GibbsSampler : public Sampler{
+ private:
+ double thetaAct;
+
+ void sampleThetaAct();
+ void sampleZ();
+
+ public:
+
+ GibbsSampler();
+
+ virtual void update();
+ virtual void sample();
+};
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..a6d5c11
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,129 @@
+CXX = g++
+ARCH = -mtune=generic
+VERSION = 0.7.5
+# ARCH = -march=core2
+# ARCH = -march=native
+
+
+# Use O1 for debuiggging so it's not totally slow.
+DBGFLAGS = -O1 -ggdb -U_FORTIFY_SOURCE
+COFLAGS = $(ARCH) -O2 -pipe
+CXXFLAGS = -DBS_VERSION=\"$(VERSION)\" -Wall $(COFLAGS)
+# -Wvla does not work with old gcc
+# -ffast-math segfaults with old gcc, don't use.
+LDFLAGS = -Wl,-gc-sections
+BOOSTFLAGS = -I .
+OPENMP = -fopenmp -DSUPPORT_OPENMP
+
+PROGRAMS = \
+ convertSamples \
+ estimateDE \
+ estimateExpression \
+ estimateHyperPar \
+ estimateVBExpression \
+ extractSamples \
+ getFoldChange \
+ getGeneExpression \
+ getPPLR \
+ getVariance \
+ getWithinGeneExpression \
+ parseAlignment \
+ transposeLargeFile
+
+all: $(PROGRAMS)
+
+COMMON_DEPS = ArgumentParser.o common.o FileHeader.o misc.o MyTimer.o
+# PROGRAMS:
+convertSamples: convertSamples.cpp $(COMMON_DEPS) TranscriptInfo.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) convertSamples.cpp $(COMMON_DEPS) TranscriptInfo.o -o convertSamples
+
+estimateDE: estimateDE.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(LDFLAGS) estimateDE.cpp $(COMMON_DEPS) PosteriorSamples.o -o estimateDE
+
+estimateExpression: estimateExpression.cpp $(COMMON_DEPS) CollapsedSampler.o GibbsParameters.o GibbsSampler.o Sampler.o TagAlignments.o TranscriptInfo.o transposeFiles.o
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(OPENMP) $(LDFLAGS) estimateExpression.cpp $(COMMON_DEPS) CollapsedSampler.o GibbsParameters.o GibbsSampler.o Sampler.o TagAlignments.o TranscriptInfo.o transposeFiles.o -o estimateExpression
+
+estimateHyperPar: estimateHyperPar.cpp $(COMMON_DEPS) lowess.o PosteriorSamples.o TranscriptExpression.o
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(LDFLAGS) estimateHyperPar.cpp $(COMMON_DEPS) lowess.o PosteriorSamples.o TranscriptExpression.o -o estimateHyperPar
+
+estimateVBExpression: estimateVBExpression.cpp $(COMMON_DEPS) SimpleSparse.o TagAlignments.o TranscriptInfo.o transposeFiles.o VariationalBayes.o
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(OPENMP) $(LDFLAGS) estimateVBExpression.cpp $(COMMON_DEPS) SimpleSparse.o TagAlignments.o TranscriptInfo.o transposeFiles.o VariationalBayes.o -o estimateVBExpression
+
+extractSamples: extractSamples.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) extractSamples.cpp $(COMMON_DEPS) PosteriorSamples.o -o extractSamples
+
+getFoldChange: getFoldChange.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getFoldChange.cpp $(COMMON_DEPS) PosteriorSamples.o -o getFoldChange
+
+getGeneExpression: getGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o -o getGeneExpression
+
+getPPLR: getPPLR.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getPPLR.cpp $(COMMON_DEPS) PosteriorSamples.o -o getPPLR
+
+getVariance: getVariance.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getVariance.cpp $(COMMON_DEPS) PosteriorSamples.o -o getVariance
+
+getWithinGeneExpression: getWithinGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getWithinGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o -o getWithinGeneExpression
+
+parseAlignment: parseAlignment.cpp $(COMMON_DEPS) ReadDistribution.o samtools/sam.o TranscriptExpression.o TranscriptInfo.o TranscriptSequence.o
+ $(CXX) $(CXXFLAGS) $(OPENMP) $(LDFLAGS) -pthread parseAlignment.cpp $(COMMON_DEPS) ReadDistribution.o samtools/*.o TranscriptExpression.o TranscriptInfo.o TranscriptSequence.o -lz -o parseAlignment
+
+transposeLargeFile: transposeLargeFile.cpp $(COMMON_DEPS) transposeFiles.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) transposeLargeFile.cpp $(COMMON_DEPS) transposeFiles.o -o transposeLargeFile
+
+# LIBRARIES:
+ArgumentParser.o: ArgumentParser.cpp ArgumentParser.h
+ $(CXX) $(CXXFLAGS) -ffunction-sections -fdata-sections -c ArgumentParser.cpp
+
+CollapsedSampler.o: CollapsedSampler.cpp CollapsedSampler.h GibbsParameters.h Sampler.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) -c CollapsedSampler.cpp
+
+FileHeader.o: common.h misc.h FileHeader.cpp FileHeader.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) -ffunction-sections -fdata-sections -c FileHeader.cpp
+
+GibbsSampler.o: GibbsSampler.cpp GibbsSampler.h GibbsParameters.h Sampler.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) -c GibbsSampler.cpp
+
+misc.o: ArgumentParser.h PosteriorSamples.h misc.cpp misc.h
+ $(CXX) $(CXXFLAGS) -ffunction-sections -fdata-sections -c misc.cpp
+
+MyTimer.o: MyTimer.h MyTimer.cpp
+ $(CXX) $(CXXFLAGS) -ffunction-sections -fdata-sections -c MyTimer.cpp
+
+PosteriorSamples.o: PosteriorSamples.cpp PosteriorSamples.h FileHeader.h
+ $(CXX) $(CXXFLAGS) -ffunction-sections -fdata-sections -c PosteriorSamples.cpp
+
+ReadDistribution.o: ReadDistribution.cpp ReadDistribution.h TranscriptExpression.h TranscriptInfo.h TranscriptSequence.h
+ $(CXX) $(CXXFLAGS) $(OPENMP) -c ReadDistribution.cpp
+
+Sampler.o: Sampler.cpp Sampler.h GibbsParameters.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) -c Sampler.cpp
+
+SimpleSparse.o: SimpleSparse.cpp SimpleSparse.h
+ $(CXX) $(CXXFLAGS) $(OPENMP) -c SimpleSparse.cpp
+
+VariationalBayes.o: VariationalBayes.cpp VariationalBayes.h SimpleSparse.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(OPENMP) -c VariationalBayes.cpp
+
+common.o: common.cpp common.h
+GibbsParameters.o: ArgumentParser.h GibbsParameters.cpp GibbsParameters.h
+lowess.o: lowess.cpp lowess.h
+TagAlignments.o: TagAlignments.cpp TagAlignments.h
+TranscriptExpression.o: TranscriptExpression.cpp TranscriptExpression.h
+TranscriptInfo.o: TranscriptInfo.cpp TranscriptInfo.h
+TranscriptSequence.o: TranscriptSequence.cpp TranscriptSequence.h
+transposeFiles.o: transposeFiles.cpp transposeFiles.h FileHeader.h
+
+# EXTERNAL LIBRARIES:
+samtools/sam.o:
+ make --directory samtools
+
+# CLEAN:
+clean:
+ rm *.o $(PROGRAMS)
+
+clean-all:
+ rm samtools/*.o *.o $(PROGRAMS)
+
diff --git a/MyTimer.cpp b/MyTimer.cpp
new file mode 100644
index 0000000..7bb34c0
--- /dev/null
+++ b/MyTimer.cpp
@@ -0,0 +1,60 @@
+#include<ctime>
+
+#include "MyTimer.h"
+
+#include "common.h"
+
+void MyTimer::adjust(double &time,char f){//{{{
+ if(f=='m')time/=60.0;
+ if(f=='h')time/=3600.0;
+}//}}}
+void MyTimer::write(double time,char f){//{{{
+ if(!quiet)messageF("[time: +%.2lf %c]\n",time,f);
+}//}}}
+MyTimer::MyTimer(){//{{{
+ N=1;
+ quiet=false;
+ times.resize(N);
+ times[0]=time(NULL);
+}//}}}
+void MyTimer::start(long timer){//{{{
+ if(timer>=N){
+ N=timer+1;
+ times.resize(N);
+ }
+ times[timer]=time(NULL);
+}//}}}
+double MyTimer::split(long timer, char f){//{{{
+ if(timer>=N)return 0;
+ double ret;
+ ret=time(NULL)-times[timer];
+ adjust(ret,f);
+ write(ret,f);
+ times[timer]=time(NULL);
+ return ret;
+}//}}}
+double MyTimer::getTime(long timer, char f){//{{{
+ if(timer>=N)return 0;
+ double ret;
+ ret=time(NULL)-times[timer];
+ adjust(ret,f);
+ return ret;
+}//}}}
+double MyTimer::current(long timer, char f){//{{{
+ if(timer>=N)return 0;
+ double ret;
+ ret=time(NULL)-times[timer];
+ adjust(ret,f);
+ write(ret,f);
+ return ret;
+}//}}}
+double MyTimer::stop(long timer, char f){//{{{
+ if(timer>=N)return 0;
+ double ret;
+ ret=time(NULL)-times[timer];
+ adjust(ret,f);
+ write(ret,f);
+ times[timer]=time(NULL);
+ return ret;
+}//}}}
+
diff --git a/MyTimer.h b/MyTimer.h
new file mode 100644
index 0000000..26d3bcd
--- /dev/null
+++ b/MyTimer.h
@@ -0,0 +1,28 @@
+#ifndef MYTIMER_H
+#define MYTIMER_H
+
+#include<vector>
+
+using namespace std;
+
+class MyTimer{
+ private:
+ vector<time_t> times;
+ long N;
+ bool quiet;
+ // Adjust time to format m or h.
+ void adjust(double &time,char f);
+ // Write time in format.
+ void write(double time,char f);
+ public:
+ MyTimer();
+ void setQuiet(){quiet=true;}
+ void setVerbose(){quiet=false;}
+ void start(long timer=0);
+ double split(long timer=0, char f='s');
+ double getTime(long timer=0, char f='s');
+ double current(long timer=0, char f='s');
+ double stop(long timer=0, char f='s');
+};
+
+#endif
diff --git a/PosteriorSamples.cpp b/PosteriorSamples.cpp
new file mode 100644
index 0000000..580478e
--- /dev/null
+++ b/PosteriorSamples.cpp
@@ -0,0 +1,299 @@
+#include<algorithm>
+#include<cstdlib>
+#include<vector>
+
+using namespace std;
+
+#include "PosteriorSamples.h"
+
+#include "FileHeader.h"
+#include "misc.h"
+
+#include "common.h"
+
+#define Sof(x) (long)x.size()
+#define SS second
+#define FF first
+
+#define MINUS_INF -47
+#define PLUS_INF 1e10
+
+void PosteriorSamples::clear(){//{{{
+ N=0;
+ M=0;
+ norm = 1.0;
+ failed=true;
+ transposed=true;
+ areLogged=false;
+}//}}}
+bool PosteriorSamples::open(string fileName){//{{{
+ if(samplesF.is_open())samplesF.close();
+ samplesF.open(fileName.c_str());
+ if(!samplesF.is_open()){
+ error("PosterioSamples: File open failed: %s\n",(fileName).c_str());
+ failed=true;
+ return false;
+ }
+ return true;
+}//}}}
+bool PosteriorSamples::initSet(long *m,long *n, string fileName){//{{{
+ failed=false;
+ if(! open(fileName))return false;
+
+ FileHeader fh(&samplesF);
+ if(!fh.samplesHeader(n,m,&transposed,&areLogged)){
+ error("PosteriorSamples: File header reading failed.\n");
+ failed=true;
+ return false;
+ }
+ N=*n;
+ M=*m;
+ return read();
+}//}}}
+bool PosteriorSamples::read(){//{{{
+ if(failed)return false;
+ if(transposed){
+ lines=vector<long>(M,-1);
+ lines[0]=samplesF.tellg();
+ }else{
+ if(N*M > PS_maxStoredSamples){
+ error("PosteriorSamples: Too many samples to store,use trasposed file.\n");
+ failed=true;
+ return false;
+ }
+ samples.resize(M,vector<double>(N,0));
+ for(long i=0;i<N;i++)
+ for(long j=0;j<M;j++)
+ samplesF>>samples[j][i];
+ if(!samplesF.good()){
+ failed=true;
+ return false;
+ }
+ }
+ return true;
+}//}}}
+bool PosteriorSamples::getTranscript(long tr,vector<double> &trSamples){//{{{
+ if((tr>=M)||(failed))return false;
+ string str;
+ bool good=true;
+ if(Sof(trSamples)!=N)trSamples.resize(N);
+ if(transposed){
+ long i;
+ if(lines[tr]==-1){
+ for(i=0;lines[i+1]!=-1;i++);
+ samplesF.seekg(lines[i]);
+ while((samplesF.good())&&(i<tr)){
+ i++;
+ samplesF.ignore(10000000,'\n');
+ lines[i]=samplesF.tellg();
+ }
+ }else{
+ samplesF.seekg(lines[tr]);
+ }
+ for(i=0;(i<N)&&(samplesF.good());i++){
+ samplesF>>trSamples[i];
+ // apply normalisation.
+ trSamples[i] *= norm;
+ if(samplesF.eof())break;
+ if(samplesF.fail()){
+ samplesF.clear();
+ samplesF.seekg(-1,ios::cur);
+ samplesF>>str;
+ if(ns_misc::toLower(str)=="-inf")trSamples[i]=MINUS_INF;
+ else if(ns_misc::toLower(str)=="nan")trSamples[i]=PLUS_INF;
+ else error("PosteriorSamples: Unknown value: %s in [tr:%ld,pos:%ld]\n",(str).c_str(),tr,i);
+ good=false;
+ }
+ }
+ if(i!=N){
+ good=false;
+ error("PosteriorSamples: Reading failed at position: [tr:%ld,pos:%ld]\n",tr,i);
+ }
+ }else{
+ trSamples = samples[tr];
+ // FIXME(glausp) it is not very efficient to do this every time.
+ // However this part only works for small data files.
+ if(norm!=1.0){
+ for(long i=0;i<N;i++)trSamples[i] *= norm;
+ }
+ }
+ return good;
+}//}}}
+void PosteriorSamples::close(){//{{{
+ samplesF.close();
+ failed=true;
+}//}}}
+
+
+Conditions::Conditions(){//{{{
+ mapping=false;
+ CN=0;
+ C=0;
+}//}}}
+long Conditions::getIndex(long max){ // {{{returns index, without checking for duplicates
+ return rand() % max;
+}//}}}
+long Conditions::getRC(long c) const { //{{{
+ if(c>C)return -1;
+ return cIndex[c].SS;
+}//}}}
+bool Conditions::init(string trFileName, vector<string> filesGot, long *m, long *n){//{{{
+ long c;
+ return init(trFileName,filesGot,&c,m,n);
+}//}}}
+bool Conditions::init(string trFileName, vector<string> filesGot, long *c, long *m, long *n){//{{{
+ long i,j,x,colN;
+ bool sameMs=true;
+ vector<string> files;
+ cIndex.resize(1,pair<long,long>(0,0));
+ for(i=0;i<Sof(filesGot);i++){
+ if(filesGot[i]=="C"){
+ if((cIndex.end()-1)->SS!=0){
+ cIndex.push_back(pair<long,long>(Sof(files),0));
+ }
+ }else{
+ (cIndex.end()-1)->SS++;
+ files.push_back(filesGot[i]);
+ }
+ }
+ if((cIndex.end()-1)->SS==0){
+ cIndex.pop_back();
+ }
+ C = Sof(cIndex);
+ *c = C;
+ //message("File names processed.\n");
+
+ CN = Sof(files);
+ samples.resize(CN);
+ Ms.resize(CN);
+ Ns.resize(CN);
+ if(! samples[0].initSet(&Ms[0],&Ns[0],files[0])){
+ error("Conditions: file %s failed to open.\n",(files[0]).c_str());
+ return false;
+ }
+ areLogged = samples[0].logged();
+ N=Ns[0];
+ M=Ms[0];
+ for(i=1;i<CN;i++){
+ if(! samples[i].initSet(&Ms[i],&Ns[i],files[i])){
+ error("Conditions: file %s failed to open.\n",(files[i]).c_str());
+ return false;
+ }
+ if(areLogged != samples[i].logged()){
+ error("Conditions: Problem reading %s: some samples are logged and some are not.\n",(files[i]).c_str());
+ return false;
+ }
+ if(M!=Ms[i]){
+ sameMs=false;
+ }
+ if(N>Ns[i])N=Ns[i];
+ }
+ *n=N;
+
+ ifstream trFile(trFileName.c_str());
+ if(! trFile.is_open()){
+ // if there is no transcript join file, the we have to make sure that Ms are the same
+ if(sameMs){
+ M=Ms[0];
+ *m=M;
+ mapping = false;
+ return true;
+ }else{
+ error("Conditions: Different number of transcripts and missing transcript-join file\n");
+ return false;
+ }
+ }else{
+ FileHeader fh(&trFile);
+ if((!fh.transcriptsHeader(&M,&colN))||(M==0)||(colN<CN+1)){
+ error("Conditions: Wrong transcript join descriptor file - m: %ld colN: %ld\n",M,colN);
+ return false;
+ }
+ *m=M;
+ trMap.resize(M,vector<long>(CN));
+ for(i=0;i<M;i++){
+ trFile>>x;
+ for(j=0;j<colN;j++)
+ if(j<CN)trFile >> trMap[i][j];
+ else trFile >> x;
+ }
+ trFile.close();
+ sort(trMap.begin(),trMap.end());// sort for faster disc access
+ mapping=true;
+ return true;
+ }
+ return false; // we should not get here
+}//}}}
+bool Conditions::setNorm(vector<double> norms){//{{{
+ if((long)norms.size()!=CN){
+ error("Conditions: The number of normalization constants does not match number of experiments (files with samples).\n");
+ return false;
+ }
+ for(long i=0;i<CN;i++){
+ samples[i].setNorm(norms[i]);
+ }
+ return true;
+}//}}}
+bool Conditions::getTranscript(long cond, long rep, long tr, vector<double> &trSamples){//{{{
+ if((cond>C)||(rep>cIndex[cond].SS)){
+ trSamples.clear();
+ return false;
+ }
+ return getTranscript(rep+cIndex[cond].FF, tr, trSamples);
+}//}}}
+bool Conditions::getTranscript(long cond, long tr, vector<double> &trSamples){//{{{
+ bool status=false;
+ static vector<double> tmpSamples;
+ if(cond>=CN){
+ error("Conditions: Wrong condition request.\n");
+ return false;
+ }
+ if(tr>=M){
+ error("Conitions: Wrong transcript request.\n");
+ return false;
+ }
+ if(mapping) tr = trMap[tr][cond];
+ if(N != Ns[cond]){
+ status = samples[cond].getTranscript(tr, tmpSamples);
+ if(Sof(trSamples) != N)trSamples.resize(N);
+ for(long i=0;i<N;i++)trSamples[i] = tmpSamples[ getIndex(Ns[cond]) ];
+ }else{
+ status = samples[cond].getTranscript(tr, trSamples);
+ }
+ return status;
+}//}}}
+bool Conditions::getTranscript(long cond, long tr, vector<double> &trSamples, long samplesN){//{{{
+ bool status=false;
+ static vector<double> tmpSamples;
+ if(cond>=CN){
+ error("Conditions: Wrong condition request.\n");
+ return false;
+ }
+ if(tr>=M){
+ error("Conitions: Wrong transcript request.\n");
+ return false;
+ }
+ if(samplesN > Ns[cond]){
+ error("Conitions: Wrong not enough samples.\n");
+ return false;
+ }
+ if(samplesN <1){
+ error("Conitions: Wrong number of samples.\n");
+ return false;
+ }
+ if(mapping)tr=trMap[tr][cond];
+ if(samplesN != Ns[cond]){
+ status = samples[cond].getTranscript(tr, tmpSamples);
+ if(Sof(trSamples) != samplesN)trSamples.resize(samplesN);
+ for(long i=0;i<samplesN;i++)
+ trSamples[i] = tmpSamples[ getIndex(Ns[cond]) ];
+ }else{
+ status = samples[cond].getTranscript(tr, trSamples);
+ }
+ return status;
+}//}}}
+void Conditions::close(){//{{{
+ for(long i=0;i<CN;i++){
+ samples[i].close();
+ }
+ cIndex.clear();
+}//}}}
diff --git a/PosteriorSamples.h b/PosteriorSamples.h
new file mode 100644
index 0000000..bdb6180
--- /dev/null
+++ b/PosteriorSamples.h
@@ -0,0 +1,66 @@
+#ifndef POSTERIORSAMPLES_H
+#define POSTERIORSAMPLES_H
+
+#include<vector>
+#include<fstream>
+#include<string>
+
+using namespace std;
+
+const long PS_maxStoredSamples = 100000000;
+
+class PosteriorSamples{//{{{
+ private:
+ long N,M;
+ double norm;
+ bool transposed,failed,areLogged;
+ ifstream samplesF;
+ vector<long> lines;
+ vector<vector<double> > samples;
+
+ bool open(string fileName);
+ bool read();
+ public:
+ PosteriorSamples() { clear(); }
+ ~PosteriorSamples() { close(); }
+ // Copy constructor and assginment. Both just create new class. For vectors only.
+ PosteriorSamples(const PosteriorSamples &other) { clear(); }
+ PosteriorSamples& operator=(const PosteriorSamples & other) { //{{{
+ close();
+ clear();
+ return *this;
+ } //}}}
+ void clear();
+ bool initSet(long *m, long *n, string fileName);
+ bool getTranscript(long tr, vector<double> &trSamples);
+ void close();
+ bool logged(){return areLogged;}
+ void setNorm(double norm){this->norm = norm;}
+};//}}}
+
+class Conditions{//{{{
+ private:
+ long M,N,CN,C;
+ bool mapping,areLogged;
+ vector<long> Ms,Ns;
+ vector<vector <long> > trMap;
+ vector<PosteriorSamples> samples;
+ vector<pair<long,long> > cIndex;
+
+ long getIndex(long max); // return index without checking for duplicats
+ public:
+ Conditions();
+ void close();
+ long getRC(long c) const;
+ long getRN() const { return CN;}
+ long getC() const { return C;}
+ bool init(string trFileName, vector<string> filesGot, long *c, long *m, long *n);
+ bool init(string trFileName, vector<string> filesGot, long *m, long *n);
+ bool setNorm(vector<double> norms);
+ bool getTranscript(long cond, long rep, long tr, vector<double> &trSamples);
+ bool getTranscript(long cond, long tr, vector<double> &trSamples);
+ bool getTranscript(long cond, long tr, vector<double> &trSamples, long samplesN);
+ bool logged() const { return areLogged; }
+};//}}}
+
+#endif
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5f64e41
--- /dev/null
+++ b/README.md
@@ -0,0 +1,34 @@
+Dependencies:
+---------------------------------------
+GNU make, g++ , zlib (for samtools API used in parseAlignment program)
+Optional:
+python - for helper scripts getCount.py, extractTranscriptInfo.py
+
+
+Compilation:
+--------------------------------------
+Enter BitSeq directory and run:
+make
+
+
+Help & Usage:
+--------------------------------
+See wiki for more information:
+https://github.com/BitSeq/BitSeq/wiki/Basic-usage
+
+
+Contact:
+----------------------------------
+Please use local issue tracker: https://github.com/BitSeq/BitSeq/issues for
+help, issue resolution and comments.
+
+For direct contact, please contact Peter Glaus (glaus [at] cs.man.ac.uk).
+
+
+License:
+----------------------------------
+Artistic-2.0
+ + Boost_1_0 for directory boost
+ + MIT for directory samtools
+ + LGPL for directory asa103
+
diff --git a/ReadDistribution.cpp b/ReadDistribution.cpp
new file mode 100644
index 0000000..f9ea7c4
--- /dev/null
+++ b/ReadDistribution.cpp
@@ -0,0 +1,1126 @@
+#include<algorithm>
+#include<cmath>
+#ifdef _OPENMP
+#include<omp.h>
+#endif
+
+#include "ReadDistribution.h"
+
+#include "misc.h"
+#include "MyTimer.h"
+
+#include "common.h"
+
+#define DEBUG(x)
+
+namespace ns_rD {
+// Base 2 Int mapping. //{{{
+vector<char> tableB2I;
+vector<int> tableB2BI;
+//}}}
+/*void inline progressLogRD(long cur,long outOf) {//{{{
+ // output progress status every 10%
+ if((outOf>10)&&(cur%((long)(outOf/10))==0)&&(cur!=0))message("# %ld done.\n",cur);
+}//}}} */
+void fillTable() {//{{{
+ if(tableB2I.size()<256){
+ tableB2I.assign(256,-1);
+ tableB2I['A'] = tableB2I['a'] = 0;
+ tableB2I['C'] = tableB2I['c'] = 1;
+ tableB2I['G'] = tableB2I['g'] = 2;
+ tableB2I['T'] = tableB2I['t'] = 3;
+ }
+ if(tableB2BI.size()>=256)return;
+ tableB2BI.assign(256,15);
+ tableB2BI['A'] = tableB2BI['a'] = 1;
+ tableB2BI['C'] = tableB2BI['c'] = 2;
+ tableB2BI['G'] = tableB2BI['g'] = 4;
+ tableB2BI['T'] = tableB2BI['t'] = 8;
+}//}}}
+inline char base2int(char B){//{{{
+ /* switch(B){
+ case 'A': case 'a': return 0;
+ case 'C': case 'c': return 1;
+ case 'G': case 'g': return 2;
+ case 'T': case 't': return 3;
+ default: return -1;
+ } */
+ return tableB2I[B];
+}//}}}
+inline int base2BAMint(char B){//{{{
+ return tableB2BI[B];
+}//}}}
+template<class keyT,class valT> inline void mapAdd(map<keyT,valT> &m, keyT key, valT val){//{{{
+ if(m.count(key)==0)
+ m[key] = val;
+ else
+ m[key] += val;
+}//}}}
+inline bool readHasPhred(const bam1_t *samA){//{{{
+ if(samA->core.l_qseq < 1) return false;
+ return bam1_qual(samA)[0] != 0xff;
+}//}}}
+// Count (number of deletions) - (number of insertions). {{{
+long countDeletions(const bam1_t *samA){
+ long deletionN = 0;
+ for(long i=0;i<samA->core.n_cigar;i++){
+ switch(bam1_cigar(samA)[i]&BAM_CIGAR_MASK){
+ case BAM_CDEL:
+ deletionN += (long)(bam1_cigar(samA)[i]>>BAM_CIGAR_SHIFT);
+ break;
+ case BAM_CINS:
+ deletionN -= (long)(bam1_cigar(samA)[i]>>BAM_CIGAR_SHIFT);
+ break;
+ }
+ }
+ return deletionN;
+}//}}}
+inline bool getCigarOp(const bam1_t *samA, long cigarI, long *cigarOp, long *cigarOpCount){//{{{
+ if((cigarI<0) || (cigarI >= samA->core.n_cigar)) return false;
+ *cigarOp = bam1_cigar(samA)[cigarI]&BAM_CIGAR_MASK;
+ *cigarOpCount = (long)(bam1_cigar(samA)[cigarI]>>BAM_CIGAR_SHIFT);
+ return true;
+}//}}}
+} // namespace ns_rD
+
+using namespace ns_rD;
+
+ReadDistribution::ReadDistribution(){ //{{{
+ M=0;
+ uniform = lengthSet = gotExpression = normalized = validLength = false;
+ warnFirst = false;
+ warnPos = warnTIDmismatch = warnUnknownTID = noteFirstMateDown = 0;
+ procN = 1;
+#ifdef _OPENMP
+ omp_set_num_threads(procN);
+#endif
+ lMu=100;
+ lSigma=10;
+ verbose = true;
+ singleReadLength = 0;
+ minFragLen=10000;
+ lowProbMismatches = LOW_PROB_MISSES;
+ lProbMis.resize(256,0);
+ lProbHit.resize(256,0);
+ for(long i=0; i<256; i++){
+ lProbMis[i] = - i / 10.0 * log(10.0);
+ lProbHit[i] = log1p(-exp(lProbMis[i]));
+ }
+ fillTable();
+}//}}}
+void ReadDistribution::writeWarnings() {//{{{
+ if(warnPos>0){
+ warning("ReadDistribution: %ld reads from a pair did not align to the expected strand of a transcript.\n Use --unstranded option in case the 5' and 3' mate are not expected to be from sense and anti-sense strands respectively.\n", warnPos);
+ }
+ if(warnTIDmismatch>0){
+ warning("ReadDistribution: %ld pair reads were aligned to different transcripts.\n", warnTIDmismatch);
+ }
+ if(warnUnknownTID>0){
+ warning("ReadDistribution: %ld fragments were aligned to unknown transcripts.\n", warnUnknownTID);
+ }
+ if(noteFirstMateDown){
+ message("NOTE: ReadDistribution: First mate from a pair was downstream (%ld times).\n", noteFirstMateDown);
+ }
+ warnPos = warnTIDmismatch = warnUnknownTID = noteFirstMateDown = 0;
+}//}}}
+void ReadDistribution::setProcN(long procN){//{{{
+ if(procN<0)procN=1;
+ if(procN>32)procN=4;
+#ifdef _OPENMP
+ this->procN = procN;
+ omp_set_num_threads(procN);
+#else
+ this->procN = 1;
+#endif
+}//}}}
+void ReadDistribution::showFirstWarnings(){//{{{
+ warnFirst = true;
+}//}}}
+bool ReadDistribution::init(long m, TranscriptInfo* trI, TranscriptSequence* trS, TranscriptExpression* trE, bool unstranded, bool verb){ //{{{
+ M = m;
+ verbose = verb;
+ if(trI==NULL){
+ error("ReadDistribution: Missing TranscriptInfo.\n");
+ return false;
+ }
+ if(trS==NULL){
+ error("ReadDistribution: Missing TranscriptSequence.\n");
+ return false;
+ }
+ uniform = false;
+ this->unstranded = unstranded;
+ trInf=trI;
+ trSeq=trS;
+ trExp=trE;
+ if(trExp) gotExpression = true;
+ else gotExpression = false;
+ lengthSet = false;
+ logLengthSum = logLengthSqSum = 0;
+ fragSeen = 0;
+ // Initialize tr - frag_length - expression maps:
+ trFragSeen5.resize(M);
+ trFragSeen3.resize(M);
+ weightNorms.resize(3,vector<map<long, double> >(M));
+ // Initialize position bias matrices:
+ posProb.resize( 6, vector<vector<double> >(trSizesN + 1, vector<double>(trNumberOfBins,0.01/trNumberOfBins)));
+ // Initialize sequence bias VLMMs:
+ seqProb.resize(4);
+ for(long i=0;i<vlmmNodesN;i++){
+ for(long j=0;j<4;j++)
+ seqProb[j].push_back(VlmmNode(vlmmNodeDependence[i]));
+ }
+ return true;
+}//}}}
+bool ReadDistribution::initUniform(long m, TranscriptInfo* trI, TranscriptSequence* trS, bool verb){ //{{{
+ M = m;
+ verbose = verb;
+ if(trI==NULL){
+ error("ReadDistribution: Missing TranscriptInfo.\n");
+ return false;
+ }
+ trInf = trI;
+ trSeq = trS;
+ trExp = NULL;
+ uniform = true;
+ lengthSet = false;
+ gotExpression = false;
+ logLengthSum = logLengthSqSum = 0;
+ fragSeen = 0;
+ return true;
+}//}}}
+void ReadDistribution::setLowProbMismatches(long m){//{{{
+ lowProbMismatches = m>1 ? m:1;
+}//}}}
+void ReadDistribution::setLength(double mu, double sigma){ //{{{
+ lMu=mu;
+ lSigma=sigma;
+ lengthSet=true;
+ validLength=true;
+ computeLengthProb();
+}//}}}
+bool ReadDistribution::observed(fragmentP frag){ //{{{
+ DEBUG(message("%s===%s\n",bam1_qname(frag->first),bam1_qname(frag->second));)
+ long tid = frag->first->core.tid;
+ if((tid < 0)||(tid>=M)){
+ if(warnFirst && (warnUnknownTID==0))
+ warning("TID unknown: %s: %ld\n",bam1_qname(frag->first),tid);
+ warnUnknownTID++;
+ return false;
+ }
+ if((frag->paired)&&(tid!=frag->second->core.tid)){
+ if(warnFirst && (warnTIDmismatch==0))
+ warning("TID mismatch: %s: %s %s\n",bam1_qname(frag->first),
+ trInf->trName(tid).c_str(),
+ trInf->trName(frag->second->core.tid).c_str());
+ warnTIDmismatch++;
+ return false;
+ }
+ // Set inverse expression
+ double Iexp = (gotExpression)? 1.0/trExp->exp(tid) : 1.0;
+ // Calculate reads' true end position:
+ long frag_first_endPos, frag_second_endPos=0;
+ frag_first_endPos = bam_calend(&frag->first->core, bam1_cigar(frag->first));
+ if(frag->paired){
+ frag_second_endPos = bam_calend(&frag->second->core, bam1_cigar(frag->second));
+ }
+ // update lengths: //{{{
+ DEBUG(message(" length update\n");)
+ double len,logLen;
+ if(frag->paired){
+ fragSeen ++;
+ if(frag->second->core.pos>frag->first->core.pos)
+ len = frag_second_endPos - frag->first->core.pos;
+ else{
+ len = frag_first_endPos - frag->second->core.pos;
+ }
+ if(minFragLen>(long)len)minFragLen = (long) len;
+ logLen = log(len);
+ logLengthSum += logLen;
+ logLengthSqSum += logLen*logLen;
+ DEBUG(if(len<=75)message("%s %ld %d %ld %d %ld\n",bam1_qname(frag->first), len, frag->first->core.pos,frag_first_endPos,frag->second->core.pos,frag_second_endPos));
+ mapAdd(fragLengths,(long)len,(long)1);
+ }else{
+ len = frag_first_endPos - frag->first->core.pos;
+ singleReadLength = (long)len;
+ if(singleReadLength<minFragLen)minFragLen = singleReadLength;
+ } //}}}
+ // Update Mismatch frequencies if no Phred. //{{{
+ if((!readHasPhred(frag->first)) || (frag->paired && !readHasPhred(frag->second))){
+ updateMismatchFreq(frag->first);
+ if(frag->paired)updateMismatchFreq(frag->second);
+ }
+ // }}}
+ // for uniform distribution ignore other estimation:
+ if(uniform) return true;
+
+ // check mates relative position: {{{
+ if((frag->paired) && (frag->first->core.pos > frag->second->core.pos)){
+ noteFirstMateDown ++;
+ bam1_t *tmp = frag->second;
+ frag->second = frag->first;
+ frag->first = tmp;
+ }
+ if((frag->paired) && (!unstranded) &&
+ ((frag->first->core.flag & BAM_FREVERSE) ||
+ (! frag->second->core.flag & BAM_FREVERSE))){
+ if(warnFirst && (warnPos==0))
+ warning("wrong strand: %s: %s\n",bam1_qname(frag->first),
+ trInf->trName(tid).c_str());
+ warnPos ++;
+ return false;
+ }//}}}
+ // positional bias:
+ // sequence bias:
+ DEBUG(message(" positional & sequence bias\n");)
+ if(! frag->paired){
+ if(frag->first->core.flag & BAM_FREVERSE){
+ // Antisense strand of transcript is 3'end of fragment
+ updatePosBias(frag_first_endPos, readM_3, tid, Iexp);
+ // readM_5 and uniformM_5 are always "second mates"
+ // this is assumed also in getP(...);
+ updateSeqBias(frag_first_endPos, readM_3, tid, Iexp);
+ // update sum of expression of fragments of given length
+ mapAdd(trFragSeen3[tid], (long)len, Iexp);
+ }else{
+ // Sense strand of transcript is 5'end of fragment
+ updatePosBias( frag->first->core.pos, readM_5, tid, Iexp);
+ updateSeqBias( frag->first->core.pos, readM_5, tid, Iexp);
+ mapAdd(trFragSeen5[tid], (long)len, Iexp);
+ }
+ }else{
+ updatePosBias( frag->first->core.pos, readM_5, tid, Iexp);
+ updateSeqBias( frag->first->core.pos, readM_5, tid, Iexp);
+ mapAdd(trFragSeen5[tid], (long)len, Iexp);
+
+ updatePosBias( frag_second_endPos, readM_3, tid, Iexp);
+ updateSeqBias( frag_second_endPos, readM_3, tid, Iexp);
+ mapAdd(trFragSeen3[tid], (long)len, Iexp);
+ }
+ return true;
+}//}}}
+void ReadDistribution::normalize(){ //{{{
+ // length distribution: {{{
+ double newMu=0, newSigma=0;
+
+ if(fragSeen>10){
+ // Estimate mean and sigma for length distribution.
+ newMu = logLengthSum / fragSeen;
+ newSigma = sqrt(logLengthSqSum / fragSeen - newMu*newMu);
+ if(verbose)message("ReadDistribution: fragment length mu: %lg sigma: %lg\n",newMu,newSigma);
+ validLength = true;
+ }
+ if(lengthSet){
+ // check difference between estimated mean and provided mean
+ if(abs(newMu-lMu)>lSigma){
+ warning("ReadDistribution: Estimated length mean (%lg) differs too much from the one provided (%lg).\n",newMu,lMu);
+ }
+ }else{
+ // Use estimated mean and sigma;
+ lMu = newMu;
+ lSigma = newSigma;
+ if(validLength)computeLengthProb();
+ }
+ // }}}
+ // mismatch frequencies: {{{
+ double lFreqSum;
+ for(size_t i=0;i<lFreqHit.size();i++){
+ lFreqSum = log(lFreqHit[i]+lFreqMis[i]);
+ lFreqHit[i] = log(lFreqHit[i]) - lFreqSum;
+ lFreqMis[i] = log(lFreqMis[i]) - lFreqSum;
+ }
+ // }}}
+ if(uniform) return;
+ map<long,double>::iterator mIt;
+ long i,j,m,group,trLen,fragLen;
+ double Iexp,norm;
+ double binSize;
+ // set Uniform position position bias: //{{{
+ if(verbose)message("ReadDistribution: Computing uniform positional bias.\n");
+ for(m=0;m<M;m++){
+ //if(verbose)progressLogRD(m,M);
+ trLen = trInf->L(m);
+ if(trLen<trNumberOfBins)continue;
+ binSize = (double)trLen / trNumberOfBins;
+ //message(" %ld %ld %ld\n",m,trLen,trFragSeen[m].size());
+ for(group=0;group<trSizesN;group++)
+ if(trLen<trSizes[group])break;
+ // update 5' positional bias
+ for( mIt=trFragSeen5[m].begin(); mIt != trFragSeen5[m].end(); mIt++){
+ fragLen = mIt->first;
+ Iexp = mIt->second / (trLen - fragLen + 1);
+ for(i=0;i<trNumberOfBins;i++){
+ // update probability of each bin by Iexp*"effective length of current bin"
+ if((i+1) * binSize <= fragLen)continue;
+ if(i * binSize < fragLen){
+ posProb[uniformM_5][group][trNumberOfBins -1 -i] +=
+ Iexp * ((i+1) * binSize - fragLen + 1);
+ }else{
+ posProb[uniformM_5][group][trNumberOfBins -1 -i] +=
+ Iexp * binSize;
+ }
+ }
+ }
+ // update 3' positional bias
+ for( mIt=trFragSeen3[m].begin(); mIt != trFragSeen3[m].end(); mIt++){
+ fragLen = mIt->first;
+ Iexp = mIt->second / (trLen - fragLen + 1);
+ for(i=0;i<trNumberOfBins;i++){
+ // update probability of each bin by Iexp*"effective length of current bin"
+ if((i+1) * binSize <= fragLen)continue;
+ if(i * binSize < fragLen){
+ posProb[uniformM_3][group][i] +=
+ Iexp * ((i+1) * binSize - fragLen + 1);
+ }else{
+ posProb[uniformM_3][group][i] +=
+ Iexp * binSize;
+ }
+ }
+ }
+ }// }}}
+ // pre-compute position bias weights: {{{
+ for(j=0;j<4;j++)
+ for(group=0;group<=trSizesN;group++){
+ norm = 0;
+ for(i=0;i<trNumberOfBins;i++)norm += posProb[j][group][i];
+ for(i=0;i<trNumberOfBins;i++)posProb[j][group][i] /= norm;
+ }
+ for(group=0;group <= trSizesN;group++){
+ for(i=0;i<trNumberOfBins;i++){
+ // FIX HERE
+ posProb[weight_5][group][i] = posProb[readM_5][group][i]/posProb[uniformM_5][group][i];
+ // FIX HERE
+ posProb[weight_3][group][i] = posProb[readM_3][group][i]/posProb[uniformM_3][group][i];
+ }
+ }//}}}
+ //set Uniform sequence bias: {{{
+ if(verbose)message("ReadDistribution: Computing uniform sequence bias.\n");
+ double IexpSum5,IexpSum3;
+ map<long,double>::reverse_iterator mItR;
+ long p;
+ for(m=0;m<M;m++){
+ //if(verbose)progressLogRD(m,M);
+ trLen = trInf->L(m);
+ IexpSum5=0;
+ for(mIt=trFragSeen5[m].begin();mIt!= trFragSeen5[m].end();mIt++)
+ IexpSum5+=mIt->second / (trLen - mIt->first + 1);
+ IexpSum3=0;
+ mItR=trFragSeen5[m].rbegin();
+ mIt=trFragSeen3[m].begin();
+ // STL map iterator IS sorted by key <=> length
+ for(p=0;p<trLen;p++){
+ while((mIt!=trFragSeen3[m].end())&&(mIt->first <= p+1)){IexpSum3+=mIt->second/ (trLen - mIt->first + 1); mIt++;}
+ while((mItR!=trFragSeen5[m].rend())&&(trLen-p < mItR->first)){IexpSum5-= mItR->second / (trLen - mItR->first + 1) ; mItR++;}
+ updateSeqBias(p, uniformM_5, m, IexpSum5);
+ // 3' end is expected to be "after"
+ updateSeqBias(p+1, uniformM_3, m, IexpSum3);
+ }
+ }//}}}
+ // normalize VLMM nodes: {{{
+ for(i=0;i<vlmmNodesN;i++){
+ for(long j=0;j<4;j++)
+ seqProb[j][i].normalize();
+ }//}}}
+}//}}}
+void ReadDistribution::logProfiles(string logFileName){//{{{
+ ofstream outF;
+ outF.open(logFileName.c_str());
+ outF.precision(6);
+ outF<<scientific;
+ if(!outF.is_open()){
+ error("ReadDistribution: Unable to open profile file: %s\n",(logFileName).c_str());
+ return;
+ }
+ long i,j,g;
+ outF<<"# BASES: (readM_5, readM_3, uniformM_5, uniformM_3)"<<endl;
+ if(!uniform){
+ for(j=0;j<4;j++){
+ outF<<"# "<<endl;
+ for(i=0;i<vlmmNodesN;i++){
+ outF<<seqProb[j][i].getPsum('A')<<" "<<seqProb[j][i].getPsum('C')<<" "<<seqProb[j][i].getPsum('G')<<" "<<seqProb[j][i].getPsum('T')<<endl;
+ }
+ }
+ }
+
+ outF<<"#\n# Position: (readM_5, readM_3, uniformM_5, uniformM_3, weight_5, weight_3)"<<endl;
+ if(!uniform){
+ for(j=0;j<6;j++){
+ outF<<"# "<<endl;
+ for(g=0;g<=trSizesN;g++){
+ for(i=0;i<trNumberOfBins;i++)
+ outF<<posProb[j][g][i]<<" ";
+ outF<<endl;
+ }
+ }
+ }
+ outF<<"# Mismatch likelihood: (probHit, probMis)"<<endl;
+ if(!lFreqHit.empty()){
+ for(i=0;i<(long)lFreqHit.size();i++)outF<<exp(lFreqHit[i])<<" ";
+ outF<<endl;
+ for(i=0;i<(long)lFreqMis.size();i++)outF<<exp(lFreqMis[i])<<" ";
+ outF<<endl;
+ }
+ outF<<"# Fragment lengths:\n";
+ if(validLength){
+ outF<<"# Distribution parameters: mu: "<<lMu<<" sigma: "<<lSigma<<endl;
+ outF<<"# Length distribution: (length, counts) L "<<fragLengths.size()<<endl;
+ for(map<long,long>::iterator it=fragLengths.begin();it!=fragLengths.end();it++)
+ outF<<it->first<<" ";
+ outF<<endl;
+ for(map<long,long>::iterator it=fragLengths.begin();it!=fragLengths.end();it++)
+ outF<<it->second<<" ";
+ outF<<endl;
+ }
+ outF.close();
+}//}}}
+void ReadDistribution::updateMismatchFreq(bam1_t *samA) {//{{{
+ if(! samA) return;
+ bam1_core_t *samC = &samA->core;
+ long i,j,k,kStart,kDir,len=samC->l_qseq;
+ // Make sure we have place for storing data.
+ if(len>(long)lFreqHit.size()){
+ lFreqHit.resize(len,1.0);
+ lFreqMis.resize(len,1.0);
+ }
+ // Set direction for storing mismatches depending on read orientation.
+ if(samC->flag & BAM_FREVERSE){
+ kStart = len - 1;
+ kDir = -1;
+ }else{
+ kStart = 0;
+ kDir = +1;
+ }
+ long deletionN = countDeletions(samA);
+ string seq = trSeq->getSeq(samC->tid, samC->pos, len+deletionN, false);
+ long cigarOp,cigarI,cigarOpCount;
+ cigarOp=cigarI=cigarOpCount=0;
+ // i - iterates within reference sequence
+ // j - iterates within read
+ // k - iterates within frequency arrays, can be reversed
+ for(i=j=0,k=kStart;(i<len+deletionN) && (j<len);){
+ if(cigarOpCount == 0){
+ if(! getCigarOp(samA, cigarI, &cigarOp, &cigarOpCount))break;
+ cigarI++;
+ }
+ switch(cigarOp){
+ case BAM_CDEL: i+=cigarOpCount; cigarOpCount=0; continue;
+ case BAM_CINS:
+ j+= cigarOpCount;
+ k+= kDir * cigarOpCount;
+ cigarOpCount=0;
+ continue;
+ }
+ if(base2int(seq[i]) > -1){
+ if(base2BAMint(seq[i]) != bam1_seqi(bam1_seq(samA),j))lFreqMis[k]+=1;
+ else lFreqHit[k]+=1;
+ }
+ i++;
+ j++;
+ k+=kDir;
+ cigarOpCount --;
+ }
+}//}}}
+pair<double,double> ReadDistribution::getSequenceLProb(bam1_t *samA) const{//{{{
+ if(! samA) return pair<double, double>(0,0);
+ double lProb=0,lowLProb=0, lPHit, lPMis;
+ bam1_core_t *samC = &samA->core;
+ uint8_t *qualP=bam1_qual(samA);
+ bool hasPhred = readHasPhred(samA);
+ long i,j,k,len=samC->l_qseq;
+ long deletionN = countDeletions(samA);
+ string seq = trSeq->getSeq(samC->tid, samC->pos, len+deletionN, false);
+ long hitC, misC, addMisC;
+ long cigarOp,cigarI,cigarOpCount;
+ bool reversed = (samC->flag & BAM_FREVERSE);
+
+ // First count the number fo misses to add for low probability. {{{
+ cigarOp = cigarI = cigarOpCount = 0;
+ hitC = misC = 0;
+ // i - iterates within reference sequence
+ // j - iterates within read
+ for(i=j=0;(i<len+deletionN) && (j<len);){
+ if(cigarOpCount == 0){
+ if(! getCigarOp(samA, cigarI, &cigarOp, &cigarOpCount))break;
+ cigarI++;
+ }
+ switch(cigarOp){
+ case BAM_CDEL: i+=cigarOpCount; cigarOpCount=0; continue;
+ case BAM_CINS: j+=cigarOpCount; cigarOpCount=0; continue;
+ }
+ if((base2int(seq[i]) == -1)||
+ (base2BAMint(seq[i]) != bam1_seqi(bam1_seq(samA),j)))misC++;
+ else hitC++;
+ i++;
+ j++;
+ cigarOpCount --;
+ }
+ addMisC = max((long)1, lowProbMismatches - misC);
+ // }}}
+
+ cigarOp = cigarI = cigarOpCount = 0;
+ for(i=j=0;(i<len+deletionN) && (j<len);){
+ if(cigarOpCount == 0){
+ if(! getCigarOp(samA, cigarI, &cigarOp, &cigarOpCount))break;
+ cigarI++;
+ }
+ switch(cigarOp){
+ case BAM_CDEL: i+=cigarOpCount; cigarOpCount=0; continue;
+ case BAM_CINS: j+=cigarOpCount; cigarOpCount=0; continue;
+ /*case BAM_CMATCH:
+ case BAM_CEQUAL:
+ case BAM_CDIFF:*/
+ }
+ if(hasPhred){
+ lPHit = lProbHit[qualP[j]];
+ lPMis = lProbMis[qualP[j]];
+ }else{
+ if(!reversed)k = j;
+ else k = len-j-1;
+ if((k>=0)&&(k<(long)lFreqHit.size())){
+ lPHit = lFreqHit[k];
+ lPMis = lFreqMis[k];
+ }else{
+ lPHit = lPMis = 0.5;
+ }
+ }
+ if((base2int(seq[i]) == -1) ||
+ (base2BAMint(seq[i]) != bam1_seqi(bam1_seq(samA),j))){
+ // If bases don't match, multiply probability by probability of error.
+ lProb += lPMis;
+ lowLProb += lPMis;
+ }else{
+ lProb += lPHit;
+ hitC --;
+ if((addMisC>0) && (reversed || (addMisC>hitC))){
+ // If there are some misses left add a 'miss' to the 'low probability'.
+ lowLProb += lPMis;
+ addMisC--;
+ }else{
+ lowLProb += lPHit;
+ }
+ }
+ i++;
+ j++;
+ cigarOpCount --;
+ }
+ return pair<double, double>(lProb,lowLProb);
+}//}}}
+bool ReadDistribution::getP(fragmentP frag,double &lProb,double &lProbNoise){ //{{{
+ lProb = ns_misc::LOG_ZERO;
+ lProbNoise = ns_misc::LOG_ZERO;
+ long tid = frag->first->core.tid;
+ long trLen = trInf->L(tid),len;
+ // Check transcript IDs {{{
+ if((tid < 0)||(tid>=M)){
+ if(warnFirst && (warnUnknownTID==0))
+ warning("TID unknown: %s: %ld\n",bam1_qname(frag->first),tid);
+ warnUnknownTID++;
+ return false;
+ }
+ if((frag->paired)&&(tid!=frag->second->core.tid)){
+ if(warnFirst && (warnTIDmismatch==0))
+ warning("TID mismatch: %s: %s %s\n",bam1_qname(frag->first),
+ trInf->trName(tid).c_str(),
+ trInf->trName(frag->second->core.tid).c_str());
+ warnTIDmismatch++;
+ return false;
+ }
+ //}}}
+ double lP = 0;
+ // Get probability based on base mismatches: {{{
+ pair<double, double> lpSeq1(0,0),lpSeq2(0,0);
+ lpSeq1 = getSequenceLProb(frag->first);
+ if(frag->paired)lpSeq2 = getSequenceLProb(frag->second);
+ // }}}
+ // Calculate reads' true end position: {{{
+ long frag_first_endPos, frag_second_endPos=0;
+ frag_first_endPos = bam_calend(&frag->first->core, bam1_cigar(frag->first));
+ if(frag->paired){
+ frag_second_endPos = bam_calend(&frag->second->core, bam1_cigar(frag->second));
+ }
+ // }}}
+ if(frag->paired){
+ // Get probability of length {{{
+ if(frag->second->core.pos > frag->first->core.pos)
+ len = frag_second_endPos - frag->first->core.pos;
+ else{
+ len = frag_first_endPos - frag->second->core.pos;
+ }
+ // compute length probability and normalize by probability of all possible lengths (cdf):
+ // P*=lengthP/lengthNorm
+ // }}}
+ if(validLength) lP += getLengthLP(len) - getLengthLNorm(trLen);
+ }else{
+ len = frag_first_endPos - frag->first->core.pos;
+ }
+ if(uniform){
+ // Get probability of position for uniform distribution
+ // P*=1/(trLen-len+1)
+ lP -= log(trLen - len + 1.0);
+ }else{ // Positional & Sequence bias {{{
+ // Get probability of position given read bias model
+ // check mates' relative position:
+ if( frag->paired && (frag->first->core.pos > frag->second->core.pos)){
+ noteFirstMateDown ++;
+ bam1_t *tmp = frag->second;
+ frag->second = frag->first;
+ frag->first = tmp;
+ }
+ if(!frag->paired){
+ if(frag->first->core.flag & BAM_FREVERSE){
+ // If read was reverse complement, then it's 3' mate.
+ // P*=posBias3'*seqBias3'/weightNorm3'
+ lP += log(getPosBias(frag->first->core.pos, frag_first_endPos,
+ mate_3, trLen)) +
+ log(getSeqBias(frag_first_endPos , mate_3, tid )) -
+ log(getWeightNorm( (long) len, mate_3, tid));
+ }else{
+ // P*=posBias5'*seqBias5'/weightNorm5'
+ lP += log(getPosBias(frag->first->core.pos, frag_first_endPos,
+ mate_5, trLen)) +
+ log(getSeqBias(frag->first->core.pos, mate_5, tid )) -
+ log(getWeightNorm( (long) len, mate_5, tid));
+ }
+ }else{
+ // check strand of the reads:
+ if((!unstranded) &&
+ ((frag->first->core.flag & BAM_FREVERSE) ||
+ (! frag->second->core.flag & BAM_FREVERSE))){
+ if(warnFirst && (warnPos==0))
+ warning("wrong strand: %s: %s\n",bam1_qname(frag->first),
+ trInf->trName(tid).c_str());
+ warnPos ++;
+ return false;
+ }
+//#pragma omp parallel sections num_threads (2) reduction(*:P)
+//{
+// #pragma omp section
+ // P*=1/weightNormFull
+ lP -= log(getWeightNorm( (long) len, FullPair, tid));
+// #pragma omp section
+// {
+ // P*=posBias5'*posBias3'*seqBias5'*seqBias3'
+ lP += log(getPosBias(frag->first->core.pos, frag_second_endPos,
+ FullPair, trLen))
+ + log(getSeqBias(frag->first->core.pos, mate_5, tid ))
+ + log(getSeqBias(frag_second_endPos , mate_3, tid ));
+// }
+//}
+ }
+ } //}}}
+ lProb = lP + lpSeq1.first+lpSeq2.first;
+ lProbNoise = lP + lpSeq1.second+lpSeq2.second;
+ return true;
+}//}}}
+void ReadDistribution::updatePosBias(long pos, biasT bias, long tid, double Iexp){ //{{{
+ if(bias == readM_3)pos--;
+ long group, rel, trLen;
+ trLen = trInf->L(tid);
+ // transcript too short:
+ if(trLen < trNumberOfBins) return;
+ // choose group:
+ for(group = 0;group < trSizesN;group++)
+ if(trLen<trSizes[group])break;
+ // find relative position:
+ rel = (pos * trNumberOfBins) / trLen;
+ if(rel>=trNumberOfBins)rel=trNumberOfBins-1;
+ //add inverse expression:
+ posProb[bias][ group ][ rel ] += Iexp;
+}//}}}
+void ReadDistribution::updateSeqBias(long pos, biasT bias, long tid, double Iexp){ //{{{
+ if(Iexp<=0)return;
+ if(bias>3)return; //this should not happen
+ long start ;
+ string seq;
+ // Set correct start based on orientation.
+ if((bias == readM_5)||(bias == uniformM_5)){
+ start = pos - vlmmStartOffset - MAX_NODE_PAR;
+ seq = trSeq->getSeq(tid, start, vlmmNodesN + MAX_NODE_PAR);
+ }else{
+ start = pos + vlmmStartOffset - vlmmNodesN ;
+ // Get don't need complementing as it is always complement.
+ seq = trSeq->getSeq(tid, start, vlmmNodesN + MAX_NODE_PAR);
+ // Only reverse the sequence.
+ reverse(seq.begin(),seq.end());
+ }
+ // Update bias weights.
+ for(long i=0;i<vlmmNodesN;i++){
+ seqProb[bias][i].update( Iexp, seq[i+2], seq[i+1], seq[i]);
+ }
+}//}}}
+double ReadDistribution::getPosBias(long start, long end, readT read, long trLen) const { //{{{
+ end --;
+ // transcript too short:
+ if(trLen < trNumberOfBins) return 1;
+ long group, relS, relE;
+ // choose group:
+ for(group = 0;group < trSizesN;group++)
+ if(trLen<trSizes[group])break;
+ // find relative positions:
+ relS = (start * trNumberOfBins) / trLen;
+ if(relS>=trNumberOfBins)relS=trNumberOfBins-1;
+ relE = (end * trNumberOfBins) / trLen;
+ if(relE>=trNumberOfBins)relE=trNumberOfBins-1;
+ double posBias = 1;
+ // return bias weight
+ if((read == FullPair) || (read == mate_5))
+ posBias *= posProb[ weight_5 ][ group ][ relS ];
+ if((read == FullPair) || (read == mate_3))
+ posBias *= posProb[ weight_3 ][ group ][ relE ];
+ return posBias;
+}//}}}
+double ReadDistribution::getSeqBias(long pos, readT read, long tid) const{ //{{{
+ if(read==FullPair)return 0; // this should never happen
+ long start;
+ biasT bias,biasNorm;
+ // Get sequence based on which fragment end we are dealing with.
+ if(read == mate_5){
+ start = pos - vlmmStartOffset - MAX_NODE_PAR;
+ }else{
+ start = pos + vlmmStartOffset - vlmmNodesN;
+ }
+ string seq = trSeq->getSeq(tid, start, vlmmNodesN + MAX_NODE_PAR);
+ if(read == mate_5){
+ bias = readM_5;
+ biasNorm = uniformM_5;
+ }else{
+ bias = readM_3;
+ biasNorm = uniformM_3;
+ // Reverse the sequence for 3' end.
+ reverse(seq.begin(),seq.end());
+ }
+ double B = 1;
+ for(long i=0;i<vlmmNodesN;i++)
+ // FIX HERE (probably that we are always doing 'same' division)
+ B *= seqProb[bias][i].getP( seq[i+2], seq[i+1], seq[i]) /
+ seqProb[biasNorm][i].getP( seq[i+2], seq[i+1], seq[i]);
+ return B;
+}//}}}
+inline char ReadDistribution::getBase(long pos, const string &fSeq) const{ //{{{
+ if((pos<0)||(pos>=(long)fSeq.size()))return 'N';
+ return fSeq[pos];
+}//}}}
+double ReadDistribution::getSeqBias(long start, long end, readT read, const string &fSeq) const{ //{{{
+ start = start - vlmmStartOffset - MAX_NODE_PAR;
+ end = end + vlmmStartOffset + MAX_NODE_PAR - 1;
+
+ double B = 1;
+ long i,j;
+ if((read==FullPair) || (read == mate_5)){
+ for(i=0,j=start; i<vlmmNodesN; i++, j++)
+ // FIX HERE (probably that we are always doing 'same' division)
+ B *= seqProb[readM_5][i].getP( getBase(j+2,fSeq), getBase(j+1,fSeq), getBase(j,fSeq)) /
+ seqProb[uniformM_5][i].getP( getBase(j+2,fSeq), getBase(j+1,fSeq), getBase(j,fSeq));
+ }
+ if((read==FullPair) || (read == mate_3)){
+ // For 3' bias we go from 'end' position backwards.
+ for(i=0,j=end; i<vlmmNodesN; i++, j--)
+ // FIX HERE (probably that we are always doing 'same' division)
+ B *= seqProb[readM_3][i].getP( getBase(j-2,fSeq), getBase(j-1,fSeq), getBase(j,fSeq)) /
+ seqProb[uniformM_3][i].getP( getBase(j-2,fSeq), getBase(j-1,fSeq), getBase(j,fSeq));
+ }
+ return B;
+}//}}}
+/* inline char ReadDistribution::complementBase(char base) const{ //{{{
+ if((base=='A')||(base=='a')) return'T';
+ if((base=='T')||(base=='t')) return 'A';
+ if((base=='C')||(base=='c')) return 'G';
+ if((base=='G')||(base=='g')) return 'C';
+ return 'N';
+}//}}} */
+double ReadDistribution::getWeightNorm(long len, readT read, long tid){ //{{{
+ if(len == 0)return 1;
+ if(weightNorms[read][tid].count(len) == 0){
+ const string &trS = trSeq->getTr(tid);
+ // We are not complementing.
+ //for(size_t i=0;i<trRS.size();i++)trRS[i] = complementBase(trRS[i]);
+ long trLen = trInf->L(tid), pos;
+ double norm = 0,w;
+ #pragma omp parallel for \
+ private(w) \
+ reduction(+:norm)
+ for(pos = 0;pos <= trLen-len;pos++){
+ w = getPosBias(pos, pos + len, read, trLen) *
+ getSeqBias(pos, pos + len, read, trS);
+ norm+=w;
+ }
+ weightNorms[read][tid][len] = norm;
+// message("w: %ld %ld %ld %ld%lf\n",read,tid,len,trLen<" ",norm);
+ return norm;
+ }
+ return weightNorms[read][tid][len];
+}//}}}
+long ReadDistribution::getWeightNormCount() const{//{{{
+ long length_sum=0;
+ for(size_t i=0;i<weightNorms.size();i++)
+ for(size_t j=0;j<weightNorms[i].size();j++)
+ length_sum+=weightNorms[i][j].size();
+ return length_sum;
+}//}}}
+double ReadDistribution::getLengthLP(long len) const{//{{{
+ if(len>=(double)lLengthP.size())return computeLengthLP(len);
+ return lLengthP[len];
+}//}}}
+double ReadDistribution::computeLengthLP(double len) const{//{{{
+ //return 1./(len*lSigma*sqrt_2_pi)*exp(-pow(log(len) - lMu, (double)2.0)/(2 * pow(lSigma, (double)2)));
+ if(len == 0)return ns_misc::LOG_ZERO;
+ const double log_sqrt_2_pi = .918938533192; // log(sqrt(2*pi))
+ const double lLen = log(len);
+ return - (lLen +
+ log(lSigma) +
+ log_sqrt_2_pi +
+ pow( (lLen - lMu) / lSigma, 2.0) / 2.0 );
+}//}}}
+double ReadDistribution::getLengthLNorm(long trLen) const{//{{{
+ if(trLen<(double)lLengthNorm.size())return lLengthNorm[trLen];
+
+ // erfc needs compiler with C99 standard
+ // other option might be to use boost/math/special_functions/erf.hpp
+ const long double sqrt_2 = 1.41421356237309;
+ long double CDF2 = erfcl((lMu-log((long double)trLen)) / (lSigma * sqrt_2));
+ if(CDF2 == 0)return log(0.5)+ns_misc::LOG_ZERO;
+ return (double)(log(0.5)+log(CDF2));
+}//}}}
+void ReadDistribution::computeLengthProb() {//{{{
+ MyTimer timer;
+ if(verbose){
+ message("Pre-computing length probabilities. ");
+ timer.start();
+ }
+ long max=0;
+ if(trInf){
+ for(long i=0;i<M;i++)if(trInf->L(i)>max)max=trInf->L(i);
+ max = min(max,(long)150000);
+ }else{
+ max = 100000;
+ }
+ lLengthP.assign(max+1,ns_misc::LOG_ZERO);
+ lLengthNorm.assign(max+1,ns_misc::LOG_ZERO);
+ bool normIsOne = false;
+ for(long i=1;i<=max;i++){
+ if(normIsOne){
+ // lP is LOG_ZERO already, set norm to log(1).
+ lLengthNorm[i] = 0;
+ continue;
+ }
+ lLengthP[i] = computeLengthLP(i);
+ lLengthNorm[i] = ns_math::logAddExp(lLengthNorm[i-1],lLengthP[i]);
+ if(lLengthNorm[i] > -1e-15){
+ normIsOne=true;
+ }
+ }
+ if(verbose)timer.current();
+}//}}}
+vector<double> ReadDistribution::getEffectiveLengths(){ //{{{
+ vector<double> effL(M,0);
+ long m,len,trLen,pos;
+ double eL, lCdfNorm,lenP, wNorm;
+ string trRS;
+ // Make one caching array for each process.
+ vector<vector<double> > posBias5All(procN),posBias3All(procN);
+ MyTimer timer;
+ timer.start();
+ DEBUG(message("Eff length: validLength %d ; minFragLen: %ld.\n",(int)validLength,minFragLen));
+ #pragma omp parallel for \
+ schedule (dynamic,5) \
+ private (len,trLen,pos,eL,lenP,wNorm,lCdfNorm,trRS)
+ for(m=0;m<M;m++){
+ if(verbose && (m!=0) && (M>20) && (m%(M/10)==0)){
+ #pragma omp critical
+ {
+ message("# %ld done. ",m);
+ timer.current();
+ }
+ }
+ long threadID = 0;
+#ifdef _OPENMP
+ threadID = omp_get_thread_num();
+#endif
+ trLen = trInf->L(m);
+ if(!validLength){
+ if(trLen>singleReadLength*2) effL[m] = trLen - singleReadLength;
+ else if(trLen>singleReadLength) effL[m] = singleReadLength;
+ else effL[m] = trLen;
+ continue;
+ }
+ lCdfNorm = getLengthLNorm(trLen);
+// always computing the effective length using fragLen only
+ if(uniform){
+ eL = 0;
+ for(len=1;len<=trLen;len++){
+ eL += exp(getLengthLP(len)-lCdfNorm) * (trLen-len);
+ }
+ // dont go below minimal fragment length
+ effL[m] = eL>minFragLen?eL:trLen;
+ }else{
+ DEBUG(message("Copy sequence.\n"));
+ const string &trS = trSeq->getTr(m);
+ vector<double> &posBias5 = posBias5All[threadID];
+ vector<double> &posBias3 = posBias3All[threadID];
+ posBias5.resize(trLen);
+ posBias3.resize(trLen);
+ DEBUG(message("Precomputing posBias.\n"));
+ for(pos = 0;pos<trLen;pos++){
+ // Don't care about end position.
+ posBias5[pos] = getPosBias(pos, trLen, mate_5, trLen)*
+ getSeqBias(pos, trLen, mate_5, trS);
+ // Don't care about start position.
+ posBias3[pos] = getPosBias(0, pos+1, mate_3, trLen)*
+ getSeqBias(0, pos+1, mate_3, trS);
+ }
+ eL=0;
+ DEBUG(message("Computing norms.\n"));
+ for(len=1;len<=trLen;len++){
+ wNorm = 0;
+ for(pos=0;pos <= trLen - len;pos++){
+ wNorm += posBias5[pos] * posBias3[pos+len-1];
+ }
+ lenP = exp(getLengthLP( len ) - lCdfNorm);
+ eL += lenP * wNorm;
+ }
+ // Check for weirdness and don't go below 0 (some transcripts already had 5 bases).
+ // Function isnormal assumes C99 or C++11.
+ if((!isnormal(eL)) || (eL <= 1)){
+ effL[m] = trLen;
+ DEBUG(message("weird: %lf %ld %ld\n",eL,trLen,m));
+ }else{
+ effL[m] = eL;
+ }
+ }
+ }
+ DEBUG(long same = 0);
+ if(! uniform){
+ // normalize effective length to same sum as original length
+ double effSum=0,lSum=0;
+ for(m=0;m<M;m++){
+ DEBUG(if(effL[m] == trInf->L(m))same++);
+ lSum+=trInf->L(m);
+ effSum+=effL[m];
+ }
+ for(m=0;m<M;m++)effL[m] *= lSum/effSum;
+ }
+ DEBUG(message(" same: %ld.\n",same));
+ for(m=0;m<M;m++)if(effL[m]<=0) effL[m]=trInf->L(m);
+ return effL;
+}//}}}
+
+double VlmmNode::getPsum(char b) const{//{{{
+ if(base2int(b) == -1) return 1/4;
+ if(parentsN == 2)return getP(b,'N','N')*16;
+ if(parentsN == 1)return getP(b,'N','N')*4;
+ return probs[base2int(b)];
+}//}}}
+VlmmNode::VlmmNode(long p) {//{{{
+ setParentsN(p);
+}//}}}
+void VlmmNode::setParentsN(long p) {//{{{
+ parentsN = p;
+ if(parentsN>2){
+ warning("VlmmNode: Code not read for using more than 2 parents.\n");
+ parentsN = 2;
+ }
+ // initialize probability matrix, set pseudocount:
+ probs.assign(pows4[parentsN+1], 0.01/pows4[parentsN+1]);
+}//}}}
+void VlmmNode::update(double Iexp, char b, char bp, char bpp) {//{{{
+ double expDiv = 1.0;
+ if(base2int(b) == -1)expDiv *=4.0;
+ if((parentsN>0)&&(base2int(bp) == -1))expDiv *=4.0;
+ if((parentsN>1)&&(base2int(bpp) == -1))expDiv *=4.0;
+ if(expDiv == 1){
+ // All bases are known:
+ long i=0;
+ switch(parentsN){
+ case 2:
+ i += pows4[2]*base2int(bpp);
+ case 1:
+ i += pows4[1]*base2int(bp);
+ default:
+ i += base2int(b);
+ }
+ probs[ i ] += Iexp;
+ }else{
+ long i=0,j=0,k=0;
+ Iexp /= expDiv;
+ if(parentsN==2){
+ for(i=0;i<4;i++)
+ if((base2int(bpp) == i) || (base2int(bpp) == -1))
+ for(j=0;j<4;j++)
+ if((base2int(bp) == j) || (base2int(bp) == -1))
+ for(k=0;k<4;k++)
+ if((base2int(b) == k) || (base2int(b) == -1))
+ probs[pows4[2]*i + pows4[1]*j+ k]+=Iexp;
+ }else if(parentsN==1){
+ for(j=0;j<4;j++)
+ if((base2int(bp) == j) || (base2int(bp) == -1))
+ for(k=0;k<4;k++)
+ if((base2int(b) == k) || (base2int(b) == -1))
+ probs[pows4[1]*j+ k]+=Iexp;
+ }else{
+ for(k=0;k<4;k++)
+ // if((base2int(b) == k) || (base2int(b) == -1)); WE KNOW THAT b == 'N'
+ probs[k]+=Iexp;
+ }
+ }
+}//}}}
+void VlmmNode::normalize() {//{{{
+ double sum=0;
+ long i,j,k,index;
+ if(parentsN == 2){
+ for(k=0;k<4;k++)
+ for(j=0;j<4;j++){
+ index = pows4[2]*k + pows4[1]*j;
+ sum = 0;
+ for(i=0;i<4;i++)sum += probs[i + index];
+ for(i=0;i<4;i++)probs[i + index] /= sum;
+ }
+ }else if(parentsN == 1){
+ for(j=0;j<4;j++){
+ index = pows4[1]*j;
+ sum = 0;
+ for(i=0;i<4;i++)sum += probs[i + index];
+ for(i=0;i<4;i++)probs[i + index] /= sum;
+ }
+ }else{
+ sum = 0;
+ for(i=0;i<pows4[parentsN+1];i++)sum += probs[i];
+ for(i=0;i<pows4[parentsN+1];i++)probs[i] /= sum;
+ }
+}//}}}
+double VlmmNode::getP(char b, char bp, char bpp) const{//{{{
+ if(base2int(b) == -1)return 1.0/4.0;
+ double probDiv = 1.0;
+ if((parentsN>0)&&(base2int(bp) == -1))probDiv *=4.0;
+ if((parentsN>1)&&(base2int(bpp) == -1))probDiv *=4.0;
+ if(probDiv == 1.0){
+ // All bases are known:
+ long i=0;
+ switch(parentsN){
+ case 2:
+ i += pows4[2]*base2int(bpp);
+ case 1:
+ i += pows4[1]*base2int(bp);
+ default:
+ i += base2int(b);
+ }
+ return probs[ i ];
+ }else{
+ long i=0,j=0,k=0;
+ double prob = 0;
+ // either one ore both parents are unknown==undefined
+ if(parentsN==2){
+ k = base2int(b);
+ for(i=0;i<4;i++)
+ if((base2int(bpp) == i) || (base2int(bpp) == -1))
+ for(j=0;j<4;j++)
+ if((base2int(bp) == j) || (base2int(bp) == -1))
+ prob += probs[pows4[2]*i + pows4[1]*j+ k];
+ }else if(parentsN==1){
+ // there was an unknown => we know that parent is unknown
+ k = base2int(b);
+ for(j=0;j<4;j++)
+ prob += probs[pows4[1]*j+ k];
+ }else ;// Covered by all bases unknown;
+ return prob / probDiv;
+ }
+}//}}}
+
diff --git a/ReadDistribution.h b/ReadDistribution.h
new file mode 100644
index 0000000..6a78fa9
--- /dev/null
+++ b/ReadDistribution.h
@@ -0,0 +1,134 @@
+#ifndef READDISTRIBUTION_H
+#define READDISTRIBUTION_H
+
+#include<vector>
+#include<map>
+
+using namespace std;
+
+#include "TranscriptInfo.h"
+#include "TranscriptExpression.h"
+#include "TranscriptSequence.h"
+
+#include "samtools/bam.h"
+#include "samtools/sam.h"
+
+namespace ns_rD {
+
+// Defaults: {{{
+const char LOW_PROB_MISSES = 6;
+const char MAX_NODE_PAR = 2;
+const long trSizes [] = { 1334,2104,2977,4389};
+const char trSizesN = 4;
+const char trNumberOfBins = 20;
+const char vlmmNodeDependence [] = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0};
+const char vlmmNodesN = 21;
+const char vlmmStartOffset = 8;
+const long pows4 [] = {1,4,16,64,256,1024,4096};
+//}}}
+
+struct fragmentT{//{{{
+ bam1_t *first,*second;
+ bool paired;
+ fragmentT(){
+ first = bam_init1();
+ second = bam_init1();
+ paired = true;
+ }
+ ~fragmentT(){
+ bam_destroy1(first);
+ bam_destroy1(second);
+ }
+ void copyFragment(const fragmentT *sourceF){
+ paired = sourceF->paired;
+ bam_copy1(first, sourceF->first);
+ bam_copy1(second, sourceF->second);
+ }
+};
+
+typedef fragmentT *fragmentP;
+//}}}
+
+class VlmmNode{//{{{
+ private:
+ long parentsN;
+ vector<double> probs;
+
+ public:
+ VlmmNode(){parentsN = 0;}
+ VlmmNode(long p);
+ void setParentsN(long p);
+ void update(double Iexp, char b, char bp, char bpp);
+ void normalize();
+ double getP(char b, char bp, char bpp) const;
+ double getPsum(char b) const;
+};//}}}
+
+enum biasT { readM_5, readM_3, uniformM_5, uniformM_3, weight_5, weight_3};
+enum readT { mate_5, mate_3, FullPair };
+
+} // namespace ns_rD
+
+class ReadDistribution{
+ private:
+ long procN,M,fragSeen,singleReadLength,minFragLen;
+ double lMu,lSigma,logLengthSum,logLengthSqSum;
+ long lowProbMismatches;
+ bool verbose,warnFirst,uniform,unstranded,lengthSet,gotExpression,normalized;
+ bool validLength;
+ long warnPos, warnTIDmismatch, warnUnknownTID, noteFirstMateDown;
+ TranscriptInfo* trInf;
+ TranscriptSequence* trSeq;
+ TranscriptExpression* trExp;
+ // for each transcript, remember seen fragments in map: length->(sum of probs)
+ vector<map<long,double> > trFragSeen5,trFragSeen3;
+ // cache for already computed weight norms for:
+ // (single reads 5',3', Pair) x Transcript x Length
+ vector<vector<map<long, double> > > weightNorms;
+ // position probability arrays (RE-FACTOR to array of 4 vectors)
+ vector<vector<vector<double> > > posProb;
+ vector<vector<ns_rD::VlmmNode> > seqProb;
+ // Cache probabilities for Phred score.
+ vector<double> lProbMis;
+ vector<double> lProbHit;
+ // Mismatch likelihods along read.
+ vector<double> lFreqHit;
+ vector<double> lFreqMis;
+ // Cache length probabilities.
+ vector<double> lLengthP,lLengthNorm;
+ map<long,long> fragLengths;
+
+ double getLengthLP(long len) const;
+ double computeLengthLP(double len) const;
+ double getLengthLNorm(long trLen) const;
+ void computeLengthProb();
+ void updateMismatchFreq(bam1_t *samA);
+ void updatePosBias(long pos, ns_rD::biasT bias, long tid, double Iexp);
+ void updateSeqBias(long pos, ns_rD::biasT bias, long tid, double Iexp);
+ double getPosBias(long start, long end, ns_rD::readT read,
+ long trLen) const;
+ double getSeqBias(long pos, ns_rD::readT read, long tid) const;
+ inline char getBase(long pos, const string &fSeq) const;
+ double getSeqBias(long start, long end, ns_rD::readT read,
+ const string &fSeq) const;
+ //inline char complementBase(char base) const;
+ double getWeightNorm(long len, ns_rD::readT read, long tid);
+ pair<double, double> getSequenceLProb(bam1_t *samA) const;
+ public:
+ ReadDistribution();
+ void setProcN(long procN);
+ void showFirstWarnings();
+ void writeWarnings();
+ bool init(long m, TranscriptInfo* trI, TranscriptSequence* trS, TranscriptExpression* trE, bool unstranded, bool verb = true);
+ bool initUniform(long m, TranscriptInfo* trI, TranscriptSequence* trS, bool verb = true);
+ void setLowProbMismatches(long m);
+ void setLength(double mu, double sigma);
+ bool observed(ns_rD::fragmentP frag);
+ void normalize();
+ void logProfiles(string logFileName = "");
+ bool getP(ns_rD::fragmentP frag,double &prob,double &probNoise);
+ long getWeightNormCount() const;
+ vector<double> getEffectiveLengths();
+};
+
+#endif
diff --git a/Sampler.cpp b/Sampler.cpp
new file mode 100644
index 0000000..2f5fa19
--- /dev/null
+++ b/Sampler.cpp
@@ -0,0 +1,220 @@
+#ifdef DoSTATS
+#include<sys/time.h>
+#endif
+
+#include "Sampler.h"
+#include "common.h"
+
+Sampler::Sampler(){ //{{{
+ m=samplesN=samplesLogged=samplesTotal=samplesOut=Nmap=Nunmap=0;
+ isoformLengths = NULL;
+#ifdef DoSTATS
+ tT=tTa=tZ=0;
+ nT=nTa=nZ=0;
+#endif
+}//}}}
+Sampler::~Sampler(){ //{{{
+#ifdef DoSTATS
+ message("---------------------------\nSTATISTICS:\n");
+ message("Theta: %lg %lgm av:%lgs\n",nT,tT/60000.0,tT/1000.0/nT);
+ message("Z: %lg %lgm av:%lgs\n",nZ,tZ/60000.0,tZ/1000.0/nZ);
+ if(nTa>0)message("Theta Act: %ld %lgm av:%lgs\n",nTa,tTa/60000.0,tTa/1000.0/nTa);
+ message("Total time: %lgm\n",(tT+tZ)/60000.0);
+#endif
+}//}}}
+void Sampler::init(long m, long samplesTotal, long samplesOut, long Nunmap,const TagAlignments *alignments, const distributionParameters &betaPar, const distributionParameters &dirPar, long &seed){//{{{
+// this->n=n;
+ this->m=m;
+ this->samplesOut=samplesOut;
+ this->Nmap=alignments->getNreads();
+ this->Nunmap=Nunmap;
+ this->alignments=alignments;
+ beta=&betaPar;
+ dir=&dirPar;
+ //dir=new distributionParameters;
+ //dir->alpha=1.0/m;
+ //dir->beta=dirPar.beta;
+ rng_mt.seed(seed);
+ seed = (long) (1717171717.17*uniformDistribution(rng_mt));
+
+ resetSampler(samplesTotal);
+
+ theta.assign(m,0);
+ C.assign(m,0);
+}//}}}
+void Sampler::resetSampler(long samplesTotal){//{{{
+ this->samplesTotal=samplesTotal;
+ samplesN = 0;
+ samplesLogged = 0;
+ logRate=(double)samplesOut/samplesTotal;
+ sumC0 = 0;
+ sumNorm.first = sumNorm.second = 0;
+ thetaSum.assign(m,pairD(0,0));
+ thetaSqSum.assign(m,pairD(0,0));
+}//}}}
+long Sampler::getAverageC0(){//{{{
+ return (long) (sumC0 / sumNorm.first);
+}//}}}
+void Sampler::getAverage(vector<pairD> &av){//{{{
+ long i;
+ if((long)av.size()<m)
+ av.assign(m,pairD(0,0));
+ for(i=0;i<m;i++){
+ if(sumNorm.first != 0)
+ av[i].first=thetaSum[i].first/sumNorm.first;
+ if(sumNorm.second != 0)
+ av[i].second=thetaSum[i].second/sumNorm.second;
+ }
+}//}}}
+pairD Sampler::getAverage(long i){//{{{
+ double av1,av2;
+ av1=(sumNorm.first==0)?0:thetaSum[i].first/sumNorm.first;
+ av2=(sumNorm.second==0)?0:thetaSum[i].second/sumNorm.second;
+ return pairD(av1,av2);
+}//}}}
+pairD Sampler::getWithinVariance(long i){//{{{
+ double va1,va2;
+ if(sumNorm.first==0)
+ va1=0;
+ else
+ va1=thetaSqSum[i].first/(sumNorm.first-1.0) -
+ (thetaSum[i].first/(sumNorm.first-1.0))*
+ (thetaSum[i].first/sumNorm.first);
+ if(sumNorm.second==0)
+ va2=0;
+ else
+ va2=thetaSqSum[i].second/(sumNorm.second-1.0) -
+ (thetaSum[i].second/(sumNorm.second-1.0))*
+ (thetaSum[i].second/sumNorm.second);
+ if(va1<0)message("minus %lg %lg %lg\n",thetaSqSum[i].first,thetaSum[i].first,sumNorm.first);
+ return pairD(va1,va2);
+}//}}}
+void Sampler::getThetaSums(long i, double *thSqSum, double *thSum, double *sumN){//{{{
+ if(i >= m){
+ (*thSqSum) = (*thSum) = (*sumN) = 0;
+ return;
+ }
+ *thSqSum = thetaSqSum[i].first;
+ *thSum = thetaSum[i].first;
+ *sumN = sumNorm.first;
+}//}}}
+void Sampler::getTau(vector<double> &tau, double norm){//{{{
+ double tauSum=0;
+
+ if ((theta.size() > tau.size()) || (isoformLengths->size() != tau.size()))
+ error("Sampler failed");
+
+ tau.assign(tau.size(),0);
+
+ tau[0]=theta[0]; // set thetaAct
+ // divide by length:
+ for(size_t i=1;i<theta.size();i++){
+ tau[ i ] = theta[i] / (*isoformLengths)[ i ] * norm;
+ tauSum += tau[i];
+ }
+ // DO normalize:
+ for(size_t i=1;i<tau.size();i++)
+ if(tau[i]>0) tau[i] /= tauSum;
+}//}}}
+void Sampler::appendFile(){//{{{
+ long i;
+ double norm=saveNorm;
+ if((!save) || (outFile == NULL))return;
+ thetaActLog.push_back(theta[0]);
+ outFile->precision(9);
+ (*outFile)<<scientific;
+ if(saveType == "counts"){
+ if(norm == 0)norm = Nmap;
+ for(i=1;i<m;i++)
+ (*outFile)<<theta[i]*norm<<" ";
+ }else if(saveType == "rpkm"){
+ if(norm == 0)norm = 1000000000.0;
+ for(i=1;i<m;i++)
+ if((*isoformLengths)[i]>0)
+ (*outFile)<<theta[i]*norm/(*isoformLengths)[i]<<" ";
+ else
+ (*outFile)<<theta[i]*norm<<" ";
+ }else if(saveType == "theta"){
+ if(norm == 0)norm=1.0;
+ for(i=1;i<m;i++)
+ (*outFile)<<theta[i]*norm<<" ";
+ }else if(saveType == "tau"){
+ if(norm == 0)norm=1.0;
+ vector<double> tau(m);
+ getTau(tau,norm);
+ for(i=1;i<m;i++)
+ (*outFile)<<tau[i]<<" ";
+ }
+ (*outFile)<<endl;
+}//}}}
+void Sampler::updateSums(){//{{{
+ long i;
+ double s;
+ for(i=0;i<m;i++){
+ thetaSum[i].first+=theta[i];
+ thetaSqSum[i].first+=theta[i]*theta[i];
+ }
+ sumC0+=C[0];
+ sumNorm.first++;
+ //if(doLog){
+ for(i=0;i<m;i++){
+ s = log(theta[i]) - log(1-theta[i]);//LOGIT
+ thetaSum[i].second += s;
+ thetaSqSum[i].second += s * s;
+ }
+ sumNorm.second++;
+ //}
+}//}}}
+void Sampler::saveSamples(ofstream *outFile, const vector<double> *isoformLengths, const string &saveType, double norm){//{{{
+ this->outFile = outFile;
+ this->isoformLengths = isoformLengths;
+ this->saveType = saveType;
+ saveNorm = norm;
+ save = true;
+ thetaActLog.clear();
+}//}}}
+void Sampler::noSave(){//{{{
+ save = false;
+ outFile = NULL;
+ if(isoformLengths){
+ delete isoformLengths;
+ isoformLengths = NULL;
+ }
+}//}}}
+
+void Sampler::sampleTheta(){//{{{
+#ifdef DoSTATS
+ nT++;
+ struct timeval start, end;
+ gettimeofday(&start, NULL);
+#endif
+ vector<double> gamma(m,0);
+ double gammaSum=0;
+ long i;
+ for(i=1;i<m;i++){
+ gammaDistribution.param(gDP(dir->alpha + C[i], dir->beta));
+ gamma[i]=gammaDistribution(rng_mt);
+ gammaSum+=gamma[i];
+ }
+ if (gammaSum<=0) // at least something should be more than zero
+ error("Sampler failed");
+
+ for(i=1;i<m;i++){
+ theta[i]=gamma[i]/gammaSum;
+ }
+#ifdef DoSTATS
+ gettimeofday(&end, NULL);
+ tT += (end.tv_sec-start.tv_sec)*1000*1000+(end.tv_usec-start.tv_usec);
+#endif
+}//}}}
+void Sampler::sample(){//{{{
+ samplesN++;
+}//}}}
+void Sampler::update(){//{{{
+ doLog = false;
+ if(samplesOut-samplesLogged>0){
+ if(samplesTotal-samplesN<=samplesOut-samplesLogged)doLog=true;
+ else if((long)(logRate * samplesN) > samplesLogged)doLog=true;
+ }
+ if(doLog) samplesLogged ++;
+}//}}}
diff --git a/Sampler.h b/Sampler.h
new file mode 100644
index 0000000..40a56c8
--- /dev/null
+++ b/Sampler.h
@@ -0,0 +1,99 @@
+#ifndef SAMPLER_H
+#define SAMPLER_H
+
+#include<vector>
+#include<fstream>
+#include "boost/random/mersenne_twister.hpp"
+#include "boost/random/gamma_distribution.hpp"
+#include "boost/random/uniform_01.hpp"
+
+using namespace std;
+
+#include "GibbsParameters.h"
+#include "TagAlignments.h"
+
+// compute statistics
+//#define DoSTATS
+//#define DoDebug
+
+typedef pair<double,double> pairD;
+
+class Sampler{
+ protected:
+ long m, samplesN, samplesLogged, samplesTotal, samplesOut, Nmap, Nunmap;
+ const distributionParameters *beta,*dir;
+ const TagAlignments *alignments;
+ const vector<double> *isoformLengths;
+ boost::random::mt11213b rng_mt;
+ boost::random::gamma_distribution<double> gammaDistribution;
+ typedef boost::random::gamma_distribution<double>::param_type gDP;
+ // Need by children:
+ boost::random::uniform_01<double> uniformDistribution;
+
+ bool doLog,save;
+ string saveType;
+ ofstream *outFile;
+ double saveNorm,logRate;
+#ifdef DoSTATS
+ long long nT,nZ,nTa;
+ double tT,tZ,tTa;
+#endif
+
+ vector<long> C;
+ double sumC0;
+ vector<double> theta;
+ vector<double> thetaActLog;
+ vector<pairD> thetaSum;
+ vector<pairD> thetaSqSum;
+ pairD sumNorm;
+
+ // Sample theta.
+ void sampleTheta();
+ // Compute tau.
+ void getTau(vector <double> &tau, double norm);
+ // Append current expression samples into file opened for saving samples.
+ void appendFile();
+ // Update sums of theta and theta^2.
+ void updateSums();
+
+ public:
+
+ Sampler();
+ virtual ~Sampler();
+ // Initialize sampler, set seed and use it to generate new seed.
+ void init(long m, long samplesTotal, long samplesOut, long Nunmap,
+ const TagAlignments *alignments,
+ const distributionParameters &betaPar,
+ const distributionParameters &dirPar,
+ long &seed);
+ // Reset sampler's stats before new iteration
+ void resetSampler(long samplesTotal);
+ // Return mean C[0].
+ long getAverageC0();
+ // Get vector of mean theta expression. Has "two columns" first is calculated
+ // from all samples, the second only from the "thinned" samples.
+ void getAverage(vector<pairD> &av);
+ // Get mean for transcript i.
+ pairD getAverage(long i);
+ // Get within variance for transcript i.
+ pairD getWithinVariance(long i);
+ // Get sum of theta^2, sum of theta, and their norm for transcript i.
+ void getThetaSums(long i, double *thSqSum, double *thSum, double *sumN);
+ // Return norms for theta sums.
+ pairD getSumNorms() const { return sumNorm; }
+ // Set sampler into state where samples are saved into the outFile.
+ void saveSamples(ofstream *outFile, const vector<double> *isoformLengths,
+ const string &saveType, double norm = 0);
+ // Stop saving samples into the file.
+ void noSave();
+ // Get theta act logged values.
+ const vector<double>& getThetaActLog(){return thetaActLog;}
+
+ // Produce new McMc samples.
+ virtual void sample();
+ // If necessary ("thinned sample") sample theta; update sums.
+ virtual void update();
+};
+
+
+#endif
diff --git a/SimpleSparse.cpp b/SimpleSparse.cpp
new file mode 100644
index 0000000..35ba9d8
--- /dev/null
+++ b/SimpleSparse.cpp
@@ -0,0 +1,97 @@
+#include<cmath>
+#include<cstring>
+#ifdef _OPENMP
+#include<omp.h>
+#endif
+
+#include "SimpleSparse.h"
+
+double SimpleSparse::logSumExpVal(long st, long en) const{//{{{
+ if(st<0)st = 0;
+ if((en == -1) || (en > T)) en = T;
+ if(st >= en) return 0;
+ long i;
+ double sumE = 0, m = val[st];
+ for(i = st; i < en; i++)if(val[i] > m)m = val[i];
+ for(i = st; i < en; i++)
+ sumE += exp(val[i] - m);
+ return m + log(sumE);
+}//}}}
+void SimpleSparse::sumRows(double res[]) const{//{{{
+ long i,r;
+ for(r=0;r<N;r++){
+ res[r]=0;
+ for(i=rowStart[r];i<rowStart[r+1];i++){
+ res[r]+=val[i];
+ }
+ }
+}//}}}
+void SimpleSparse::sumCols(double res[]) const{//{{{
+ memset(res,0,M*sizeof(double));
+ for(long i=0;i<T;i++)res[col[i]]+=val[i];
+}//}}}
+long SimpleSparse::countAboveDelta(double delta) const{//{{{
+ long i,count=0;
+ #pragma omp parallel for reduction(+:count)
+ for(i=0;i<T;i++){
+ if(val[i]>delta)count++;
+ }
+ return count;
+}//}}}
+
+void SimpleSparse::softmaxInplace(SimpleSparse *res){//{{{
+ double logRowSum = 0;
+ long i,r;
+ #pragma omp parallel for private(i,logRowSum)
+ for(r=0;r<N;r++){
+ logRowSum = logSumExpVal(rowStart[r],rowStart[r+1]);
+ for(i=rowStart[r];i<rowStart[r+1];i++){
+ val[i] = val[i] - logRowSum;
+ res->val[i] = exp( val[i] );
+ }
+ }
+}//}}}
+void SimpleSparse::softmax(SimpleSparse *res) const{//{{{
+ double logRowSum = 0;
+ long i,r;
+ #pragma omp parallel for private(i,logRowSum)
+ for(r=0;r<N;r++){
+ logRowSum = logSumExpVal(rowStart[r],rowStart[r+1]);
+ for(i=rowStart[r];i<rowStart[r+1];i++){
+ res->val[i] = exp(val[i] - logRowSum);
+ }
+ }
+}//}}}
+
+SimpleSparse::SimpleSparse(long n,long m, long t){//{{{
+ N=n;
+ M=m;
+ T=t;
+ val = new double[T];
+ base = true; // base matrix with it's own col & rowStart information
+ col = new int_least32_t[T];
+ rowStart = new int_least32_t[N+1];
+ //colStart = new long[M+1];
+}//}}}
+SimpleSparse::SimpleSparse(SimpleSparse *m0){//{{{
+ N=m0->N;
+ M=m0->M;
+ T=m0->T;
+ val = new double[T];
+ base = false; // use col & rowStart information from the base matrix m0
+ col = m0->col;
+ rowStart = m0->rowStart;
+ /*col = new long[T];
+ rowStart = new long[N+1];
+ memcpy(col, m0->col, T*sizeof(long));
+ memcpy(rowStart, m0->rowStart, (N+1)*sizeof(long));
+ */
+}//}}}
+SimpleSparse::~SimpleSparse(){//{{{
+ delete[] val;
+ if(base){
+ // BEWARE there could be other matrices using this data
+ delete[] col;
+ delete[] rowStart;
+ }
+}//}}}
diff --git a/SimpleSparse.h b/SimpleSparse.h
new file mode 100644
index 0000000..916dce5
--- /dev/null
+++ b/SimpleSparse.h
@@ -0,0 +1,27 @@
+#ifndef SIMPLESPARSE_H
+#define SIMPLESPARSE_H
+
+#include<stdint.h>
+
+//#define setVal(x,i,y) {for(i=0;i<x->T;i++)x->val[i]=y;}
+
+class SimpleSparse {
+ private:
+ bool base;
+ public:
+ long N,M,T; // reads, transcripts, total
+ int_least32_t *rowStart,*colStart,*col;
+ double *val;
+
+ SimpleSparse(long n,long m, long t);
+ SimpleSparse(SimpleSparse *m0);
+ ~SimpleSparse();
+ void softmax(SimpleSparse *res) const;
+ void softmaxInplace(SimpleSparse *res);
+ long countAboveDelta(double delta = 0.99) const;
+ void sumCols(double res[]) const;
+ void sumRows(double res[]) const;
+ double logSumExpVal(long st, long en) const;
+};
+
+#endif
diff --git a/TagAlignments.cpp b/TagAlignments.cpp
new file mode 100644
index 0000000..b0f0bde
--- /dev/null
+++ b/TagAlignments.cpp
@@ -0,0 +1,132 @@
+#include<cmath>
+
+#include "TagAlignments.h"
+
+#include "misc.h"
+
+#include "common.h"
+
+//#define MEM_USAGE
+
+TagAlignments::TagAlignments(bool storeL){//{{{
+ knowNtotal=false;
+ knowNreads=false;
+ Ntotal=0;
+ Nreads=0;
+ storeLog = storeL;
+}//}}}
+void TagAlignments::init(long Nreads,long Ntotal, long M){//{{{
+ currentRead = 0;
+ reservedN = 0;
+ if(Nreads>0){
+ this->Nreads=Nreads;
+ knowNreads=true;
+ readIndex.reserve(Nreads+2);
+ }
+ readIndex.push_back(0);
+
+ if(Ntotal>0){
+ this->Ntotal=Ntotal;
+ knowNtotal=true;
+ reservedN = Ntotal+1;
+ trIds.reserve(reservedN);
+ probs.reserve(reservedN);
+ }
+ if(M>0){
+ this->M=M;
+ readsInIsoform.assign(M,-1);
+ }else{
+ readsInIsoform.clear();
+ this->M=0;
+ }
+}//}}}
+void TagAlignments::pushAlignment(long trId, double prob){//{{{
+ if(prob<=0)pushAlignmentL(trId, ns_misc::LOG_ZERO);
+ else pushAlignmentL(trId, log(prob));
+}//}}}
+void TagAlignments::pushAlignmentL(long trId, double lProb){//{{{
+ if(trId>=M){
+ M=trId+1;
+ readsInIsoform.resize(M,-1);
+ }
+ if(readsInIsoform[trId] == currentRead){
+ // The read has already one alignment to this transcript.
+ for(long i=readIndex[currentRead];i<(long)trIds.size();i++)
+ if(trIds[i] == trId){
+ probs[i] = ns_math::logAddExp(probs[i], lProb);
+ break;
+ }
+ }else{
+ if(! knowNtotal){
+ // the size of arrays is unknown try to reserve sensible amount of space if we know Nreads
+ if(knowNreads && reservedN && ((long)probs.size() == reservedN)){
+ // we reached the size of reserved space
+ double dens = (double)probs.size() / currentRead;
+ dens *= 1.05; //increase it by 5%
+ reservedN =(long)( reservedN + (dens) * (Nreads - currentRead + 1000.0) );
+ #ifdef MEM_USAGE
+ message("TagAlignments:\n size: %ld reserving: %ld capacity before: %ld\n",probs.size(),reservedN,probs.capacity());
+ #endif
+ trIds.reserve(reservedN);
+ probs.reserve(reservedN);
+ #ifdef MEM_USAGE
+ message(" capacity after: %ld\n",probs.capacity());
+ #endif
+ }else if(knowNreads && (! reservedN) && (currentRead == Nreads / 4 )){
+ // one quarter in, try to reserve sensible amount of space
+ double dens = (double)probs.size() / currentRead;
+ dens *= 1.05; //increase it by 5%
+ reservedN =(long)((dens) * (Nreads));
+ #ifdef MEM_USAGE
+ message("TagAlignments:\n size: %ld reserving: %ld capacity before: %ld\n",probs.size(),reservedN,probs.capacity());
+ #endif
+ trIds.reserve(reservedN);
+ probs.reserve(reservedN);
+ #ifdef MEM_USAGE
+ message(" capacity after: %ld\n",probs.capacity());
+ #endif
+ }
+ }
+ trIds.push_back(trId);
+ probs.push_back(lProb);
+ // Mark that transcript trId already has alignment from this read.
+ readsInIsoform[trId] = currentRead;
+ }
+}//}}}
+void TagAlignments::pushRead(){//{{{
+ // Check whether there were any valid alignments added for this read:
+ if(readIndex[currentRead] == (int_least32_t) probs.size()){
+ // If no new alignments, do nothing.
+ return;
+ }
+ // If there are alignments transform from log space if necessary and move to next read.
+ if(!storeLog){
+ double logSum = ns_math::logSumExp(probs, readIndex[currentRead], probs.size());
+ for(long i = readIndex[currentRead]; i<(long)probs.size(); i++)
+ probs[i] = exp(probs[i]-logSum);
+ }
+ // Move to the next read.
+ currentRead++;
+ readIndex.push_back(probs.size());
+}//}}}
+void TagAlignments::finalizeRead(long *M, long *Nreads, long *Ntotal){//{{{
+ *M = this->M = readsInIsoform.size();
+ *Nreads = this->Nreads = readIndex.size() - 1;
+ *Ntotal = this->Ntotal = probs.size();
+#ifdef MEM_USAGE
+ message("TagAlignments: readIndex size: %ld capacity %ld\n",readIndex.size(),readIndex.capacity());
+ message("TagAlignments: probs size: %ld capacity %ld\n",probs.size(),probs.capacity());
+#endif
+}//}}}
+int_least32_t TagAlignments::getTrId(long i) const {//{{{
+ if(i<Ntotal)return trIds[i];
+ return 0;
+}//}}}
+double TagAlignments::getProb(long i) const {//{{{
+ if(i<Ntotal)return probs[i];
+ return 0;
+}//}}}
+int_least32_t TagAlignments::getReadsI(long i) const {//{{{
+ if(i<=Nreads)return readIndex[i];
+ return 0;
+}//}}}
diff --git a/TagAlignments.h b/TagAlignments.h
new file mode 100644
index 0000000..691bbb3
--- /dev/null
+++ b/TagAlignments.h
@@ -0,0 +1,44 @@
+#ifndef TAGALIGNMENTS_H
+#define TAGALIGNMENTS_H
+
+#include<stdint.h>
+#include<vector>
+
+using namespace std;
+
+// Probabilities are stored in log scale.
+
+class TagAlignments{
+ private:
+ vector<int_least32_t> trIds;
+ vector<double> probs;
+ vector<int_least32_t> readIndex;
+ vector<int_least32_t> readsInIsoform;
+
+ bool storeLog,knowNtotal,knowNreads;
+ long M,Ntotal,Nreads,currentRead,reservedN;
+ public:
+ // Constructor, can specify whether the probabilities should be stored in log space.
+ TagAlignments(bool storeL = true);
+ // Initialize reader. For non-zero arguments, also reserves some memory.
+ void init(long Nreads = 0,long Ntotal = 0,long M = 0);
+ // Add alignment for currently processed read.
+ void pushAlignment(long trId, double prob);
+ // Add alignment for currently processed read, with probability in log scale.
+ void pushAlignmentL(long trId, double lProb);
+ // Finish processing current read and move onto new read.
+ void pushRead();
+ // Finalizes reading reads and sets N, Nreads, Ntotal.
+ void finalizeRead(long *M, long *Nreads, long *Ntotal);
+ // Return TrID of i-th alignment.
+ int_least32_t getTrId(long i) const;
+ // Return alignment probability of i-th alignment as it is stored.
+ // (if it is stored in log space, return log-probability)
+ double getProb(long i) const;
+ // Get index for i-th read's alignments.
+ int_least32_t getReadsI(long i) const;
+ // Get number of reads.
+ long getNreads() const { return Nreads;}
+};
+
+#endif
diff --git a/TranscriptExpression.cpp b/TranscriptExpression.cpp
new file mode 100644
index 0000000..75deb04
--- /dev/null
+++ b/TranscriptExpression.cpp
@@ -0,0 +1,87 @@
+#include<algorithm>
+
+#include "TranscriptExpression.h"
+#include "FileHeader.h"
+#include "common.h"
+
+TE_FileType TranscriptExpression::guessFileType(const string &fileName){//{{{
+ string extension = fileName.substr(fileName.rfind(".")+1);
+ if(extension == "thetaMeans") return SAMPLER_MEANS;
+ if(extension == "m_alphas") return M_ALPHAS;
+ // Ends with 'mean' or 'variance' or 'var'
+ if((extension.rfind("mean") == extension.size() - 4) ||
+ (extension.rfind("variance") == extension.size() - 8) ||
+ (extension.rfind("var") == extension.size() - 3)) return MEAN_VARIANCE;
+ // Default is SAMPLER_MEANS.
+ return SAMPLER_MEANS;
+}//}}}
+TranscriptExpression::TranscriptExpression(){//{{{
+ M=0;
+ logged=false;
+}//}}}
+TranscriptExpression::TranscriptExpression(const string &fileName, TE_FileType fileType){//{{{
+ TranscriptExpression();
+ readExpression(fileName,fileType);
+}//}}}
+bool TranscriptExpression::readExpression(const string &fileName, TE_FileType fileType){//{{{
+ long i;
+ if(fileType == GUESS)fileType = guessFileType(fileName);
+ ifstream varFile(fileName.c_str());
+ FileHeader fh(&varFile);
+ if((!fh.varianceHeader(&M,&logged))||(M==0)){
+ error("TranscriptExpression: Problem loading variance file %s\n",(fileName).c_str());
+ return false;
+ }
+ // M_ALPHAS file contains nosie transcript.
+ if(fileType == M_ALPHAS) M--;
+ trs.resize(M);
+ if(fileType == SAMPLER_MEANS){
+ double count,mean2;
+ for(i=0;i<M;i++){
+ varFile>>trs[i].id>>trs[i].exp>>count>>mean2>>trs[i].var;
+ // IDs in SAMPLER_MEANS file are shifted by 1
+ trs[i].id--;
+ varFile.ignore(1000,'\n');
+ if(varFile.bad()){
+ error("TranscriptExpression: Problem reading transcript %ld.\n",i);
+ return false;
+ }
+ }
+ }else if(fileType == MEAN_VARIANCE){
+ for(i=0;i<M;i++){
+ trs[i].id=i;
+ varFile>>trs[i].exp>>trs[i].var;
+ varFile.ignore(1000,'\n');
+ if(varFile.bad()){
+ error("TranscriptExpression: Problem reading transcript %ld.\n",i);
+ return false;
+ }
+ }
+ }else if(fileType == M_ALPHAS){
+ double alpha, beta, beta0;
+ // Skip first entry - noise transcript.
+ varFile>>trs[0].exp>>alpha>>beta0;
+ varFile.ignore(1000,'\n');
+ for(i=0;i<M;i++){
+ trs[i].id=i;
+ varFile>>trs[i].exp>>alpha>>beta;
+ // Beta0 is the sum of all except noise.
+ trs[i].exp = alpha / beta0;
+ trs[i].var = alpha * (beta0-alpha) / (beta0 * beta0 * (beta0 + 1));
+ varFile.ignore(1000,'\n');
+ if(varFile.bad()){
+ error("TranscriptExpression: Problem reading transcript %ld.\n",i);
+ return false;
+ }
+ }
+ }
+ fh.close();
+ return true;
+}//}}}
+void TranscriptExpression::doSort(bool reverse){//{{{
+ if(! reverse)
+ sort(trs.begin(),trs.end());
+ else
+ sort(trs.rbegin(),trs.rend());
+}//}}}
+
diff --git a/TranscriptExpression.h b/TranscriptExpression.h
new file mode 100644
index 0000000..c04c9e1
--- /dev/null
+++ b/TranscriptExpression.h
@@ -0,0 +1,38 @@
+#ifndef TRANSCRIPTEXPRESSION_H
+#define TRANSCRIPTEXPRESSION_H
+#include<vector>
+#include<string>
+#include<stdint.h>
+
+using namespace std;
+
+enum TE_FileType{ SAMPLER_MEANS, MEAN_VARIANCE , M_ALPHAS, GUESS };
+
+struct trExpInfoT{
+ double exp,var;
+ int_least32_t id;
+ bool operator< (const trExpInfoT& d2) const{
+ return exp<d2.exp;
+ }
+};
+
+class TranscriptExpression{
+ private:
+ long M;
+ bool logged;
+ vector<trExpInfoT> trs;
+ TE_FileType guessFileType(const string &fileName);
+
+ public:
+ TranscriptExpression();
+ TranscriptExpression(const string &fileName, TE_FileType fileType = SAMPLER_MEANS);
+ bool readExpression(const string &fileName, TE_FileType fileType = SAMPLER_MEANS);
+ void doSort(bool reverse = false);
+ long getM(){return M;}
+ bool isLogged(){return logged;}
+ double exp(long tr){return trs[tr].exp;}
+ double var(long tr){return trs[tr].var;}
+ long id(long tr){return trs[tr].id;}
+};
+
+#endif
diff --git a/TranscriptInfo.cpp b/TranscriptInfo.cpp
new file mode 100644
index 0000000..224036d
--- /dev/null
+++ b/TranscriptInfo.cpp
@@ -0,0 +1,258 @@
+#include<fstream>
+#include<set>
+
+#include"TranscriptInfo.h"
+
+#include "common.h"
+
+bool TranscriptInfo::writeInfo(string fileName, bool force) const{//{{{
+ ofstream trF;
+ if(! force){
+ // Do nothing if file exists.
+ ifstream testF(fileName.c_str());
+ if(testF.is_open()){
+ testF.close();
+ return false;
+ }
+ testF.close();
+ }
+ trF.open(fileName.c_str(),ios::out | ios::trunc);
+ if(! trF.is_open() ) return false;
+ trF<<"# M "<<M<<endl;
+ for(long i=0;i<M;i++)
+ trF<<transcripts[i].g<<" "<<transcripts[i].t<<" "<<transcripts[i].l<<" "<<transcripts[i].effL<<endl;
+ trF.close();
+ return true;
+}//}}}
+bool TranscriptInfo::writeGeneInfo(string fileName) const{//{{{
+ ofstream geF;
+ geF.open(fileName.c_str(),ios::out | ios::trunc);
+ if(! geF.is_open() ) return false;
+ geF<<"# G "<<G<<endl;
+ geF<<"# <gene name> <# of transcripts> <average length>"<<endl;
+ double length;
+ for(long i=0;i<G;i++){
+ length = 0;
+ for(long j=0;j<genes[i].m;j++)length+=transcripts[genes[i].trs[j]].l;
+ geF<<genes[i].name<<" "<<genes[i].m<<" "<<length/genes[i].m<<endl;
+ }
+ geF.close();
+ return true;
+}//}}}
+bool TranscriptInfo::setInfo(vector<string> gNames,vector<string> tNames, vector<long> lengths){//{{{
+ // The sizes have to be equal.
+ if((gNames.size()!=tNames.size())||(tNames.size()!=lengths.size())) return false;
+ transcriptT newT;
+ M = (long) gNames.size();
+ // Create new entry for each transcript.
+ for(long i=0;i<M;i++){
+ newT.g=gNames[i];
+ newT.t=tNames[i];
+ newT.gI = 0;
+ newT.l=(int_least32_t)lengths[i];
+ newT.effL = lengths[i];
+ transcripts.push_back(newT);
+ }
+ // Initialize gene info based on gene names.
+ setGeneInfo();
+ isInitialized = true;
+ return isInitialized;
+}//}}}
+void TranscriptInfo::setGeneInfo(){//{{{
+ // Cleanup previous gene list.
+ genes.clear();
+ // Map of genes: name -> position within gene vector.
+ map<string,long> names;
+ geneT tmpG;
+ long gi=0,i;
+ groupedByGenes = true;
+ string previousName = "!-noname-!";
+ for(i=0;i<M;i++){
+ // If gene name same as previous, then just add new transcript.
+ if(transcripts[i].g == previousName){
+ transcripts[i].gI = gi;
+ genes[gi].m++;
+ genes[gi].trs.push_back(i);
+ }else{
+ previousName=transcripts[i].g;
+ // Check whether the gene name is new or was seen before.
+ if(names.count(transcripts[i].g) == 0){
+ // Prepare entry for new gene, starting with one (current) transcript.
+ tmpG.name = transcripts[i].g;
+ tmpG.m = 1;
+ tmpG.trs = vector<long>(1,i);
+ // Add entry to the gene list.
+ genes.push_back(tmpG);
+ // Set current gene index.
+ gi=genes.size()-1;
+ transcripts[i].gI = gi;
+ // Map gene name to it's index and update previousName.
+ names[transcripts[i].g] = gi;
+ }else{
+ // If gene name was seen before then transcripts are not grouped by genes.
+ groupedByGenes=false;
+ //warning("TranscriptInfo: Transcripts of gene %ld are not grouped.\n",transcripts[i].g);
+ gi = names[transcripts[i].g];
+ transcripts[i].gI = gi;
+ genes[gi].m++;
+ genes[gi].trs.push_back(i);
+ }
+ }
+ }
+ G = genes.size();
+ // Add empty record to the end.
+ tmpG.name = "";
+ tmpG.m = 0;
+ tmpG.trs.clear();
+ genes.push_back(tmpG);
+}//}}}
+TranscriptInfo::TranscriptInfo(){ clearTranscriptInfo(); }
+void TranscriptInfo::clearTranscriptInfo(){//{{{
+ M=G=0;
+ isInitialized=false;
+ groupedByGenes=true;
+ transcripts.clear();
+ genes.clear();
+}//}}}
+TranscriptInfo::TranscriptInfo(string fileName){//{{{
+ noName="wrongID";
+ // TranscriptInfo();
+ readInfo(fileName);
+}//}}}
+bool TranscriptInfo::readInfo(string fileName){//{{{
+ clearTranscriptInfo();
+ ifstream trFile(fileName.c_str());
+ if(!trFile.is_open()){
+ error("TranscriptInfo: problem reading transcript file.\n");
+ return false;
+ }
+ transcriptT newT;
+ // Read all lines of file ignoring lines starting with #.
+ while(trFile.good()){
+ while(trFile.good() && (trFile.peek()=='#'))
+ trFile.ignore(100000000,'\n');
+ if(!trFile.good()) break;
+ // Read gene name, tr name and length.
+ trFile>>newT.g>>newT.t>>newT.l;
+ newT.gI = 0;
+ // Should not hit EOF or any other error yet.
+ if(!trFile.good()) break;
+ // Read effective length if present:
+ while((trFile.peek() == '\t')||(trFile.peek() == ' ')) trFile.get();
+ // If end of line is reached then use length as effective length.
+ if((trFile.good()) && (trFile.peek() == '\n')) newT.effL = newT.l;
+ else trFile>>newT.effL;
+ // If the line was OK, then push new entry (EOF when looking for effective length is allowed).
+ if(!trFile.fail())
+ transcripts.push_back(newT);
+ // Ignore rest of the line.
+ trFile.ignore(100000000,'\n');
+ }
+ trFile.close();
+ isInitialized = true;
+ M = (long)transcripts.size();
+ setGeneInfo();
+ return isInitialized;
+}//}}}
+long TranscriptInfo::getM() const{//{{{
+ return M;
+}//}}}
+long TranscriptInfo::getG() const{//{{{
+ return G;
+}//}}}
+const vector<long> &TranscriptInfo::getGtrs(long i) const{//{{{
+ if((i>G) || (i<0)){
+ // Return empty record.
+ return genes[G].trs;
+ }
+ return genes[i].trs;
+}//}}}
+double TranscriptInfo::effL(long i) const{//{{{
+ if(isInitialized && (i<M))return transcripts[i].effL;
+ return 0;
+}//}}}
+long TranscriptInfo::L(long i) const{//{{{
+ if(isInitialized && (i<M))return transcripts[i].l;
+ return 0;
+}//}}}
+const string &TranscriptInfo::trName(long i) const{//{{{
+ if(isInitialized && (i<M))return transcripts[i].t;
+ return noName;
+}//}}}
+const string &TranscriptInfo::geName(long i) const{//{{{
+ if(isInitialized && (i<M))return transcripts[i].g;
+ return noName;
+}//}}}
+long TranscriptInfo::geId(long i) const{//{{{
+ if(isInitialized && (i<M))return transcripts[i].gI;
+ return -1;
+}//}}}
+void TranscriptInfo::setEffectiveLength(vector<double> effL){//{{{
+ if((long)effL.size() != M){
+ warning("TranscriptInfo: Wrong array size for effective length adjustment.\n");
+ return;
+ }
+ // Adjust effective length to similar scale as normal length
+ double sumL = 0,sumN = 0,norm;
+ for(long i=0;i<M;i++){
+ sumN+=effL[i];
+ sumL+=transcripts[i].l;
+ }
+// don't normalize
+// norm = sumL / sumN;
+ norm = 1;
+ for(long i=0;i<M;i++){
+ transcripts[i].effL = effL[i] * norm;
+ }
+}//}}}
+vector<double> *TranscriptInfo::getShiftedLengths(bool effective) const{//{{{
+ vector<double> *Ls = new vector<double>(M+1);
+ for(long i=0;i<M;i++){
+ if(effective)(*Ls)[i+1] = transcripts[i].effL;
+ else (*Ls)[i+1] = transcripts[i].l;
+ }
+ return Ls;
+}//}}}
+bool TranscriptInfo::updateTrNames(const vector<string> &trList){//{{{
+ if((long)trList.size() != M)return false;
+ // Check uniqueness of new names.
+ set<string> trSet(trList.begin(),trList.end());
+ if((long)trSet.size() != M)return false;
+ for(long i=0;i<M;i++){
+ transcripts[i].t = trList[i];
+ }
+ return true;
+}//}}}
+bool TranscriptInfo::updateGeneNames(const vector<string> &geneList){//{{{
+ if((long)geneList.size() != M){
+ warning("TranscriptInfo: Number of items in gene list (%ld) does not match number of transcripts (%ld).",geneList.size(),M);
+ return false;
+ }
+ // Copy gene names in the order they are.
+ for(long i=0;i<M;i++){
+ transcripts[i].g = geneList[i];
+ }
+ // Initialize gene info.
+ setGeneInfo();
+ return true;
+}//}}}
+bool TranscriptInfo::updateGeneNames(const map<string,string> &trGeneList){//{{{
+ if((long)trGeneList.size() < M){
+ warning("TranscriptInfo: Number of items in TR->GE map (%ld) is less than the number of transcripts (%ld).",trGeneList.size(),M);
+ return false;
+ }
+ // Check all transcripts have associated gene name.
+ for(long i=0;i<M;i++){
+ if(!trGeneList.count(transcripts[i].t)){
+ warning("TranscriptInfo: No gene name for transcript [%s].",transcripts[i].t.c_str());
+ return false;
+ }
+ }
+ // Set gene names.
+ for(long i=0;i<M;i++){
+ transcripts[i].g = trGeneList.find(transcripts[i].t)->second;
+ }
+ // Initialize gene info.
+ setGeneInfo();
+ return true;
+}//}}}
diff --git a/TranscriptInfo.h b/TranscriptInfo.h
new file mode 100644
index 0000000..03d3391
--- /dev/null
+++ b/TranscriptInfo.h
@@ -0,0 +1,75 @@
+#ifndef TRANSCRIPTINFO_H
+#define TRANSCRIPTINFO_H
+#include<string>
+#include<vector>
+#include<map>
+#include<stdint.h>
+
+using namespace std;
+
+struct transcriptT {//{{{
+ string g,t;
+ int_least32_t l,gI;
+ double effL;
+ bool operator< (const transcriptT& d2) const{
+ if(g==d2.g)return t<d2.t;
+ return g<d2.g;
+ }
+};//}}}
+
+struct geneT {//{{{
+ string name;
+ int_least32_t m;
+ vector<long> trs;
+};//}}}
+
+class TranscriptInfo{
+ private:
+ // Number of transcripts, genes.
+ long M,G;
+ // Flags.
+ bool isInitialized, groupedByGenes;
+ // Transcript information:
+ // gene name, transcript name, length, effective length
+ vector<transcriptT> transcripts;
+ // Gene information:
+ // name, number of transcripts, list of transcripts
+ // Length is G+1 after initialization (with dummy record at the end).
+ vector<geneT> genes;
+ // Populate genes variable with gene information based on gene names saved in transcript information.
+ void setGeneInfo();
+ string noName;
+
+ public:
+ TranscriptInfo();
+ // Clears all information.
+ void clearTranscriptInfo();
+ TranscriptInfo(string fileName);
+ // Read info from a file name.
+ // Header (# M <num>) is ignored. The file is read until EOF.
+ bool readInfo(string fileName);
+ // Write transcript into into file. Does not overwrite existing file unless force=true.
+ bool writeInfo(string fileName, bool force = false) const;
+ bool writeGeneInfo(string fileName) const;
+ bool setInfo(vector<string> gNames, vector<string> tNames, vector<long> lengths);
+ bool isOK() const{ return isInitialized; }
+ long getM() const;
+ long getG() const;
+ const vector<long> &getGtrs(long i) const;
+ long L(long i) const;
+ double effL(long i) const;
+ const string &trName(long i) const;
+ const string &geName(long i) const;
+ long geId(long i) const;
+ bool genesOrdered() const{ return groupedByGenes; }
+ void setEffectiveLength(vector<double> effL);
+ // Return pointer to a vector of lengths with transcript IDs starting from 1.
+ vector<double> *getShiftedLengths(bool effective = false) const;
+ // Update transcript names with new names from the list.
+ bool updateTrNames(const vector<string> &trList);
+ // Sets gene names to transcripts and calls setGeneInfo to initialize gene information.
+ bool updateGeneNames(const vector<string> &geneList);
+ bool updateGeneNames(const map<string,string> &trGeneList);
+};
+
+#endif
diff --git a/TranscriptSequence.cpp b/TranscriptSequence.cpp
new file mode 100644
index 0000000..b60d4bc
--- /dev/null
+++ b/TranscriptSequence.cpp
@@ -0,0 +1,197 @@
+#include<algorithm>
+#include<fstream>
+#include<set>
+#include<sstream>
+
+#include "TranscriptSequence.h"
+
+#include "misc.h"
+
+#include "common.h"
+
+// Number of times we randomly probe for old cache record.
+// CR: #define WORST_SEARCH_N 10
+
+TranscriptSequence::TranscriptSequence(){//{{{
+ // CR: srand(time(NULL));
+ M=0;
+ cM=0;
+ gotGeneNames=false;
+ // CR: useCounter = 0;
+}//}}}
+TranscriptSequence::TranscriptSequence(string fileName, refFormatT format){//{{{
+ TranscriptSequence();
+ readSequence(fileName,format);
+}//}}}
+bool TranscriptSequence::readSequence(string fileName, refFormatT format){//{{{
+ fastaF.open(fileName.c_str());
+ if(!fastaF.is_open()){
+ error("TranscriptSequence: problem reading transcript file.\n");
+ return false;
+ }
+ trSeqInfoT newTr;
+ // CR: newTr.lastUse=0;
+ // CR: newTr.cache=-1;
+ string trDesc,geneName;
+ long pos;
+ istringstream geneDesc;
+ trNames.clear();
+ geneNames.clear();
+ gotGeneNames = true;
+ // Record trNames only from gencode ref.
+ gotTrNames = (format == GENCODE);
+ while(fastaF.good()){
+ while((fastaF.peek()!='>')&&(fastaF.good()))
+ fastaF.ignore(1000,'\n');
+ if(! fastaF.good())break;
+ // Read description line:
+ getline(fastaF, trDesc, '\n');
+ // look for gene name if previous lines had gene name:
+ if(gotGeneNames){
+ if(format == GENCODE){
+ vector<string> lineTokens = ns_misc::tokenize(trDesc,"|");
+ if(lineTokens.size()>1){
+ geneNames.push_back(lineTokens[1]);
+ trNames.push_back(lineTokens[0].substr(1));
+ }else{
+ gotGeneNames = false;
+ gotTrNames = false;
+ }
+ }else{ // format == STANDARD
+ pos=min(trDesc.find("gene:"),trDesc.find("gene="));
+ if(pos!=(long)string::npos){
+ geneDesc.clear();
+ geneDesc.str(trDesc.substr(pos+5));
+ geneDesc >> geneName;
+ geneNames.push_back(geneName);
+ }else{
+ gotGeneNames = false;
+ }
+ }
+ }
+ // remember position:
+ newTr.seek=fastaF.tellg();
+ trs.push_back(newTr);
+ }
+ // Exit if there was an error while reading the file.
+ if(fastaF.bad()){
+ error("TranscriptSequence: problem reading file.\n");
+ return false;
+ }
+ M = trs.size();
+ // Allocate cache for all.
+ cache.resize(M);
+ //cache.resize(min(M,(long)TRS_CACHE_MAX));
+ //cachedTrs.resize(min(M,(long)TRS_CACHE_MAX));
+ // Clear eof flag from input stream.
+ fastaF.clear();
+ return loadSequence();
+}//}}}
+bool TranscriptSequence::loadSequence(){//{{{
+ cache.resize(M);
+ string seqLine;
+ for(long tr=0;tr<M;tr++){
+ // Set input stream to transcript's position.
+ fastaF.seekg(trs[tr].seek);
+ // Read line by line until reaching EOF or next header line '>'.
+ while((fastaF.peek()!='>')&&( getline(fastaF,seqLine,'\n').good())){
+ cache[tr]+=seqLine;
+ }
+ if(fastaF.bad()){
+ error("TranscriptSequence: Failed reading transcript %ld\n",tr);
+ return false;
+ }
+ // Clear flags (just in case).
+ fastaF.clear();
+ }
+ return true;
+}//}}}
+long TranscriptSequence::getG() const{//{{{
+ if(!gotGeneNames)return 0;
+ return (set<string>(geneNames.begin(),geneNames.end())).size();
+}//}}}
+const string &TranscriptSequence::getTr(long tr) const{//{{{
+ if((tr<0)||(tr>=M))return noneTr;
+ // Return pointer to the sequence in cache.
+ return cache[tr];
+ /* Used with cacheing. {{{
+ // Update last use info.
+ trs[tr].lastUse = useCounter++;
+ return cache[acquireSequence(tr)];
+ }}} */
+}//}}}
+string TranscriptSequence::getSeq(long trI,long start,long l,bool doReverse) const{//{{{
+ // Return empty string for unknown transcript.
+ if((trI<0)||(trI>=M))return "";
+ /* Used with cacheing. {{{
+ // Update last use info.
+ trs[tr].lastUse = useCounter++;
+ // Get position within cache.
+ long trI = acquireSequence(tr);
+ }}} */
+
+ // If position is not within the sequence, return Ns.
+ if(start>=(long)cache[trI].size())return string(l,'N');
+
+ string ret;
+ // Copy appropriate sequence, fill up the rest with Ns.
+ if(start<0){
+ ret.assign(-start,'N');
+ ret+=cache[trI].substr(0,l+start);
+ }else{
+ ret = cache[trI].substr(start,l);
+ if(((long)ret.size()) < l)ret.append(l-ret.size(), 'N');
+ }
+
+ if(!doReverse){
+ return ret;
+ }else{
+ // For reverse return reversed string with complemented bases.
+ reverse(ret.begin(),ret.end());
+ for(long i=0;i<l;i++)
+ if((ret[i]=='A')||(ret[i]=='a'))ret[i]='T';
+ else if((ret[i]=='T')||(ret[i]=='t'))ret[i]='A';
+ else if((ret[i]=='C')||(ret[i]=='c'))ret[i]='G';
+ else if((ret[i]=='G')||(ret[i]=='g'))ret[i]='C';
+ return ret;
+ }
+}//}}}
+/* long TranscriptSequence::acquireSequence(long tr){//{{{
+ // If the sequence is stored in cache then just return it's cache index.
+ if(trs[tr].cache!=-1)return trs[tr].cache;
+ long i,newP,j;
+ // See if cache is full.
+ if(cM<TRS_CACHE_MAX){
+ // If cache limit not reached, just add new sequence.
+ newP=cM;
+ cM++;
+ }else{
+ // If cache is full, look at WORST_SEARCH_N positions and choose the one least used.
+ newP=rand()%cM;
+ for(i=0;i<WORST_SEARCH_N;i++){
+ j=rand()%cM;
+ if(trs[cachedTrs[newP]].lastUse > trs[cachedTrs[j]].lastUse)newP=j;
+ }
+ // "remove" the transcript from position newP from cache.
+ trs[cachedTrs[newP]].cache=-1;
+ cache[newP].clear();
+ }
+ // Set input stream to transcript's position.
+ fastaF.seekg(trs[tr].seek);
+ string seqLine;
+ // Read line by line until reaching EOF or next header line '>'.
+ while((fastaF.peek()!='>')&&( getline(fastaF,seqLine,'\n').good())){
+ cache[newP]+=seqLine;
+ }
+ if(fastaF.bad()){
+ error("TranscriptSequence: Failed reading transcript %ld\n",tr);
+ return 0;
+ }
+ // Clear flags.
+ fastaF.clear();
+ // Update cache information.
+ cachedTrs[newP]=tr;
+ trs[tr].cache=newP;
+ // Return transcripts index within cache.
+ return newP;
+}//}}} */
diff --git a/TranscriptSequence.h b/TranscriptSequence.h
new file mode 100644
index 0000000..c6dc85f
--- /dev/null
+++ b/TranscriptSequence.h
@@ -0,0 +1,82 @@
+#ifndef TRANSCRIPTSEQUENCE_H
+#define TRANSCRIPTSEQUENCE_H
+#include<fstream>
+#include<stdint.h>
+#include<string>
+#include<vector>
+
+using namespace std;
+
+/*
+ Lines commented with CR: -> cache related.
+ This was commented out when cacheing was removed.
+*/
+
+// Max number f transcripts to be cached at a time.
+// CR: #define TRS_CACHE_MAX 200000
+
+struct trSeqInfoT{
+ streampos seek;
+// CR: long cache;
+// CR: uint_least64_t lastUse;
+};
+
+enum refFormatT { STANDARD, GENCODE };
+
+/*
+TranscriptSequence class manages fasta file with transcript sequence.
+// CR: Only up to TRS_CACHE_MAX transcripts are "cached" at a time.
+*/
+class TranscriptSequence{
+ private:
+ // Total number of transcripts and number of cached transcripts.
+ long M,cM;
+ // Flag indicating whether it was possible to obtain gene names from the reference file.
+ bool gotGeneNames,gotTrNames;
+ // Transcript names.
+ vector<string> trNames;
+ // Gene names for each transcript.
+ vector<string> geneNames;
+ // Transcript cache information: seek position, use and cache position.
+ vector<trSeqInfoT> trs;
+ // Cache of transcript sequences.
+ vector<string> cache;
+ // Input stream for the fasta file.
+ ifstream fastaF;
+ // Empty transcript.
+ string noneTr;
+
+ // Counter for the least recently used entry.
+ // CR: uint_least64_t useCounter;
+ // IDs of transcripts currently in the cache (same order as cache).
+ // CR: vector<long> cachedTrs;
+ // Read transcript sequence from the file, save to cache and return it's cache index.
+ // CR: long acquireSequence(long tr);
+
+ bool loadSequence();
+ public:
+ TranscriptSequence();
+ // Initialize class and cass readSequence(fileName).
+ TranscriptSequence(string fileName, refFormatT format = STANDARD);
+ // Process input file fileName and record beginning of each transcript.
+ bool readSequence(string fileName, refFormatT format = STANDARD);
+ // Return number of transcripts.
+ long getM() const{ return M; }
+ // Return number of UNIQUE gene names.
+ long getG() const;
+ // Return pointer to the transcript sequence. The reference is not persistent.
+ // NULL for unknown transcript.
+ const string &getTr(long tr) const;
+ // Return sequence from transcript <tr> starting from <start> of length <l>.
+ string getSeq(long trI, long start, long l,bool doReverse = false) const;
+ // Reports whether transcript names were extracted from the sequence file.
+ bool hasTrNames() const{ return gotTrNames; }
+ // Reports whether gene names were extracted from the sequence file.
+ bool hasGeneNames() const{ return gotGeneNames; }
+ // Return reference to const vector containing the geneNames.
+ const vector<string> &getGeneNames() const{ return geneNames; }
+ // Return reference to const vector of transcript names.
+ const vector<string> &getTrNames() const{ return trNames; }
+};
+
+#endif
diff --git a/VariationalBayes.cpp b/VariationalBayes.cpp
new file mode 100644
index 0000000..edf495f
--- /dev/null
+++ b/VariationalBayes.cpp
@@ -0,0 +1,384 @@
+#include<fstream>
+#include<iomanip>
+#include<cmath>
+#include<cstring>
+#ifdef _OPENMP
+#include<omp.h>
+#endif
+#include "asa103/asa103.hpp"
+#include "boost/random/normal_distribution.hpp"
+#include "boost/random/gamma_distribution.hpp"
+
+#include "VariationalBayes.h"
+
+#include "common.h"
+
+#define SWAPD(x,y) {tmpD=x;x=y;y=tmpD;}
+#define ZERO_LIMIT 1e-12
+
+typedef boost::random::gamma_distribution<double>::param_type gDP;
+
+void VariationalBayes::setLog(string logFileName,MyTimer *timer){//{{{
+ this->logFileName=logFileName;
+ this->logTimer=timer;
+}//}}}
+VariationalBayes::VariationalBayes(SimpleSparse *_beta,double *_alpha,long seed,long procN){//{{{
+/*
+ As bitseq_vb::__init__(self, alpha, beta) in python
+ Python difference:
+ - python version excludes beta.data <= 1e-40
+*/
+ quiet = false;
+ logFileName = "tmp.convLog";
+ logTimer = NULL;
+#ifdef SUPPORT_OPENMP
+ omp_set_num_threads(procN);
+#endif
+ long i;
+ beta=_beta;
+ N=beta->N;
+ M=beta->M;
+ T=beta->T;
+
+ //logBeta= new SimpleSparse(beta);
+ //beta already contains log probabilities.
+
+ if(_alpha){
+ alpha = _alpha;
+ }else{
+ alpha = new double[M];
+ for(i=0;i<M;i++)alpha[i]=1.;
+ }
+ phiHat = new double[M];
+ digA_pH = new double[M];
+
+ rng_mt.seed(seed);
+ boost::random::normal_distribution<long double> normalD;
+ //typedef boost::random::normal_distribution<long double>::param_type nDP;
+ //normalD.param(nDP(0,1));
+
+ phi_sm = new SimpleSparse(beta);
+ for(i=0;i<T;i++)phi_sm->val[i] = normalD(rng_mt);
+ phi = new SimpleSparse(beta);
+ // PyDif make phi a copy of phi_sm <- not important because of unpack() coming up next
+
+ unpack(phi_sm->val); //unpack(pack());
+
+ double alphaS=0,gAlphaS=0;
+ for(i=0;i<M;i++){
+ alphaS+=alpha[i];
+ gAlphaS+=lgamma(alpha[i]);
+ }
+ boundConstant = lgamma(alphaS) - gAlphaS - lgamma(alphaS+N);
+}//}}}
+VariationalBayes::~VariationalBayes(){//{{{
+ delete[] alpha;
+ delete[] phiHat;
+ delete[] digA_pH;
+ delete phi_sm;
+ delete phi;
+}//}}}
+void VariationalBayes::unpack(double vals[],double adds[]){//{{{
+ if(adds==NULL){
+ if(vals!=phi_sm->val)memcpy(phi_sm->val,vals,T*sizeof(double));
+ }else{
+ long i;
+ #pragma omp parallel for
+ for(i=0;i<T;i++)phi_sm->val[i] = vals[i]+adds[i];
+ }
+ phi_sm->softmaxInplace(phi); //softmax phi_sm into phi; and set phi_sm to log(phi)
+ phi->sumCols(phiHat); // sumCols of phi into phiHat
+}//}}}
+
+void VariationalBayes::negGradient(double res[]){//{{{
+ long i;
+ int err=0,totalError=0;
+ #pragma omp parallel for private(err) reduction(+:totalError)
+ for(i=0;i<M;i++){
+ digA_pH[i]=digama(alpha[i]+phiHat[i], &err);
+ totalError += err;
+ }
+ if(totalError){error("VariationalBayes: Digamma error (%d).\n",totalError); }
+ // beta is logged now
+ #pragma omp parallel for
+ for(i=0;i<T;i++)res[i]= - (beta->val[i] - phi_sm->val[i] - 1.0 + digA_pH[beta->col[i]]);
+}//}}}
+double VariationalBayes::getBound(){//{{{
+ // the lower bound on the model likelihood
+ double A=0,B=0,C=0;
+ long i;
+ #pragma omp parallel for reduction(+:A,B)
+ for(i=0;i<T;i++){
+ // beta is logged now.
+ A += phi->val[i] * beta->val[i];
+ // PyDif use nansum instead of ZERO_LIMIT (nansum sums all elements treating NaN as zero
+ if(phi->val[i]>ZERO_LIMIT){
+ B += phi->val[i] * phi_sm->val[i];
+ }
+ }
+ #pragma omp parallel for reduction(+:C)
+ for(i=0;i<M;i++){
+ C += lgamma(alpha[i]+phiHat[i]);
+ }
+ return A+B+C+boundConstant;
+}//}}}
+
+void VariationalBayes::optimize(bool verbose,OPT_TYPE method,long maxIter,double ftol, double gtol){//{{{
+ bool usedSteepest;
+ long iteration=0,i,r;
+ double boundOld,bound,squareNorm,squareNormOld=1,valBeta=0,valBetaDiv,natGrad_i,gradGamma_i,phiGradPhiSum_r;
+ double *gradPhi,*natGrad,*gradGamma,*searchDir,*tmpD,*phiOld;
+ gradPhi=natGrad=gradGamma=searchDir=tmpD=phiOld=NULL;
+ MyTimer timer;
+ // allocate stuff {{{
+ //SimpleSparse *phiGradPhi=new SimpleSparse(beta);
+ gradPhi = new double[T];
+ // phiOld = new double[T]; will use gradPhi memory for this
+ phiOld = NULL;
+ natGrad = new double[T];
+ if(method == OPTT_HS)
+ gradGamma = new double[T];
+ searchDir = new double[T];
+ //searchDirOld = new double[T];
+ //phiGradPhi_sum = new double[N];
+ // }}}
+#ifdef LOG_CONV
+ ofstream logF(logFileName.c_str());
+ logF.precision(15);
+ logF<<"# iter bound squareNorm time(m) [M*means M*vars]"<<endl;
+ if(logTimer)logTimer->setQuiet();
+ #ifdef LONG_LOG
+ vector<double> dirAlpha(M);
+ #endif
+#endif
+ boundOld=getBound();
+ timer.start();
+ while(true){
+ negGradient(gradPhi);
+ // "yuck"
+ //setVal(phiGradPhi,i,phi->val[i]*gradPhi[i]);
+ //phiGradPhi->sumRows(phiGradPhi_sum);
+ // removed need for phiGradPhi matrix:
+ // removed need for phiGradPhi_sum
+ /*for(r=0;r<N;r++){
+ phiGradPhi_sum[r] = 0;
+ for(i=phi->rowStart[r];i<phi->rowStart[r+1];i++) phiGradPhi_sum[r] += phi->val[i] * gradPhi[i];
+ }*/
+
+ // set natGrad & gradGamma
+ squareNorm=0;
+ valBeta = 0;
+ valBetaDiv = 0;
+ #pragma omp parallel for private(i,phiGradPhiSum_r,natGrad_i,gradGamma_i) reduction(+:squareNorm,valBeta,valBetaDiv)
+ for(r=0;r<N;r++){
+ phiGradPhiSum_r = 0;
+ for(i = phi->rowStart[r]; i < phi->rowStart[r+1]; i++)
+ phiGradPhiSum_r += phi->val[i] * gradPhi[i];
+
+ for(i = phi->rowStart[r]; i < phi->rowStart[r+1]; i++){
+ natGrad_i = gradPhi[i] - phiGradPhiSum_r;
+ gradGamma_i = natGrad_i * phi->val[i];
+ squareNorm += natGrad_i * gradGamma_i;
+
+ if(method==OPTT_PR){
+ valBeta += (natGrad_i - natGrad[i])*gradGamma_i;
+ }
+ if(method==OPTT_HS){
+ valBeta += (natGrad_i-natGrad[i])*gradGamma_i;
+ valBetaDiv += (natGrad_i-natGrad[i])*gradGamma[i];
+ gradGamma[i] = gradGamma_i;
+ }
+ natGrad[i] = natGrad_i;
+ }
+ }
+
+ if((method==OPTT_STEEPEST) || (iteration % (N*M)==0)){
+ valBeta=0;
+ }else if(method==OPTT_PR ){
+ // already computed:
+ // valBeta=0;
+ // for(i=0;i<T;i++)valBeta+= (natGrad[i]-natGradOld[i])*gradGamma[i];
+ valBeta /= squareNormOld;
+ }else if(method==OPTT_FR ){
+ valBeta = squareNorm / squareNormOld;
+ }else if(method==OPTT_HS ){
+ // already computed:
+ //valBeta=div=0;
+ //for(i=0;i<T;i++){
+ // valBeta += (natGrad[i]-natGradOld[i])*gradGamma[i];
+ // div += (natGrad[i]-natGradOld[i])*gradGammaOld[i];
+ //}
+ if(valBetaDiv!=0)valBeta /= valBetaDiv;
+ else valBeta = 0;
+ }
+
+ if(valBeta>0){
+ usedSteepest = false;
+ //for(i=0;i<T;i++)searchDir[i]= -natGrad[i] + valBeta*searchDirOld[i];
+ // removed need for searchDirOld:
+ #pragma omp parallel for
+ for(i=0;i<T;i++)
+ searchDir[i]= -natGrad[i] + valBeta*searchDir[i];
+ }else{
+ usedSteepest = true;
+ #pragma omp parallel for
+ for(i=0;i<T;i++)
+ searchDir[i]= -natGrad[i];
+ }
+
+ //try conjugate step
+ SWAPD(gradPhi,phiOld);
+ memcpy(phiOld,phi_sm->val,T*sizeof(double)); // memcpy(phiOld,pack(),T*sizeof(double));
+ unpack(phiOld,searchDir);
+ bound = getBound();
+ iteration++;
+ // make sure there is an increase in L, else revert to steepest
+ if((bound<boundOld) && (valBeta>0)){
+ usedSteepest = true;
+ #pragma omp parallel for
+ for(i=0;i<T;i++)
+ searchDir[i]= -natGrad[i];
+ unpack(phiOld,searchDir);
+ bound = getBound();
+ // this should not be increased: iteration++;
+ }
+ if(bound<boundOld) { // If bound decreased even after using steepest, step back and quit.
+ unpack(phiOld);
+ }
+ SWAPD(gradPhi,phiOld);
+ if(verbose){
+ #ifdef SHOW_FIXED
+ messageF("iter(%c): %5.ld bound: %.3lf grad: %.7lf beta: %.7lf fixed: %ld\n",(usedSteepest?'s':'o'),iteration,bound,squareNorm,valBeta,phi->countAboveDelta(0.999));
+ #else
+ messageF("iter(%c)[%5.lds]: %5.ld bound: %.3lf grad: %.7lf beta: %.7lf\n",(usedSteepest?'s':'o'),(long)timer.getTime(),iteration,bound,squareNorm,valBeta);
+ #endif
+ }else if(!quiet){
+ messageF("\riter(%c): %5.ld bound: %.3lf grad: %.7lf beta: %.7lf ",(usedSteepest?'s':'o'),iteration,bound,squareNorm,valBeta);
+ }
+#ifdef LOG_CONV
+ if((iteration%100==0) ||
+ ((iteration<500) && (iteration%50==0)) ||
+ ((iteration<150) && (iteration%10==0)) ||
+ ((iteration<50) && (iteration%5==0))){
+ logF<<iteration<<" "<<bound<<" "<<squareNorm;
+ if(logTimer)logF<<" "<<logTimer->current(0,'m');
+ #ifdef LONG_LOG
+ double alphaSum = 0, alphaVarNorm;
+ // True 'alpha' - Dirichlet parameter is alpha+phiHat.
+ for(i=1;i<M;i++){
+ dirAlpha[i] = alpha[i] + phiHat[i];
+ alphaSum += dirAlpha[i];
+ }
+ for(i=1;i<M;i++)logF<< " " << dirAlpha[i] / alphaSum;
+ alphaVarNorm = alphaSum*alphaSum*(alphaSum+1);
+ for(i=1;i<M;i++)logF<<" "<<dirAlpha[i]*(alphaSum-dirAlpha[i])/alphaVarNorm;
+ #endif
+ logF<<endl;
+ }
+#endif
+
+ // convergence check {{{
+ if(bound<boundOld){
+ message("\nEnd: bound decrease\n");
+ break;
+ }
+ if(abs(bound-boundOld)<=ftol){
+ message("\nEnd: converged (ftol)\n");
+ break;
+ }
+ if(squareNorm<=gtol){
+ message("\nEnd: converged (gtol)\n");
+ break;
+ }
+ if(iteration>=maxIter){
+ message("\nEnd: maxIter exceeded\n");
+ break;
+ }
+ // }}}
+ // store essentials {{{
+ squareNormOld=squareNorm;
+ boundOld=bound;
+ // }}}
+ R_INTERUPT;
+ }
+ if(quiet){
+ messageF("iter(%c): %5.ld bound: %.3lf grad: %.7lf beta: %.7lf\n",(usedSteepest?'s':'o'),iteration,bound,squareNorm,valBeta);
+ }
+#ifdef LOG_CONV
+ logF<<iteration<<" "<<bound<<" "<<squareNorm;
+ if(logTimer)logF<<" "<<logTimer->current(0,'m');
+ #ifdef LONG_LOG
+ double alphaSum = 0, alphaVarNorm;
+ // True 'alpha' - Dirichlet parameter is alpha+phiHat.
+ for(i=1;i<M;i++){
+ dirAlpha[i] = alpha[i] + phiHat[i];
+ alphaSum += dirAlpha[i];
+ }
+ for(i=1;i<M;i++)logF<< " " << dirAlpha[i] / alphaSum;
+ alphaVarNorm = alphaSum*alphaSum*(alphaSum+1);
+ for(i=1;i<M;i++)logF<<" "<<dirAlpha[i]*(alphaSum-dirAlpha[i])/alphaVarNorm;
+ #endif
+ logF<<endl;
+ if(logTimer)logTimer->setVerbose();
+ logF.close();
+#endif
+ // free memory {{{
+ //delete phiGradPhi;
+ delete[] gradPhi;
+ delete[] natGrad;
+ if(method == OPTT_HS)
+ delete[] gradGamma;
+ delete[] searchDir;
+ //delete[] searchDirOld;
+ //delete[] phiGradPhi_sum;
+ // }}}
+}//}}}
+
+double *VariationalBayes::getAlphas(){//{{{
+ double *alphas = new double[M];
+ for(long i=0;i<M;i++)alphas[i] = alpha[i] + phiHat[i];
+ return alphas;
+}//}}}
+
+void VariationalBayes::generateSamples(long samplesN, const string &outTypeS, const vector<double> *isoformLengths, ofstream *outF) {//{{{
+ vector<double> gamma(M,0);
+ vector<gDP> alphaParam;
+ boost::random::gamma_distribution<double> gammaDistribution;
+ long n,m;
+ double gammaSum, norm, normC = 1.0;
+ // Set normalisation.
+ if(outTypeS == "counts") normC = N; // N is Nmap.
+ if(outTypeS == "rpkm") normC = 1e9;
+ // Pre-compute Dirichlet's alpha and save them as parameters for Gamma.
+ for(m=0;m<M;m++)alphaParam.push_back(gDP(alpha[m] + phiHat[m], 1.0));
+ // Sample.
+ outF->precision(9);
+ (*outF)<<scientific;
+ for(n=0;n<samplesN;n++){
+ // Compute M gammas and sum. Ignore 0 - noise transcript.
+ gammaSum = 0;
+ for(m=1;m<M;m++){
+ gammaDistribution.param(alphaParam[m]);
+ gamma[m] = gammaDistribution(rng_mt);
+ gammaSum += gamma[m];
+ }
+ // For rpkm normalize by length.
+ if(outTypeS == "rpkm"){
+ if((long)isoformLengths->size() < M){
+ error("VariationalBayes: Too few isoform lengths for RPKM computation.");
+ return;
+ }
+ for(m=1;m<M;m++)
+ if((*isoformLengths)[m]>0)
+ gamma[m] /= (*isoformLengths)[m];
+ }
+ norm = normC / gammaSum;
+ for(m=1;m < M;m++){
+ (*outF)<<gamma[m] * norm<<" ";
+ }
+ (*outF)<<endl;
+ R_INTERUPT;
+ }
+ // Delete lengths.
+ delete isoformLengths;
+}//}}}
diff --git a/VariationalBayes.h b/VariationalBayes.h
new file mode 100644
index 0000000..50ff026
--- /dev/null
+++ b/VariationalBayes.h
@@ -0,0 +1,45 @@
+#ifndef VARIATIONALBAYES_H
+#define VARIATIONALBAYES_H
+
+#include "boost/random/mersenne_twister.hpp"
+
+#include "MyTimer.h"
+#include "SimpleSparse.h"
+
+//#define LOG_CONV
+//#define LONG_LOG
+//#define SHOW_FIXED
+
+enum OPT_TYPE { OPTT_STEEPEST, OPTT_PR, OPTT_FR, OPTT_HS};
+
+class VariationalBayes {
+ private:
+ long N,M,T; // N - read num K- read number?
+ double * alpha; // prior over expression
+ double * phiHat;
+ double * digA_pH;
+ double boundConstant;
+ SimpleSparse *beta,*phi_sm,*phi;
+ // logBeta replaced by logging beta itself
+ string logFileName;
+ MyTimer *logTimer;
+ // mersen twister random number generator
+ boost::random::mt11213b rng_mt;
+ bool quiet;
+
+ public:
+ VariationalBayes(SimpleSparse *_beta,double *_alpha=NULL,long seed = 0,long procN = 1);
+ ~VariationalBayes();
+ //double *pack(){return phi_sm->val;}
+ void unpack(double vals[], double adds[] = NULL); // set phi_m, phi=softmax(phi_m), phi_hat=sumOverCols(phi)
+ void negGradient(double res[]);
+ double getBound();
+ void optimize(bool verbose=false, OPT_TYPE method=OPTT_STEEPEST,long maxIter=10000,double ftol=1e-5, double gtol=1e-5);
+ double *getAlphas();
+ void setLog(string logFileName,MyTimer *timer);
+ // Generates samples from the distribution. The 0 (noise) transcript is left out.
+ void generateSamples(long samplesN, const string &outTypeS, const vector<double> *isoformLengths, ofstream *outF);
+ void beQuiet(){ quiet = true; }
+};
+
+#endif
diff --git a/_release_Makefile b/_release_Makefile
new file mode 100644
index 0000000..469ec77
--- /dev/null
+++ b/_release_Makefile
@@ -0,0 +1,122 @@
+CXX = g++
+ARCH = -mtune=generic
+VERSION = 0.7.5
+# ARCH = -march=core2
+# ARCH = -march=native
+
+
+COFLAGS = $(ARCH) -O2 -pipe
+CXXFLAGS = -DBS_VERSION=\"$(VERSION)\" -Wall $(COFLAGS)
+LDFLAGS = -Wl,-gc-sections
+BOOSTFLAGS = -I .
+OPENMP = -fopenmp -DSUPPORT_OPENMP
+
+PROGRAMS = \
+ convertSamples \
+ estimateDE \
+ estimateExpression \
+ estimateHyperPar \
+ estimateVBExpression \
+ extractSamples \
+ getFoldChange \
+ getGeneExpression \
+ getPPLR \
+ getVariance \
+ getWithinGeneExpression \
+ parseAlignment \
+ transposeLargeFile
+
+all: $(PROGRAMS)
+
+COMMON_DEPS = ArgumentParser.o common.o FileHeader.o misc.o MyTimer.o
+# PROGRAMS:
+convertSamples: convertSamples.cpp $(COMMON_DEPS) TranscriptInfo.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) convertSamples.cpp $(COMMON_DEPS) TranscriptInfo.o -o convertSamples
+
+estimateDE: estimateDE.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(LDFLAGS) estimateDE.cpp $(COMMON_DEPS) PosteriorSamples.o -o estimateDE
+
+estimateExpression: estimateExpression.cpp $(COMMON_DEPS) CollapsedSampler.o GibbsParameters.o GibbsSampler.o Sampler.o TagAlignments.o TranscriptInfo.o transposeFiles.o
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(OPENMP) $(LDFLAGS) estimateExpression.cpp $(COMMON_DEPS) CollapsedSampler.o GibbsParameters.o GibbsSampler.o Sampler.o TagAlignments.o TranscriptInfo.o transposeFiles.o -o estimateExpression
+
+estimateHyperPar: estimateHyperPar.cpp $(COMMON_DEPS) lowess.o PosteriorSamples.o TranscriptExpression.o
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(LDFLAGS) estimateHyperPar.cpp $(COMMON_DEPS) lowess.o PosteriorSamples.o TranscriptExpression.o -o estimateHyperPar
+
+estimateVBExpression: estimateVBExpression.cpp $(COMMON_DEPS) SimpleSparse.o TagAlignments.o TranscriptInfo.o transposeFiles.o VariationalBayes.o
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(OPENMP) $(LDFLAGS) estimateVBExpression.cpp $(COMMON_DEPS) SimpleSparse.o TagAlignments.o TranscriptInfo.o transposeFiles.o VariationalBayes.o -o estimateVBExpression
+
+extractSamples: extractSamples.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) extractSamples.cpp $(COMMON_DEPS) PosteriorSamples.o -o extractSamples
+
+getFoldChange: getFoldChange.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getFoldChange.cpp $(COMMON_DEPS) PosteriorSamples.o -o getFoldChange
+
+getGeneExpression: getGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o -o getGeneExpression
+
+getPPLR: getPPLR.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getPPLR.cpp $(COMMON_DEPS) PosteriorSamples.o -o getPPLR
+
+getVariance: getVariance.cpp $(COMMON_DEPS) PosteriorSamples.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getVariance.cpp $(COMMON_DEPS) PosteriorSamples.o -o getVariance
+
+getWithinGeneExpression: getWithinGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) getWithinGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o -o getWithinGeneExpression
+
+parseAlignment: parseAlignment.cpp $(COMMON_DEPS) ReadDistribution.o samtools/sam.o TranscriptExpression.o TranscriptInfo.o TranscriptSequence.o
+ $(CXX) $(CXXFLAGS) $(OPENMP) $(LDFLAGS) -pthread parseAlignment.cpp $(COMMON_DEPS) ReadDistribution.o samtools/*.o TranscriptExpression.o TranscriptInfo.o TranscriptSequence.o -lz -o parseAlignment
+
+transposeLargeFile: transposeLargeFile.cpp $(COMMON_DEPS) transposeFiles.o
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) transposeLargeFile.cpp $(COMMON_DEPS) transposeFiles.o -o transposeLargeFile
+
+# LIBRARIES:
+ArgumentParser.o: ArgumentParser.cpp ArgumentParser.h
+ $(CXX) $(CXXFLAGS) -ffunction-sections -fdata-sections -c ArgumentParser.cpp
+
+CollapsedSampler.o: CollapsedSampler.cpp CollapsedSampler.h GibbsParameters.h Sampler.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) -c CollapsedSampler.cpp
+
+FileHeader.o: common.h misc.h FileHeader.cpp FileHeader.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) -ffunction-sections -fdata-sections -c FileHeader.cpp
+
+GibbsSampler.o: GibbsSampler.cpp GibbsSampler.h GibbsParameters.h Sampler.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) -c GibbsSampler.cpp
+
+misc.o: ArgumentParser.h PosteriorSamples.h misc.cpp misc.h
+ $(CXX) $(CXXFLAGS) -ffunction-sections -fdata-sections -c misc.cpp
+
+MyTimer.o: MyTimer.h MyTimer.cpp
+ $(CXX) $(CXXFLAGS) -ffunction-sections -fdata-sections -c MyTimer.cpp
+
+PosteriorSamples.o: PosteriorSamples.cpp PosteriorSamples.h FileHeader.h
+ $(CXX) $(CXXFLAGS) -ffunction-sections -fdata-sections -c PosteriorSamples.cpp
+
+ReadDistribution.o: ReadDistribution.cpp ReadDistribution.h TranscriptExpression.h TranscriptInfo.h TranscriptSequence.h
+ $(CXX) $(CXXFLAGS) $(OPENMP) -c ReadDistribution.cpp
+
+Sampler.o: Sampler.cpp Sampler.h GibbsParameters.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) -c Sampler.cpp
+
+SimpleSparse.o: SimpleSparse.cpp SimpleSparse.h
+ $(CXX) $(CXXFLAGS) $(OPENMP) -c SimpleSparse.cpp
+
+VariationalBayes.o: VariationalBayes.cpp VariationalBayes.h SimpleSparse.h
+ $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) $(OPENMP) -c VariationalBayes.cpp
+
+common.o: common.cpp common.h
+GibbsParameters.o: ArgumentParser.h GibbsParameters.cpp GibbsParameters.h
+lowess.o: lowess.cpp lowess.h
+TagAlignments.o: TagAlignments.cpp TagAlignments.h
+TranscriptExpression.o: TranscriptExpression.cpp TranscriptExpression.h
+TranscriptInfo.o: TranscriptInfo.cpp TranscriptInfo.h
+TranscriptSequence.o: TranscriptSequence.cpp TranscriptSequence.h
+transposeFiles.o: transposeFiles.cpp transposeFiles.h FileHeader.h
+
+# EXTERNAL LIBRARIES:
+samtools/sam.o:
+ make --directory samtools
+
+# CLEAN:
+clean:
+ rm samtools/*.o *.o $(PROGRAMS)
+
diff --git a/asa103/LICENSE.txt b/asa103/LICENSE.txt
new file mode 100644
index 0000000..65c5ca8
--- /dev/null
+++ b/asa103/LICENSE.txt
@@ -0,0 +1,165 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+ This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+ 0. Additional Definitions.
+
+ As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+ "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+ An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+ A "Combined Work" is a work produced by combining or linking an
+Application with the Library. The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+ The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+ The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+ 1. Exception to Section 3 of the GNU GPL.
+
+ You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+ 2. Conveying Modified Versions.
+
+ If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+ a) under this License, provided that you make a good faith effort to
+ ensure that, in the event an Application does not supply the
+ function or data, the facility still operates, and performs
+ whatever part of its purpose remains meaningful, or
+
+ b) under the GNU GPL, with none of the additional permissions of
+ this License applicable to that copy.
+
+ 3. Object Code Incorporating Material from Library Header Files.
+
+ The object code form of an Application may incorporate material from
+a header file that is part of the Library. You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+ a) Give prominent notice with each copy of the object code that the
+ Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the object code with a copy of the GNU GPL and this license
+ document.
+
+ 4. Combined Works.
+
+ You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+ a) Give prominent notice with each copy of the Combined Work that
+ the Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
+ document.
+
+ c) For a Combined Work that displays copyright notices during
+ execution, include the copyright notice for the Library among
+ these notices, as well as a reference directing the user to the
+ copies of the GNU GPL and this license document.
+
+ d) Do one of the following:
+
+ 0) Convey the Minimal Corresponding Source under the terms of this
+ License, and the Corresponding Application Code in a form
+ suitable for, and under terms that permit, the user to
+ recombine or relink the Application with a modified version of
+ the Linked Version to produce a modified Combined Work, in the
+ manner specified by section 6 of the GNU GPL for conveying
+ Corresponding Source.
+
+ 1) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (a) uses at run time
+ a copy of the Library already present on the user's computer
+ system, and (b) will operate properly with a modified version
+ of the Library that is interface-compatible with the Linked
+ Version.
+
+ e) Provide Installation Information, but only if you would otherwise
+ be required to provide such information under section 6 of the
+ GNU GPL, and only to the extent that such information is
+ necessary to install and execute a modified version of the
+ Combined Work produced by recombining or relinking the
+ Application with a modified version of the Linked Version. (If
+ you use option 4d0, the Installation Information must accompany
+ the Minimal Corresponding Source and Corresponding Application
+ Code. If you use option 4d1, you must provide the Installation
+ Information in the manner specified by section 6 of the GNU GPL
+ for conveying Corresponding Source.)
+
+ 5. Combined Libraries.
+
+ You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+ a) Accompany the combined library with a copy of the same work based
+ on the Library, uncombined with any other library facilities,
+ conveyed under the terms of this License.
+
+ b) Give prominent notice with the combined library that part of it
+ is a work based on the Library, and explaining where to find the
+ accompanying uncombined form of the same work.
+
+ 6. Revised Versions of the GNU Lesser General Public License.
+
+ The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+ If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/asa103/asa103.hpp b/asa103/asa103.hpp
new file mode 100644
index 0000000..98638f5
--- /dev/null
+++ b/asa103/asa103.hpp
@@ -0,0 +1,96 @@
+# include <cmath>
+
+//****************************************************************************80
+
+double digama ( double x, int *ifault )
+
+//****************************************************************************80
+//
+// Purpose:
+//
+// DIGAMA calculates DIGAMMA ( X ) = d ( LOG ( GAMMA ( X ) ) ) / dX
+//
+// Licensing:
+//
+// This code is distributed under the GNU LGPL license.
+//
+// Modified:
+//
+// 18 January 2008
+//
+// Author:
+//
+// Original FORTRAN77 version by Jose Bernardo.
+// C++ version by John Burkardt.
+//
+// Reference:
+//
+// Jose Bernardo,
+// Algorithm AS 103:
+// Psi ( Digamma ) Function,
+// Applied Statistics,
+// Volume 25, Number 3, 1976, pages 315-317.
+//
+// Parameters:
+//
+// Input, double X, the argument of the digamma function.
+// 0 < X.
+//
+// Output, int *IFAULT, error flag.
+// 0, no error.
+// 1, X <= 0.
+//
+// Output, double DIGAMA, the value of the digamma function at X.
+//
+{
+ double c = 8.5;
+ double d1 = -0.5772156649;
+ double r;
+ double s = 0.00001;
+ double s3 = 0.08333333333;
+ double s4 = 0.0083333333333;
+ double s5 = 0.003968253968;
+ double value;
+ double y;
+//
+// Check the input.
+//
+ if ( x <= 0.0 )
+ {
+ value = 0.0;
+ *ifault = 1;
+ return value;
+ }
+//
+// Initialize.
+//
+ *ifault = 0;
+ y = x;
+ value = 0.0;
+//
+// Use approximation if argument <= S.
+//
+ if ( y <= s )
+ {
+ value = d1 - 1.0 / y;
+ return value;
+ }
+//
+// Reduce to DIGAMA(X + N) where (X + N) >= C.
+//
+ while ( y < c )
+ {
+ value = value - 1.0 / y;
+ y = y + 1.0;
+ }
+//
+// Use Stirling's (actually de Moivre's) expansion if argument > C.
+//
+ r = 1.0 / y;
+ value = value + log ( y ) - 0.5 * r;
+ r = r * r;
+ value = value - r * ( s3 - r * ( s4 - r * s5 ) );
+
+ return value;
+}
+//****************************************************************************80
diff --git a/biocUpdate.sh b/biocUpdate.sh
new file mode 100755
index 0000000..92b74b6
--- /dev/null
+++ b/biocUpdate.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Script that automizes copying into Bioconductor repository.
+# THE ASSUMPTION IS that there were no updates made to Bioc sources, thus can be just replaced by sources from C++ version.
+
+
+if [ $# -ne 1 ]
+then
+ echo "Usage: provide path to bioc sources directory (.../devel/src)"
+ echo " biocUpdate.sh [dirPath]"
+ exit
+fi
+
+for i in `ls $1`
+do
+ if [ -e $i -a -f $i ]
+ then
+ other="${1}/${i}"
+ if ! diff -q $i $other > /dev/null
+ then
+ cp -v $i $other;
+ fi
+ fi
+done
diff --git a/changeList b/changeList
new file mode 100644
index 0000000..38e2250
--- /dev/null
+++ b/changeList
@@ -0,0 +1,86 @@
+0.7.5
+Improvements:
+- parseAlignment adding option to mateNamesDiffer [!! add note that this can't be used with mixed alignments !!]
+
+Bug fixes:
+- estimateExpression didn't use more cores when number of chains was changed in parameters file.
+
+0.7.4
+[0.7.3] (bug in copying alignment)
+[0.7.2]
+[0.7.1]:
+Improvements:
+- parseAlignment can use *.m_alphas files generated by estimateVBExpression for expression estimates
+- estimateHyperPar forces smoothing of hyperparameters by default
+- enable excluding singletons (half alignments of paired reads) in parseAlignment[0.7.3]
+- .prob file contains number of transcripts so estimateExpression does not need .tr file for correct transcript count[0.7.3]
+
+Bug fixes:
+- fixing problems related to parsing FASTA reads
+ (and correctly assigning additional mismatches)
+- fixed problem with half alignments being omitted from read distribution estimation[0.7.2]
+
+0.7.0:
+Improvements:
+- estimateVBexpression provides new expression inference method using fast collapsed VB
+- adding --unstranded flag to parseAlignment, to allow read pairs with various directions to be used
+- changing computation of Rhat to produce better estimates
+
+Bug fixes:
+- adding -lpthread option for parseAlignment compilation, so that samtools links even without -fopenmp
+
+[0.6.1]
+0.6.0:
+Improvements:
+- thetaMeans contains variance of theta
+- proper handling of bad alignments
+- major speed improvement for parseAlignment on multiple CPUs
+- getGeneExpression and getWithinGene expression can use 'external' TR->GE mapping or gene list
+- getWithinGene changed so that transcripts keep order unless --groupByGene
+- getGeneExpression produces gene info file with list of genes
+- increased output precision in gGE and gWGE
+
+Bug fixes:
+- fixed major bug in getGeneExpression and getWithinGeneExpression
+- fixed occasional underflow in effective length computation
+
+0.5.3:
+Improvements:
+- parseAlignment should be compatible with bowtie 2 output + new flag (-l/--limit) to limit maximum number of alignments
+- parseAlignment tries to determine input type based on extension (no need to use --format parameter in most cases)
+- more helpful error messages in get[Within]GeneExpression programs
+
+Bug fixes:
+- fixed broken extractTranscriptInfo.py and added new 'type' that looks for "gene=<gene name>"
+
+Internals:
+- replacing Boost headers with the latest stable version 1.53.0
+- replacing samtools API with the latest stable version 0.1.19
+- parseAlignment saves probabilities in log scale
+
+0.5.0:
+Improvements:
+- parse alignment can extract and save gene names from Ensembl type reference
+- estimateDE now produces comparison for all condition pairs when used with more than 2 conditions
+- added --seed option to estimateHyperPar and estimateDE
+- estimateHyperPar works in log scale only (it accepts only mean logged expression in --meanFile; and logs expression samples automatically when needed)
+
+Bug fixes:
+- fixed output of estimateDE: fold change with confidence intervals are in log2 scale, mean expression in natural log scale
+ + changed flag confidencePerc to confidenceInterval and default value to 95
+- changed getPPLR so that it produces PPLR comparison in the same "direction" as estimateDE
+- fixed problem with long lines in reference sequence file in parseAlignment
+
+Deprecation:
+- drop obsolete parseAlignment.py
+
+0.4.3:
+- changed order in estimateDE output (logFC ConfidenceLow ConfidenceHigh)
+ -> rename to credible intervals ?
+- added normalization for getV. estimateDE estimateHP
+- estimateE
+ - new format in thetaMeans
+ - set seed
+ - doMax
+- drop example.html
+- bugfix for parseAlignment
diff --git a/checkTR.py b/checkTR.py
new file mode 100755
index 0000000..88d1df5
--- /dev/null
+++ b/checkTR.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+
+"""
+Due to an error in our code, GENE EXPRESSION and WITHIN GENE EXPRESSION results
+produced with BitSeq versions up to 0.5.3 might be wrong for some genes.
+
+This only applies to SOME genes (or transcripts of that genes) IF the transcripts
+in .tr file were not grouped by gene.
+This program can check exactly which genes have wrong expression estimates.
+
+The results of other genes and their transcripts are correct.
+"""
+
+import sys
+
+def checkFile(fileName):
+ try:
+ inf = open(fileName);
+ except:
+ sys.exit("Can't open file "+str(fileName));
+ genesSeen = set();
+
+ giName = ""
+ prevName = "";
+
+ genesWrong = list();
+ g2ts ={};
+
+ for line in inf:
+ if line[0]=='#':continue;
+ try:
+ gn = line.split()[0];
+ tr = line.split()[1];
+ except:
+ sys.exit("The TR does not seem to have valid format in line:\n"+line);
+
+ if gn in g2ts:
+ g2ts[gn].append(tr);
+ else:
+ g2ts[gn] = [tr];
+
+ if gn == prevName:
+ if prevName != giName:
+ if giName not in genesWrong: genesWrong.append(giName);
+ if prevName not in genesWrong: genesWrong.append(prevName);
+ else:
+ if gn not in genesSeen:
+ prevName=gn;
+ giName=gn;
+ genesSeen.add(gn);
+ else:
+ giName=gn;
+ if len(genesWrong) == 0:
+ print("Everything seems to be fine.")
+ else:
+ print("These "+str(len(genesWrong))+" (out of "+str(len(genesSeen))+") have wrong GENE EXPRESSION results:")
+ trCount = 0;
+ genesStr = "";
+ for it in genesWrong:
+ genesStr += it+" ";
+ trCount+=len(g2ts[it]);
+ print(genesStr);
+ print("These "+str(trCount)+" transcripts have wrong WITHIN GENE EXPRESSION results:");
+ trsStr = "";
+ for it in genesWrong:
+ for trit in g2ts[it]:
+ trsStr += trit+" ";
+ print(trsStr)
+
+if __name__ == "__main__":
+ if len(sys.argv) <2:
+ sys.exit("Please provide file name as argument.");
+ print("Checking file "+sys.argv[1]);
+ checkFile(sys.argv[1]);
+
+
diff --git a/common.cpp b/common.cpp
new file mode 100644
index 0000000..4685b13
--- /dev/null
+++ b/common.cpp
@@ -0,0 +1,22 @@
+#include <cstdlib>
+#include <string>
+
+#include "common.h"
+
+using namespace std;
+
+void buildTime(char *argv0, string compileDate, string compileTime, const char* version){
+#ifdef BIOC_BUILD
+ return ; // dont want to print compile information
+#endif
+ message("### %s (version: %s) build: %s %s\n",argv0, version, compileDate.c_str(),compileTime.c_str());
+}
+
+bool progressLog(long cur,long outOf, long steps, char nl) {
+ // output progress status every 10%
+ if((outOf>steps)&&(cur%((long)(outOf/steps))==0)&&(cur!=0)){
+ message("# %ld done.%c",cur,nl);
+ return true;
+ }
+ return false;
+}
diff --git a/common.h b/common.h
new file mode 100644
index 0000000..c8e3b34
--- /dev/null
+++ b/common.h
@@ -0,0 +1,41 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#include<string>
+
+using std::string;
+
+const char bitseq_version[] = BS_VERSION;
+
+#ifdef BIOC_BUILD
+
+#include <R.h>
+#include <R_ext/Utils.h>
+
+#define R_INTERUPT R_CheckUserInterrupt()
+
+#define message(...) Rprintf(__VA_ARGS__)
+#define messageF(...) Rprintf(__VA_ARGS__)
+#define messageFlush()
+
+const long samplesAtOnce = 50;
+
+#else
+
+#include<cstdio>
+
+#define R_INTERUPT
+
+#define message(...) printf(__VA_ARGS__)
+#define messageF(...) {printf(__VA_ARGS__);fflush(stdout);}
+#define messageFlush() fflush(stdout)
+#define warning(...) {fprintf(stderr,"WARNING: ");fprintf(stderr, __VA_ARGS__);}
+#define error(...) {fprintf(stderr,"ERROR: ");fprintf(stderr, __VA_ARGS__);}
+
+#endif
+
+void buildTime(char *argv0, string compileDate, string compileTime, const char *version = bitseq_version);
+
+bool progressLog(long cur,long outOf, long steps = 10, char nl = '\n');
+
+#endif
diff --git a/convertSamples.cpp b/convertSamples.cpp
new file mode 100644
index 0000000..edd3cae
--- /dev/null
+++ b/convertSamples.cpp
@@ -0,0 +1,197 @@
+#include <cmath>
+#include <iomanip>
+#include <fstream>
+
+using namespace std;
+
+#include "ArgumentParser.h"
+#include "common.h"
+#include "FileHeader.h"
+#include "misc.h"
+#include "TranscriptInfo.h"
+
+namespace ns_convertS {
+double r2c(double sample, double norm, double len){
+ return sample * norm * len;
+}
+double c2r(double sample, double norm, double len){
+ return sample / norm / len;
+}
+double t2rl(double sample, double Lnorm, double len){
+ return log(sample / len) + Lnorm;
+}
+double norm(double sample, double norm, double len = 1){
+ return sample * norm;
+}
+double logNorm(double sample, double Lnorm, double len = 1){
+ return log(sample) + Lnorm;
+}
+}
+
+int main(int argc,char* argv[]){
+ string programDescription=
+"Converts or normalizes MCMC expression samples.\n\
+ [sampleFile] should contain transposed MCMC samples.";
+ // Set options {{{
+ ArgumentParser args(programDescription,"[sampleFile]",1);
+ args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
+ string actionDesc =
+"Action to perform options:\n\
+ T2R - theta to rpkm\n\
+ R2T - rpkm to theta\n\
+ T2RL - theta to log-rpkm\n\
+ C2R - counts to rpkm\n\
+ R2C - rpkm 2 counts\n\
+ NORM - normalize (samples are multiplied by Nmap)\n\
+ LOGNORM - log+normalize (samples are multiplied by Nmap and logged).";
+ args.addOptionS("a","action","action",1,actionDesc);
+ args.addOptionD("","Nmap","Nmap",0,"Total number of aligned reads. Or a normalization constant, when normalizing.");
+ args.addOptionS("t","trInfoFile","trInfoFileName",0,"File containing transcript information.");
+ if(!args.parse(argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ string action = args.getS("action");
+ if(! ((action=="T2R")||(action=="T2RL")||(action=="R2T")||(action=="C2R")||
+ (action=="R2C")||(action=="NORM")||(action=="LOGNORM"))){
+ error("Main: Unknown action: %s.\n",action.c_str());
+ return 1;
+ }
+
+ // }}}
+
+ long M=0,i,j,m,N;
+ double Nmap=0;
+ // Check Nmap //{{{
+ if(args.isSet("Nmap")){
+ Nmap=args.getD("Nmap");
+ if((action=="T2R")||(action=="T2RL")||(action=="R2T")){
+ warning("Main: Using %lf as normalization constant, are you sure about this?\n",Nmap);
+ }
+ }else{
+ if((action=="C2R")||(action=="R2C")){
+ error("Main: Need Nmap (total number of mapped reads) for converting from/to counts.\n");
+ return 1;
+ }
+ if((action=="NORM")||(action=="LOGNORM")){
+ error("Main: Need Nmap (normalization constant) for normalization.\n");
+ return 1;
+ }
+ }
+ //}}}
+ // T2R is just C2R with Nmap = 1. //{{{
+ if(action=="T2R"){
+ action="C2R";
+ if(!args.isSet("Nmap"))Nmap = 1;
+ }
+ if(action=="R2T"){
+ action="R2C";
+ if(!args.isSet("Nmap"))Nmap = 1;
+ }
+ //}}}
+ bool trans;
+ ifstream inFile;
+ FileHeader fh;
+ string geName,trName;
+ TranscriptInfo trInfo;
+
+ // Load TR file if necessary {{{
+ if(!((action=="NORM")||(action=="LOGNORM"))){
+ if((! args.isSet("trInfoFileName")) || (! trInfo.readInfo(args.getS("trInfoFileName")))){
+ error("Main: Transcript info file read failed. Please provide valid file with --trInfoFile option.\n");
+ return 1;
+ }
+ M=trInfo.getM();
+ } //}}}
+
+ ofstream outFile;
+ if(!ns_misc::openOutput(args,&outFile))return 1;
+
+ inFile.open(args.args()[0].c_str());
+ fh.setFile(&inFile);
+ if(!fh.samplesHeader(&N,&m,&trans)){//{{{
+ error("Main: Unable to open samples file.\n");
+ return 1;
+/* }else if((trans)&&(! ((action=="--RPKMtoCOVERAGE")||(action=="-R2C")) )){
+ error("File should not be transposed");
+ return 0;*/ //}}}
+ }else if((m==0)||((M!=0)&&(M!=m))){ //{{{
+ error("Main: Wrong number of transcripts %ld vs %ld.\n",M,m);
+ return 1;
+ }//}}}
+ M=m;
+ outFile<<"# "<<args.args()[0];
+ if((action=="LOGNORM")||(action=="T2RL"))outFile<<"\n# L ";
+ if(trans) outFile<<"\n# T (M rows,N cols)";
+ else outFile<<"\n# (N rows,M cols)";
+ outFile<<"\n# M "<<M<<"\n# N "<<N<<endl;
+ outFile.precision(9);
+ outFile<<scientific;
+
+ double sample;
+ double (*comp)(double a, double b, double c)=NULL;
+ double normC=1;
+ if(action=="R2C"){
+ normC = 1e-9*Nmap;
+ comp = &ns_convertS::r2c;
+ } else if(action=="C2R"){
+ normC = 1e-9*Nmap;
+ comp = &ns_convertS::c2r;
+ } else if(action=="T2RL"){
+ if(args.isSet("Nmap")) normC = log(Nmap * 1e9);
+ else normC = log(1e9);
+ comp = &ns_convertS::t2rl;
+ } else if(action=="NORM"){
+ normC = Nmap;
+ comp = &ns_convertS::norm;
+ } else if(action=="LOGNORM"){
+ normC = log(Nmap);
+ comp = &ns_convertS::logNorm;
+ } else {
+ error("Something went wrong.\n");
+ return 1;
+ }
+ if(!((action=="NORM")||(action=="LOGNORM"))){
+ if(trans){
+ for(j=0;j<M;j++){
+ for(i=0;i<N-1;i++){
+ inFile>>sample;
+ outFile<<comp(sample,normC,trInfo.effL(j))<<" ";
+ }
+ inFile>>sample;
+ outFile<<comp(sample,normC,trInfo.effL(j))<<endl;
+ }
+ }else{
+ for(i=0;i<N;i++){
+ for(j=0;j<M-1;j++){
+ inFile>>sample;
+ outFile<<comp(sample,normC,trInfo.effL(j))<<" ";
+ }
+ inFile>>sample;
+ outFile<<comp(sample,normC,trInfo.effL(j))<<endl;
+ }
+ }
+ }else{
+ if(trans){
+ for(j=0;j<M;j++){
+ for(i=0;i<N-1;i++){
+ inFile>>sample;
+ outFile<<comp(sample,normC,1)<<" ";
+ }
+ inFile>>sample;
+ outFile<<comp(sample,normC,1)<<endl;
+ }
+ }else{
+ for(i=0;i<N;i++){
+ for(j=0;j<M-1;j++){
+ inFile>>sample;
+ outFile<<comp(sample,normC,1)<<" ";
+ }
+ inFile>>sample;
+ outFile<<comp(sample,normC,1)<<endl;
+ }
+ }
+ }
+ inFile.close();
+ outFile.close();
+ if(args.verbose)message("Done.\n");
+ return 0;
+}
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index f004316..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,14 +0,0 @@
-bitseq (0.7.0+dfsg-1) UNRELEASED; urgency=medium
-
- * Initial upload to Debian
-
- -- Andreas Tille <tille at debian.org> Wed, 01 Jan 2014 19:04:47 +0100
-
-bitseq (0.4.3-0ubuntu2) precise; urgency=low
-
- * Initial release.
- * Build against stock libbam (only static just now)
- * Build against stock libboost
- * For now, leave the names of the binaries as-is
-
- -- Tim Booth <tbooth at ceh.ac.uk> Mon, 11 Feb 2013 09:59:05 +0000
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 93524dc..0000000
--- a/debian/control
+++ /dev/null
@@ -1,30 +0,0 @@
-Source: bitseq
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Tim Booth <tbooth at ceh.ac.uk>,
- Andreas Tille <tille at debian.org>
-Section: science
-Priority: optional
-Build-Depends: debhelper (>= 9),
- zlib1g-dev,
- libbam-dev,
- libboost-dev,
- help2man
-Standards-Version: 3.9.5
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/bitseq/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/bitseq/trunk/
-Homepage: https://github.com/BitSeq/BitSeq
-
-Package: bitseq
-Architecture: any
-Depends: ${shlibs:Depends},
- ${misc:Depends},
- python
-Suggests: samtools
-Description: Bayesian Inference of Transcripts from Sequencing Data
- BitSeq is an application for inferring expression levels of individual
- transcripts from sequencing (RNA-Seq) data and estimating differential
- expression (DE) between conditions. An advantage of this approach is the
- ability to account for both technical uncertainty and intrinsic biological
- variance in order to avoid false DE calls. The technical contribution to the
- uncertainty comes both from finite read-depth and the possibly ambiguous
- mapping of reads to multiple transcripts.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 39c351f..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,22 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: BitSeq
-Upstream-Contact: Peter Glaus <glaus at cs.man.ac.uk>
-Source: https://github.com/BitSeq/BitSeq/releases
-Files-Excluded: boost/ samtools/
-
-Files: *
-Copyright: © 2013 Peter Glaus
-License: Artistic-2
-
-Files: debian/*
-Copyright: © 2013 Tim Booth <tbooth at ceh.ac.uk>
- © 2013 Andreas Tille <tille at debian.org>
-License: Artistic-2
-
-License: Artistic-2
- This program is free software; you can redistribute it and/or modify
- it under the terms of the Artistic License, which comes with Perl.
- .
- On Debian systems, the complete text of the Artistic License can be
- found in `/usr/share/common-licenses/Artistic'.
-
diff --git a/debian/patches/hardening.patch b/debian/patches/hardening.patch
deleted file mode 100644
index cf2d122..0000000
--- a/debian/patches/hardening.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-Author: Andreas Tille <tille at debian.org>
-Last-Change: Wed, 01 Jan 2014 19:04:47 +0100
-Description: Propagate hardening options
-
---- BitSeq-0.7.0.orig/Makefile
-+++ BitSeq-0.7.0/Makefile
-@@ -6,8 +6,8 @@
-
-
- COFLAGS = $(ARCH) -O2 -pipe
--CXXFLAGS = -DBS_VERSION=\"$(VERSION)\" -Wall $(COFLAGS)
--LDFLAGS = -Wl,-gc-sections
-+CXXFLAGS += -DBS_VERSION=\"$(VERSION)\" -Wall $(COFLAGS)
-+LDFLAGS += -Wl,-gc-sections
- BOOSTFLAGS = -I .
- OPENMP = -fopenmp -DSUPPORT_OPENMP
-
diff --git a/debian/patches/link_against_system_samtools.patch b/debian/patches/link_against_system_samtools.patch
deleted file mode 100644
index 5c6d282..0000000
--- a/debian/patches/link_against_system_samtools.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-Author: Tim Booth <tbooth at ceh.ac.uk>
-Last-Update: Wed, 01 Jan 2014 19:04:47 +0100 (by Andreas Tille)
-Description: link against Debian packaged samtools
-
---- BitSeq-0.7.0.orig/ReadDistribution.h
-+++ BitSeq-0.7.0/ReadDistribution.h
-@@ -21,8 +21,8 @@
-
- #else
-
--#include "bam.h"
--#include "sam.h"
-+#include <samtools/bam.h>
-+#include <samtools/sam.h>
-
- //#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
- /*
---- BitSeq-0.7.0.orig/Makefile
-+++ BitSeq-0.7.0/Makefile
-@@ -63,8 +63,8 @@
- getWithinGeneExpression: getWithinGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o
- $(CXX) $(CXXFLAGS) $(LDFLAGS) getWithinGeneExpression.cpp $(COMMON_DEPS) PosteriorSamples.o TranscriptInfo.o -o getWithinGeneExpression
-
--parseAlignment: parseAlignment.cpp $(COMMON_DEPS) ReadDistribution.o samtools/sam.o TranscriptExpression.o TranscriptInfo.o TranscriptSequence.o
-- $(CXX) $(CXXFLAGS) $(OPENMP) $(LDFLAGS) -pthread -Isamtools parseAlignment.cpp $(COMMON_DEPS) ReadDistribution.o samtools/*.o TranscriptExpression.o TranscriptInfo.o TranscriptSequence.o -lz -o parseAlignment
-+parseAlignment: parseAlignment.cpp $(COMMON_DEPS) ReadDistribution.o TranscriptExpression.o TranscriptInfo.o TranscriptSequence.o
-+ $(CXX) $(CXXFLAGS) $(OPENMP) $(LDFLAGS) -pthread parseAlignment.cpp $(COMMON_DEPS) ReadDistribution.o TranscriptExpression.o TranscriptInfo.o TranscriptSequence.o -lz -lbam -o parseAlignment
-
- transposeLargeFile: transposeLargeFile.cpp $(COMMON_DEPS) transposeFiles.o
- $(CXX) $(CXXFLAGS) $(LDFLAGS) transposeLargeFile.cpp $(COMMON_DEPS) transposeFiles.o -o transposeLargeFile
-@@ -92,7 +92,7 @@
- $(CXX) $(CXXFLAGS) -ffunction-sections -fdata-sections -c PosteriorSamples.cpp
-
- ReadDistribution.o: ReadDistribution.cpp ReadDistribution.h TranscriptExpression.h TranscriptInfo.h TranscriptSequence.h
-- $(CXX) $(CXXFLAGS) $(OPENMP) -Isamtools -c ReadDistribution.cpp
-+ $(CXX) $(CXXFLAGS) $(OPENMP) -c ReadDistribution.cpp
-
- Sampler.o: Sampler.cpp Sampler.h GibbsParameters.h
- $(CXX) $(CXXFLAGS) $(BOOSTFLAGS) -c Sampler.cpp
-@@ -116,10 +116,7 @@
- asa103/asa103.o:
- make --directory asa103
-
--samtools/sam.o:
-- make --directory samtools
--
- # CLEAN:
- clean:
-- rm asa103/*.o samtools/*.o *.o $(PROGRAMS)
-+ rm -f asa103/*.o samtools/*.o *.o $(PROGRAMS)
-
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 7d71a4c..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1,2 +0,0 @@
-link_against_system_samtools.patch
-hardening.patch
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 09fc5bb..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/make -f
-
-# Uncomment this to turn on verbose mode.
-export DH_VERBOSE=1
-
-pkg := $(shell dpkg-parsechangelog | sed -n 's/^Source: //p')
-version=$(shell dpkg-parsechangelog -ldebian/changelog | grep Version: | cut -f2 -d' ' | cut -f1 -d- )
-mandir=$(CURDIR)/debian/$(pkg)/usr/share/man/man1/
-bindir=$(CURDIR)/debian/$(pkg)/usr/bin/
-
-%:
- dh $@
-
-override_dh_installman:
- # try to create man pages whereever possible
- mkdir -p $(mandir)
- help2man --no-info --name='convert or normalize MCMC expression samples' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/convertSamples > $(mandir)/convertSamples.1
- help2man --no-info --name='this bitseq module estimates differential expression from data sets' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/estimateDE > $(mandir)/estimateDE.1
- help2man --no-info --name='estimate expression given precomputed probabilities of (observed) reads'' alignments' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/estimateExpression > $(mandir)/estimateExpression.1
- help2man --no-info --name='estimate expression dependent hyperparameters for bitseq' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/estimateHyperPar > $(mandir)/estimateHyperPar.1
- help2man --no-info --name='estimate expression given precomputed probabilities of (observed) reads'' alignments' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/estimateVBExpression > $(mandir)/estimateVBExpression.1
- help2man --no-info --name='extract MCMC samples of selected transcripts' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/extractSamples > $(mandir)/extractSamples.1
- help2man --no-info --name='computes log_2 Fold Change from MCMC expression samples' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/getFoldChange > $(mandir)/getFoldChange.1
- help2man --no-info --name='compute expression of whole genes' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/getGeneExpression > $(mandir)/getGeneExpression.1
- help2man --no-info --name='compute PPLR from MCMC expression samples' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/getPPLR > $(mandir)/getPPLR.1
- help2man --no-info --name='estimates variance of MCMC samples' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/getVariance > $(mandir)/getVariance.1
- help2man --no-info --name='compute relative expression of transcripts within genes' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/getWithinGeneExpression > $(mandir)/getWithinGeneExpression.1
- help2man --no-info --name='pre\-compute probabilities of (observed) reads'' alignments' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/parseAlignment > $(mandir)/parseAlignment.1
- help2man --no-info --name='helper for bitseq to transpose files lines and columns' \
- --version-string="$(version)" --no-discard-stderr \
- $(bindir)/transposeLargeFile > $(mandir)/transposeLargeFile.1
-
-override_dh_auto_install:
- find -maxdepth 1 -type f -perm /111 -exec dh_install '{}' /usr/bin ';'
-
-get-orig-source:
- mkdir -p ../tarballs
- uscan --verbose --force-download --repack-compression xz --destdir=../tarballs
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index f22b3bb..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,4 +0,0 @@
-version=4
-
-opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,repack,compression=xz" \
- https://github.com/BitSeq/BitSeq/releases .*/archive/v at ANY_VERSION@@ARCHIVE_EXT@
diff --git a/estimateDE.cpp b/estimateDE.cpp
new file mode 100644
index 0000000..d81be6b
--- /dev/null
+++ b/estimateDE.cpp
@@ -0,0 +1,326 @@
+/*
+ * Original model applying the DE model to individual sets of samples independently.
+ * One set of samples == 1 sample from each replicate of each condition.
+ */
+#include<algorithm>
+#include<cmath>
+#include<fstream>
+#include<sstream>
+#include "boost/random/gamma_distribution.hpp"
+#include "boost/random/mersenne_twister.hpp"
+#include "boost/random/normal_distribution.hpp"
+
+using namespace std;
+
+#include "ArgumentParser.h"
+#include "misc.h"
+#include "MyTimer.h"
+#include "PosteriorSamples.h"
+
+#include "common.h"
+
+//#define PERCENT 0.9
+
+#define LAMBDA_0 2.0
+
+using ns_params::paramT;
+
+namespace ns_estimateDE {
+
+// Open and write headers into appropriate output files.
+// The size of outFiles[] should be C+1.
+// Returns true if everything went OK.
+bool initializeOutputFile(long C, long M, long N, const ArgumentParser &args, ofstream *outF, ofstream outFiles[]);
+// For a given mean expression expr finds alpha and beta for which were estimated for a closes expression.
+void getParams(double expr,const vector<paramT> ¶ms, paramT *par);
+// Read transcript m into tr and prepare mu_0 and mu_00, cond does not really change.
+void readNextTranscript(long m, long C, long N, Conditions *cond, const vector<paramT> ¶ms, vector<vector<vector<double> > > *tr, vector<paramT> *curParams, double *mu_00);
+
+}
+
+extern "C" int estimateDE(int *argc,char* argv[]){
+string programDescription =
+"Estimate differential expression from the dataset.\n\
+ [sampleFiles] should contain transposed MCMC samples from replicates.\n\
+ To distinguish conditions use C between them e.g.:\n\
+ samplesC1-R1.rpkm samplesC1-R2.rpkm C samplesC2-R1.rpkm samplesC2-R2.rpkm";
+ // Intro: {{{
+ ArgumentParser args(programDescription,"[sampleFiles]",1);
+ args.addOptionS("o","outPrefix","outFilePrefix",1,"Prefix for the output files.");
+ args.addOptionS("p","parameters","parFileName",1,"File containing estimated hyperparameters.");
+ args.addOptionB("s","samples","samples",0,"Produce samples of condition mean expression apart from PPLR and confidence.");
+ args.addOptionD("l","lambda0","lambda0",0,"Parameter lambda_0.",LAMBDA_0);
+ args.addOptionD("","confidenceInterval","cf",0,"Percentage for confidence intervals.", 95);
+ args.addOptionS("","norm","normalization",0,"Normalization constants for each input file provided as comma separated list of doubles (e.g. 1.0017,1.0,0.9999 ).");
+ args.addOptionL("","seed","seed",0,"Random initialization seed.");
+ if(!args.parse(*argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ //}}}
+ /*
+ * N - number of samples in one replicate (the smallest number for replicates with different N_r)
+ * M - number of transcripts
+ * C - number of conditions
+ */
+ long C,M,N;
+ vector<paramT> params;
+ Conditions cond;
+ // Open file with hyper parameters and read those in.
+ if(!ns_params::readParams(args.getS("parFileName"), ¶ms)) return 1;
+ if(args.verb())message("Parameters loaded.\n");
+ // Initialize sample files handled by object cond.
+ if(!ns_misc::readConditions(args, &C, &M, &N, &cond)) return 1;
+ // Initialize output files.
+ ofstream outF;
+ ofstream *outFiles = new ofstream[C+1];
+ // Use standard array as we don't want to bother with vector of pointers.
+ if(!ns_estimateDE::initializeOutputFile(C, M, N, args, &outF, outFiles)) return 1;
+
+ // variables {{{
+ vector<vector<vector<double> > > tr(C);
+ vector<paramT> curParams(C);
+ vector<vector<double> > samples(C,vector<double>(N));
+ vector<double> vars(N);
+ vector<double> mu_c(C);
+// vector<vector<double> > mus(C,vector<double>(N,0));
+// vector<double> vars(N);
+ long c,c2,m,n,r;
+ double prec,var,sum,sumSq,alpha,beta,betaPar,mu_00,normMu;
+ double lambda0 = args.getD("lambda0");
+ long RC;
+ MyTimer timer;
+ boost::random::mt11213b rng_mt(ns_misc::getSeed(args));
+ boost::random::gamma_distribution<long double> gammaDistribution;
+ typedef boost::random::gamma_distribution<long double>::param_type gDP;
+ boost::random::normal_distribution<long double> normalDistribution;
+ typedef boost::random::normal_distribution<long double>::param_type nDP;
+ double log2FC, pplr, ciLow, ciHigh;
+ vector<double> difs(N);
+ // }}}
+
+ if(args.verbose){ //{{{
+ timer.split();
+ message("Sampling condition mean expression.\n");
+ }//}}}
+ for(m=0;m<M;m++){
+ if(progressLog(m,M,10,' '))timer.split();
+ // Read into tr and assign hyperparameters into curParams, initialize mu_00.
+ // cond does not really change, just reads more data from file.
+ ns_estimateDE::readNextTranscript(m, C, N, &cond, params, &tr, &curParams, &mu_00);
+ // Zero "mean condition mean expression".
+ mu_c.assign(C,0);
+ // Sample condition mean expressions {{{
+ for(n=0;n<N;n++){
+ for(c=0;c<C;c++){
+ RC = cond.getRC(c);
+ alpha = curParams[c].alpha + RC / 2.0;
+ betaPar = lambda0*mu_00*mu_00;
+
+ sum=0;
+ sumSq=0;
+ for(r=0;r< RC;r++){
+ sum += tr[c][r][n];
+ sumSq += tr[c][r][n]*tr[c][r][n];
+ }
+ betaPar += sumSq - (lambda0*mu_00 + sum)*(lambda0*mu_00 + sum) /
+ (lambda0 + RC);
+ normMu= (lambda0*mu_00 + sum) / (lambda0 + RC);
+ beta = curParams[c].beta + betaPar / 2 ;
+ // Set parameters of gamma distribution.
+ gammaDistribution.param(gDP(alpha, 1.0/beta));
+ // Sample precision.
+ prec = gammaDistribution(rng_mt);
+ // Variance, the precision is scaled by (lambda0+RC).
+ var = 1/(prec *(lambda0 + RC));
+ vars[n] = var;
+
+ // Set parameter for normal distribution.
+ normalDistribution.param(nDP(normMu, sqrt(var)));
+ // Sample condition mean.
+ samples[c][n] = normalDistribution(rng_mt);
+ mu_c[c] += samples[c][n];
+ }
+ R_INTERUPT;
+ }
+ // }}}
+ // Compute condition mean for each condition.
+ for(c=0;c<C;c++) mu_c[c] /= N;
+ // Calculate and write pplr for each pair of conditions. {{{
+ for(c=0;c<C;c++){
+ for(c2=c+1;c2<C;c2++){
+ pplr = 0;
+ for(n=0;n<N;n++)
+ if(samples[c2][n] > samples[c][n])pplr+=1;
+ pplr/=N;
+ outF<<pplr<<" ";
+ }
+ }
+ // }}}
+ // Calculate log2FC; write log2FC and CIs for each pair of conditions. {{{
+ for(c=0;c<C;c++){
+ for(c2=c+1;c2<C;c2++){
+ for(n=0;n<N;n++)
+ difs[n] = samples[c2][n]-samples[c][n];
+ ns_misc::computeCI(args.getD("cf"), &difs, &ciLow, &ciHigh);
+ ciLow /= log(2);
+ ciHigh /= log(2);
+ log2FC = (mu_c[c2] - mu_c[c])/log(2);
+ outF<<log2FC<<" "<<ciLow<<" "<<ciHigh<<" ";
+ }
+ }
+ // }}}
+ // Write logged condition mean for each condition. No space before EOL. {{{
+ for(c = 0; c < C-1; c++)outF<<mu_c[c]<<" ";
+ outF<<mu_c[C-1]<<endl;
+ // }}}
+ // Write samples if necessary. {{{
+ if(args.flag("samples")){
+ for(c=0;c<C;c++){
+ for(n=0;n<N;n++)outFiles[c]<<samples[c][n]<<" ";
+ outFiles[c]<<endl;
+ }
+ // Save sampled variance as well.
+ for(n=0;n<N;n++) outFiles[C]<<vars[n]<<" ";
+ outFiles[C]<<endl;
+ }//}}}
+ }
+ // Close and exit {{{
+ if(args.flag("samples")){
+ for(c=0;c<C+1;c++)outFiles[c].close();
+ }
+ outF.close();
+ if(args.verbose)message("DONE\n");
+ // }}}
+ return 0;
+}
+
+#ifndef BIOC_BUILD
+int main(int argc,char* argv[]){
+ return estimateDE(&argc,argv);
+}
+#endif
+
+namespace ns_estimateDE {
+
+bool initializeOutputFile(long C, long M, long N, const ArgumentParser &args, ofstream *outF, ofstream outFiles[]){//{{{
+ if(args.flag("samples")){
+ // If samples flag is set, then write condition mean expression samples into -C?.est files.
+ // Also write variance samples into samples file.
+ stringstream fnStream;
+ string fileName;
+ // Initialize samples files.
+ for(long c=0;c<C;c++){
+ fnStream.str("");
+ fnStream<<args.getS("outFilePrefix")<<"-C"<<c<<".est";
+ fileName = fnStream.str();
+ outFiles[c].open(fileName.c_str());
+ if(! outFiles[c].is_open()){
+ error("Unable to open output file %s\n",fileName.c_str());
+ return false;
+ }
+ // Write header for samples file.
+ outFiles[c]<<"# Inferred condition mean log expression.\n"
+ "# condition "<<c+1
+ <<"\n# ";
+ for(long i=0;i<(long)args.args().size();i++){
+ outFiles[c]<<args.args()[i]<<" ";
+ }
+ outFiles[c]<<"\n# lambda_0 "<<args.getD("lambda0")<<"\n# T (Mrows_Ncols) L (logged)\n# M "<<M<<"\n# N "<<N<<endl;
+ }
+ // Initialize file for variances.
+ string varFileName = args.getS("outFilePrefix")+".estVar";
+ outFiles[C].open(varFileName.c_str());
+ if(! outFiles[C].is_open()){
+ error("Unable to open output file %s\n",varFileName.c_str());
+ return false;
+ }
+ // Write header for variance file.
+ outFiles[C]<<"# Inferred variances in last condition.\n"
+ "# lambda_0 "<<args.getD("lambda0")
+ <<"\n# T \n# M "<<M<<"\n# N "<<N
+ <<endl;
+ }
+ // Initialize PPLR file.
+ string outFileName = args.getS("outFilePrefix")+".pplr";
+ outF->open(outFileName.c_str());
+ if(! outF->is_open()){
+ error("Unable to open output file %s\n",outFileName.c_str());
+ return false;
+ }
+ // Write header for PPLR file.
+ *outF<<"# ";
+ for(long i=0;i<(long)args.args().size();i++){
+ *outF<<args.args()[i]<<" ";
+ }
+ *outF<<"\n# lambda_0 "<<args.getD("lambda0")<<"\n# T \n# M "<<M<<"\n# N "<<N<<"\n"
+ <<"# Conditions: C "<<C<<" Condition pairs("<<C*(C-1)/2<<"): ";
+ for(long c=0;c<C;c++)
+ for(long c2=c+1;c2<C;c2++)
+ *outF<<c+1<<"~"<<c2+1<<" ";
+ *outF<<"\n# Columns contain PPLR for each pair of conditions, "
+ "log2 fold change with confidence intervals for each pair of conditions and "
+ "log mean condition mean expression for each condition.\n"
+ "# CPxPPLR CPx(log2FC ConfidenceLow ConfidenceHigh) "
+ "Cx(log mean condition mean expressions)"
+ <<endl;
+ return true;
+}//}}}
+
+void getParams(double expr,const vector<paramT> ¶ms, paramT *par){//{{{
+ long i=0,j=params.size()-1,k;
+ if(expr<=params[0].expr){
+ par->alpha=params[0].alpha;
+ par->beta=params[0].beta;
+ return;
+ }
+ if(expr>=params[j].expr){
+ par->alpha=params[j].alpha;
+ par->beta=params[j].beta;
+ return;
+ }
+ while(j-i>1){
+ k=(i+j)/2;
+ if(params[k].expr<=expr)i=k;
+ else j=k;
+ }
+ if(expr-params[i].expr<params[j].expr-expr)k=i;
+ else k=j;
+
+ par->alpha=params[k].alpha;
+ par->beta=params[k].beta;
+}//}}}
+
+void readNextTranscript(long m, long C, long N, Conditions *cond, const vector<paramT> ¶ms, vector<vector<vector<double> > > *tr, vector<paramT> *curParams, double *mu_00){//{{{
+ double divT = 0, divC, mu_0;
+ long c,r,n,RC;
+ *mu_00 = 0;
+ for(c=0;c<C;c++){
+ mu_0=0;
+ divC=0;
+ RC = cond->getRC(c);
+ if((long)(*tr)[c].size() < RC){
+ (*tr)[c].resize( RC );
+ }
+ for(r=0;r<RC;r++){
+ if(cond->getTranscript(c, r , m, (*tr)[c][r]), N){
+ for(n=0;n<N;n++){
+ // Log the expression samples if the files don't have logged flag set.
+ if(!cond->logged())(*tr)[c][r][n] = ((*tr)[c][r][n] == 0)? ns_misc::LOG_ZERO : log ((*tr)[c][r][n] );
+ mu_0+=(*tr)[c][r][n];
+ }
+ divC+=1;
+ }else{
+ warning("Main: Condition %ld replicate %ld does not seem to have transcript %ld.\n",c,r,m);
+ }
+ }
+ R_INTERUPT;
+ if(divC>0){
+ mu_0 /= (divC * N); // take mean over all replicates
+ *mu_00+=mu_0;
+ divT++;
+ }
+ getParams(mu_0, params, &(*curParams)[c]);
+ }
+ *mu_00/=divT;
+}//}}}
+
+}
diff --git a/estimateExpression.cpp b/estimateExpression.cpp
new file mode 100644
index 0000000..489968f
--- /dev/null
+++ b/estimateExpression.cpp
@@ -0,0 +1,597 @@
+#include<algorithm>
+#include<ctime>
+#include<cmath>
+#ifdef _OPENMP
+#include<omp.h>
+#endif
+#include<sstream>
+
+#include "ArgumentParser.h"
+#include "CollapsedSampler.h"
+#include "FileHeader.h"
+#include "GibbsSampler.h"
+#include "misc.h"
+#include "MyTimer.h"
+#include "Sampler.h"
+#include "TagAlignments.h"
+#include "TranscriptInfo.h"
+#include "transposeFiles.h"
+
+#include "common.h"
+
+#define DEBUG(x)
+#define FF first
+#define SS second
+
+//#define LOG_NEED
+//#define LOG_RHAT
+TranscriptInfo trInfo;
+
+long M;//, mAll; // M : number of transcripts (include transcript 0 ~ Noise)
+//long N,
+long Nunmap; // N: number of read, un-mappable read, mappable reads
+
+vector<string> samplesFileNames;
+string failedMessage;
+
+void clearDataEE(){
+ samplesFileNames.clear();
+}
+
+TagAlignments* readData(const ArgumentParser &args) {//{{{
+ long i,j,num,tid;
+ double prb;
+ long Ntotal=0,Nmap=0,probM=0;
+ string readId,strand,blank;
+ ifstream inFile;
+ MyTimer timer;
+ TagAlignments *alignments = new TagAlignments(false);
+
+ // Read alignment probabilities {{{
+ inFile.open(args.args()[0].c_str());
+ FileHeader fh(&inFile);
+ ns_fileHeader::AlignmentFileType format;
+ if((!fh.probHeader(&Nmap,&Ntotal,&probM,&format)) || (Nmap ==0)){//{{{
+ error("Prob file header read failed.\n");
+ return NULL;
+ }//}}}
+ // Use number of transcripts from prob file if it is higher.
+ if(probM>M)M = probM;
+ message("N mapped: %ld\n",Nmap);
+ messageF("N total: %ld\n",Ntotal);
+ if(args.verb())message("Reading alignments.\n");
+ if(Ntotal>Nmap)Nunmap=Ntotal-Nmap;
+ else Nunmap=1; //no valid count file assume only one not aligned properly
+ alignments->init(Nmap,0,M);
+ long mod=10000;
+ long bad = 0;
+ timer.start();
+ for(i = 0; i < Nmap; i++) {
+ inFile>>readId>>num;
+ if(format==ns_fileHeader::OLD_FORMAT)inFile>>blank;
+ if(!inFile.good())break;
+ // message("%s %ld\n",(readId).c_str(),num);
+ for(j = 0; j < num; j++) {
+ if(format == ns_fileHeader::OLD_FORMAT)inFile>>tid>>strand>>prb;
+ else inFile>>tid>>prb;
+ if(inFile.fail()){
+ inFile.clear();
+ // ignore other read's alignments
+ j=num;
+ // this read goes to noise assigning
+ tid=0;
+ // 10 means either 10 or exp(10), but should be still be large enough
+ prb=10;
+ bad++;
+ }
+ switch(format){
+ case ns_fileHeader::OLD_FORMAT:
+ if(tid!=0) prb /= trInfo.L(tid-1);
+ case ns_fileHeader::NEW_FORMAT:
+ alignments->pushAlignment(tid, prb);
+ break;
+ case ns_fileHeader::LOG_FORMAT:
+ alignments->pushAlignmentL(tid, prb);
+ }
+ }
+ // ignore rest of line
+ inFile.ignore(10000000,'\n');
+
+ alignments->pushRead();
+
+ R_INTERUPT;
+ if(args.verb() && (i % mod == 0) && (i>0)){
+ message(" %ld ",i);
+ timer.split();
+ mod*=10;
+ }
+ }
+ if(bad>0)warning("Main: %ld reads' alignment information were corrupted.\n",bad);
+ inFile.close();
+ long Nhits,NreadsReal;
+ alignments->finalizeRead(&M, &NreadsReal, &Nhits);
+ // If the transcript info is initialized, check that the number of transcripts has not changed.
+ // The number can't be smaller as it starts off with trInfo->M
+ if((trInfo.isOK())&&(M > trInfo.getM() + 1)){
+ if(args.getS("outputType") == "rpkm"){
+ error("Main: Number of transcripts in .prob file is higher than in the .tr file (%ld %ld)!\n",M,trInfo.getM() + 1);
+ delete alignments;
+ return NULL;
+ }else{
+ warning("Main: Number of transcripts in .prob file is higher than in the .tr file (%ld %ld)!\n This can cause problems later on!\n",M,trInfo.getM() + 1);
+ }
+ }
+ //}}}
+ if(i<Nmap)message("Read only %ld reads.\n",NreadsReal);
+ message("All alignments: %ld\n",Nhits);
+ messageF("Isoforms: %ld\n",M);
+ Nmap = NreadsReal;
+ return alignments;
+ /* {{{ remapping isoforms to ignore those without any hits
+ M = mAll;
+ M = isoformsHit;
+ isoformMap.assign(M);
+ for(i=0,j=0;i<mAll;i++)
+ if(readInIsoform[i]!=-1){
+ readInIsoform[i]=j;
+ isoformMap[j]=i;
+ j++;
+ }
+ for(i=0;i<Sof(alignments);i++){
+ alignments[i].setTrId( readInIsoform[ alignments[i].getTrId() ] );
+ }
+ }}}*/
+}//}}}
+
+void MCMC(TagAlignments *alignments,gibbsParameters &gPar,ArgumentParser &args){//{{{
+ // Declarations: {{{
+ DEBUG(message("Declarations:\n"));
+ long i,j,samplesHave=0,totalSamples=0,samplesN,chainsN,samplesSave,seed;
+ pairD rMean,tmpA,tmpV,sumNorms;
+ double rH1,rH2;
+ ofstream meansFile;
+ ofstream *samplesFile = new ofstream[gPar.chainsN()];
+ MyTimer timer;
+ bool quitNext = false;
+ vector<pairD> betwVar(M),withVar(M),s2j(M),totAverage(M),av,var;
+ vector<pair<pairD,long> > rHat2(M);
+ // }}}
+ // Names: {{{
+ stringstream sstr;
+ #ifdef LOG_RHAT
+ sstr.str("");
+ sstr<<args.getS("outFilePrefix")<<".rhatLog";
+ string rhatLogFile = sstr.str();
+ #endif
+ #ifdef LOG_NEED
+ sstr.str("");
+ sstr<<args.getS("outFilePrefix")<<".effLog";
+ string effLogFile = sstr.str();
+ #endif
+ // }}}
+ // Init: {{{
+ DEBUG(message("Initialization:\n"));
+ samplesN=gPar.samplesN();
+ chainsN=gPar.chainsN();
+ samplesSave=(gPar.samplesSave()-1)/chainsN+1;
+
+ vector<Sampler*> samplers(chainsN);
+ if( ! args.flag("gibbs")){
+ for(j=0;j<chainsN;j++)
+ samplers[j] = new CollapsedSampler;
+ }else{
+ for(j=0;j<chainsN;j++)
+ samplers[j] = new GibbsSampler;
+ }
+
+ timer.start();
+ timer.start(1);
+ if(args.isSet("seed"))seed=args.getL("seed");
+ else seed = time(NULL);
+ if(args.verbose)message("seed: %ld\n",seed);
+ for(i=0;i<chainsN;i++){
+ // Init samplers
+ DEBUG(message("Sampler %ld init.\n",i);)
+ samplers[i]->noSave();
+ DEBUG(message("init\n");)
+ samplers[i]->init(M, samplesN, samplesSave, Nunmap, alignments, gPar.beta(), gPar.dir(), seed);
+ DEBUG(message(" seed: %ld\n",seed);)
+ // sampler is initialized with 'seed' and then sets 'seed' to new random seed for the next sampler
+ }
+ // parallel block:
+ // make sure that all functions used are CONST and variables are being READ or private
+ // private: samplesHave (or subCounter)
+#ifdef BIOC_BUILD
+ long samplesDo, subCounter;
+ for(samplesHave=0;samplesHave<gPar.burnIn();samplesHave+=samplesDo){
+ samplesDo = min(gPar.burnIn() - samplesHave, samplesAtOnce);
+ #pragma omp parallel for private(subCounter)
+ for(i=0;i<chainsN;i++){
+ for(subCounter=0;subCounter<samplesDo; subCounter++){
+ samplers[i]->sample();
+ }
+ }
+ // Check for interrupt out of the parallel part.
+ R_INTERUPT;
+ }
+#else
+ #pragma omp parallel for private(samplesHave)
+ for(i=0;i<chainsN;i++){
+ DEBUG(message(" burn in\n");)
+ for(samplesHave=0;samplesHave<gPar.burnIn();samplesHave++){
+ samplers[i]->sample();
+ }
+ }
+#endif
+ totalSamples = gPar.burnIn();
+ message("Burn in: %ld DONE. ",gPar.burnIn());
+ DEBUG(message(" reseting samplers after BurnIn\n"));
+ for(i=0;i<chainsN;i++){
+ samplers[i]->resetSampler(samplesN);
+ }
+ timer.split(0,'m');
+ //}}}
+ // Main sampling loop:
+ while(1){
+ timer.start();
+ // Sample: {{{
+ // parallel block:
+ // make sure that all functions used are CONST and variables are being READ or private
+ // private: samplesHave (or subCounter)
+#ifdef BIOC_BUILD
+ for(samplesHave=0;samplesHave<samplesN;samplesHave+=samplesDo){
+ samplesDo = min(samplesN - samplesHave, samplesAtOnce);
+ #pragma omp parallel for private(subCounter)
+ for(i=0;i<chainsN;i++){
+ for(subCounter=0;subCounter<samplesDo; subCounter++){
+ samplers[i]->sample();
+ samplers[i]->update();
+ }
+ }
+ // Check for interrupt out of the parallel part.
+ R_INTERUPT;
+ }
+#else
+ #pragma omp parallel for private(samplesHave)
+ for(i=0;i<chainsN;i++){
+ for(samplesHave = 0;samplesHave<samplesN;samplesHave++){
+ samplers[i]->sample();
+ samplers[i]->update();
+ }
+ }
+#endif
+ totalSamples += samplesN;
+ message("\nSampling DONE. ");
+ timer.split(0,'m');
+ //}}}
+ // Check for change of parameters: {{{
+ gPar.readParameters();
+ // }}}
+ // Compute convergence statistics {{{
+ totAverage.assign(M,pairD(0,0));
+ betwVar.assign(M,pairD(0,0));
+ withVar.assign(M,pairD(0,0));
+ // Norms for sums (used for variance and mean), should be same for all
+ // samplers and all transcripts.
+ sumNorms = samplers[0]->getSumNorms();
+ samplesHave = (long)sumNorms.FF;
+ for(i=0;i<M;i++){
+ for(j=0;j<chainsN;j++){
+ tmpA = samplers[j]->getAverage(i);
+ tmpV = samplers[j]->getWithinVariance(i);
+ totAverage[i].FF += tmpA.FF;
+ totAverage[i].SS += tmpA.SS;
+ withVar[i].FF += tmpV.FF;
+ withVar[i].SS += tmpV.SS;
+ }
+ totAverage[i].FF /= chainsN;
+ totAverage[i].SS /= chainsN;
+ withVar[i].FF /= chainsN;
+ withVar[i].SS /= chainsN;
+ for(j=0;j<chainsN;j++){
+ tmpA = samplers[j]->getAverage(i);
+ betwVar[i].FF += (totAverage[i].FF - tmpA.FF)*(totAverage[i].FF - tmpA.FF);
+ betwVar[i].SS += (totAverage[i].SS - tmpA.SS)*(totAverage[i].SS - tmpA.SS);
+ }
+ betwVar[i].FF /= (chainsN-1.0);
+ betwVar[i].SS /= (chainsN-1.0);
+ }
+ for(i=0;i<M;i++){
+ // betwVar[i] *= samplesHave / (chainsN - 1.0);
+ rHat2[i].SS=i;
+ if(withVar[i].FF == 0 ){
+ rHat2[i].FF.FF = 0;
+ rHat2[i].FF.SS = 0;
+ } else {
+ // First 'column' is Rhat of logit(theta).
+ rHat2[i].FF.FF = (sumNorms.SS - 1.0) / sumNorms.SS + betwVar[i].SS / withVar[i].SS ;
+ rHat2[i].FF.SS = (sumNorms.FF - 1.0) / sumNorms.FF + betwVar[i].FF / withVar[i].FF ;
+ //betwVar[i] / ( samplesHave * withVar[i] );
+ }
+ }
+ sort(rHat2.rbegin(),rHat2.rend());
+ message("rHat (for %ld samples) \n",samplesN);
+ rMean.FF=0;
+ rMean.SS=0;
+ message(" rHat (rH theta| tid | mean theta)\n");
+ for(i=0;(i<10) && (i<M);i++){
+ rH1 = sqrt(rHat2[i].FF.FF);
+ rH2 = sqrt(rHat2[i].FF.SS);
+ rMean.FF+=rH1;
+ rMean.SS+=rH2;
+// message(" %lf (%lf | %ld | %lf|%lf|%lf)",rHat2[i].FF.FF,rHat2[i].FF.SS,rHat2[i].SS,totAverage[rHat2[i].SS].FF,withVar[rHat2[i].SS].FF,betwVar[rHat2[i].SS].FF/samplesHave);
+ if((i<3) || args.verbose){
+ message(" %7.4lf (%7.4lf | %6ld | %8.5lf)",rH1,rH2,rHat2[i].SS-1,totAverage[rHat2[i].SS].FF);
+ message("\n");
+ }
+// message(" %lf",sqrt(rHat2[i].FF));
+ }
+ rMean.FF /= 10.0;
+ rMean.SS /= 10.0;
+ message(" Mean rHat of worst 10 transcripts: %lf\n",rMean.FF);
+ if(args.flag("scaleReduction"))message(" (target: %.3lf)\n",gPar.targetScaleReduction());
+ message(" Mean C0: (");
+ for(j=0;j<chainsN;j++)message("%ld ",samplers[j]->getAverageC0());
+ message("). Nunmap: %ld\n",Nunmap);
+ if(args.flag("gibbs"))message(" Mean thetaAct (noise parameter)\n %lf\n",totAverage[0].FF);
+ messageF("\n");
+ //}}}
+ // Log rHat if necessary. {{{
+ #ifdef LOG_RHAT
+ ofstream rhatLog(rhatLogFile.c_str(), ofstream::app);
+ rhatLog<<totalSamples<<" "<<(long)timer.getTime(1);
+ for(i=1;i<M;i++){
+ rhatLog<<" "<<sqrt(rHat2[i].FF.FF);
+ }
+ rhatLog<<endl;
+ rhatLog.close();
+ #endif
+ // }}}
+ // Increase sample size and start over: {{{
+ if(quitNext){// Sampling iterations end {{{
+ if(sqrt(rHat2[0].FF.FF) > gPar.targetScaleReduction()){
+ message("WARNING: Following transcripts failed to converge entirely\n (however the estimates might still be usable):\n");
+ long countUncoverged=0;
+ sstr.str("");
+ sstr<<"# unconverged_transcripts: ";
+ for(i=0;(i<M) && (sqrt(rHat2[i].FF.FF) > gPar.targetScaleReduction());i++){
+ sstr<<rHat2[i].SS<<" ("<<sqrt(rHat2[i].FF.FF)<<") ";
+ countUncoverged++;
+ if(args.verbose)message(" %s( %ld , %lf )\n",(trInfo.trName(rHat2[i].SS-1)).c_str(),rHat2[i].SS-1,sqrt(rHat2[i].FF.FF));
+ }
+ sstr<<"\n";
+ failedMessage=sstr.str();
+ if(!args.verbose)message(" %ld transcripts (full list is in the output file)\n",countUncoverged);
+ }
+ // Close files and delete pointers.
+ for(j=0;j<chainsN;j++){
+ samplers[j]->noSave();
+ samplesFile[j].close();
+ }
+ delete[] samplesFile;
+ break;
+ }//}}}
+ if(! (args.flag("scaleReduction") || args.flag("MCMC_samplesDOmax"))){
+ vector<double> needS(M,0);
+ for(i=1;i<M;i++){
+ // between variance was not multiplied by samplesHave===n
+ // there is no chainsN in the denominator because samplesSave was already divided by chainsN
+ // Use LOGIT(theta):
+ needS[i] = samplesSave * sumNorms.SS/
+ ((sumNorms.SS-1.0)/sumNorms.SS*withVar[i].SS/betwVar[i].SS+1.0);
+ //needS[i] = samplesSave * samplesHave/
+ // ((samplesHave-1.0)/samplesHave*withVar[i].FF/betwVar[i].FF+1.0);
+ }
+ // log the number of effective samples, only when testing... //{{{
+ #ifdef LOG_NEED
+ ofstream effLog(effLogFile.c_str());
+ for(i=1;i<M;i++){
+ effLog<<needS[rHat2[i].SS]<<" "<<sqrt(rHat2[i].FF.FF)<<" "<<samplesHave*betwVar[rHat2[i].SS].FF<<" "<<withVar[rHat2[i].SS].FF<<" "<<rHat2[i].SS<<endl;
+ }
+ effLog.close();
+ #endif
+ //}}}
+ sort(needS.begin(),needS.end());
+ i = (long)(M*0.95)+1; // make at least 95% transcripts converged
+ /* samplesN -> now it will be samples needed PER chain in order to
+ * generate samplesSave*chainsN effective samples.
+ */
+ samplesN = max((long)needS[i],samplesSave);
+ quitNext = true;
+ }else{
+ // Prepare for producing samples if Rhat^2<target scale reduction
+ // OR reached samplesNmax
+ // OR produced too many samples (>500 000)
+ if((totalSamples*chainsN < 5000000) && (rMean.FF > gPar.targetScaleReduction())){
+ samplesN *= 2;
+ }else{
+ quitNext = true;
+ }
+ if((samplesN >= gPar.samplesNmax()) || args.flag("MCMC_samplesDOmax")){
+ samplesN=gPar.samplesNmax();
+ quitNext = true;
+ }
+ }
+ // if next iteration is the last one, prepare the files and make samples write samples
+ if(quitNext){
+ messageF("Producing %ld final samples from each chain.\n",samplesN);
+ // if samplesN<samplesSave, only samplesN samples will be saved
+ if(samplesN<samplesSave){
+ samplesSave = samplesN;
+ }
+ for(j=0;j<chainsN;j++){
+ sstr.str("");
+ sstr<<args.getS("outFilePrefix")<<"."<<args.getS("outputType")<<"S-"<<j;
+ samplesFileNames.push_back(sstr.str());
+ samplesFile[j].open(samplesFileNames[j].c_str());
+ if(! samplesFile[j].is_open()){
+ error("Main: Unable to open output file '%s'.\n",(sstr.str()).c_str());
+ }else{
+ samplesFile[j]<<"#\n# M "<<M-1<<"\n# N "<<samplesSave<<endl;
+ samplers[j]->saveSamples(&samplesFile[j],trInfo.getShiftedLengths(true),args.getS("outputType"));
+ }
+ }
+ }
+ for(j=0;j<chainsN;j++){
+ samplers[j]->resetSampler(samplesN);
+ }
+ samplesHave=0;
+ //}}}
+ }
+ // Write means: {{{
+ meansFile.open((args.getS("outFilePrefix")+".thetaMeans").c_str());
+ if(meansFile.is_open()){
+ meansFile<<"# T => Mrows \n# M "<<M-1<<endl;
+ meansFile<<"# file containing the mean value of theta - relative abundace of fragments and counts\n"
+ "# (overall mean, overall counts, mean of saved samples, and mean from every chain are reported)\n"
+ "# columns:\n"
+ "# <transcriptID> <meanThetaOverall> <meanReadCountOverall> <meanThetaSaved> <varThetaOverall>";
+ for(j=0;j<chainsN;j++)meansFile<<" <chain"<<j+1<<"mean>";
+ meansFile<<endl;
+ meansFile<<scientific;
+ meansFile.precision(9);
+ double sumSaved, thetaSqSum, thetaSum, sumNorm, tSS, tS, sN, thetaVar;
+ for(i=0;i<M;i++){
+ sumSaved=thetaSqSum=thetaSum=sumNorm=0;
+ for(j=0;j<chainsN;j++){
+ sumSaved+=samplers[j]->getAverage(i).SS;
+ samplers[j]->getThetaSums(i, &tSS, &tS, &sN);
+ thetaSqSum += tSS;
+ thetaSum += tS;
+ sumNorm += sN;
+ }
+ if(i==0){
+ meansFile<<"#thetaAct:";
+ }else{
+ meansFile<<i;
+ }
+ thetaVar = thetaSqSum / (sumNorm - 1.0) -
+ thetaSum / (sumNorm - 1.0) * thetaSum / sumNorm;
+ meansFile<<" "<<thetaSum/sumNorm<<" "<<(long)floor(thetaSum/sumNorm*alignments->getNreads()+0.5)<<" "<<sumSaved/chainsN<<" "<<thetaVar;
+ for(j=0;j<chainsN;j++)
+ meansFile<<" "<<samplers[j]->getAverage(i).FF;
+ meansFile<<endl;
+ }
+ meansFile.close();
+ }else{
+ warning("Main: Unable to write thetaMeans into: %s\n",(args.getS("outFilePrefix")+".thetaMeans").c_str());
+ }
+ //}}}
+ // Write thetaAct: {{{
+ if(args.isSet("thetaActFileName")){
+ ofstream actFile(args.getS("thetaActFileName").c_str());
+ if(actFile.is_open()){
+ actFile<<"# samples of thetaAct parameter (only generated when using gibbs sampling)\n";
+ actFile<<"# N "<<chainsN*samplesSave<<endl;
+ for(j=0;j<chainsN;j++){
+ for(i=0;i<(long)samplers[j]->getThetaActLog().size();i++)
+ actFile<<samplers[j]->getThetaActLog()[i]<<" ";
+ }
+ actFile<<endl;
+ actFile.close();
+ }else{
+ warning("Main: Unable to write thetaAct log: %s.\n",(args.getS("thetaActFileName")).c_str());
+ }
+ }
+ // }}}
+ // Free memory: {{{
+ for(j=0;j<chainsN;j++){
+ delete samplers[j];
+ }
+// delete [] samplers;
+ //}}}
+ message("Total samples: %ld\n",totalSamples*chainsN);
+}//}}}
+
+extern "C" int estimateExpression(int *argc, char* argv[]) {//{{{
+clearDataEE();
+string programDescription =
+"Estimates expression given precomputed probabilities of (observed) reads' alignments.\n\
+ Uses MCMC sampling algorithm to produce relative abundance or RPKM.\n";
+ // Set options {{{
+ ArgumentParser args;
+ args.init(programDescription,"[prob file]",1);
+ args.addOptionS("o","outPrefix","outFilePrefix",1,"Prefix for the output files.");
+ args.addOptionS("O","outType","outputType",0,"Output type (theta, RPKM, counts, tau).","theta");
+ args.addOptionB("G","gibbs","gibbs",0,"Use Gibbs sampling instead of collapsed Gibbs sampling.");
+ args.addOptionS("p","parFile","parFileName",0,"File containing parameters for the sampler, which can be otherwise specified by --MCMC* options. As the file is checked after every MCMC iteration, the parameters can be adjusted while running.");
+ args.addOptionS("t","trInfoFile","trInfoFileName",0,"File containing transcript information. (Necessary for RPKM)");
+ args.addOptionL("P","procN","procN",0,"Limit the maximum number of threads to be used. (Default is the number of MCMC chains.)");
+ args.addOptionS("","thetaActFile","thetaActFileName",0,"File for logging noise parameter theta^{act}.");
+ args.addOptionL("","MCMC_burnIn","MCMC_burnIn",0,"Length of sampler's burn in period.",1000);
+ args.addOptionL("","MCMC_samplesN","MCMC_samplesN",0,"Initial number of samples produced. Doubles after every iteration.",1000);
+ args.addOptionL("","MCMC_samplesSave","MCMC_samplesSave",0,"Number of samples recorder in total.",1000);
+ args.addOptionL("","MCMC_samplesNmax","MCMC_samplesNmax",0,"Maximum number of samples produced in one iteration. After producing samplesNmax samples sampler finishes.",50000);
+ args.addOptionB("","MCMC_samplesDOmax","MCMC_samplesDOmax",0,"Produce maximum number of samples (samplesNmax) in second iteration and quit.");
+ args.addOptionL("","MCMC_chainsN","MCMC_chainsN",0,"Number of parallel chains used. At least two chains will be used.",4);
+ args.addOptionD("","MCMC_scaleReduction","MCMC_scaleReduction",0,"Target scale reduction, sampler finishes after this value is met.",1.2);
+ args.addOptionD("","MCMC_dirAlpha","MCMC_dirAlpha",0,"Alpha parameter for the Dirichlet distribution.",1.0);
+ args.addOptionB("","scaleReduction","scaleReduction",0,"Use scale reduction as stopping criterion, instead of computing effective sample size.");
+ args.addOptionL("s","seed","seed",0,"Random initialization seed.");
+ if(!args.parse(*argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ // }}}
+ MyTimer timer;
+ gibbsParameters gPar;
+ TagAlignments *alignments=NULL;
+//{{{ Initialization:
+
+ gPar.setParameters(args);
+ if(args.isSet("parFileName")){
+ gPar.setParameters(args.getS("parFileName"));
+ }
+ args.updateS("outputType", ns_expression::getOutputType(args));
+ if(args.verbose)gPar.getAllParameters();
+#ifdef SUPPORT_OPENMP
+ if(args.isSet("procN"))
+ omp_set_num_threads(args.getL("procN"));
+ else
+ omp_set_num_threads(gPar.chainsN());
+#endif
+
+
+ //}}}
+ // {{{ Read transcriptInfo and .prob file
+ if((!args.isSet("trInfoFileName"))||(!trInfo.readInfo(args.getS("trInfoFileName")))){
+ if(args.getS("outputType") == "rpkm"){
+ error("Main: Missing transcript info file. The file is necessary for producing RPKM.\n");
+ return 1;
+ }
+ }else{
+ M = trInfo.getM()+1;
+ }
+ alignments = readData(args);
+ if(! alignments){
+ error("Main: Reading alignments failed.\n");
+ return 1;
+ }
+ if(M<=0){
+ error("Main: Invalid number of transcripts in .prob file.\n");
+ return 1;
+ }
+ // }}}
+
+ if(args.verbose)timer.split();
+ if(args.verbose)messageF("Starting the sampler.\n");
+ MCMC(alignments,gPar,args);
+ // {{{ Transpose and merge sample file
+ if(transposeFiles(samplesFileNames,args.getS("outFilePrefix")+"."+args.getS("outputType"),args.verbose,failedMessage)){
+ if(args.verbose)message("Sample files transposed. Deleting.\n");
+ for(long i=0;i<(long)samplesFileNames.size();i++){
+ remove(samplesFileNames[i].c_str());
+ }
+ }else{
+ message("Transposing files failed. Please check the files and try using trasposeLargeFile to transpose & merge the files into single file.\n");
+ }
+ //}}}
+ delete alignments;
+ message("DONE. ");
+ timer.split(0,'m');
+ return 0;
+}//}}}
+
+#ifndef BIOC_BUILD
+int main(int argc, char* argv[]) {
+ return estimateExpression(&argc,argv);
+}
+#endif
diff --git a/estimateHyperPar.cpp b/estimateHyperPar.cpp
new file mode 100644
index 0000000..65a936a
--- /dev/null
+++ b/estimateHyperPar.cpp
@@ -0,0 +1,369 @@
+/*
+ * Hyperparameter model in estimate[*]HyperPar.cpp always depends on the model used in
+ * relevant estimate[*]DE.cpp
+ */
+// DECLARATIONS: {{{
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <sstream>
+#include "boost/random/mersenne_twister.hpp"
+#include "boost/random/normal_distribution.hpp"
+#include "boost/random/uniform_01.hpp"
+using namespace std;
+
+#include "ArgumentParser.h"
+#include "lowess.h"
+#include "MyTimer.h"
+#include "misc.h"
+#include "PosteriorSamples.h"
+#include "TranscriptExpression.h"
+
+#include "common.h"
+
+using ns_params::paramT;
+
+//}}}
+// Defaults: {{{
+#define ALPHA_PROP 0.1
+#define BETA_PROP 0.08
+#define subM_MIN 10
+#define subM_MAX 5000
+#define SAMPLES_N 2
+#define MAX_ITER 1000
+#define MAX_RETRIES 10
+#define MAX_PARAM 5000
+//}}}
+
+extern "C" int estimateHyperPar(int *argc,char* argv[]){
+string programDescription =
+"Estimate expression dependent hyperparameters from the dataset.\n\
+ [sample Files] should contain transposed MCMC samples from replicates.\n\
+ To distinguish conditions use C between them e.g.:\n\
+ samplesC1-R1.rpkm samplesC1-R2.rpkm C samplesC2-R1.rpkm samplesC2-R2.rpkm";
+ // Intro: {{{
+ // Set options {{{
+ ArgumentParser args(programDescription,"[sampleFiles]",1);
+ args.addOptionB("V","veryVerbose","veryVerbose",0,"More verbose output.");
+ args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
+ args.addOptionS("p","paramsAllFile","paramsAllFileName",0,"Name of the file to which to store all parameter values generated prior to lowess smoothing.");
+ args.addOptionS("","meanFile","meanFileName",0,"Name of the file containing joint mean and variance.");
+ args.addOptionL("g","groupsNumber","groupsN",0,"Number of groups of transcript of similar size.",200);
+ args.addOptionL("s","samplesNumber","samplesN",0,"Number of samples generated for each group.",SAMPLES_N);
+ args.addOptionD("l","lambda0","lambda0",0,"Precision scaling parameter lambda0.",2.0);
+ args.addOptionD("","exThreshold","exT",0,"Threshold of lowest expression for which the estimation is done.",-5);
+ args.addOptionB("S","smoothOnly","smoothOnly",0,"Input file contains previously sampled hyperparameters which should smoothed only.");
+ args.addOptionD("","lowess-f","lowess-f",0,"Parameter F for lowess smoothing specifying amount of smoothing.",0.2);
+ args.addOptionL("","lowess-steps","lowess-steps",0,"Parameter Nsteps for lowess smoothing specifying number of iterations.",5);
+ args.addOptionB("","noforce","noforce",0,"Do not force smoothing of the parameters.",false);
+ args.addOptionS("","norm","normalization",0,"Normalization constants for each input file provided as comma separated list of doubles (e.g. 1.0017,1.0,0.9999 ).");
+ args.addOptionL("","seed","seed",0,"Random initialization seed.");
+ if(!args.parse(*argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ // }}}
+
+ MyTimer timer;
+ timer.start(1);
+ long i,M=0,N,RTN,C;
+ bool storeAll=args.isSet("paramsAllFileName");
+ vector<paramT> params;
+ paramT param;
+ TranscriptExpression trExp;
+ ofstream outF;
+
+ if(! args.flag("smoothOnly")){
+ if(! args.isSet("meanFileName")){
+ error("Main: Please provide mean file name (--meanFile).\n");
+ return 1;
+ }
+ trExp.readExpression(args.getS("meanFileName"), MEAN_VARIANCE);
+ // Force user to use logged mean and samples.
+ if(!trExp.isLogged()){
+ error("Main: Please compute the expression mean from logged samples (getVariance --log ...).\n");
+ return 1;
+ }
+ M = trExp.getM();
+ if(args.verbose)message("Transcripts in expression file: %ld\n",M);
+ trExp.doSort(true);
+ }
+
+ if(!ns_misc::openOutput(args, &outF)) return 1;
+ ///}}}
+
+ if(args.flag("smoothOnly")){
+ // Reading previously sampled parameters. (header is copie into outF)
+ readParams(args.args()[0], ¶ms, &outF);
+ }else{
+ // Sampling parameters based on data
+ // Read conditions {{{
+ Conditions cond;
+ if(!ns_misc::readConditions(args, &C, &M, &N, &cond)) return 1;
+ RTN = cond.getRN();
+ if(args.verbose)message("Number of all replicates: %ld\n",RTN);
+
+ // Prepare file for storing all sampled parameters.
+ ofstream paramsF;
+ if(storeAll){
+ if(!ns_misc::openOutput(args.getS("paramsAllFileName"), ¶msF)) return 1;
+ paramsF<<"# lambda0 "<<args.getD("lambda0")<<endl;
+ }
+ // }}}
+ // Declarations {{{
+ vector<long double> mu0(subM_MAX,0);
+ vector<vector<vector<double> > > tr(subM_MAX,vector<vector<double> >(RTN));
+ vector<vector<long double> > bAdd(subM_MAX,vector<long double> (C,0));
+ boost::random::mt11213b rng_mt(ns_misc::getSeed(args));
+ boost::random::uniform_01<long double> uniformDistribution;
+ boost::random::normal_distribution<long double> normalDistributionA,normalDistributionB;
+ typedef boost::random::normal_distribution<long double>::param_type nDP;
+
+ long double alpha,beta,alphaP,betaP,prob,probAll,probC,mean,old_mult,proposalMultiplier,acceptR,sum,sumS,lambda0,exDelta,exLast;
+ long samp,samplesN,samplesREDO,maxIter,r,c,m,curM,Rc,subM;
+ bool breaked=false,good=false;
+ //}}}
+ // Initial values {{{
+ alpha=uniformDistribution(rng_mt)*10.0;
+ beta=uniformDistribution(rng_mt)*5.0;
+ old_mult=0;
+ proposalMultiplier=2.0;
+ prob = 0;
+ lambda0 = args.getD("lambda0");
+ samplesN = args.getL("samplesN");
+ curM=0;
+ exDelta = (trExp.exp(0)-trExp.exp(M-1))/args.getL("groupsN");
+ exLast = trExp.exp(0);
+ if(args.verbose)message("Expression step: %Lg\n",exDelta);
+ // }}}
+ timer.split();
+ if(args.verbose)message("Running sampler.\n");
+ while(curM<M){
+ // Reading next group of transcripts {{{
+ mean=0;
+ m = 0;
+ while((curM<M)&&(m<subM_MAX)){
+ if(trExp.exp(curM)<args.getD("exT")){
+ if(args.verbose)message("skipping expression: %lg\n",trExp.exp(curM));
+ break;
+ }
+ for(r=0;r<RTN;r++){
+ good = cond.getTranscript(r, trExp.id(curM), tr[m][r],samplesN+MAX_RETRIES);
+ if(!good)break;
+ // If sampels were not logged, log them now.
+ if(!cond.logged())
+ for(samp=0;samp<samplesN+MAX_RETRIES;samp++){
+ tr[m][r][samp] = (tr[m][r][samp] == 0)? ns_misc::LOG_ZERO:log(tr[m][r][samp]);
+ }
+ }
+ if(good){
+ mu0[m]=trExp.exp(curM);
+ mean+=mu0[m];
+ m++;
+ }
+ curM++;
+ if(args.flag("veryVerbose"))if(progressLog(curM,M,10,' '))timer.split(0,'m');
+ if((m>=subM_MIN)&&(exDelta<exLast-trExp.exp(curM-1)))break;
+ }
+ exLast = trExp.exp(curM-1);
+ if(m<subM_MIN)break;
+ subM = m;
+ mean/=subM;
+ if(args.flag("veryVerbose"))message("# mean: %Lg subM: %ld\n",mean,subM);
+ if(storeAll)paramsF<<"# mean: "<<mean<<" subM: "<<subM<<endl;
+ samplesREDO = 0;
+ //}}}
+ for(samp=0;samp<samplesN+samplesREDO;samp++){
+ // Computing Badd_gc and initializing {{{
+ for(m=0;m<subM;m++){
+ i=0; // counter over all replicates;
+ for(c=0;c<C;c++){
+ sum = 0;
+ sumS = 0;
+ Rc=cond.getRC(c);
+ for(r=0;r<Rc;r++){
+ sum += tr[m][i][samp];
+ sumS += tr[m][i][samp]*tr[m][i][samp];
+ i++;
+ }
+ bAdd[m][c]=0.5*(sumS + mu0[m]*mu0[m]*lambda0 -
+ (sum+mu0[m]*lambda0)*(sum+mu0[m]*lambda0)/(lambda0+Rc));
+ }
+ }
+ acceptR=0;
+ old_mult=0;
+ proposalMultiplier=proposalMultiplier*2.0;
+ normalDistributionA.param(nDP(0,ALPHA_PROP*proposalMultiplier));
+ normalDistributionB.param(nDP(0,BETA_PROP*proposalMultiplier));
+ maxIter=0;
+ breaked = false;
+ R_INTERUPT;
+ //}}}
+ while((acceptR<0.25)||(acceptR>0.5)||(old_mult!=proposalMultiplier)){
+ // Convergence control based on acceptance ratio. {{{
+ maxIter++;
+ if(maxIter>MAX_ITER){
+ if(args.flag("veryVerbose"))
+ message("(BREAKED acceptR %Lg mult %Lg)\n",acceptR,proposalMultiplier);
+ if(storeAll)
+ paramsF<<"#(BREAKED acceptR "<<acceptR<<" mult "<<proposalMultiplier<<")"<<endl;
+ breaked=true;
+ break;
+ }
+ if((alpha>MAX_PARAM)||(beta>MAX_PARAM)){
+ if(args.flag("veryVerbose"))
+ message("(OVERFLOW acceptR %Lg mult %Lg)\n",acceptR,proposalMultiplier);
+ if(storeAll)
+ paramsF<<"#(OVERFLOW acceptR "<<acceptR<<" mult "<<proposalMultiplier<<")"<<endl;
+ breaked=true;
+ break;
+ }
+ old_mult=proposalMultiplier;
+ if(acceptR<0.25)proposalMultiplier/=1.02;
+ if(acceptR>0.5)proposalMultiplier*=1.02;
+ if(old_mult!=proposalMultiplier){
+ normalDistributionA.param(nDP(0,ALPHA_PROP*proposalMultiplier));
+ normalDistributionB.param(nDP(0,BETA_PROP*proposalMultiplier));
+ }
+ //}}}
+ acceptR=0;
+ R_INTERUPT;
+ for(i=0;i<1000;i++){ // Sampling 1000 samples {{{
+ alphaP = alpha + normalDistributionA(rng_mt);
+ if(alphaP<0)alphaP = -alphaP;
+ betaP= beta + normalDistributionB(rng_mt);
+ if(betaP<0)betaP = -betaP;
+ if((alphaP==0)||(betaP==0)){
+ prob=0;
+ }else{
+ prob = 1.0;
+ probAll = pow(betaP,alphaP) / pow(beta,alpha);
+ for(c=0;c<C;c++){
+ probC = lgamma(alphaP + cond.getRC(c)/2.0)+
+ lgamma(alpha) -
+ lgamma(alpha + cond.getRC(c)/2.0) -
+ lgamma(alphaP);
+ probC = probAll * exp(probC);
+ for(m=0;m<subM;m++){
+ // message(" (var_g %lg) (pow %lg %lg %lg) ",bAdd[g]/2.0,pow(beta+bAdd[g]/2, alpha),pow(betaP+bAdd[g]/2, alphaP),pow((beta+bAdd[g]/2)/(betaP+bAdd[g]/2),SUB_N/2));
+ prob *= probC;
+ prob *= pow(beta+bAdd[m][c], alpha) /
+ pow(betaP+bAdd[m][c], alphaP);
+ prob *= pow( (beta+bAdd[m][c])/(betaP+bAdd[m][c]), (long double)(cond.getRC(c)/2.0));
+ }
+ }
+ if((prob>1.0)||(uniformDistribution(rng_mt)< prob)){
+ alpha=alphaP;
+ beta=betaP;
+ acceptR++;
+ }
+ }
+ } //}}}
+ acceptR/=i;
+ }
+ // Save generated parameters {{{
+ if(storeAll)
+ paramsF<<"#(acceptR "<<acceptR<<" mult "<<proposalMultiplier<<" iter "<<maxIter<<")"<<endl;
+ if(!breaked){
+ if(args.flag("veryVerbose")) message("%Lg %Lg\n",alpha,beta);
+ if(storeAll) paramsF<<alpha<<" "<<beta<<" "<<mean<<endl;
+ param.expr=mean;
+ param.alpha=alpha;
+ param.beta=beta;
+ params.push_back(param);
+ }else{
+ if(args.flag("veryVerbose")) message("# %Lg %Lg %Lg\n",alpha,beta,mean);
+ if(storeAll) paramsF<<"# "<<alpha<<" "<<beta<<endl;
+ proposalMultiplier=2;
+ normalDistributionA.param(nDP(0,ALPHA_PROP*proposalMultiplier));
+ normalDistributionB.param(nDP(0,BETA_PROP*proposalMultiplier));
+ alpha=uniformDistribution(rng_mt)*10.0;
+ beta=uniformDistribution(rng_mt)*5.0;
+ if(samplesREDO<MAX_RETRIES){
+ samplesREDO++;
+ }
+ }
+ //}}}
+ }
+ if((args.verbose)&&(!args.flag("veryVerbose"))){
+ messageF(".");
+ }
+ }
+ cond.close();
+ if(storeAll)paramsF.close();
+ outF<<"# lambda0 "<<args.getD("lambda0")<<endl;
+ if(args.verbose)message("\nSampling done.\n");
+ }
+ sort(params.begin(),params.end());
+ long pAll=(long)params.size(), pDistinct;
+ if(args.verbose)message("Have %ld parameters to smooth.\n",pAll);
+ vector<double> exp(pAll),alp(pAll),bet(pAll),alpS,betS;
+ for(i=0;i<pAll;i++){
+ exp[i]=params[i].expr;
+ alp[i]=params[i].alpha;
+ bet[i]=params[i].beta;
+ }
+ double f = args.getD("lowess-f");
+ long iter = args.getL("lowess-steps"),iterAdd;
+ bool redoSmooth;
+ for(iterAdd=0;iterAdd<6;iterAdd++){ // Increase iteration if anything is <=0
+ redoSmooth = false;
+ lowess(exp,alp,f,iter+iterAdd,alpS);
+ for(i=0;i<pAll;i++)
+ if(alpS[i]<=0){
+ redoSmooth = true;
+ if(args.flag("veryVerbose"))message(" negative alpha: %lg exp: %lg\n",alpS[i],exp[i]);
+ }
+ if(!redoSmooth)break;
+ if(args.verbose)message("Re-Smoothing alpha.\n");
+ }
+ outF<<"# alphaSmooth f: "<<f<<" nSteps: "<<iter+iterAdd<<endl;
+ if(args.verbose)message("# alphaSmooth f: %lg nSteps: %ld\n",f,iter+iterAdd);
+ if((iterAdd==6)&&(args.flag("noforce"))){
+ error("Main: Unable to produce smooth alpha >0.\nTry adjusting the parameter lowess-f.\n");
+ outF.close();
+ remove(args.getS("outFileName").c_str());
+ return 0;
+ }
+ for(iterAdd=0;iterAdd<6;iterAdd++){ // Increase iteration if anything is <=0
+ redoSmooth = false;
+ lowess(exp,bet,f,iter+iterAdd,betS);
+ for(i=0;i<pAll;i++)
+ if(betS[i]<=0){
+ redoSmooth = true;
+ if(args.flag("veryVerbose"))message(" negative beta: %lg exp: %lg\n",betS[i],exp[i]);
+ }
+ if(!redoSmooth)break;
+ if(args.verbose)message("Re-Smoothing beta.\n");
+ }
+ outF<<"# betaSmooth f: "<<f<<" nSteps: "<<iter+iterAdd<<endl;
+ if(args.verbose)message("# betaSmooth f: %lg nSteps: %ld\n",f,iter+iterAdd);
+ if((iterAdd==6)&&(args.flag("noforce"))){
+ error("Main: Unable to produce smooth beta >0.\nTry adjusting the parameter lowess-f.\n");
+ outF.close();
+ remove(args.getS("outFileName").c_str());
+ return 0;
+ }
+ if(!args.flag("noforce")){
+ for(i=0;i<pAll;i++)
+ while((i<pAll)&&((alpS[i]<=0)||(betS[i]<=0))){
+ message("Removing: %lg %lg %lg\n",alpS[i],betS[i],exp[i]);
+ alpS.erase(alpS.begin()+i); betS.erase(betS.begin()+i); exp.erase(exp.begin()+i);
+ pAll = alpS.size();
+ }
+ }
+ pDistinct = 1;
+ for(i=1;i<pAll;i++)if(exp[i]!=exp[i-1])pDistinct++;
+ outF<<"# PN "<<pDistinct<<" hyperparameters"<<endl;
+ outF<<"# columns: alpha beta expression "<<endl;
+ outF<<alpS[0]<<" "<<betS[0]<<" "<<exp[0]<<endl;
+ for(i=1;i<pAll;i++)
+ if(exp[i]!=exp[i-1])outF<<alpS[i]<<" "<<betS[i]<<" "<<exp[i]<<endl;
+ outF.close();
+ if(args.verbose){message("DONE.\n");timer.stop(1,'m');}
+ return 0;
+}
+
+#ifndef BIOC_BUILD
+int main(int argc,char* argv[]){
+ return estimateHyperPar(&argc,argv);
+}
+#endif
diff --git a/estimateVBExpression.cpp b/estimateVBExpression.cpp
new file mode 100644
index 0000000..9c78b37
--- /dev/null
+++ b/estimateVBExpression.cpp
@@ -0,0 +1,238 @@
+#include "ArgumentParser.h"
+#include "FileHeader.h"
+#include "misc.h"
+#include "MyTimer.h"
+#include "SimpleSparse.h"
+#include "TagAlignments.h"
+#include "transposeFiles.h"
+#include "VariationalBayes.h"
+
+#include "common.h"
+
+SimpleSparse* readData(const ArgumentParser &args, long trM){//{{{
+/*
+ As parse(filename,maxreads=None) in python
+ Python difference:
+ - missing maxreads check
+ (abort if more than maxreads reads were processed)
+*/
+ long i,j,num,tid;
+ double prb;
+ long Ntotal=0,Nmap=0, M=0;
+ string readId,strand,blank;
+ ifstream inFile;
+ MyTimer timer;
+ TagAlignments *alignments = new TagAlignments();
+
+ // Read alignment probabilities {{{
+ inFile.open(args.args()[0].c_str());
+ FileHeader fh(&inFile);
+ ns_fileHeader::AlignmentFileType format;
+ if((!fh.probHeader(&Nmap,&Ntotal,&M,&format)) || (Nmap ==0)){//{{{
+ error("Prob file header read failed.\n");
+ return NULL;
+ }//}}}
+ if(format == ns_fileHeader::OLD_FORMAT){
+ error("Please use new/log format of Prob file.");
+ return NULL;
+ }
+ message("N mapped: %ld\n",Nmap);
+ messageF("N total: %ld\n",Ntotal);
+ if(args.verb())message("Reading alignments.\n");
+ alignments->init(Nmap,0,M);
+ long mod=10000;
+ long bad = 0;
+ timer.start();
+ for(i = 0; i < Nmap; i++) {
+ inFile>>readId>>num;
+ if(!inFile.good())break;
+ // message("%s %ld\n",(readId).c_str(),num);
+ for(j = 0; j < num; j++) {
+ inFile>>tid>>prb;
+ if(inFile.fail()){
+ inFile.clear();
+ // ignore rest of line
+ j=num;
+ // this read goes to noise
+ tid=0;
+ // 10 means either 10 or exp(10), but should be still be large enough
+ prb=10;
+ bad++;
+ }
+ switch(format){
+ case ns_fileHeader::NEW_FORMAT:
+ alignments->pushAlignment(tid, prb);
+ break;
+ case ns_fileHeader::LOG_FORMAT:
+ alignments->pushAlignmentL(tid, prb);
+ break;
+ default:;
+ }
+ }
+ // ignore rest of line
+ inFile.ignore(10000000,'\n');
+
+ alignments->pushRead();
+
+ R_INTERUPT;
+ if(args.verb() && (i % mod == 0) && (i>0)){
+ message(" %ld ",i);
+ timer.split();
+ mod*=10;
+ }
+ }
+ if(bad>0)warning("Main: %ld reads' alignment information were corrupted.\n",bad);
+ inFile.close();
+ long Nhits,NreadsReal;
+ alignments->finalizeRead(&M, &NreadsReal, &Nhits);
+ // Increase M based on number of transcripts in trInfo file.
+ if(M<trM)M = trM;
+ //}}}
+ if(i<Nmap)message("Read only %ld reads.\n",NreadsReal);
+ message("All alignments: %ld\n",Nhits);
+ messageF("Isoforms: %ld\n",M);
+ Nmap = NreadsReal;
+
+ SimpleSparse *beta = new SimpleSparse(Nmap, M, Nhits);
+
+ for(i=0;i<=Nmap;i++)beta->rowStart[i]=alignments->getReadsI(i);
+ for(i=0;i<Nhits;i++){
+ beta->val[i]=alignments->getProb(i);
+ beta->col[i]=alignments->getTrId(i);
+ }
+
+ delete alignments;
+ return beta;
+}//}}}
+
+extern "C" int estimateVBExpression(int *argc, char* argv[]) {//{{{
+string programDescription =
+"Estimates expression given precomputed probabilities of (observed) reads' alignments.\n\
+ Uses Variational Bayes algorithm to produce parameters for distribution of relative abundances.\n";
+ // Set options {{{
+ ArgumentParser args;
+ args.init(programDescription,"[prob file]",1);
+ args.addOptionS("o","outPrefix","outFilePrefix",1,"Prefix for the output files.");
+ args.addOptionS("O","outType","outputType",0,"Output type (theta, RPKM, counts) of the samples sampled from the distribution.","theta");
+ args.addOptionS("t","trInfoFile","trInfoFileName",0,"File containing transcript information. (Necessary for RPKM samples)");
+ args.addOptionL("P","procN","procN",0,"Limit the maximum number of threads to be used.",4);
+ args.addOptionS("m","method","optMethod",0,"Optimization method (steepest, PR, FR, HS).","FR");
+ args.addOptionL("s","seed","seed",0,"Random initialization seed.");
+ args.addOptionL("","maxIter","maxIter",0,"Maximum number of iterations.",(long)1e4);
+ args.addOptionD("","optLimit","limit",0,"Optimisation limit in terms of minimal gradient or change of bound.",1e-5);
+ args.addOptionL("","samples","samples",0,"Number of samples to be sampled from the distribution.");
+ args.addOptionB("V","veryVerbose","veryVerbose",0,"More verbose output, better if output forwarded into file.");
+ if(!args.parse(*argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ OPT_TYPE optM;
+ if(args.isSet("optMethod")){
+ if((args.getLowerS("optMethod")=="steepest")||
+ (args.getLowerS("optMethod")=="vbem"))optM = OPTT_STEEPEST;
+ else if(args.getLowerS("optMethod")=="pr")optM = OPTT_PR;
+ else if(args.getLowerS("optMethod")=="fr")optM = OPTT_FR;
+ else if(args.getLowerS("optMethod")=="hs")optM = OPTT_HS;
+ else optM = OPTT_FR;
+ }else optM = OPTT_FR;
+ args.updateS("outputType", ns_expression::getOutputType(args, "theta"));
+ if(args.getS("outputType") == "tau"){
+ error("Main: 'tau' is not valid output type.\n");
+ return 1;
+ }
+ // }}}
+ MyTimer timer;
+ timer.start(2);
+ long M = 0;
+ SimpleSparse *beta;
+ TranscriptInfo trInfo;
+
+ // {{{ Read transcriptInfo and .prob file
+ if((!args.isSet("trInfoFileName"))||(!trInfo.readInfo(args.getS("trInfoFileName")))){
+ if(args.isSet("samples") && (args.getL("samples")>0) && (args.getS("outputType") == "rpkm")){
+ error("Main: Missing transcript info file. The file is necessary for producing RPKM samples.\n");
+ return 1;
+ }
+ }else{
+ M = trInfo.getM()+1;
+ }
+ beta = readData(args,M);
+ if(! beta){
+ error("Main: Reading probabilities failed.\n");
+ return 1;
+ }
+ M = beta->M;
+ if(M<=0){
+ error("Main: Invalid number of transcripts in .prob file.\n");
+ return 1;
+ }
+ // }}}
+
+ if(args.verbose)timer.split();
+
+ if(args.verbose)message("Initializing VB.\n");
+
+ VariationalBayes varB(beta,NULL,ns_misc::getSeed(args),args.getL("procN"));
+
+ if(args.verbose)timer.split();
+ if(args.verbose)message("Starting VB optimization.\n");
+
+#ifdef LOG_CONV
+ varB.setLog(args.getS("outFilePrefix")+".convLog",&timer);
+#endif
+
+ // Optimize:
+ if(!args.verbose)varB.beQuiet();
+ varB.optimize(args.flag("veryVerbose"),optM,args.getL("maxIter"),args.getD("limit"),args.getD("limit"));
+
+ if(args.verbose){timer.split(0,'m');}
+ double *alpha = varB.getAlphas();
+ double alphaSum = 0 ;
+ long i;
+ for(i=0;i<M;i++)alphaSum+=alpha[i];
+ ofstream outF;
+ if(! ns_misc::openOutput((args.getS("outFilePrefix")+".m_alphas"), &outF)){
+ return 1;
+ }
+ outF<<"# "<<args.args()[0]<<endl;
+ outF<<"# M "<<M<<"\n"
+ "# List includes also 'noise' transcript (first line)\n"
+ "# <alpha> - parameter of Dirichlet distribution\n"
+ "# <alpha> <beta> - parameters of the marginal Gamma distribution\n"
+ "# columns: <mean theta> <alpha> <beta>"<<endl;
+ outF<<scientific;
+ outF.precision(9);
+ for(i=0;i<M;i++){
+ outF<<alpha[i]/alphaSum<<" "<<alpha[i]<<" "<<alphaSum-alpha[i]<<endl;
+ }
+ outF.close();
+ // free memory
+ delete beta;
+ delete[] alpha;
+ if(args.isSet("samples") && (args.getL("samples")>0)){
+ string outTypeS = args.getS("outputType");
+ string samplesFName = args.getS("outFilePrefix")+".VB" + outTypeS;
+ string samplesTmpName = args.getS("outFilePrefix")+".VB"+outTypeS+"TMP";
+ timer.start(0);
+ if(args.verbose)messageF("Generating samples into temporary file %s. ",samplesTmpName.c_str());
+ if(!ns_misc::openOutput(samplesTmpName, &outF)) return 1;
+ // Samples are generated without the "noise transcript".
+ outF<<"# M "<<M-1<<" N "<<args.getL("samples")<<endl;
+ varB.generateSamples(args.getL("samples"), outTypeS, trInfo.getShiftedLengths(), &outF);
+ outF.close();
+ if(args.verbose)timer.split(0);
+ if(transposeFiles(vector<string>(1, samplesTmpName), samplesFName, args.verbose, "")){
+ if(args.verbose)message("Removing temporary file %s.\n", samplesTmpName.c_str());
+ remove(samplesTmpName.c_str());
+ }else {
+ error("Main: Transposing samples failed.\n");
+ return 1;
+ }
+ }
+ if(args.verbose){message("DONE. "); timer.split(2,'m');}
+ return 0;
+}//}}}
+
+#ifndef BIOC_BUILD
+int main(int argc, char* argv[]) {
+ return estimateVBExpression(&argc,argv);
+}
+#endif
diff --git a/extractSamples.cpp b/extractSamples.cpp
new file mode 100644
index 0000000..4c05687
--- /dev/null
+++ b/extractSamples.cpp
@@ -0,0 +1,126 @@
+/*
+ *
+ * Extract samples of given transcripts.
+ *
+ *
+ */
+#include<iostream>
+#include<cstdlib>
+#include<algorithm>
+
+using namespace std;
+
+#include "PosteriorSamples.h"
+#include "ArgumentParser.h"
+#include "common.h"
+
+#define Sof(x) (long)x.size()
+
+vector <long> tokenizeL(const string &input,const string &space = " "){//{{{
+ vector <long> ret;
+ long pos=0,f=0,n=input.size();
+ while((pos<n)&&(f<n)&&(f>=0)){
+ f=input.find(space,pos);
+ if(f==pos)pos++;
+ else{
+ if((f <n)&&(f>=0)){
+ ret.push_back(atoi(input.substr(pos,f-pos).c_str()));
+ pos=f+1;
+ }
+ }
+ }
+ if(pos<n)ret.push_back(atoi(input.substr(pos,n-pos).c_str()));
+ return ret;
+} //}}}
+
+int main(int argc,char* argv[]){
+ srand(time(NULL));
+ string programDescription=
+"Extracts MCMC samples of selected transcripts.\n\
+ [sampleFiles] should contain transposed MCMC samples.";
+ // Set options {{{
+ ArgumentParser args(programDescription,"[sampleFiles]",1);
+ args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
+ args.addOptionS("L","list","list",0,"Comma delimited list of ZERO-BASED transcript ids (i.e. lines) which should be extracted: 0,17,47,1024,4777");
+ args.addOptionL("r","random","randomN",0,"Choose random [randomN] transcripts.");
+ if(!args.parse(argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ // }}}
+ long i,j,c,C,N,M=0,S;
+ vector<long> trList;
+ Conditions samples;
+
+ // Initialize samples reader
+ if( (!samples.init("NONE", args.args(), &C, &M, &N)) || (C<=0) || (M<=0) || (N<=0)){
+ cerr<<"ERROR: Main: Failed loading MCMC samples."<<endl;
+ return 1;
+ }
+ C=samples.getRN();
+ if(args.isSet("list")){
+ // Process transcripts list:
+ trList = tokenizeL(args.getS("list"),",");
+ sort(trList.begin(),trList.end());
+ // Erase invalid and duplicate IDs
+ for(i=0;i<Sof(trList);i++){
+ if((trList[i]<0)||(trList[i]>=M)||((i>0)&&(trList[i]==trList[i-1]))){
+ trList.erase(trList.begin()+i);
+ i--;
+ }
+ }
+ S=Sof(trList);
+ if(S==0){
+ cerr<<"ERROR: Main: No valid transcript IDs supplied."<<endl;
+ return 1;
+ }
+ }else if(args.isSet("randomN")){
+ // Create list of [randomN] random transcripts
+ S = args.getL("randomN");
+ if((S<=0)||(S>M)){
+ cerr<<"ERROR: Main: Wrong number of transcripts ot output: "<<S<<"."<<endl;
+ return 1;
+ }
+ for(i=0;i<S;i++){
+ j = rand()%M;
+ while(find(trList.begin(),trList.end(),j)!=trList.end())
+ j = rand()%M;
+ trList.push_back(j);
+ }
+ sort(trList.begin(),trList.end());
+ }else{
+ cerr<<"ERROR: Main: Need to specify at least one of --list or --random."<<endl;
+ return 1;
+ }
+ if(args.verbose)cout<<"C: "<<C<<" samples: "<<N<<"\ntranscripts: "<<M<<"\nselected: "<<S<<endl;
+
+ // Open output file and write header
+ ofstream outFile(args.getS("outFileName").c_str());
+ if(! outFile.is_open()){
+ cerr<<"ERROR: Main: File write failed!"<<endl;
+ return 1;
+ }
+ outFile<<"# Selected transcripts from: ";
+ for(i=0;i<C;i++)outFile<<args.args()[i]<<",";
+ outFile<<endl;
+ outFile<<"# transcripts(zero-based): "<<trList[0];
+ for(i=1;i<S;i++)outFile<<","<<trList[i];
+ outFile<<"\n# T (M rows,N cols)\n";
+ outFile<<"# C "<<C<<" (conditions)\n";
+ outFile<<"# M "<<S<<" (out of: "<<M<<")\n# N "<<N<<endl;
+ outFile.precision(9);
+ outFile<<scientific;
+
+ // Copy samples
+ vector<double> tr;
+ for(j=0;j<S;j++){
+ if(args.verbose)cout<<trList[j]<<" ";
+ cout.flush();
+ for(c=0;c<C;c++){
+ samples.getTranscript(c,trList[j], tr);
+ for(i=0;i<N;i++)outFile<<tr[i]<<" ";
+ outFile<<endl;
+ }
+ }
+ outFile.close();
+ if(args.verbose)cout<<"DONE"<<endl;
+ return 0;
+}
diff --git a/extractTranscriptInfo.py b/extractTranscriptInfo.py
new file mode 100755
index 0000000..879076c
--- /dev/null
+++ b/extractTranscriptInfo.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python
+# Initialization {{{
+import sys
+from optparse import OptionParser
+parser = OptionParser(usage="%prog [options] <inputFile> <outputFile>\n\n\
+ This program extracts information about transcripts from reference Fasta file.\n\
+ This is partially replaced by using SAM header, which however does not include information about transcript-gene grouping.\n\
+ Current version of parseAlignment extracts this information from a reference sequence file (making this script obsolete).\
+")
+parser.add_option("-v", "--verbose", default=False, dest="verbose", action="store_true", help="Verbose output")
+parser.add_option("-t","--type",dest="type", type="string",help="Type of file to parse: ensembl, cuff, other");
+
+(options, args) = parser.parse_args()
+def verbose(str):
+ if options.verbose:
+ print str;
+
+if len(args)<2:
+ sys.exit("Missing arguments");
+
+try:
+ inF = open(args[0],"r");
+except:
+ sys.exit("Unable to open input file: "+args[0]+" .");
+
+
+try:
+ outF = open(args[1],"w");
+except:
+ sys.exit("Unable to open output file: "+args[1]+" .");
+#}}}
+
+seqName="";
+geneName="";
+seqLen=0;
+seqCount=0;
+
+result = [];
+li = 0;
+
+if options.type:
+ if options.type=="ensembl":
+ itype = "ens";
+ print "Expecting header line format:\n>[tr Name] .* gene:[gene Name] .*";
+ elif options.type=="cuff":
+ itype = "cuf";
+ print "Expecting header line format:\n>[tr Name] .* gene=[gene Name] .*";
+ else:
+ itype = "non";
+ print "Expecting header line format:\n>[tr Name] .*\n -> using \"none\" as gene names";
+else:
+ itype = "non";
+ print "Expecting header line format:\n>[tr Name] .*\n -> using \"none\" as gene names";
+
+for line in inF:
+ li+=1;
+ if line[0] == '>':
+ if seqName!="":
+ result.append([geneName,seqName,str(seqLen)]);
+ seqLen=0;
+ seqCount+=1;
+ # Split line after >
+ lSplit = line[1:].split()
+ seqName = lSplit[0];
+ if seqName == "":
+ seqName = "unknown-tr"+str(seqCount);
+ print "Warning: no name on line ",li,". Using '",seqName,"'.";
+ if itype == "non":
+ geneName = "none";
+ else:
+ geneName = ""
+ for it in lSplit:
+ if (itype=="ens" and "gene:" in it) or (itype=="cuf" and "gene=" in it) :
+ geneName=it[5:];
+ if geneName == "":
+ geneName = seqName;
+ else:
+ seqLen+=len(line)-1;
+if seqName!="":
+ result.append([geneName,seqName,str(seqLen)]);
+
+inF.close();
+
+verbose(str(seqCount)+" sequences processed.");
+
+outF.write("# M "+str(seqCount)+"\n");
+for it in result:
+ outF.write(it[0]+" "+it[1]+" "+it[2]+"\n");
+
+outF.close();
+
diff --git a/getCounts.py b/getCounts.py
new file mode 100755
index 0000000..44e514b
--- /dev/null
+++ b/getCounts.py
@@ -0,0 +1,78 @@
+#!/usr/bin/python
+# Initialization {{{
+import sys
+import numpy as np
+#import os, time # needed for this:
+#time_str = time.strftime("%b %e %Y %H:%M:%S", time.gmtime(os.lstat(sys.argv[0]).st_mtime));
+#print "###",os.path.basename(sys.argv[0]),"build:",time_str;
+
+from optparse import OptionParser
+parser = OptionParser(usage="%prog [options] [<inputFile.thetaMeans>]+\n\n\
+ This program reads supplied .thetaMeans files and using either information from .prob files or Nmap option generates read counts for each input file provided.")
+parser.add_option("-o", "--outFile", dest="out", help="Output file", type="string")
+parser.add_option("-v", "--verbose", default=False, dest="verbose", action="store_true", help="Verbose output")
+parser.add_option("-p", "--probDir", dest="probDir", help="Directory with .prob files. The program will look in here for files with same name except fot extension .prob in order to find out total-aligned-read counts for each experiment.", type="string")
+parser.add_option("-n", "--Nmap", dest="Nmap", help = "Comma separated list of total aligned-read-counts for each experiment.",type="string");
+def verbose(str):
+ if options.verbose:
+ print str;
+(options, args) = parser.parse_args()
+
+if len(args)==0:
+ sys.exit("Please supply .thetaMeans filenames as arguments.");
+if not options.out:
+ sys.exit("Please supply output file");
+if (not options.probDir) and (not options.Nmap):
+ sys.exit("Please use either --Nmap or --probDir.");
+#}}}
+
+if options.Nmap:
+ try:
+ N = [ float(it) for it in options.Nmap.split(",")]
+ if len(N) != len(args):
+ raise;
+ except:
+ sys.exit("Unable to turn '"+options.Nmap+"' into "+str(len(args))+" numbers.");
+else:
+ N = []
+ for arg in args:
+ fn = arg.split("/")[-1];
+ if fn[-11:] == '.thetaMeans':
+ fn = options.probDir +"/"+fn[:-11]+".prob";
+ else:
+ fn = options.probDir +"/"+fn+".prob";
+ try:
+ inF = open(fn);
+ except:
+ sys.exit("Unable to open file: "+fn);
+ print "Reading file: ",fn;
+ Nmap = 0;
+ for line in inF:
+ if line[0]!="#": break;
+ ls=line.split();
+ for i in xrange(len(ls)-1):
+ if ls[i] == "Nmap": Nmap = int(ls[i+1]);
+ inF.close();
+ if Nmap <= 0:
+ sys.exit("Unable to find valid Nmap in: "+fn);
+ N.append(Nmap);
+
+
+means = [np.transpose(np.loadtxt(arg))[1] for arg in args];
+print "Files:";
+for j in xrange(len(args)):
+ print " ",args[j],N[j];
+
+try:
+ outF = open(options.out,"w");
+except:
+ sys.exit("Unable to open output file: ",options.out);
+
+for i in xrange(len(means[0])):
+ for j in xrange(len(means)):
+ outF.write(str(long(round(means[j][i]*N[j])))+" ");
+ outF.write("\n");
+
+outF.close();
+
+
diff --git a/getFoldChange.cpp b/getFoldChange.cpp
new file mode 100644
index 0000000..7f1a345
--- /dev/null
+++ b/getFoldChange.cpp
@@ -0,0 +1,112 @@
+/*
+ *
+ * Compute Fold Change between expression samples.
+ *
+ *
+ */
+#include <cmath>
+#include <iostream>
+
+using namespace std;
+
+#include "PosteriorSamples.h"
+#include "ArgumentParser.h"
+#include "common.h"
+
+
+int main(int argc,char* argv[]){
+ string programDescription=
+"Computes log_2 Fold Change from MCMC expression samples.\n\
+ [sampleFiles] should contain transposed MCMC samples from replicates.\n\
+ (use --log option if they are not logged)";
+ // Set options {{{
+ ArgumentParser args(programDescription,"[sampleFiles]",1);
+ args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
+ args.addOptionB("l","log","log",0,"Use logged values.");
+// args.addOptionS("t","type","type",0,"Type of variance, possible values: [sample,sqDif] for sample variance or sqared difference.","sample");
+ if(!args.parse(argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ // }}}
+ bool doLog=args.flag("log");
+ if(doLog){
+ if(args.verbose)cout<<"Will log expression samples to produce log_2 Fold Chnage."<<endl;
+ }else{
+ if(args.verbose)cout<<"Assuming samples are logged, producing log_2 Fold Change."<<endl;
+ }
+
+ long i,j,r,N,RN,M=0,C;
+
+ Conditions cond;
+ if(! (cond.init("NONE", args.args(), &C, &M, &N))){
+ cerr<<"ERROR: Main: Failed loading MCMC samples."<<endl;
+ return 0;
+ }
+ RN=cond.getRN();
+ if((RN>2)&&(C!=2)){//{{{
+ cout<<"Please specify exactly 2 conditions when using more than two sample files.\n";
+ cout<<" such as: [sample Files from first condition] C [sample files from second condition]"<<endl;
+ return 0;
+ }//}}}
+ if(args.verbose)cout<<"Samples: "<<N<<" transcripts: "<<M<<endl;
+
+ ofstream outFile(args.getS("outFileName").c_str());
+ if(! outFile.is_open()){
+ cerr<<"ERROR: Main: File write failed!"<<endl;
+ return 0;
+ }
+ outFile<<"# log_2 Fold Change in expression."<<endl;
+ outFile<<"# files: ";
+ for(r=0;r<2;r++)outFile<<args.args()[r]<<" ";
+ outFile<<endl;
+ outFile<<"# T (M rows,N cols)"<<endl;
+ outFile<<"# M "<<M<<endl;
+ outFile<<"# N "<<N<<endl;
+ vector<double> tr,tr2,res(N);
+ double l2=log(2.0);
+ long RC;
+ for(j=0;j<M;j++){
+ if(args.verbose)progressLog(j,M);
+ if(RN==2){
+ if(cond.getTranscript(0,j,tr)&&cond.getTranscript(1,j,tr2)){
+ for(i=0;i<N;i++){
+ if(doLog)outFile<<log(tr2[i]/tr[i])/l2;
+ outFile<<(tr2[i]-tr[i])/l2<<" ";
+ }
+ outFile<<endl;
+ }else{
+ cerr<<"Failed loading "<<j<<" transcript."<<endl;
+ }
+ }else{
+ // Comparing arithmetic means of log samples which are geometric means of samples
+ res.assign(N,0);
+ RC = cond.getRC(1);
+ for(r=0;r< RC;r++){
+ if(cond.getTranscript(1,r,j,tr)){
+ for(i=0;i<N;i++)
+ if(doLog)res[i]+=log(tr[i])/RC;
+ else res[i]+=tr[i]/RC;
+ }else{
+ cerr<<"Failed loading "<<j<<" transcript from condition 1 replicate "<<r<<endl;
+ }
+ }
+ RC = cond.getRC(0);
+ for(r=0;r<RC;r++){
+ if(cond.getTranscript(0,r,j,tr)){
+ for(i=0;i<N;i++)
+ if(doLog)res[i]-=log(tr[i])/RC;
+ else res[i]-=tr[i]/RC;
+ }else{
+ cerr<<"Failed loading "<<j<<" transcript from condition 0 replicate "<<r<<endl;
+ }
+ }
+ for(i=0;i<N;i++)
+ outFile<<res[i]/l2<<" ";
+ outFile<<endl;
+ }
+ }
+ cond.close();
+
+ outFile.close();
+ if(args.verbose)cout<<"DONE"<<endl;
+ return 0;
+}
diff --git a/getGeneExpression.cpp b/getGeneExpression.cpp
new file mode 100644
index 0000000..80fb989
--- /dev/null
+++ b/getGeneExpression.cpp
@@ -0,0 +1,120 @@
+/*
+ *
+ * Produce overall gene expression
+ *
+ */
+#include<cmath>
+
+using namespace std;
+
+#include "ArgumentParser.h"
+#include "misc.h"
+#include "PosteriorSamples.h"
+#include "TranscriptInfo.h"
+
+#include "common.h"
+
+extern "C" int getGeneExpression(int *argc,char* argv[]){
+ string programDescription=
+"Computes expression of whole genes.\n\
+ [samplesFile] should contain transposed MCMC samples which will be transformed into gene expression samples.";
+ // Set options {{{
+ ArgumentParser args(programDescription,"[samplesFile]",1);
+ args.addOptionS("t","trInfoFile","trInfoFileName",1,"Name of the transcript file.");
+ args.addOptionB("a","adjustByLength","adjust",0,"Adjust expression by transcripts length.");
+ args.addOptionB("","theta2rpkm","rpkm",0,"Transform transcript expression in theta to gene expression in RPKM.");
+ args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
+ args.addOptionB("l","log","log",0,"Output logged values.");
+ args.addOptionS("T","trMap","trMapFile",0,"Name of the file containing transcript to gene mapping.");
+ args.addOptionS("G","geneList","geneListFile",0,"Name of the file containing list of gene names (one for each transcript).");
+ args.addOptionB("","updateTrFile","updateTrFile",0,"Update trInfoFile if new gene names were provided (with trMapFile or geneListFile).");
+ args.addOptionS("g","geneInfoFile","geneInfoFile",0,"Name of while to which gene information will be saved.");
+ if(!args.parse(*argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ // }}}
+ bool doLog,doAdjust=args.flag("adjust")||args.flag("rpkm"),doRPKM=args.flag("rpkm");
+ doLog = ns_genes::getLog(args);
+
+ long N=0,M=0,G;
+ TranscriptInfo trInfo;
+ PosteriorSamples samples;
+ if(!ns_genes::prepareInput(args, &trInfo, &samples, &M, &N, &G))return 1;
+ if(!ns_genes::updateGenes(args, &trInfo, &G))return 1;
+ if(args.verb())messageF("Genes: %ld\n",G);
+ if(!ns_genes::checkGeneCount(G,M))return 1;
+ if(args.flag("updateTrFile") && (args.isSet("trMapFile") || args.isSet("geneListFile"))){
+ if(args.verb())message("Updating transcript info file with new gene names.\n");
+ if(!trInfo.writeInfo(args.getS("trInfoFileName"), true)){
+ if(args.verb())warning("Main: Updating trInfoFile failed.\n");
+ }
+ }
+ if(args.isSet("geneInfoFile")){
+ if(args.verb())message("Saving gene information into: %s.\n",args.getS("geneInfoFile").c_str());
+ if(!trInfo.writeGeneInfo(args.getS("geneInfoFile"))){
+ warning("Main: Writing gene information failed.\n");
+ }
+ }
+
+ ofstream outFile;
+ if(!ns_misc::openOutput(args, &outFile))return 1;;
+ // Write ouput header {{{
+ outFile<<"# from: "<<args.args()[0]<<"\n# samples of gene expression\n";
+ if(args.verbose)message("Genes will be ordered as they first appear in %s.\n",(args.getS("trInfoFileName")).c_str());
+ outFile<<"# Genes will be ordered as they first appear in "<<args.getS("trInfoFileName")<<"\n";
+ if(doRPKM)outFile<<"# data in RPKM\n";
+ if(doLog)outFile<<"# L \n";
+ outFile<<"# T (M rows,N cols)\n";
+ outFile<<"# G = M "<<G<<"\n# N "<<N<<endl;
+ // Set precision.
+ outFile.precision(9);
+ outFile<<scientific;
+ // }}}
+ vector< vector<double> > trs;
+ vector<long double> normals(N,0);
+ long double sum;
+ long i,j,g,gM,m;
+ if(doAdjust){
+ vector<double> tr(M);
+ if(args.verbose)message("Computing normalization constants, because of length adjustment.\n");
+ for(j=0;j<M;j++){
+ if(args.verbose)progressLog(j,M);
+ samples.getTranscript(j,tr);
+ for(i=0;i<N;i++)
+ normals[i] += tr[i]/trInfo.L(j);
+ }
+ }
+ if(args.verbose)message("Computing gene expression.\n");
+ for(g=0;g<G;g++){
+ if(args.verbose)progressLog(g,G);
+ gM = trInfo.getGtrs(g).size();
+ if((long)trs.size()<gM)trs.resize(gM);
+ for(j=0;j<gM;j++){
+ m = trInfo.getGtrs(g)[j];
+ samples.getTranscript( m , trs[j]);
+ }
+ for(i=0;i<N;i++){
+ sum = 0;
+ for(j=0;j<gM;j++){
+ if(doAdjust&&(normals[i]>0)){
+ m = trInfo.getGtrs(g)[j];
+ sum+=(trs[j][i] / trInfo.L(m)) / normals[i];
+ }else{
+ sum+=trs[j][i];
+ }
+ }
+ if(doRPKM)sum=sum*10e9;
+ if(doLog)sum=log(sum);
+ outFile<<sum<<" ";
+ }
+ outFile<<endl;
+ }
+ outFile.close();
+ if(args.verbose)message("DONE\n");
+ return 0;
+}
+
+#ifndef BIOC_BUILD
+int main(int argc,char* argv[]){
+ return getGeneExpression(&argc,argv);
+}
+#endif
diff --git a/getPPLR.cpp b/getPPLR.cpp
new file mode 100644
index 0000000..2fb6d95
--- /dev/null
+++ b/getPPLR.cpp
@@ -0,0 +1,152 @@
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+using namespace std;
+
+#include "ArgumentParser.h"
+#include "common.h"
+#include "misc.h"
+#include "PosteriorSamples.h"
+
+int main(int argc,char* argv[]){
+ string programDescription=
+"Computes PPLR from MCMC expression samples.\n"
+" (the probability of second condition being up-regulated)\n"
+" Also computes log2 fold change with confidence intervals, and condition mean log expression.\n"
+" [sampleFiles] should contain transposed MCMC samples from different conditions.";
+ // Set options {{{
+ ArgumentParser args(programDescription,"[sampleFile-C1] [sampleFile-C1]",1);
+ args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
+ args.addOptionB("","inputIsLogged","logged",0,"Indicate that the input expression estimates are on log scale. (Not necessary to use with data generated by BitSeq-0.5.0 and above.)");
+ args.addOptionB("d","distribution","distribution",0,"Produce whole distribution of differences.");
+ args.addOptionS("s","selectFile","selectFileName",0,"File containing list of selected transcript IDs (zero based), only these will be reported. Only works with --distribution option.");
+ args.addOptionD("","subSample","subSample",0,"Sub-sample the distributions using a given fraction of expression samples.",1.0);
+ if(!args.parse(argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ // }}}
+
+ long i,m,N,M;
+ bool getAll=false, doLog = true;
+ vector<long> trSelect;
+ if(! args.isSet("selectFileName")){
+ getAll=true;
+ }else{
+ ifstream selectF (args.getS("selectFileName").c_str());
+ if(! selectF.is_open()){
+ cerr<<"ERROR: Main: Failed loading selected transcripts."<<endl;
+ return 1;
+ }
+ selectF>>m;
+ while(selectF.good()){
+ trSelect.push_back(m);
+ selectF>>m;
+ }
+ selectF.close();
+ sort(trSelect.begin(),trSelect.end());
+ }
+
+ Conditions cond;
+ if(! cond.init("NONE", args.args(), &M, &N)){
+ cerr<<"ERROR: Main: Failed loading conditions."<<endl;
+ return 1;
+ }
+ if(cond.logged() || args.flag("logged")) {
+ doLog = false;
+ if(args.verbose)cout<<"Assuming values are logged already."<<endl;
+ }else {
+ doLog = true;
+ if(args.verbose)cout<<"Will use logged values."<<endl;
+ }
+ if(args.verbose)cout<<"M "<<M<<" N "<<N<<endl;
+ ofstream outFile(args.getS("outFileName").c_str());
+ if(! outFile.is_open()){
+ cerr<<"ERROR: Main: File write probably failed!"<<endl;
+ return 1;
+ }
+ if(getAll){
+ trSelect.resize(M);
+ for(i=0;i<M;i++)trSelect[i]=i;
+ }
+
+ vector<vector<double> > tr(2);
+ vector<double> difs;
+ long subN = N;
+ double frac = args.getD("subSample");
+ if((frac > 0) && (frac < 1))subN = (long)(N * frac);
+ if(subN<1){
+ cerr<<"ERROR: The fraction of samples for sub-sampling is too small."<<endl;
+ return 1;
+ }
+ if((args.getD("subSample")!=1) && args.verbose){
+ cout<<"Using "<<subN<<" samples for sub-sampling."<<endl;
+ }
+ double pplr,mu_0,mu_1,log2FC,ciLow,ciHigh;
+ if(! args.flag("distribution")){
+ if(args.verbose)cout<<"Counting PPLR"<<endl;
+ outFile<<"# Computed PPLR, log2 fold change with 95\% confidence intervals, condition mean log expression."<<endl;
+ outFile<<"# M "<<M<<"\n# columns:"<<endl;
+ outFile<<"# PPLR log2FoldChange ConfidenceLow ConfidenceHigh MeanLogExpressionC1 MeanLogExpressionC2"<<endl;
+ for(m=0;m<M;m++){
+ if(args.verbose)progressLog(m,M);
+ cond.getTranscript(0,m,tr[0],subN);
+ cond.getTranscript(1,m,tr[1],subN);
+ difs.resize(subN);
+ pplr = log2FC = mu_0 = mu_1 = 0;
+ for(i=0;i<subN;i++){
+ if(doLog){
+ if((tr[0][i] <= 0) || (tr[1][i] <= 0)){
+ cerr<<"ERROR: Found non-positive expression (transcript: "<<m<<").\n"
+ " The expression is probably in log scale already.\n"
+ " Please check your data and use --inputIsLogged if that is the case."
+ <<endl;
+ return 1;
+ }
+ tr[1][i] = log(tr[1][i]);
+ tr[0][i] = log(tr[0][i]);
+ }
+ if(tr[1][i]>tr[0][i])pplr+=1;
+ difs[i]=tr[1][i]-tr[0][i];
+ log2FC+=tr[1][i]-tr[0][i];
+ mu_0 += tr[0][i];
+ mu_1 += tr[1][i];
+ }
+ pplr /= subN;
+ mu_0 /= subN;
+ mu_1 /= subN;
+ log2FC /= subN*log(2);
+ ns_misc::computeCI(95, &difs, &ciLow, &ciHigh);
+ ciLow /= log(2);
+ ciHigh /= log(2);
+ outFile<<pplr<<" "<<log2FC<<" "<<ciLow<<" "<<ciHigh<<" "<<mu_0<<" "<<mu_1<<endl;
+ }
+ }else{
+ if(args.verbose)cout<<"Computing Log Ratio distribution"<<endl;
+ long selectM = trSelect.size();
+ outFile<<"# Log Ratio distribution"<<endl;
+ outFile<<"# T "<<endl;
+ outFile<<"# M "<<selectM<<endl;
+ outFile<<"# N "<<subN<<endl;
+ outFile<<"# first column - transcript number (zero based)"<<endl;
+ for(m=0;m<selectM;m++){
+ if(selectM>10)progressLog(m,M);
+ cond.getTranscript(0,trSelect[m],tr[0],subN);
+ cond.getTranscript(1,trSelect[m],tr[1],subN);
+ outFile<<trSelect[m]<<" ";
+ for(i=0;i<subN;i++){
+ if(doLog){
+ tr[1][i] = log(tr[1][i]);
+ tr[0][i] = log(tr[0][i]);
+ }
+ outFile<<tr[1][i]-tr[0][i]<<" ";
+ }
+ outFile<<endl;
+ }
+ }
+ outFile.close();
+ cond.close();
+ return 0;
+}
diff --git a/getVariance.cpp b/getVariance.cpp
new file mode 100644
index 0000000..e2bdba2
--- /dev/null
+++ b/getVariance.cpp
@@ -0,0 +1,167 @@
+/*
+ *
+ * Compute posterior variance of samples.
+ *
+ *
+ */
+#include<cmath>
+
+using namespace std;
+
+#include "ArgumentParser.h"
+#include "misc.h"
+#include "PosteriorSamples.h"
+
+#include "common.h"
+
+extern "C" int getVariance(int *argc,char* argv[]){
+ string programDescription=
+"Estimates variance of MCMC samples from 1 or multiple replicates.\n\
+ [sample Files] should contain transposed MCMC samples from replicates.";
+ // Set options {{{
+ ArgumentParser args(programDescription,"[sampleFiles]",1);
+ args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
+ args.addOptionB("l","log","log",0,"Use logged values.");
+ args.addOptionS("t","type","type",0,"Type of variance, possible values: [sample,sqDif] for sample variance or squared difference.","sample");
+ args.addOptionS("","norm","normalization",0,"Normalization constants for each input file provided as comma separated list of doubles (e.g. 1.0017,1.0,0.9999 ).");
+ if(!args.parse(*argc,argv)){return 0;}
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ bool doLog=args.flag("log"),logged=false;
+ // }}}
+
+ long i,j,r,N,RN,M=0;
+
+ Conditions cond;
+ if(! (cond.init("NONE", args.args(), &M, &N))){
+ error("Main: Failed loading MCMC samples.\n");
+ return 1;
+ }
+ if(doLog){
+ logged = true;
+ if(cond.logged()){
+ doLog=false;
+ if(args.verbose)message("Samples are already logged, computing mean.\n");
+ }else{
+ if(args.verbose)message("Using logged values.\n");
+ }
+ }else{
+ if(args.verbose)message("NOT using logged values.\n");
+ if(cond.logged())logged=true;
+ }
+ if(args.isSet("normalization")){
+ if(! cond.setNorm(args.getTokenizedS2D("normalization"))){
+ error("Main: Applying normalization constants failed.\n");
+ return 1;
+ }
+ }
+ RN=cond.getRN();
+ if((args.getS("type")=="sqDif")&&(RN>2)&&(args.verbose)){//{{{
+ i=0;
+ while(args.args()[i]=="C")i++;
+ message("using only: %s ",(args.args()[i]).c_str());
+ i++;
+ while(args.args()[i]=="C")i++;
+ message("%s\n",(args.args()[i]).c_str());
+ }//}}}
+ if(args.verbose)message("replicates: %ld samples: %ld transcripts: %ld\n",RN,N,M);
+
+ ofstream outFile(args.getS("outFileName").c_str());
+ if(! outFile.is_open()){
+ error("Main: File write failed!\n");
+ return 1;
+ }
+ vector<double> mean(M),var(M);
+ vector<double> tr,tr2;
+ double m,mSq,count,sqDif;
+ bool good=true;
+ if(args.getS("type")=="sample"){ //{{{
+ for(j=0;j<M;j++){
+ if((j%10000==0)&&(j>0)&&args.verbose)message("%ld\n",j);
+
+ m = mSq = count = 0;
+ for(r=0;r<RN;r++){
+ if(cond.getTranscript(r,j,tr,N/RN)){
+ for(i=0;i<N/RN;i++){
+ if(doLog){
+ tr[i]=tr[i]<=0?ns_misc::LOG_ZERO:log(tr[i]);
+ }
+ m+=tr[i];
+ mSq += tr[i]*tr[i];
+ }
+ count+=N/RN;
+ }else{
+ warning("Error at %ld %ld\n",j,r);
+ }
+ // message("%ld %ld\n",m,count);
+ }
+ if(count==0){
+ warning("no samples for transcript: %ld.\n",j);
+ //for(i,Sof(tr))message("%lf "=0;i,Sof(tr))message("%lf "<tr[i];i,Sof(tr))message("%lf "++);
+ mean[j] = -47;
+ var[j] = -47;
+ }else{
+ mean[j] = m / count;
+ var[j] = mSq/count - m*m/(count*count);
+ }
+ }//}}}
+ }else{ // "sqDif" {{{
+ for(j=0;j<M;j++){
+ if((j%10000==0)&&(j>0)&&args.verbose)message("%ld\n",j);
+ m = sqDif = 0;
+ if(RN==1){
+ if(! cond.getTranscript(0,j,tr,N)){
+ mean[j] = -47;
+ var[j] = -47;
+ good=false;
+ continue;
+ }
+ tr2.resize(N/2);
+ for(i=0;i<N/2;i++)
+ tr2[i]=tr[i+N/2];
+ }else{
+ if(! (cond.getTranscript(0,j,tr,N/2)&&
+ cond.getTranscript(1,j,tr2,N/2))){
+ mean[j] = -47;
+ var[j] = -47;
+ good=false;
+ continue;
+ }
+ }
+ if(good){
+ for(i=0;i<N/2;i++){
+ if(doLog){
+ tr[i]=tr[i]<=0?ns_misc::LOG_ZERO:log(tr[i]);
+ tr2[i]=tr2[i]<=0?ns_misc::LOG_ZERO:log(tr2[i]);
+ }
+ m+=tr[i]+tr2[i];
+ sqDif+=(tr[i]-tr2[i])*(tr[i]-tr2[i]);
+ }
+ mean[j] = m / N;
+ var[j] = sqDif / N; // == ( sqDif / (N/2) ) / 2
+ }
+ }
+ } //}}}
+ cond.close();
+
+ outFile<<"# Transcripts mean expression and "<<args.getS("type")<<" variance."<<endl;
+ outFile<<"# files: ";
+ for(r=0;r<RN;r++)outFile<<args.args()[r]<<" ";
+ outFile<<endl;
+ if(logged)outFile<<"# L -> values logged"<<endl;
+ outFile<<"# M "<<M<<endl;
+ (outFile<<scientific).precision(9);
+ for(i=0;i<M;i++){
+ if((mean[i]==-47)&&(var[i]==-47))outFile<<"NaN 0 "<<endl;
+ else outFile<<mean[i]<<" "<<var[i]<<endl;
+ }
+ outFile.close();
+ if(args.verbose)message("DONE\n");
+ return 0;
+}
+
+
+#ifndef BIOC_BUILD
+int main(int argc,char* argv[]){
+ return getVariance(&argc,argv);
+}
+#endif
diff --git a/getWithinGeneExpression.cpp b/getWithinGeneExpression.cpp
new file mode 100644
index 0000000..ca7846f
--- /dev/null
+++ b/getWithinGeneExpression.cpp
@@ -0,0 +1,248 @@
+/*
+ *
+ * Produce relative expression within gene
+ *
+ */
+#include<cmath>
+
+using namespace std;
+
+#include "ArgumentParser.h"
+#include "misc.h"
+#include "PosteriorSamples.h"
+#include "TranscriptInfo.h"
+
+#include "common.h"
+
+namespace ns_withinGene {
+
+// Read transcripts of gene g.
+void readTranscripts(long g, const TranscriptInfo &trInfo, PosteriorSamples *samples, long *gM, vector< vector<double> > *trs);
+
+// Adjust expression samples by transcript length.
+void adjustExpression(long g, const TranscriptInfo &trInfo, vector< vector<double> > *trs);
+
+// Compute sum of samples of transcripts from one gene.
+void getSum(long gM, long N, const vector< vector<double> > &trs, vector<double> *sum);
+
+// Update 'mean' and squareSum with new value.
+void updateSummaries(double x, long double *mean, long double *sqSum, double norm = 1, bool doLog = false);
+
+// Append samples of a transcript into output file.
+void writeTr(long N, const vector<double> &tr, ofstream *outFile);
+
+} // namespace ns_withinGene
+
+extern "C" int getWithinGeneExpression(int *argc,char* argv[]){
+ string programDescription=
+"Computes relative expression of transcripts within genes.\n\
+ [samplesFile] should contain transposed MCMC expression samples.\n\
+ program can produce means and variance and write them into [sumFile]\n\
+ or individual MCMC samples which are written into [outFile].";
+ // Set options {{{
+ ArgumentParser args(programDescription,"[samplesFile]",1);
+ args.addOptionS("t","trInfoFile","trInfoFileName",1,"Name of the transcript file.");
+ args.addOptionB("a","adjustByLength","adjust",0,"Adjust expression by transcripts length.");
+ args.addOptionS("o","outFile","outFileName",0,"Name of the output file.");
+ args.addOptionS("s","sumFile","sumFileName",0,"Name of summarization file where true mean, true variance and relative mean and relative variance are saved.");
+ args.addOptionB("l","log","log",0,"Use logged values.");
+ args.addOptionS("T","trMap","trMapFile",0,"Name of the file containing transcript to gene mapping.");
+ args.addOptionS("G","geneList","geneListFile",0,"Name of the file containing list of gene names (one for each transcript).");
+ args.addOptionB("","groupByGene","groupByGene",0,"Group transcripts by genes (this can change the default order of output.");
+ args.addOptionB("","updateTrFile","updateTrFile",0,"Update trInfoFile if new gene names were provided (with trMapFile or geneListFile).");
+ if(!args.parse(*argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ // }}}
+ bool doLog,doOut=args.isSet("outFileName"),doSummaries=args.isSet("sumFileName");
+ if(! (doOut || doSummaries)){
+ error("Main: Have to specify at least one of --outFile/--sumFile.\n");
+ return 1;
+ }
+ doLog = ns_genes::getLog(args);
+
+ long N=0,M=0,G;
+ TranscriptInfo trInfo;
+ PosteriorSamples samples;
+ if(!ns_genes::prepareInput(args, &trInfo, &samples, &M, &N, &G))return 1;
+ if(!ns_genes::updateGenes(args, &trInfo, &G))return 1;
+ if(args.verb())messageF("Genes: %ld\n",G);
+ if(!ns_genes::checkGeneCount(G,M))return 1;
+ if(args.flag("updateTrFile") && (args.isSet("trMapFile") || args.isSet("geneListFile"))){
+ if(args.verb())message("Updating transcript info file with new gene names.\n");
+ if(!trInfo.writeInfo(args.getS("trInfoFileName"), true)){
+ if(args.verb())warning("Main: Updating trInfoFile failed.\n");
+ }
+ }
+
+ ofstream outFile,sumFile;
+ if(doOut){
+ if(!ns_misc::openOutput(args, &outFile))return 1;;
+ // Write output header {{{
+ outFile<<"# from: "<<args.args()[0]<<"\n# samples of within gene expression\n";
+ if(! trInfo.genesOrdered()){
+ if(args.flag("groupByGene")){
+ warning("Main: Transcripts in output file will be reordered and grouped by genes.\n");
+ outFile<<"# WARNING: transcripts in output file are reordered and grouped by genes.\n";
+ }else{
+ warning("Main: Transcripts are not grouped by genes.\n"
+ " The transcript order will be kept the same but computation will be slower.\n");
+ }
+ }
+ if(doLog)outFile<<"# L \n";
+ outFile<<"# T (M rows,N cols)\n";
+ outFile<<"# M "<<M<<"\n# N "<<N<<endl;
+ // Set precision.
+ (outFile<<scientific).precision(9);
+ // }}}
+ }
+ if(doSummaries){
+ if(!ns_misc::openOutput(args.getS("sumFileName"), &sumFile))return 1;
+ sumFile<<"# from: "<<args.args()[0]<<"\n# <mean> <variance> <mean of within gene expression> <variance of within gene expression>\n# M "<<M<<endl;
+ // Set precision.
+ (sumFile<<scientific).precision(9);
+ }
+ vector<long double> mean(M,0),mean2(M,0),sqSum(M,0),sqSum2(M,0);
+ vector< vector<double> > trs;
+ vector<double> sum;
+ // Nrmalisation constant are 1 by default, or equivalently 0 in LOG case.
+ vector<double> normals(N,(int)(!doLog));
+ long i,j,g,gM,m;
+ if(args.flag("adjust")&&(doSummaries)){
+ // 'normals' are only precomputed so that non-relative mean and variance are computed from
+ // length adjusted and normalised expression.
+ vector<double> tr(M);
+ if(args.verbose)message("Computing normalization constants, because of length adjustment.\n");
+ normals.assign(N,0);
+ for(j=0;j<M;j++){
+ if(args.verbose)progressLog(j,M);
+ samples.getTranscript(j,tr);
+ for(i=0;i<N;i++)
+ normals[i] += tr[i]/trInfo.L(j);
+ }
+ if(doLog)for(i=0;i<N;i++)normals[i] =
+ (normals[i] != 0) ? log(normals[i]) : ns_misc::LOG_ZERO;
+ }
+ if(args.verbose)message("Computing within gene relative expression.\n");
+ g = -2;
+ if(!args.flag("groupByGene")){
+ long curJ=0;
+ // Here we iterate over transcripts:
+ // For each transript: load all transcripts of a gene of current transcripts
+ // If gene is same as for previous, then just reuse information
+ for(m=0;m<M;m++){
+ if(args.verbose)progressLog(m,M);
+ if(trInfo.geId(m) == g){
+ for(j=0;j<gM;j++)if(trInfo.getGtrs(g)[j] == m){curJ = j; break;}
+ }else{
+ g = trInfo.geId(m);
+ ns_withinGene::readTranscripts(g, trInfo, &samples, &gM, &trs);
+ curJ = 0;
+ for(j=0;j<gM;j++)if(trInfo.getGtrs(g)[j] == m){curJ = j; break;}
+ if(args.flag("adjust"))ns_withinGene::adjustExpression(g, trInfo, &trs);
+ ns_withinGene::getSum(gM, N, trs, &sum);
+ }
+ for(i=0;i<N;i++){
+ if(doLog)trs[curJ][i] = log(trs[curJ][i]);
+ if(doSummaries) ns_withinGene::updateSummaries(trs[curJ][i], &(mean[m]), &(sqSum[m]), normals[i], doLog);
+ if(doLog)trs[curJ][i] -= log(sum[i]);
+ else trs[curJ][i] /= sum[i];
+ if(doSummaries) ns_withinGene::updateSummaries(trs[curJ][i], &mean2[m], &sqSum2[m]);
+ }
+ if(doOut){
+ ns_withinGene::writeTr(N, trs[curJ], &outFile);
+ }
+ }
+ }else{
+ // Here we iterate over genes:
+ // Calculate values for all their transcripts
+ // Write all transcripts of current gene
+ for(g=0;g<G;g++){
+ if(args.verbose)progressLog(g,G);
+ ns_withinGene::readTranscripts(g, trInfo, &samples, &gM, &trs);
+ if(args.flag("adjust"))ns_withinGene::adjustExpression(g, trInfo, &trs);
+ ns_withinGene::getSum(gM, N, trs, &sum);
+ for(i=0;i<N;i++){
+ for(j=0;j<gM;j++){
+ m = trInfo.getGtrs(g)[j];
+ if(doLog)trs[j][i] = log(trs[j][i]);
+ if(doSummaries) ns_withinGene::updateSummaries(trs[j][i], &mean[m], &sqSum[m], normals[i], doLog);
+ if(doLog)trs[j][i] -= log(sum[i]);
+ else trs[j][i] /= sum[i];
+ if(doSummaries) ns_withinGene::updateSummaries(trs[j][i], &mean2[m], &sqSum2[m]);
+ }
+ }
+ if(doOut){
+ for(j=0;j<gM;j++){
+ ns_withinGene::writeTr(N, trs[j], &outFile);
+ }
+ }
+ }
+ }
+ if(doOut)outFile.close();
+ if(doSummaries){
+ long double var,var2;
+ for(m=0;m<M;m++){
+ mean[m] /= N;
+ var = sqSum[m]/N - mean[m]*mean[m];
+ mean2[m] /= N;
+ var2 = sqSum2[m]/N - mean2[m]*mean2[m];
+ sumFile<<mean[m]<<" "<<var<<" "<<mean2[m]<<" "<<var2<<endl;
+ }
+ sumFile.close();
+ }
+ if(args.verbose)message("DONE\n");
+ return 0;
+}
+
+
+#ifndef BIOC_BUILD
+int main(int argc,char* argv[]){
+ return getWithinGeneExpression(&argc,argv);
+}
+#endif
+
+
+namespace ns_withinGene {
+
+void readTranscripts(long g, const TranscriptInfo &trInfo, PosteriorSamples *samples, long *gM, vector< vector<double> > *trs){//{{{
+ *gM = trInfo.getGtrs(g).size();
+ if((long)trs->size() < *gM)trs->resize(*gM);
+ for(long j = 0; j < *gM; j++){
+ samples->getTranscript( trInfo.getGtrs(g)[j] , (*trs)[j]);
+ }
+}// }}}
+
+void adjustExpression(long g, const TranscriptInfo &trInfo, vector< vector<double> > *trs){//{{{
+ long N,gM = trInfo.getGtrs(g).size();
+ double l;
+ for(long j=0; j<gM; j++){
+ l = trInfo.L(trInfo.getGtrs(g)[j]);
+ N = (*trs)[j].size();
+ for(long n=0; n<N; n++){
+ (*trs)[j][n] /= l;
+ }
+ }
+}// }}}
+
+void getSum(long gM, long N, const vector< vector<double> > &trs, vector<double> *sum){//{{{
+ sum->assign(N,0);
+ for(long j=0; j<gM; j++)
+ for(long n=0; n<N; n++)(*sum)[n] += trs[j][n];
+}// }}}
+
+void updateSummaries(double x, long double *mean, long double *sqSum, double norm, bool doLog){//{{{
+ if(doLog) x -= norm;
+ else x = (norm != 0) ? x/norm : x;
+
+ *mean += x;
+ *sqSum += x*x;
+}// }}}
+
+void writeTr(long N, const vector<double> &tr, ofstream *outFile){//{{{
+ for(long n=0; n<N-1; n++)
+ (*outFile)<<tr[n]<<" ";
+ (*outFile)<<tr[N-1]<<endl;
+}// }}}
+
+} // namespace ns_withinGene
+
diff --git a/lowess.cpp b/lowess.cpp
new file mode 100644
index 0000000..efdb215
--- /dev/null
+++ b/lowess.cpp
@@ -0,0 +1,511 @@
+/*
+ * c++ implementation of Lowess weighted regression by
+ * Peter Glaus http://www.cs.man.ac.uk/~glausp/
+ *
+ *
+ * Based on fortran code by Cleveland downloaded from:
+ * http://netlib.org/go/lowess.f
+ * original author:
+* wsc at research.bell-labs.com Mon Dec 30 16:55 EST 1985
+* W. S. Cleveland
+* Bell Laboratories
+* Murray Hill NJ 07974
+ *
+ * See original documentation below the code for details.
+ *
+ */
+#include<algorithm>
+#include<cmath>
+#include<fstream>
+
+using namespace std;
+
+#include "lowess.h"
+#include "common.h"
+
+void lowess(const vector<double> &x, const vector<double> &y, double f, long nsteps, vector<double> &ys){//{{{
+ vector<double> rw,res;
+ lowess(x,y,f,nsteps,0.,ys,rw,res);
+}//}}}
+void lowess(const vector<double> &x, const vector<double> &y, double f, long nsteps, double delta, vector<double> &ys, vector<double> &rw, vector<double>&res){ //{{{
+ long n=(long)x.size();
+ bool ok=false;
+ long nleft,nright, i, j, iter, last, m1, m2, ns;
+ double cut, cmad, r, d1, d2, c1, c9, alpha, denom;
+ if((n==0)||((long)y.size()!=n)) return;
+ ys.resize(n);
+ rw.resize(n);
+ res.resize(n);
+ if(n==1){
+ ys[0]=y[0];
+ return;
+ }
+ // ns - at least 2, at most n
+ ns = max(min((long)(f*n),n),(long)2);
+ for(iter=0;iter<nsteps+1; iter++){
+ // robustnes iterations
+ nleft = 0;
+ nright = ns-1;
+ // index of last estimated point
+ last = -1;
+ // index of current point
+ i=0;
+ do{
+ while(nright<n-1){
+ // move <nleft,nright> right, while radius decreases
+ d1 = x[i]-x[nleft];
+ d2 = x[nright+1] - x[i];
+ if(d1<=d2)break;
+ nleft++;
+ nright++;
+ }
+ // fit value at x[i]
+ lowest(x,y,x[i],ys[i],nleft,nright,res,iter>0,rw,ok);
+ if(!ok) ys[i]=y[i];
+ if(last<i-1){
+ // interpolate skipped points
+ if(last<0){
+ warning("Lowess: out of range.\n");
+ }
+ denom = x[i] - x[last];
+ for(j=last+1;j<i;j++){
+ alpha = (x[j]-x[last])/denom;
+ ys[j] = alpha * ys[i] + (1.0-alpha)*ys[last];
+ }
+ }
+ last = i;
+ cut = x[last]+delta;
+ for(i=last+1;i<n;i++){
+ if(x[i]>cut)break;
+ if(x[i]==x[last]){
+ ys[i]=ys[last];
+ last=i;
+ }
+ }
+ i=max(last+1,i-1);
+ }while(last<n-1);
+ for(i=0;i<n;i++)
+ res[i] = y[i]-ys[i];
+ if(iter==nsteps)break ;
+ for(i=0;i<n;i++)
+ rw[i]=abs(res[i]);
+ sort(rw.begin(),rw.end());
+ m1 = n/2+1;
+ m2 = n-m1;
+ m1 --;
+ cmad = 3.0 *(rw[m1]+rw[m2]);
+ c9 = .999*cmad;
+ c1 = .001*cmad;
+ for(i=0;i<n;i++){
+ r = abs(res[i]);
+ if(r<=c1) rw[i]=1;
+ else if(r>c9) rw[i]=0;
+ else rw[i] = (1.0-(r/cmad)*(r/cmad))*(1.0-(r/cmad)*(r/cmad));
+ }
+ }
+}//}}}
+
+void lowest(const vector<double> &x, const vector<double> &y, double xs, double &ys, long nleft, long nright, vector<double> &w, bool userw, vector<double> &rw, bool &ok){//{{{
+ long n = (long)x.size();
+ long nrt, j;
+ double a, b, c, h, r, h1, h9, range;
+ range = x[n-1]-x[0];
+ h = max(xs-x[nleft],x[nright]-xs);
+ h9 = 0.999*h;
+ h1 = 0.001*h;
+ // sum of weights
+ a = 0;
+ for(j=nleft;j<n;j++){
+ // compute weights (pick up all ties on right)
+ w[j]=0.;
+ r = abs(x[j]-xs);
+ if(r<=h9){
+ // small enough for non-zero weight
+ if(r>h1) w[j] = (1.0-(r/h)*(r/h)*(r/h))*(1.0-(r/h)*(r/h)*(r/h))*(1.0-(r/h)*(r/h)*(r/h));
+ else w[j] = 1.;
+ if(userw) w[j] *= rw[j];
+ a += w[j];
+ }else if(x[j]>xs) break; // get out at first zero wt on right
+ }
+ nrt = j-1;
+ // rightmost pt (may be greater than nright because of ties)
+ if(a<=0.) ok = false;
+ else{
+ // weighted least squares
+ ok = true;
+ // normalize weights
+ for(j=nleft;j<=nrt;j++)
+ w[j] /= a;
+ if(h>0.){
+ // use linear fit
+ a = 0.;
+ for(j=nleft;j<=nrt;j++)
+ a += w[j]*x[j]; // weighted centre of values
+ b = xs-a;
+ c = 0;
+ for(j=nleft;j<=nrt;j++)
+ c += w[j]*(x[j]-a)*(x[j]-a);
+ if(sqrt(c)>0.001*range){
+ // points are spread enough to compute slope
+ b /= c;
+ for(j=nleft;j<=nrt;j++)
+ w[j] *= (1.0+b*(x[j]-a));
+ }
+ }
+ ys = 0;
+ for(j=nleft;j<=nrt;j++)
+ ys += w[j]*y[j];
+ }
+}//}}}
+
+/* {{{ Documentation
+* wsc at research.bell-labs.com Mon Dec 30 16:55 EST 1985
+* W. S. Cleveland
+* Bell Laboratories
+* Murray Hill NJ 07974
+*
+* outline of this file:
+* lines 1-72 introduction
+* 73-177 documentation for lowess
+* 178-238 ratfor version of lowess
+* 239-301 documentation for lowest
+* 302-350 ratfor version of lowest
+* 351-end test driver and fortran version of lowess and lowest
+*
+* a multivariate version is available by "send dloess from a"
+*
+* COMPUTER PROGRAMS FOR LOCALLY WEIGHTED REGRESSION
+*
+* This package consists of two FORTRAN programs for
+* smoothing scatterplots by robust locally weighted
+* regression, or lowess. The principal routine is LOWESS
+* which computes the smoothed values using the method
+* described in The Elements of Graphing Data, by William S.
+* Cleveland (Wadsworth, 555 Morego Street, Monterey,
+* California 93940).
+*
+* LOWESS calls a support routine, LOWEST, the code for
+* which is included. LOWESS also calls a routine SORT, which
+* the user must provide.
+*
+* To reduce the computations, LOWESS requires that the
+* arrays X and Y, which are the horizontal and vertical
+* coordinates, respectively, of the scatterplot, be such that
+* X is sorted from smallest to largest. The user must
+* therefore use another sort routine which will sort X and Y
+* according to X.
+* To summarize the scatterplot, YS, the fitted values,
+* should be plotted against X. No graphics routines are
+* available in the package and must be supplied by the user.
+*
+* The FORTRAN code for the routines LOWESS and LOWEST has
+* been generated from higher level RATFOR programs
+* (B. W. Kernighan, ``RATFOR: A Preprocessor for a Rational
+* Fortran,'' Software Practice and Experience, Vol. 5 (1975),
+* which are also included.
+*
+* The following are data and output from LOWESS that can
+* be used to check your implementation of the routines. The
+* notation (10)v means 10 values of v.
+*
+*
+*
+*
+* X values:
+* 1 2 3 4 5 (10)6 8 10 12 14 50
+*
+* Y values:
+* 18 2 15 6 10 4 16 11 7 3 14 17 20 12 9 13 1 8 5 19
+*
+*
+* YS values with F = .25, NSTEPS = 0, DELTA = 0.0
+* 13.659 11.145 8.701 9.722 10.000 (10)11.300 13.000 6.440 5.596
+* 5.456 18.998
+*
+* YS values with F = .25, NSTEPS = 0 , DELTA = 3.0
+* 13.659 12.347 11.034 9.722 10.511 (10)11.300 13.000 6.440 5.596
+* 5.456 18.998
+*
+* YS values with F = .25, NSTEPS = 2, DELTA = 0.0
+* 14.811 12.115 8.984 9.676 10.000 (10)11.346 13.000 6.734 5.744
+* 5.415 18.998
+*
+*
+*
+*
+* LOWESS
+*
+*
+*
+* Calling sequence
+*
+* CALL LOWESS(X,Y,N,F,NSTEPS,DELTA,YS,RW,RES)
+*
+* Purpose
+*
+* LOWESS computes the smooth of a scatterplot of Y against X
+* using robust locally weighted regression. Fitted values,
+* YS, are computed at each of the values of the horizontal
+* axis in X.
+*
+* Argument description
+*
+* X = Input; abscissas of the points on the
+* scatterplot; the values in X must be ordered
+* from smallest to largest.
+* Y = Input; ordinates of the points on the
+* scatterplot.
+* N = Input; dimension of X,Y,YS,RW, and RES.
+* F = Input; specifies the amount of smoothing; F is
+* the fraction of points used to compute each
+* fitted value; as F increases the smoothed values
+* become smoother; choosing F in the range .2 to
+* .8 usually results in a good fit; if you have no
+* idea which value to use, try F = .5.
+* NSTEPS = Input; the number of iterations in the robust
+* fit; if NSTEPS = 0, the nonrobust fit is
+* returned; setting NSTEPS equal to 2 should serve
+* most purposes.
+* DELTA = input; nonnegative parameter which may be used
+* to save computations; if N is less than 100, set
+* DELTA equal to 0.0; if N is greater than 100 you
+* should find out how DELTA works by reading the
+* additional instructions section.
+* YS = Output; fitted values; YS(I) is the fitted value
+* at X(I); to summarize the scatterplot, YS(I)
+* should be plotted against X(I).
+* RW = Output; robustness weights; RW(I) is the weight
+* given to the point (X(I),Y(I)); if NSTEPS = 0,
+* RW is not used.
+* RES = Output; residuals; RES(I) = Y(I)-YS(I).
+*
+*
+* Other programs called
+*
+* LOWEST
+* SSORT
+*
+* Additional instructions
+*
+* DELTA can be used to save computations. Very roughly the
+* algorithm is this: on the initial fit and on each of the
+* NSTEPS iterations locally weighted regression fitted values
+* are computed at points in X which are spaced, roughly, DELTA
+* apart; then the fitted values at the remaining points are
+* computed using linear interpolation. The first locally
+* weighted regression (l.w.r.) computation is carried out at
+* X(1) and the last is carried out at X(N). Suppose the
+* l.w.r. computation is carried out at X(I). If X(I+1) is
+* greater than or equal to X(I)+DELTA, the next l.w.r.
+* computation is carried out at X(I+1). If X(I+1) is less
+* than X(I)+DELTA, the next l.w.r. computation is carried out
+* at the largest X(J) which is greater than or equal to X(I)
+* but is not greater than X(I)+DELTA. Then the fitted values
+* for X(K) between X(I) and X(J), if there are any, are
+* computed by linear interpolation of the fitted values at
+* X(I) and X(J). If N is less than 100 then DELTA can be set
+* to 0.0 since the computation time will not be too great.
+* For larger N it is typically not necessary to carry out the
+* l.w.r. computation for all points, so that much computation
+* time can be saved by taking DELTA to be greater than 0.0.
+* If DELTA = Range (X)/k then, if the values in X were
+* uniformly scattered over the range, the full l.w.r.
+* computation would be carried out at approximately k points.
+* Taking k to be 50 often works well.
+*
+* Method
+*
+* The fitted values are computed by using the nearest neighbor
+* routine and robust locally weighted regression of degree 1
+* with the tricube weight function. A few additional features
+* have been added. Suppose r is FN truncated to an integer.
+* Let h be the distance to the r-th nearest neighbor
+* from X(I). All points within h of X(I) are used. Thus if
+* the r-th nearest neighbor is exactly the same distance as
+* other points, more than r points can possibly be used for
+* the smooth at X(I). There are two cases where robust
+* locally weighted regression of degree 0 is actually used at
+* X(I). One case occurs when h is 0.0. The second case
+* occurs when the weighted standard error of the X(I) with
+* respect to the weights w(j) is less than .001 times the
+* range of the X(I), where w(j) is the weight assigned to the
+* j-th point of X (the tricube weight times the robustness
+* weight) divided by the sum of all of the weights. Finally,
+* if the w(j) are all zero for the smooth at X(I), the fitted
+* value is taken to be Y(I).
+*
+*
+*
+*
+* subroutine lowess(x,y,n,f,nsteps,delta,ys,rw,res)
+* real x(n),y(n),ys(n),rw(n),res(n)
+* logical ok
+* if (n<2){ ys(1) = y(1); return }
+* ns = max0(min0(ifix(f*float(n)),n),2) # at least two, at most n points
+* for(iter=1; iter<=nsteps+1; iter=iter+1){ # robustness iterations
+* nleft = 1; nright = ns
+* last = 0 # index of prev estimated point
+* i = 1 # index of current point
+* repeat{
+* while(nright<n){
+* # move nleft, nright to right if radius decreases
+* d1 = x(i)-x(nleft)
+* d2 = x(nright+1)-x(i)
+* # if d1<=d2 with x(nright+1)==x(nright), lowest fixes
+* if (d1<=d2) break
+* # radius will not decrease by move right
+* nleft = nleft+1
+* nright = nright+1
+* }
+* call lowest(x,y,n,x(i),ys(i),nleft,nright,res,iter>1,rw,ok)
+* # fitted value at x(i)
+* if (!ok) ys(i) = y(i)
+* # all weights zero - copy over value (all rw==0)
+* if (last<i-1) { # skipped points -- interpolate
+* denom = x(i)-x(last) # non-zero - proof?
+* for(j=last+1; j<i; j=j+1){
+* alpha = (x(j)-x(last))/denom
+* ys(j) = alpha*ys(i)+(1.0-alpha)*ys(last)
+* }
+* }
+* last = i # last point actually estimated
+* cut = x(last)+delta # x coord of close points
+* for(i=last+1; i<=n; i=i+1){ # find close points
+* if (x(i)>cut) break # i one beyond last pt within cut
+* if(x(i)==x(last)){ # exact match in x
+* ys(i) = ys(last)
+* last = i
+* }
+* }
+* i=max0(last+1,i-1)
+* # back 1 point so interpolation within delta, but always go forward
+* } until(last>=n)
+* do i = 1,n # residuals
+* res(i) = y(i)-ys(i)
+* if (iter>nsteps) break # compute robustness weights except last time
+* do i = 1,n
+* rw(i) = abs(res(i))
+* call sort(rw,n)
+* m1 = 1+n/2; m2 = n-m1+1
+* cmad = 3.0*(rw(m1)+rw(m2)) # 6 median abs resid
+* c9 = .999*cmad; c1 = .001*cmad
+* do i = 1,n {
+* r = abs(res(i))
+* if(r<=c1) rw(i)=1. # near 0, avoid underflow
+* else if(r>c9) rw(i)=0. # near 1, avoid underflow
+* else rw(i) = (1.0-(r/cmad)**2)**2
+* }
+* }
+* return
+* end
+*
+*
+*
+*
+* LOWEST
+*
+*
+*
+* Calling sequence
+*
+* CALL LOWEST(X,Y,N,XS,YS,NLEFT,NRIGHT,W,USERW,RW,OK)
+*
+* Purpose
+*
+* LOWEST is a support routine for LOWESS and ordinarily will
+* not be called by the user. The fitted value, YS, is
+* computed at the value, XS, of the horizontal axis.
+* Robustness weights, RW, can be employed in computing the
+* fit.
+*
+* Argument description
+*
+*
+* X = Input; abscissas of the points on the
+* scatterplot; the values in X must be ordered
+* from smallest to largest.
+* Y = Input; ordinates of the points on the
+* scatterplot.
+* N = Input; dimension of X,Y,W, and RW.
+* XS = Input; value of the horizontal axis at which the
+* smooth is computed.
+* YS = Output; fitted value at XS.
+* NLEFT = Input; index of the first point which should be
+* considered in computing the fitted value.
+* NRIGHT = Input; index of the last point which should be
+* considered in computing the fitted value.
+* W = Output; W(I) is the weight for Y(I) used in the
+* expression for YS, which is the sum from
+* I = NLEFT to NRIGHT of W(I)*Y(I); W(I) is
+* defined only at locations NLEFT to NRIGHT.
+* USERW = Input; logical variable; if USERW is .TRUE., a
+* robust fit is carried out using the weights in
+* RW; if USERW is .FALSE., the values in RW are
+* not used.
+* RW = Input; robustness weights.
+* OK = Output; logical variable; if the weights for the
+* smooth are all 0.0, the fitted value, YS, is not
+* computed and OK is set equal to .FALSE.; if the
+* fitted value is computed OK is set equal to
+*
+*
+* Method
+*
+* The smooth at XS is computed using (robust) locally weighted
+* regression of degree 1. The tricube weight function is used
+* with h equal to the maximum of XS-X(NLEFT) and X(NRIGHT)-XS.
+* Two cases where the program reverts to locally weighted
+* regression of degree 0 are described in the documentation
+* for LOWESS.
+*
+*
+*
+*
+* subroutine lowest(x,y,n,xs,ys,nleft,nright,w,userw,rw,ok)
+* real x(n),y(n),w(n),rw(n)
+* logical userw,ok
+* range = x(n)-x(1)
+* h = amax1(xs-x(nleft),x(nright)-xs)
+* h9 = .999*h
+* h1 = .001*h
+* a = 0.0 # sum of weights
+* for(j=nleft; j<=n; j=j+1){ # compute weights (pick up all ties on right)
+* w(j)=0.
+* r = abs(x(j)-xs)
+* if (r<=h9) { # small enough for non-zero weight
+* if (r>h1) w(j) = (1.0-(r/h)**3)**3
+* else w(j) = 1.
+* if (userw) w(j) = rw(j)*w(j)
+* a = a+w(j)
+* }
+* else if(x(j)>xs)break # get out at first zero wt on right
+* }
+* nrt=j-1 # rightmost pt (may be greater than nright because of ties)
+* if (a<=0.0) ok = FALSE
+* else { # weighted least squares
+* ok = TRUE
+* do j = nleft,nrt
+* w(j) = w(j)/a # make sum of w(j) == 1
+* if (h>0.) { # use linear fit
+* a = 0.0
+* do j = nleft,nrt
+* a = a+w(j)*x(j) # weighted center of x values
+* b = xs-a
+* c = 0.0
+* do j = nleft,nrt
+* c = c+w(j)*(x(j)-a)**2
+* if(sqrt(c)>.001*range) {
+* # points are spread out enough to compute slope
+* b = b/c
+* do j = nleft,nrt
+* w(j) = w(j)*(1.0+b*(x(j)-a))
+* }
+* }
+* ys = 0.0
+* do j = nleft,nrt
+* ys = ys+w(j)*y(j)
+* }
+* return
+* end
+*
+}}}*/
diff --git a/lowess.h b/lowess.h
new file mode 100644
index 0000000..f20a518
--- /dev/null
+++ b/lowess.h
@@ -0,0 +1,30 @@
+/*
+ * c++ implementation of Lowess weighted regression by
+ * Peter Glaus http://www.cs.man.ac.uk/~glausp/
+ *
+ *
+ * Based on fortran code by Cleveland downloaded from:
+ * http://netlib.org/go/lowess.f
+ * original author:
+* wsc at research.bell-labs.com Mon Dec 30 16:55 EST 1985
+* W. S. Cleveland
+* Bell Laboratories
+* Murray Hill NJ 07974
+ *
+ * See original documentation in the .cpp file for details.
+ *
+ */
+#ifndef LOWESS_H
+#define LOWESS_H
+
+#include<vector>
+
+using namespace std;
+
+void lowess(const vector<double> &x, const vector<double> &y, double f, long nsteps, double delta, vector<double> &ys, vector<double> &rw, vector<double> &res);
+
+void lowess(const vector<double> &x, const vector<double> &y, double f, long nsteps, vector<double> &ys);
+
+void lowest(const vector<double> &x, const vector<double> &y, double xs, double &ys, long nleft, long nright, vector<double> &w,bool userw, vector<double> &rw, bool &ok);
+
+#endif
diff --git a/misc.cpp b/misc.cpp
new file mode 100644
index 0000000..256dd63
--- /dev/null
+++ b/misc.cpp
@@ -0,0 +1,240 @@
+#include <algorithm>
+#include <ctime>
+#include <cmath>
+
+#include "misc.h"
+
+#include "FileHeader.h"
+
+#include "common.h"
+
+namespace ns_math {
+double logAddExp(double a, double b){ //{{{
+ if(a>b){
+ return a+log1p(exp(b-a));
+ }else {
+ return b+log1p(exp(a-b));
+ }
+} //}}}
+double logSumExp(const vector<double> &vals, long st, long en){ //{{{
+ if(st<0)st = 0;
+ if((en == -1) || (en > (long)vals.size())) en = vals.size();
+ if(st >= en)return 0;
+ double sumE = 0, m = *max_element(vals.begin() + st,vals.begin() + en);
+ for(long i = st; i < en; i++)
+ sumE += exp(vals[i] - m);
+ return m + log(sumE);
+} //}}}
+} // namespace ns_math
+
+namespace ns_expression {
+
+string getOutputType(const ArgumentParser &args, const string &defaultType){ //{{{
+ string type = ns_misc::toLower(args.getS("outputType"));
+ if((type!="theta") && (type!="rpkm") && (type!="counts") && (type!="tau")){
+ type = defaultType;
+ warning("Using output type %s.",type.c_str());
+ }
+ return type;
+} //}}}
+} // namespace ns_expression
+
+namespace ns_misc {
+long getSeed(const ArgumentParser &args){//{{{
+ long seed;
+ if(args.isSet("seed"))seed=args.getL("seed");
+ else seed = time(NULL);
+ if(args.verbose)message("seed: %ld\n",seed);
+ return seed;
+}//}}}
+bool openOutput(const ArgumentParser &args, ofstream *outF){//{{{
+ outF->open(args.getS("outFileName").c_str());
+ if(!outF->is_open()){
+ error("Main: Output file open failed.\n");
+ return false;
+ }
+ return true;
+}//}}}
+bool openOutput(const string &name, ofstream *outF) {//{{{
+ outF->open(name.c_str());
+ if(!outF->is_open()){
+ error("Main: File '%s' open failed.\n",name.c_str());
+ return false;
+ }
+ return true;
+}//}}}
+
+bool readConditions(const ArgumentParser &args, long *C, long *M, long *N, Conditions *cond){//{{{
+ if(! cond->init("NONE", args.args(), C, M, N)){
+ error("Main: Failed loading MCMC samples.\n");
+ return false;
+ }
+ if(args.isSet("normalization")){
+ if(! cond->setNorm(args.getTokenizedS2D("normalization"))){
+ error("Main: Applying normalization constants failed.\n");
+ return false;
+ }
+ }
+ if(!cond->logged() && args.verb()){
+ message("Samples are not logged. (will log for you)\n");
+ message("Using %lg as minimum instead of log(0).\n",LOG_ZERO);
+ }
+ if(args.verb())message("Files with samples loaded.\n");
+ return true;
+}//}}}
+
+void computeCI(double cf, vector<double> *difs, double *ciLow, double *ciHigh){//{{{
+ cf = (100 - cf) / 2.0;
+ double N = difs->size();
+ sort(difs->begin(),difs->end());
+ *ciLow = (*difs)[(long)(N/100.*cf)];
+ *ciHigh = (*difs)[(long)(N-N/100.*cf)];
+}//}}}
+
+string toLower(string str){//{{{
+ for(size_t i=0;i<str.size();i++)
+ if((str[i]>='A')&&(str[i]<='Z'))str[i]=str[i]-'A'+'a';
+ return str;
+}//}}}
+
+vector<string> tokenize(const string &input,const string &space){//{{{
+ vector<string> ret;
+ long pos=0,f=0,n=input.size();
+ while((pos<n)&&(f<n)&&(f>=0)){
+ f=input.find(space,pos);
+ if(f==pos)pos++;
+ else{
+ if((f<n)&&(f>=0)){
+ ret.push_back(input.substr(pos,f-pos));
+ pos=f+1;
+ }
+ }
+ }
+ if(pos<n)ret.push_back(input.substr(pos,n-pos));
+ return ret;
+} //}}}
+} // namespace ns_misc
+
+namespace ns_genes {
+bool getLog(const ArgumentParser &args){// {{{
+ if(args.flag("log")){
+ if(args.verb())message("Using logged values.\n");
+ return true;
+ }
+ if(args.verb())message("NOT using logged values.\n");
+ return false;
+}// }}}
+
+bool prepareInput(const ArgumentParser &args, TranscriptInfo *trInfo, PosteriorSamples *samples, long *M, long *N, long *G){// {{{
+ if(! trInfo->readInfo(args.getS("trInfoFileName"))) return false;
+ *G = trInfo->getG();
+ if((! samples->initSet(M,N,args.args()[0]))||(*M<=0)||(*N<=0)){
+ error("Main: Failed loading MCMC samples.\n");
+ return false;
+ }
+ if(*M!=trInfo->getM()){
+ error("Main: Number of transcripts in the info file and samples file are different: %ld vs %ld\n",trInfo->getM(),*M);
+ return false;
+ }
+ if(args.verb())messageF("Transcripts: %ld\n",*M);
+ return true;
+}// }}}
+
+bool updateGenes(const ArgumentParser &args, TranscriptInfo *trInfo, long *G){//{{{
+ if(!(args.isSet("trMapFile") || args.isSet("geneListFile")))return true;
+ if(args.isSet("trMapFile") && args.isSet("geneListFile")){
+ error("Main: Please provide only one of trMapFile and geneListFile, both serve the same function.\n");
+ return false;
+ }
+ bool isMap;
+ ifstream mapFile;
+ if(args.isSet("trMapFile")){
+ isMap = true;
+ mapFile.open(args.getS("trMapFile").c_str());
+ }else {
+ isMap = false;
+ mapFile.open(args.getS("geneListFile").c_str());
+ }
+ if(!mapFile.is_open()){
+ if(isMap){
+ error("Main: Failed reading file with transcript to gene mapping.\n");
+ }else{
+ error("Main: Failed reading file with gene names.\n");
+ }
+ return false;
+ }
+ map<string,string> trMap;
+ vector<string> geneList;
+ string trName,geName;
+ while(mapFile.good()){
+ while(mapFile.good() && (mapFile.peek()=='#'))
+ mapFile.ignore(100000000,'\n');
+ if(!mapFile.good()) break;
+ mapFile>>geName;
+ if(isMap){
+ mapFile>>trName;
+ }
+ if(!mapFile.fail()){
+ if(isMap){
+ trMap[trName]=geName;
+ }else{
+ geneList.push_back(geName);
+ }
+ }
+ mapFile.ignore(100000000,'\n');
+ }
+ mapFile.close();
+ bool succ;
+ if(isMap)succ = trInfo->updateGeneNames(trMap);
+ else succ = trInfo->updateGeneNames(geneList);
+ if(!succ){
+ error("Main: Filed setting gene information.\n");
+ return false;
+ }
+ *G = trInfo->getG();
+ return true;
+}//}}}
+
+bool checkGeneCount(long G, long M){//{{{
+ if((G != 1) && (G != M)) return true;
+ if(G==1){
+ error("Main: All transcripts share just one gene.\n");
+ }else{
+ error("Main: There are no transcripts sharing one gene.\n");
+ }
+ message("Please provide valid transcript to gene mapping (trMapFile or geneListFile).\n"
+ " (trMap file should contain rows in format: <geneName> <transcriptName>.)\n"
+ " (geneList file should contain rows with gene names, one per transcript.)\n");
+ return false;
+}//}}}
+} // namespace ns_genes
+
+namespace ns_params {
+bool readParams(const string &name, vector<paramT> *params, ofstream *outF){//{{{
+ long parN;
+ ifstream parFile(name.c_str());
+ FileHeader fh(&parFile);
+ if(!fh.paramsHeader(&parN, outF)){
+ error("Main: Problem loading parameters file %s\n",name.c_str());
+ return false;
+ }
+ // Vector of parameters: (mean expression, (alpha, beta) )
+ paramT param;
+ while(parFile.good()){
+ while((parFile.good())&&(parFile.peek()=='#')){
+ parFile.ignore(10000000,'\n');
+ }
+ parFile>>param.alpha>>param.beta>>param.expr;
+ if(parFile.good())
+ params->push_back(param);
+ parFile.ignore(10000000,'\n');
+ }
+ if((parN>0)&&(parN != (long)params->size())){
+ warning("Main: declared number of parameters does not match number of lines read (%ld %ld).\n", parN, (long)params->size());
+ }
+ fh.close();
+ sort(params->begin(),params->end());
+ return true;
+}//}}}
+
+} // namespace ns_params
diff --git a/misc.h b/misc.h
new file mode 100644
index 0000000..23ccb38
--- /dev/null
+++ b/misc.h
@@ -0,0 +1,84 @@
+#ifndef MISC_H
+#define MISC_H
+
+#include<fstream>
+
+#include "ArgumentParser.h"
+#include "PosteriorSamples.h"
+#include "TranscriptInfo.h"
+
+namespace ns_math {
+
+// For a=log(x), b=log(y); compute log(x+y).
+double logAddExp(double a, double b);
+
+// For vals_i = log(x_i); compute log(sum(x_i)) for st<=i<en.
+double logSumExp(const vector<double> &vals, long st = 0, long en = -1);
+
+}
+
+namespace ns_expression {
+
+// Return output type based on the command line argument (one of theta/rpkm/counts/tau).
+string getOutputType(const ArgumentParser &args, const string &defaultType = "rpkm");
+}
+
+namespace ns_misc {
+
+// Value to use instead of log(0).
+const double LOG_ZERO=-100;
+
+// Return seed; either using seed set in args, or by using time(NULL) as seed.
+long getSeed(const ArgumentParser &args);
+
+// Open output file based on standard argument --outFile=<outFileName>.
+bool openOutput(const ArgumentParser &args, ofstream *outF);
+// Open output file of a give name.
+bool openOutput(const string &name, ofstream *outF);
+
+// Reads and initializes files containing samples fro each condition and each replicate.
+bool readConditions(const ArgumentParser &args, long *C, long *M, long *N, Conditions *cond);
+
+// Compute confidence intervals.
+void computeCI(double cf, vector<double> *difs, double *ciLow, double *ciHigh);
+
+// Convert string into lower case.
+string toLower(string str);
+
+// Tokenize string into vector of strings based on separator.
+vector<string> tokenize(const string &input,const string &space = " ");
+}
+
+namespace ns_genes {
+// Return true if -l/--log is set.
+bool getLog(const ArgumentParser &args);
+
+// Initializes samples reader, trInfo and sets M,N,G.
+// Return false if reading failed or number fo transcripts does not match.
+bool prepareInput(const ArgumentParser &args, TranscriptInfo *trInfo, PosteriorSamples *samples, long *M, long *N, long *G);
+
+// Tries reading Transcript->Gene mapping from arguments provided (trMapFile or geneListFile)
+// and update gene info.
+bool updateGenes(const ArgumentParser &args, TranscriptInfo *trInfo, long *G);
+
+// Check whether gene cont is reasonable (G!=1 && G!=M)
+// and write appropriate error messages
+bool checkGeneCount(long G, long M);
+} // namespace ns_genes
+
+namespace ns_params{
+
+struct paramT {//{{{
+ double expr, alpha, beta;
+ bool operator< (const paramT &p2) const{
+ return expr<p2.expr;
+ }
+};//}}}
+
+// Read hyperparameters from a file specified by file name.
+// If outF is not NULL, it copies header from input file to outF.
+// The vector is sorted by expression at the end.
+bool readParams(const string &name, vector<paramT> *params, ofstream *outF = NULL);
+
+}
+#endif
diff --git a/parameters1.txt b/parameters1.txt
new file mode 100644
index 0000000..55c6b25
--- /dev/null
+++ b/parameters1.txt
@@ -0,0 +1,32 @@
+# parameters:
+# if this parameters file is used ( -p parameters1.txt ) then these values override the command line arguments --MCMC_*
+
+# length of burnIn
+burnIn 1000
+
+# initial number of samples, doubles every time targetScaleReduction is not met, until it reaches sampleNmax
+samplesN 1000
+# max number of samples generated in one iteration
+# after generating samplesNmax sampels, the program finishes even if some transcripts have not met the targetScaleReduction criteria
+samplesNmax 30000
+
+# number of samples actually recorded
+samplesSave 500
+
+# number of parallel chains
+chainsN 4
+
+# target scale reduction for the parameters
+# this applies only when option --scaleReduction is used
+# this parameter decides end of sampling
+# if you want to end simulation increase it, this file is read every time sampling finnishes k-th iteration of (2^(k-1))*samplesN samples
+#targetScaleReduction 1.2
+
+
+# parameters for the prior distributions
+#dirAlpha 1
+#dirBeta 1
+#betaAlpha 10
+#betaBeta 2
+
+
diff --git a/parseAlignment.cpp b/parseAlignment.cpp
new file mode 100644
index 0000000..63e19c7
--- /dev/null
+++ b/parseAlignment.cpp
@@ -0,0 +1,612 @@
+// DECLARATIONS: {{{
+#include<cmath>
+#include<set>
+
+using namespace std;
+
+#include "ArgumentParser.h"
+#include "misc.h"
+#include "MyTimer.h"
+#include "ReadDistribution.h"
+#include "TranscriptExpression.h"
+#include "TranscriptInfo.h"
+#include "TranscriptSequence.h"
+
+#include "common.h"
+//}}}
+
+//#define DEBUG_AT(x) message(x)
+#define DEBUG_AT(x)
+
+namespace ns_parseAlignment {
+class TagAlignment{//{{{
+ protected:
+ int_least32_t trId;
+// bool strand; // true = forward; false = reverse
+ double prob,lowProb;
+ public:
+ TagAlignment(long t=0,double p = 0,double lp = 0){
+ trId=(int_least32_t)t;
+// strand=s;
+ prob=p;
+ lowProb=lp;
+ }
+ long getTrId()const {return trId;}
+ double getProb()const {return prob;}
+ double getLowProb()const {return lowProb;}
+ void setProb(double p){prob=p;}
+}; //}}}
+
+// Check if next fragment is different.
+bool nextFragDiffers(const ns_rD::fragmentP curF, const ns_rD::fragmentP nextF, bool mateNamesDiffer);
+// String comparison allowing last cmpEPS bases different as long as length
+// is the same.
+long readNameCmp(const char *str1, const char *str2);
+// Read Fragment from SAM file.
+// Copies data from 'next' fragment into 'cur' fragment and reads new fragment information into 'next'.
+// Fragment is either both paired-ends or just single read.
+bool readNextFragment(samfile_t* samData, ns_rD::fragmentP &cur, ns_rD::fragmentP &next);
+
+// Determine input format base either on --format flag or on the file extension.
+// Sets format to bam/sam and returns true, or returns false if format is unknown.
+bool setInputFormat(const ArgumentParser &args, string *format);
+
+bool openSamFile(const string &name, const string &inFormat, samfile_t **samFile);
+
+bool initializeInfoFile(const ArgumentParser &args, samfile_t *samFile, TranscriptInfo **trInfo, long *M);
+} // namespace ns_parseAlignment
+
+extern "C" int parseAlignment(int *argc,char* argv[]){
+string programDescription =
+"Pre-computes probabilities of (observed) reads' alignments.\n\
+ [alignment file] should be in either SAM or BAM format.\n";
+ TranscriptInfo *trInfo=NULL;
+ TranscriptSequence *trSeq=NULL;
+ TranscriptExpression *trExp=NULL;
+ MyTimer timer;
+ timer.start();
+ timer.start(7);
+ long Ntotal = 0, Nmap = 0, M=0, i;
+ string inFormat;
+ samfile_t *samData=NULL;
+ ReadDistribution readD;
+ ns_rD::fragmentP curF = new ns_rD::fragmentT, nextF = new ns_rD::fragmentT, validAF = new ns_rD::fragmentT;
+ // This could be changed to either GNU's hash_set or C++11's unsorted_set, once it's safe.
+ set<string> ignoredReads;
+ long ignoredMaxAlignments = 0, ignoredSingletons = 0;
+ // Intro: {{{
+ // Set options {{{
+ ArgumentParser args(programDescription,"[alignment file]",1);
+ args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
+ args.addOptionS("f","format","format",0,"Input format: either SAM, BAM.");
+ args.addOptionS("t","trInfoFile","trInfoFileName",0,"File to save transcript information extracted from [BS]AM file and reference.");
+ //args.addOptionS("t","trInfoFile","trInfoFileName",0,"If transcript(reference sequence) information is contained within SAM file, program will write this information into <trInfoFile>, otherwise it will look for this information in <trInfoFile>.");
+ args.addOptionS("s","trSeqFile","trSeqFileName",1,"Transcript sequence in FASTA format --- for non-uniform read distribution estimation.");
+ args.addOptionS("","trSeqHeader","trSeqHeader",0,"Transcript sequence header format enables gene name extraction (standard/gencode).","standard");
+ args.addOptionS("e","expressionFile","expFileName",0,"Transcript relative expression estimates --- for better non-uniform read distribution estimation.");
+ args.addOptionL("N","readsN","readsN",0,"Total number of reads. This is not necessary if [SB]AM contains also reads with no valid alignments.");
+ args.addOptionS("","failed","failed",0,"File name where to save names of reads that failed to align.");
+ args.addOptionB("","uniform","uniform",0,"Use uniform read distribution.");
+ args.addOptionD("","lenMu","lenMu",0,"Set mean of log fragment length distribution. (l_frag ~ LogNormal(mu,sigma^2))");
+ args.addOptionD("","lenSigma","lenSigma",0,"Set sigma^2 (or variance) of log fragment length distribution. (l_frag ~ LogNormal(mu,sigma^2))");
+ args.addOptionS("","distributionFile","distributionFileName",0,"Name of file to which read-distribution should be saved.");
+ args.addOptionL("P","procN","procN",0,"Maximum number of threads to be used. This provides speedup mostly when using non-uniform read distribution model (i.e. no --uniform flag).",4);
+ args.addOptionB("V","veryVerbose","veryVerbose",0,"Very verbose output.");
+ args.addOptionL("","noiseMismatches","numNoiseMismatches",0,"Number of mismatches to be considered as noise.",ns_rD::LOW_PROB_MISSES);
+ args.addOptionL("l","limitA","maxAlignments",0,"Limit maximum number of alignments per read. (Reads with more alignments are skipped.)");
+ args.addOptionB("","unstranded","unstranded",0,"Paired read are not strand specific.");
+ args.addOptionB("","show1warning","show1warning",0,"Show first alignments that are considered wrong (TID unknown, TID mismatch, wrong strand).");
+ args.addOptionB("","excludeSingletons","excludeSingletons",0,"Exclude single mate alignments for paired-end reads.");
+ args.addOptionB("","mateNamesDiffer","mateNamesDiffer",0,"Mates from paired-end reads have different names.");
+ if(!args.parse(*argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+ readD.setProcN(args.getL("procN"));
+ if(args.flag("show1warning"))readD.showFirstWarnings();
+ // }}}
+ if(!ns_parseAlignment::setInputFormat(args, &inFormat))return 1;
+ if(!ns_parseAlignment::openSamFile(args.args()[0], inFormat, &samData))return 1;
+ if(!ns_parseAlignment::initializeInfoFile(args, samData, &trInfo, &M))return 1;
+ // Read expression and initialize transcript sequence {{{
+ if(args.verbose)message("Initializing fasta sequence reader.\n");
+ // Initialize fasta sequence reader.
+ trSeq = new TranscriptSequence();
+ if(args.getLowerS("trSeqHeader") == "gencode"){
+ trSeq->readSequence(args.getS("trSeqFileName"), GENCODE);
+ }else{
+ trSeq->readSequence(args.getS("trSeqFileName"), STANDARD);
+ }
+ // Check numbers for transcripts match.
+ if(trSeq->getM() != M){
+ error("Main: Number of transcripts in the alignment file and the sequence file are different: %ld vs %ld\n",M,trSeq->getM());
+ return 1;
+ }
+ // Check that length of each transcript matches.
+ for(i=0;i<M;i++){
+ if(trInfo->L(i) != (long)(trSeq->getTr(i).size())){
+ error("Main: Transcript info length and sequence length of transcript %ld DO NOT MATCH! (%ld %d)\n",i,trInfo->L(i),(int)(trSeq->getTr(i).size()));
+ return 1;
+ }
+ }
+ // If there were gene names in transcript sequence, assign them to transcript info.
+ if(trSeq->hasGeneNames() && (trSeq->getG()>1)){
+ if(trInfo->getG() == 1){
+ // If just one gene present, then assign gene names.
+ if(args.verbose)message("Found gene names in sequence file, updating transcript information.\n");
+ trInfo->updateGeneNames(trSeq->getGeneNames());
+ }else{
+ // If there is more than one gene name already, don't fix.
+ if(trInfo->getG() != trSeq->getG()){
+ warning("Main: Different number of genes detected in transcript information and sequence file (%ld %ld).\n You might want to check your data.\n", trInfo->getG(), trSeq->getG());
+ }
+ }
+ }
+ // If format is GENCODE and transcript names were extracted, update.
+ if((args.getLowerS("trSeqHeader") == "gencode")&&(trSeq->hasTrNames())){
+ if(args.flag("veryVerbose"))message("Updating transcript names.\n");
+ if(!trInfo->updateTrNames(trSeq->getTrNames())){
+ if(args.flag("veryVerbose"))warning("Transcript names update failed.\n");
+ }
+ }
+ if(!args.flag("uniform")){
+ // Try loading expression file from previous estimation for non-uniform read model.
+ if(args.isSet("expFileName")){
+ if(args.verbose)message("Loading transcript initial expression data.\n");
+ trExp = new TranscriptExpression(args.getS("expFileName"), GUESS);
+ if(trExp->getM() != M){
+ error("Main: Number of transcripts in the alignment file and the expression file are different: %ld vs %ld\n",M,trExp->getM());
+ return 1;
+ }
+ }
+ }
+ //}}}
+ timer.split(0,'m');
+ //}}}
+
+ // Estimating probabilities {{{
+ bool analyzeReads = false;
+
+ if(args.isSet("lenMu") && args.isSet("lenSigma")){
+ readD.setLength(args.getD("lenMu"),args.getD("lenSigma"));
+ }else{
+ analyzeReads = true;
+ }
+ if(args.flag("uniform")){
+ if(args.verbose)message("Using uniform read distribution.\n");
+ readD.initUniform(M,trInfo,trSeq,args.flag("veryVerbose"));
+ }else{
+ if(args.verbose)message("Estimating non-uniform read distribution.\n");
+ readD.init(M,trInfo,trSeq,trExp,args.flag("unstranded"),args.flag("veryVerbose"));
+ if(args.flag("veryVerbose"))message(" ReadDistribution initialization done.\n");
+ analyzeReads = true;
+ }
+ if(args.isSet("numNoiseMismatches")){
+ readD.setLowProbMismatches(args.getL("numNoiseMismatches"));
+ }
+ // fill in "next" fragment:
+ // Counters for all, Good Alignments; and weird alignments
+ long observeN, pairedGA, firstGA, secondGA, singleGA, weirdGA, allGA, pairedBad;
+ bool storedValidA = false;
+ long RE_noEndInfo, RE_weirdPairdInfo, RE_nameMismatch;
+ long maxAlignments = 0;
+ if(args.isSet("maxAlignments") && (args.getL("maxAlignments")>0))
+ maxAlignments = args.getL("maxAlignments");
+ // start counting (and possibly estimating):
+ observeN = pairedGA = firstGA = secondGA = singleGA = weirdGA = pairedBad = 0;
+ RE_noEndInfo = RE_weirdPairdInfo = RE_nameMismatch = 0;
+ ns_parseAlignment::readNextFragment(samData, curF, nextF);
+ while(ns_parseAlignment::readNextFragment(samData,curF,nextF)){
+ R_INTERUPT;
+ if( !(curF->first->core.flag & BAM_FUNMAP) ){
+ // (at least) The first read was mapped.
+ if( curF->paired ) {
+ // Fragment's both reads are mapped as a pair.
+ // Check mates' names.
+ if((ns_parseAlignment::readNameCmp(bam1_qname(curF->first), bam1_qname(curF->second))==0) ||
+ (args.flag("mateNamesDiffer"))){
+ pairedGA++;
+ }else{
+ pairedBad++;
+ if(RE_nameMismatch == 0){
+ warning("Paired read name mismatch: %s %s\n",bam1_qname(curF->first), bam1_qname(curF->second));
+ }
+ RE_nameMismatch++;
+ if(RE_nameMismatch>10)break;
+ }
+ }else {
+ if (curF->first->core.flag & BAM_FPAIRED) {
+ // Read was part of pair (meaning that the other is unmapped).
+ if (curF->first->core.flag & BAM_FREAD1) {
+ firstGA++;
+ } else if (curF->first->core.flag & BAM_FREAD2) {
+ secondGA++;
+ } else weirdGA ++;
+ } else {
+ // Read is single end, with valid alignment.
+ singleGA++;
+ }
+ }
+ // Unless pairedBad>0 the alignment is valid.
+ // If excludeSingletons is set, only use paired alignment and alignments of single-end reads.
+ if((!storedValidA) &&
+ (((!args.flag("excludeSingletons")) && (pairedBad == 0)) ||
+ (pairedBad + firstGA + secondGA + weirdGA == 0))){
+ validAF->copyFragment(curF);
+ storedValidA=true;
+ }
+ }
+ // Next fragment is different.
+ if(ns_parseAlignment::nextFragDiffers(curF, nextF, args.flag("mateNamesDiffer"))){
+ Ntotal++;
+ allGA = singleGA + pairedGA + firstGA +secondGA+ weirdGA;
+ if( allGA == 0 ){ // No good alignment.
+ // Just in case:
+ storedValidA=false;
+ pairedBad = 0;
+ continue;
+ }
+ Nmap ++;
+ if(weirdGA)RE_noEndInfo++;
+ if((singleGA>0) && (pairedGA>0)) RE_weirdPairdInfo++;
+ // If it's good uniquely aligned fragment/read, add it to the observation.
+ if(( allGA == 1) && analyzeReads && (pairedBad == 0) && storedValidA){
+ if(readD.observed(validAF))observeN++;
+ }else if(maxAlignments && (allGA>maxAlignments)) {
+ // This read will be ignored.
+ ignoredReads.insert(bam1_qname(curF->first));
+ ignoredMaxAlignments++;
+ Nmap --;
+ }else if(args.flag("excludeSingletons") && (pairedGA + singleGA == 0)){
+ // When excluding singletons only alignments of full pair or single-end read count.
+ ignoredReads.insert(bam1_qname(curF->first));
+ ignoredSingletons++;
+ Nmap --;
+ }
+ pairedGA = firstGA = secondGA = singleGA = weirdGA = pairedBad = 0;
+ storedValidA = false;
+ }
+ }
+ if(RE_nameMismatch>10){
+ error("Names of paired mates didn't match at least 10 times.\n"
+ " Something is possibly wrong with your data or the reads have to be renamed.\n");
+ return 1;
+ }
+ message("Reads: all(Ntotal): %ld mapped(Nmap): %ld\n",Ntotal,Nmap);
+ if(args.verbose)message(" %ld reads were used to estimate empirical distributions.\n",observeN);
+ if(ignoredMaxAlignments>0)message(" %ld reads are skipped due to having more than %ld alignments.\n",ignoredMaxAlignments, maxAlignments);
+ if(ignoredSingletons>0)message(" %ld reads skipped due to having just single mate alignments.\n",ignoredSingletons);
+ if(RE_noEndInfo)warning(" %ld reads that were paired, but do not have \"end\" information.\n (is your alignment file valid?)", RE_noEndInfo);
+ if(RE_weirdPairdInfo)warning(" %ld reads that were reported as both paired and single end.\n (is your alignment file valid?)", RE_weirdPairdInfo);
+ readD.writeWarnings();
+ if(args.flag("veryVerbose"))timer.split(0,'m');
+ // Normalize read distribution:
+ if(args.flag("veryVerbose"))message("Normalizing read distribution.\n");
+ readD.normalize();
+ if(args.isSet("distributionFileName")){
+ readD.logProfiles(args.getS("distributionFileName"));
+ }
+ timer.split(0,'m');
+ // }}}
+
+ // Writing probabilities: {{{
+ // Re-opening alignment file
+ if(!ns_parseAlignment::openSamFile(args.args()[0], inFormat, &samData))return 1;
+ if(args.verbose)message("Writing alignment probabilities.\n");
+ double prob,probNoise,minProb;
+ prob = probNoise = 0;
+ set<string> failedReads;
+ vector<ns_parseAlignment::TagAlignment> alignments;
+ // Open and initialize output file {{{
+ ofstream outF(args.getS("outFileName").c_str());
+ if(!outF.is_open()){
+ error("Main: Unable to open output file.\n");
+ return 1;
+ }
+ outF<<"# Ntotal "<<Ntotal<<"\n# Nmap "<<Nmap<<"\n# M "<<M<<endl;
+ outF<<"# LOGFORMAT (probabilities saved on log scale.)\n# r_name num_alignments (tr_id prob )^*{num_alignments}"<<endl;
+ outF.precision(9);
+ outF<<scientific;
+ // }}}
+
+ // start reading:
+ timer.start(1);
+ bool invalidAlignment = false;
+ long readC, pairedN, singleN, firstN, secondN, weirdN, invalidN, noN;
+ readC = pairedN = singleN = firstN = secondN = weirdN = invalidN = noN = 0;
+ RE_nameMismatch = 0 ;
+ // fill in "next" fragment:
+ ns_parseAlignment::readNextFragment(samData, curF, nextF);
+ while(ns_parseAlignment::readNextFragment(samData,curF,nextF)){
+ R_INTERUPT;
+ // Skip all alignments of this read.
+ if(ignoredReads.count(bam1_qname(curF->first))>0){
+ DEBUG_AT(" ignore\n");
+ // Read reads while the name is the same.
+ while(ns_parseAlignment::readNextFragment(samData,curF,nextF)){
+ DEBUG_AT(" ignore\n");
+ if(ns_parseAlignment::nextFragDiffers(curF, nextF, args.flag("mateNamesDiffer")))
+ break;
+ }
+ readC++;
+ if(args.verbose){ if(progressLog(readC,Ntotal,10,' '))timer.split(1,'m');}
+ continue;
+ }
+ if( !(curF->first->core.flag & BAM_FUNMAP) ){
+ DEBUG_AT("M");
+ // (at least) The first read was mapped.
+ // Check mates' names.
+ if(curF->paired && (ns_parseAlignment::readNameCmp(bam1_qname(curF->first), bam1_qname(curF->second))!=0) && (!args.flag("mateNamesDiffer"))){
+ if(RE_nameMismatch == 0){
+ warning("Paired read name mismatch: %s %s\n",bam1_qname(curF->first), bam1_qname(curF->second));
+ }
+ RE_nameMismatch++;
+ if(RE_nameMismatch>10)break;
+ invalidAlignment = true;
+ }else if((!args.flag("excludeSingletons")) || curF->paired || (! (curF->first->core.flag & BAM_FPAIRED))){
+ // We only calculate probabilties and add alignments if:
+ // (singletons are not exlucded) OR (it is a proper paired alignments) OR (it is single-end read)
+ if(readD.getP(curF, prob, probNoise)){
+ // We calculated valid probabilities for this alignment.
+ // Add alignment:
+ alignments.push_back(ns_parseAlignment::TagAlignment(curF->first->core.tid+1, prob, probNoise));
+ // Update counters:
+ if( curF->paired ) {
+ // Fragment's both reads are mapped as a pair.
+ pairedN++;
+ DEBUG_AT(" P\n");
+ }else {
+ if (curF->first->core.flag & BAM_FPAIRED) {
+ // Read was part of pair (meaning that the other is unmapped).
+ if (curF->first->core.flag & BAM_FREAD1) {
+ firstN++;
+ DEBUG_AT(" 1\n");
+ } else if (curF->first->core.flag & BAM_FREAD2) {
+ secondN++;
+ DEBUG_AT(" 2\n");
+ } else {
+ weirdN ++;
+ DEBUG_AT(" W\n");
+ }
+ } else {
+ // Read is single end, with valid alignment.
+ singleN++;
+ DEBUG_AT(" S\n");
+ }
+ }
+ } else {
+ // Calculation of alignment probabilities failed.
+ invalidAlignment = true;
+ }
+ }
+ }else DEBUG_AT("UNMAP\n");
+ // next fragment has different name
+ if(ns_parseAlignment::nextFragDiffers(curF, nextF, args.flag("mateNamesDiffer"))){
+ DEBUG_AT(" last\n");
+ readC++;
+ if(args.verbose){ if(progressLog(readC,Ntotal,10,' '))timer.split(1,'m');}
+ if(!alignments.empty()){
+ outF<<bam1_qname(curF->first)<<" "<<alignments.size()+1;
+ minProb = 1;
+ for(i=0;i<(long)alignments.size();i++){
+ if(minProb>alignments[i].getLowProb())minProb = alignments[i].getLowProb();
+ outF<<" "<<alignments[i].getTrId()
+// <<" "<<getStrandC(alignments[i].getStrand())
+ <<" "<<alignments[i].getProb();
+ }
+ outF<<" 0 "<<minProb<<endl;
+ alignments.clear();
+ }else{
+ // read has no valid alignments:
+ if(invalidAlignment){
+ // If there were invalid alignments, write a mock record in order to keep Nmap consistent.
+ invalidN++;
+ outF<<bam1_qname(curF->first)<<" 1 0 0"<<endl;
+ }else {
+ noN++;
+ }
+ if(args.isSet("failed")){
+ // Save failed reads.
+ failedReads.insert(bam1_qname(curF->first));
+ if(curF->paired)failedReads.insert(bam1_qname(curF->second));
+ }
+ }
+ invalidAlignment = false;
+ }
+ }
+ if(RE_nameMismatch>10){
+ error("Names of paired mates didn't match at least 10 times.\n"
+ " Something is possibly wrong with your data or the reads have to be renamed.\n");
+ return 1;
+ }
+ outF.close();
+ timer.split(0,'m');
+ if(args.verbose){
+ message("Analyzed %ld reads:\n",readC);
+ if(ignoredMaxAlignments>0)message(" %ld ignored due to --limitA flag\n",ignoredMaxAlignments);
+ if(invalidN>0)message(" %ld had only invalid alignments (see warnings)\n",invalidN);
+ if(noN>0)message(" %ld had no alignments\n",noN);
+ message("The rest had %ld alignments:\n",pairedN+singleN+firstN+secondN+weirdN);
+ if(pairedN>0)message(" %ld paired alignments\n",pairedN);
+ if(firstN+secondN+weirdN>0)
+ message(" %ld half alignments (paired-end mates aligned independently)\n",firstN+secondN+weirdN);
+ if(singleN>0)message(" %ld single-read alignments\n",singleN);
+ //flushStdout();
+ messageFlush();
+ }else {
+ messageF("Alignments: %ld.\n",pairedN+singleN+firstN+secondN+weirdN);
+ }
+ readD.writeWarnings();
+ if(args.flag("veryVerbose")){
+ message("Number of weights cached: %ld\n",readD.getWeightNormCount());
+ }
+ // Deal with reads that failed to align {{{
+ if(args.isSet("failed")){
+ outF.open(args.getS("failed").c_str());
+ if(outF.is_open()){
+ for(set<string>::iterator setIt=failedReads.begin(); setIt!=failedReads.end();setIt++)
+ outF<<*setIt<<endl;
+ outF.close();
+ }
+ } //}}}
+ // Compute effective length and save transcript info {{{
+ if(args.isSet("trInfoFileName")){
+ if(args.verbose)messageF("Computing effective lengths.\n");
+ trInfo->setEffectiveLength(readD.getEffectiveLengths());
+ if(! trInfo->writeInfo(args.getS("trInfoFileName"))){
+ warning("Main: File %s probably already exists.\n"
+ " Will save new transcript info into %s-NEW.\n",(args.getS("trInfoFileName")).c_str(),(args.getS("trInfoFileName")).c_str());
+ if(! trInfo->writeInfo(args.getS("trInfoFileName")+"-NEW", true)){ // DO OVERWRITE
+ warning("Main: Writing into %s failed!.",(args.getS("trInfoFileName")+"-NEW").c_str());
+ }
+ }else {
+ if(args.verbose)message("Transcript information saved into %s.\n",(args.getS("trInfoFileName")).c_str());
+ }
+ if(args.verbose)timer.split(0,'m');
+ } //}}}
+ // Close, free and write failed reads if filename provided {{{
+ delete curF;
+ delete nextF;
+ delete validAF;
+ delete trInfo;
+ delete trSeq;
+ delete trExp;
+ samclose(samData);
+ // }}}
+ // }}}
+ if(args.verbose)message("DONE. ");
+ timer.split(7,'m');
+ return 0;
+}
+
+#ifndef BIOC_BUILD
+int main(int argc,char* argv[]){
+ return parseAlignment(&argc,argv);
+}
+#endif
+
+namespace ns_parseAlignment {
+
+bool nextFragDiffers(const ns_rD::fragmentP curF, const ns_rD::fragmentP nextF, bool mateNamesDiffer){//{{{
+ if(readNameCmp(bam1_qname(curF->first), bam1_qname(nextF->first))==0) return false;
+ if(nextF->paired && mateNamesDiffer && (readNameCmp(bam1_qname(curF->first), bam1_qname(nextF->second))==0)) return false;
+ return true;
+}//}}}
+
+long readNameCmp(const char *str1, const char *str2){//{{{
+ // Check first character(so that we can look back later).
+ if(*str1 != *str2)return *str1 - *str2;
+ while(*str1 || *str2){
+ if(*str1 != *str2){
+ // They can differ in last character if its preceeeded by /:_.
+ if(*str1 && *str2 && (!*(str1+1)) && (!*(str2+1)) &&
+ ((*(str1-1) == '/') || (*(str1-1) == ':') || (*(str1-1) == '_'))){
+ return 0;
+ }
+ return *str1 - *str2;
+ }
+ str1++;
+ str2++;
+ }
+ return 0;
+}//}}}
+
+bool readNextFragment(samfile_t* samData, ns_rD::fragmentP &cur, ns_rD::fragmentP &next){//{{{
+ static ns_rD::fragmentP tmpF = NULL;
+ bool currentOK = true;
+ // switch current to next:
+ tmpF = cur;
+ cur = next;
+ next = tmpF;
+ // check if current fragment is valid
+ if( !cur->first->data || ( *(cur->first->data) == '\0')){
+ // current fragment is invalid
+ currentOK = false;
+ }
+ // try reading next fragment:
+ if(samread(samData,next->first)<0){
+ // read failed: set next reads name to empty string
+ *(next->first->data) = '\0';
+ return currentOK;
+ }
+ // Read proper pairs OR pairs with both mates unmapped into one fragment.
+ if((next->first->core.flag & BAM_FPROPER_PAIR) ||
+ ((next->first->core.flag & BAM_FPAIRED) &&
+ (next->first->core.flag & BAM_FUNMAP) &&
+ (next->first->core.flag & BAM_FMUNMAP))){
+ next->paired = true;
+ // Try reading second mate.
+ if(samread(samData,next->second)<0) next->paired = false;
+ }else{
+ next->paired = false;
+ }
+ /* Note:
+ * Relying on BAM_FREAD2 as being the last read of template probably does not work.
+ */
+ return currentOK;
+}//}}}
+
+bool setInputFormat(const ArgumentParser &args, string *format){//{{{
+ if(args.isSet("format")){
+ *format = args.getLowerS("format");
+ if((*format =="sam")||(*format == "bam")){
+ return true;
+ }
+ warning("Unknown format '%s'.\n",format->c_str());
+ }
+ string fileName = args.args()[0];
+ string extension = fileName.substr(fileName.rfind(".")+1);
+ *format = ns_misc::toLower(extension);
+ if((*format =="sam")||(*format == "bam")){
+ if(args.verb())message("Assuming alignment file in '%s' format.\n",format->c_str());
+ return true;
+ }
+ message("Unknown extension '%s'.\n",extension.c_str());
+ error("Couldn't determine the type of input file, please use --format and check your input.\n");
+ return false;
+}//}}}
+
+bool openSamFile(const string &name, const string &inFormat, samfile_t **samFile){//{{{
+ if(*samFile != NULL)samclose(*samFile);
+ if(inFormat=="bam") *samFile = samopen(name.c_str(), "rb" , NULL);
+ else *samFile = samopen(name.c_str(), "r" , NULL);
+ if(*samFile == NULL){
+ error("Failed re-reading alignments.\n");
+ return false;
+ }
+ return true;
+}//}}}
+
+bool initializeInfoFile(const ArgumentParser &args, samfile_t *samFile, TranscriptInfo **trInfo, long *M){//{{{
+ if((samFile->header == NULL)||(samFile->header->n_targets == 0)){
+ if(! args.isSet("trInfoFileName")){
+ error("Main: alignment file does not contain header, or the header is empty.\n"
+ " Please either include header in alignment file or provide transcript information file.\n"
+ " (option --trInfoFile, file should contain lines with <gene name> <transcript name> <transcript length>.\n");
+ return false;
+ }else{
+ if(args.verb())message("Using %s for transcript information.\n",(args.getS("trInfoFileName")).c_str());
+ if((*trInfo = new TranscriptInfo(args.getS("trInfoFileName"))) && (*trInfo)->isOK()){
+ *M=(*trInfo)->getM();
+ }else {
+ error("Main: Can't get transcript information.\n");
+ return false;
+ }
+ }
+ }else{
+ if(args.verbose)message("Using alignments' header for transcript information.\n");
+ *M = samFile->header->n_targets;
+ vector<string> trNames(*M);
+ vector<long> trLengths(*M);
+ for(long i=0;i<*M;i++){
+ trNames[i] = samFile->header->target_name[i];
+ trLengths[i] = samFile->header->target_len[i];
+ }
+ *trInfo = new TranscriptInfo();
+ if(! (*trInfo)->setInfo(vector<string>(*M,"none"), trNames, trLengths)){
+ error("TranscriptInfo not initialized.\n");
+ return false;
+ }
+ }
+ return true;
+}//}}}
+
+} // namespace ns_parseAlignment
diff --git a/parseAlignment.py b/parseAlignment.py
new file mode 100755
index 0000000..13ecc18
--- /dev/null
+++ b/parseAlignment.py
@@ -0,0 +1,482 @@
+#!/usr/bin/python
+# Initialization {{{
+import sys
+import numpy as np
+def normpdf(x,m,s):
+ return 1./(s*2.5066282746310002)*np.exp(-1./(2.0*s*s)*(x-m)**2.)
+import os, time # needed for this:
+time_str = time.strftime("%b %e %Y %H:%M:%S", time.gmtime(os.lstat(sys.argv[0]).st_mtime));
+print "###",os.path.basename(sys.argv[0]),"build:",time_str;
+# {{{ parse arguments and set filenames
+from optparse import OptionParser
+parser = OptionParser(usage="%prog [options]\n -a -t are necessary\n -e is adviced")
+parser.add_option("-T", "--transcriptPrefix", dest="tPref", help="Prefix of transcript names within MAP file (e.g. hg19_ensGene_ for ensembl genes from UCSC)", type="string")
+parser.add_option("-p", "--prefix", dest="pref", help="Experiment prefix, use same prefix for all files (.map, .tr, .prob)", type="string")
+parser.add_option("-a", "--alignmentFile", dest="aFile", help="Alignments file name", type="string")
+parser.add_option("-A", "--alignmentFileType", dest="aType", default="bowtie", help="Alignments file type", type="string")
+parser.add_option("-t", "--transcriptFile", dest="tFile", help="File with with list of transcripts (second column) and their lengths (third column, used later).", type="string")
+parser.add_option("-o", "--out", dest="oFile", help="Output name (should end with .prob).", type="string")
+parser.add_option("-N", "--totalN", dest = "totalN", help="Total number of reads. If <name>.map.bowtieLog does not exist this number has to be provided", type="int")
+parser.add_option("-i", "--inputType", dest = "inputType", help="Input file type determines the assignemnt of probability for each read (fastq, fastq33, fasta)", default="fastq");
+parser.add_option("-v", "--verbose", default=False, dest="verbose", action="store_true", help="Verbose output")
+parser.add_option("--vv", default=False, dest="veryVerbose", action="store_true", help="Very verbose output")
+parser.add_option("--paired", default=False, dest="paired", action="store_true", help="Flag fo paired alignemnts")
+parser.add_option("--IamSure", default=False, dest="amSure", action="store_true", help="I am sure I want to use this.")
+
+
+(options, args) = parser.parse_args()
+
+if not options.amSure:
+ sys.exit("Please use new implementation of parsing algorithm \"parseAlignment\". If you really want to use this program use the option --IamSure.");
+
+
+if options.tPref !=None:
+ prefixL = len(options.tPref);
+else:
+ prefixL = 0;
+
+if options.pref :
+ aFileName=options.pref+".map"
+ oFileName=options.pref+".prob"
+ tFileName=options.pref+".tr"
+else:
+ if not options.aFile:
+ sys.exit("Need alignemnt file name.");
+ if not options.oFile:
+ sys.exit("Need output file name.");
+ if not options.tFile:
+ sys.exit("Need transcript file name.");
+if options.aFile:
+ aFileName=options.aFile
+if options.oFile:
+ oFileName=options.oFile
+if options.tFile:
+ tFileName=options.tFile;
+#}}}
+#{{{ get total number of reads, possibly from <file>.map.bowtieLog
+Ntotal = 0
+if options.totalN :
+ Ntotal = options.totalN;
+else:
+ try:
+ bLog = open(aFileName+".bowtieLog");
+ for line in bLog:
+ if line.find("# reads processed:")>-1:
+ Ntotal = int( line[line.find("# reads processed:")+18:].split()[0] );
+ # in other words take first wor after "reads processed:" and convert it to Ntotal
+ break;
+ bLog.close();
+ if Ntotal <= 0:
+ sys.exit("File read, but Ntotal was "+str(Ntotal));
+ except:
+ sys.exit( "Was not able to read file "+aFileName+".bowtieLog . Please provide number of reads (-N atribute) or the log file.")
+#}}}
+def nuc2i(str):#{{{
+ if str.lower() == "a": return 0;
+ if str.lower() == "c": return 1;
+ if str.lower() == "g": return 2;
+ if str.lower() == "t": return 3;
+ return 4;
+#}}}
+def verbose(str):#{{{
+ if options.verbose:
+ print str;
+#}}}
+verbose("Using files:\n "+aFileName+" for reading alignments\n "+oFileName+" for writing probabilities\n "+tFileName+" for writing transcript info");
+# {{{ reading transcript info
+try:
+ tFile = open(tFileName,"r")
+except:
+ sys.exit("Unable to open transcript file: "+tFileName+" .");
+
+trMap=dict()
+i=0;
+for line in tFile:
+ if line[0] == '#': continue;
+ trMap[line.split()[1]]=i+1;
+ #trMap[line.split()[1][prefixL:]]=i+1;
+ i+=1;
+trN=i;
+tFile.close();
+#}}}
+# {{{ open output file
+try:
+ oFile = open(oFileName,"w");
+except:
+ sys.exit("Unable to open output file: "+oFileName+" .");
+#}}}
+#{{{ open alignment file and check number of columns
+if options.aType != "bowtie":
+ sys.exit("Unrecognized alignment type.");
+try:
+ aFile = open(aFileName,"r")
+except:
+ sys.exit("Unable to open alignments file: "+aFileName+" .");
+
+alignment=aFile.readline().rstrip().split("\t");
+columnN=len(alignment)+1; # expect no mismatch info
+try:
+ x = int(alignment[columnN-2]); # this works if last column is NOT mismatch info
+except:
+ columnN -= 1; # otherwise decrease number of columns
+colS = columnN - 8; # if 8 columns, no shift necessary
+verbose("columns: "+str(columnN));
+aFile.seek(0);
+#}}}
+# }}}
+
+if options.inputType=="fasta": #{{{
+ minReadLength=25;
+ pseudoCount = 1.0;
+ nucProb = [[[pseudoCount for i in range(5)] for k in range(5)] for j in range(minReadLength)];
+ noiseProb = [pseudoCount for i in range(5)];
+
+ verbose("Estimating mismatch probability."); # {{{
+ readId=""
+ mismatch=""
+ hadMismatches=True;
+ readN = 0;
+ verbose("Use all reads, not only unique.");
+ for line in aFile:
+ alignment=line.rstrip().split("\t");
+
+ readSeq=alignment[4+colS]
+ if alignment[1+colS]=="-":
+ readSeq = readSeq[::-1];
+
+ if alignment[0] != readId or readSeq != seq:
+ readId=alignment[0];
+ readN+=1;
+ if not hadMismatches:
+ while len(seq) > len(nucProb):
+ nucProb.append([[pseudoCount for i in range(5)] for k in range(5)]);
+ for i in range(len(seq)):
+ nuc1 = nuc2i(seq[i]);
+ nucProb[i][nuc1][nuc1]+=1;
+
+ hadMismatches=False;
+ seq = readSeq;
+ mismatch=""
+ for nuc in seq:
+ noiseProb[nuc2i(nuc)]+=1
+
+ if len(alignment)==columnN:
+ if alignment[columnN-1] != mismatch:
+ while len(seq) > len(nucProb):
+ nucProb.append([[pseudoCount for i in range(5)] for k in range(5)]);
+ for i in range(len(seq)):
+ nuc1 = nuc2i(seq[i]);
+ nucProb[i][nuc1][nuc1]+=1;
+ hadMismatches = True;
+
+ mismatch=alignment[columnN-1]
+ mismatchArray = mismatch.split(",");
+ for mis in mismatchArray:
+ pos = int( mis.split(":")[0] );
+ nuc1 = nuc2i( mis.split(":")[1].split(">")[0] );
+ nuc2 = nuc2i( mis.split(":")[1].split(">")[1] );
+ # while pos >= len(nucProb):
+ # nucProb.append([[pseudoCount for i in range(5)] for k in range(5)]);
+ nucProb[pos][ nuc2 ][ nuc2 ]-=1;
+ if nucProb[pos][nuc2][nuc2]<1 : print pos,nuc2,seq,mismatch;
+ nucProb[pos][ nuc1 ][ nuc2 ]+=1;
+ # }}}
+ """verbose("Using only unique reads");#{{{
+for line in aFile:
+ alignment=line.split();
+ if alignment[4] != seq:
+ seq=alignment[4]
+
+ if mismatch != "":
+ mismatchArray = mismatch.split(",");
+ for mis in mismatchArray:
+ pos = int( mis.split(":")[0] );
+ nuc1 = mis.split(":")[1].split(">")[0];
+ nuc2 = mis.split(":")[1].split(">")[1];
+ while pos <= len(nucProb):
+ nusProb.append([[pseudoCount for i in range(5)] for k in range(5)]);
+ nucProb[pos][ nuc2i(nuc1) ][ nuc2i(nuc2) ]+=1;
+ if len(alignment>7):
+ mismatch=alignment[7];
+
+ for nuc in seq:
+ noiseProb[nuc2i(nuc)]+=1
+ else:
+ mismatch=""
+#}}}"""
+ verbose("Estimating probability of noise from aligned reads.") #{{{
+ total=sum(noiseProb);
+ for i in range(5):
+ noiseProb[i] /= total;
+
+ verbose("Estimating nucleotide mismatch matrix.");
+ for i in range(len(nucProb)):
+ for j in range(5):
+ total = sum( nucProb[i][j] );
+ for k in range(5):
+ nucProb[i][j][k] /= total;
+
+ if options.veryVerbose:
+ print "Noise probabilities: ";
+ print " ",;
+ print noiseProb;
+ print "Nucleotide mismatch matrix:";
+ for i in range(len(nucProb)):
+ print "Position ",i,":\n ",;
+ print nucProb[i];
+ #}}}
+ verbose("Writing alignment probabilities"); # {{{
+ aFile.seek(0);
+
+ alignment=aFile.readline().rstrip().split("\t");
+ readId=alignment[0];
+ if alignment[1+colS] == "+":
+ seq=alignment[4+colS];
+ else:
+ seq=alignment[4+colS][::-1];
+ prob = 1.0;
+ for nuc in seq:
+ prob *= noiseProb[nuc2i(nuc)];
+ alignments=[(0,alignment[1+colS],prob)];
+
+ aFile.seek(0);
+ alN = 0;
+ oFile.write("# Ntotal "+str(Ntotal)+"\n");
+ oFile.write("# Nmap "+str(readN)+"\n");
+
+ for line in aFile:
+ alignment=line.rstrip().split("\t");
+ alN+=1;
+
+ readSeq=alignment[4+colS]
+ if alignment[1+colS]=="-":
+ readSeq = readSeq[::-1];
+
+ # write old and init new reads
+ if readId!=alignment[0] or readSeq!=seq:
+ readId = readId.replace(" ","_");
+ oFile.write(readId+" "+str(len(alignments))+" alignments:");
+ for align in alignments:
+ oFile.write(" " + str(align[0]) + " " + align[1] + " " + str(align[2]));
+
+ oFile.write("\n");
+
+ readId=alignment[0];
+ seq = readSeq;
+ del alignments[:]
+ prob = 1.0;
+ for nuc in seq:
+ prob *= noiseProb[nuc2i(nuc)];
+ alignments.append((0,alignment[1+colS],prob));
+
+ # set transcript id
+ if alignment[2+colS][prefixL:] in trMap:
+ trans = trMap[ alignment[2+colS][prefixL:] ];
+ else:
+ trans = 0;
+ print "Transcript '"+alignment[2+colS]+"' or '"+alignment[2+colS][prefixL:]+"' was not found in the transcript file.";
+ #print alignment;
+ # calculate probabilities
+ prob=1.0;
+ for i in range(len(seq)):
+ nuc1 = nuc2i(seq[i]);
+ prob *= nucProb[i][nuc1][nuc1];
+
+ if len(alignment)==columnN:
+ mismatch=alignment[columnN-1]
+ mismatchArray = mismatch.split(",");
+ for mis in mismatchArray:
+ pos = int( mis.split(":")[0] );
+ nuc1 = nuc2i( mis.split(":")[1].split(">")[0] );
+ nuc2 = nuc2i( mis.split(":")[1].split(">")[1] );
+ prob /= nucProb[pos][ nuc2 ][ nuc2 ];
+ prob *= nucProb[pos][ nuc1 ][ nuc2 ];
+ # add new alignment to list
+ alignments.append( (trans, alignment[1+colS], prob) );
+ # if len(alignments)>2 and alignments[len(alignments)-1][2]!=alignments[len(alignments)-2][2]:
+ # print readId;
+
+
+ readId = readId.replace(" ","_");
+ oFile.write(readId+" "+str(len(alignments))+" alignments:");
+ for align in alignments:
+ oFile.write(" " + str(align[0]) + " " + str(align[1]) + " " + str(align[2]));
+ oFile.write("\n");
+ # }}}
+# end if options.inputType=="fasta" }}}
+else:
+ # {{{ qTOp functions
+ if options.inputType=="fastq": Qshift=64;
+ if options.inputType=="fastq33": Qshift=33;
+ phredWarning = False;
+ def qTOp(Q):
+ phredS = float(ord(Q)-Qshift);
+ if phredS<0:
+ if not phredWarning:
+ print "WARNING: Phred score too low (",int(phredS),") perhpas use --inputType fastq33.";
+ phredWarning=True;
+ elif phredS>65:
+ if not phredWarning:
+ print "NOTE: Phred score unnaturally high (",int(phredS),") check your input type and perhaps set --inputType fastq.";
+ phredWarning=True;
+ return 1-10**( phredS / -10);
+ def qTOpInvert(Q):
+ p = 1-10**(float(ord(Q)-Qshift) / -10);
+ if p==0: return 1;
+ return (1-p)/p;
+ #}}}
+ # {{{ counting reads
+ readN = 0
+ rId = "";
+ seq = "";
+ phread = "";
+ aFile.seek(0);
+ frags=[]
+ while True:
+ line = aFile.readline();
+ if line == "": break; # empty line means end of file
+ if options.paired:
+ line2=aFile.readline();
+
+ alignment=line.rstrip().split("\t");
+ readId=alignment[0];
+ readSeq=alignment[4+colS]
+ readPhread=alignment[5+colS];
+
+ if readId != rId or readSeq != seq or readPhread != phread:
+ readN+=1;
+ rId=readId;
+ seq=readSeq;
+ phread=readPhread;
+ if options.paired:
+ frags.append( int(line2.rstrip().split("\t")[3+colS]) - int(alignment[3+colS]) );
+ if options.paired:
+ fragMu = np.mean(frags)
+ fragStD = np.std(frags)
+ # }}}
+ verbose("Writing alignment probabilities");
+ aFile.seek(0);
+ #{{{ read first read identificators
+ alignment=aFile.readline().rstrip().split("\t");
+ readId=alignment[0];
+ if alignment[1+colS] == "+":
+ seq=alignment[4+colS];
+ phread=alignment[5+colS]
+ else:
+ seq=alignment[4+colS][::-1];
+ phread=alignment[5+colS][::-1]
+ prob=1.0;
+ for Q in phread:
+ prob *= qTOp(Q);
+
+ if options.paired: #secon pair
+ align2 = aFile.readline().rstrip().split("\t");
+ fragL = int( align2[3+colS]) - int(alignment[3+colS]);
+ prob *= normpdf(fragL,fragMu,fragStD);
+ if align2[1+colS] == "+":
+ phread2=align2[5+colS]
+ else:
+ phread2=align2[5+colS][::-1]
+ for Q in phread2:
+ prob *= qTOp(Q);
+
+ alignments=[]
+ aFile.seek(0);
+ #}}}
+ alN = 0;
+ oFile.write("# Ntotal "+str(Ntotal)+"\n");
+ oFile.write("# Nmap "+str(readN)+"\n");
+
+ while True:
+ line=aFile.readline();
+ if line == "": break; # empty line means end of file
+ alignment=line.rstrip().split("\t");
+
+ alN+=1;
+
+ readSeq=alignment[4+colS]
+ readPhread=alignment[5+colS]
+ if alignment[1+colS]=="-":
+ readPhread = readPhread[::-1]
+ readSeq = readSeq[::-1];
+ if options.paired:
+ align2 = aFile.readline().rstrip().split("\t")
+ r2Phread = align2[5+colS]
+ if align2[1+colS]=="-":
+ r2Phread = r2Phread[::-1];
+ else: r2Phread = "";
+
+ # write old and init new reads
+ if readId!=alignment[0] or readSeq!=seq or readPhread!=phread:
+ readId = readId.replace(" ","_");
+ oFile.write(readId+" "+str(len(alignments)+1)+" alignments:");
+ minProb = 1;
+ for align in alignments:
+ if minProb > align[2]: minProb=align[2];
+ oFile.write(" " + str(align[0]) + " " + align[1] + " " + str(align[2]));
+ oFile.write(" 0 + " + str(minProb*qTOpInvert(phread[0])*qTOpInvert(phread[1])*qTOpInvert(phread[2])));
+ # add noise alignment with 3 extra mismatches on first bases
+ oFile.write("\n");
+
+ readId=alignment[0];
+ seq = readSeq;
+ phread=readPhread;
+ del alignments[:]
+ prob=1.0;
+ for Q in phread:
+ prob *= qTOp(Q);
+ if options.paired:
+ fragL = int(align2[3+colS])-int(alignment[3+colS]);
+ prob *= normpdf(fragL, fragMu, fragStD);
+ phread2=r2Phread;
+ for Q in phread2:
+ prob *= qTOp(Q);
+ # set transcript id
+ if alignment[2+colS][prefixL:] in trMap:
+ trans = trMap[ alignment[2+colS][prefixL:] ];
+ else:
+ trans = 0;
+ print "Transcript '"+alignment[2+colS]+"' or '"+alignment[2+colS][prefixL:]+"' was not found in the transcript file.";
+ #print alignment;
+ # calculate probabilities
+ probLoc = prob;
+ if len(alignment)==columnN:
+ mismatch=alignment[columnN-1]
+ mismatchArray = mismatch.split(",");
+ for mis in mismatchArray:
+ try:
+ pos = int( mis.split(":")[0] );
+ except:
+ pos=0;
+ print 'X',mis,'X',alignment;
+ probLoc = probLoc * qTOpInvert(phread[pos]);
+ if options.paired and len(align2)==columnN:
+ mismatch=align2[columnN-1]
+ mismatchArray = mismatch.split(",");
+ for mis in mismatchArray:
+ try:
+ pos = int( mis.split(":")[0] );
+ except:
+ pos=0;
+ print mis
+ probLoc = probLoc * qTOpInvert(phread2[pos]);
+
+ # add new alignment to list
+ alignments.append( (trans, alignment[1+colS], probLoc) );
+ # if len(alignments)>2 and alignments[len(alignments)-1][2]!=alignments[len(alignments)-2][2]:
+ # print readId;
+
+ readId = readId.replace(" ","_");
+ oFile.write(readId+" "+str(len(alignments)+1)+" alignments:");
+ minProb = 1;
+ for align in alignments:
+ if minProb > align[2]: minProb=align[2];
+ oFile.write(" " + str(align[0]) + " " + str(align[1]) + " " + str(align[2]));
+ oFile.write(" 0 + " + str(minProb*qTOpInvert(phread[0])*qTOpInvert(phread[1])*qTOpInvert(phread[2])));
+ # add noise alignment with 1 extra mismatch on first base
+ oFile.write("\n");
+
+
+print "Processed:\n ",alN,"alignments + (",readN,"noise alignments)\n ",readN,"reads\n ",trN,"transcripts\nTotal reads: ",Ntotal,"\n";
+aFile.close();
+oFile.close();
diff --git a/releaseDo.sh b/releaseDo.sh
new file mode 100755
index 0000000..f670359
--- /dev/null
+++ b/releaseDo.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Script that automizes creating new BitSeq release.
+# Copies relevant files listed in releaseList, uses _release_Makefile as new Makefile
+# (make sure it's correct) and copies directories boost and samtools.
+
+if [ $# -ne 1 ]
+then
+ echo "reselaseDo.sh [dirName]"
+ exit
+fi
+
+DIR=$1
+
+if [ -d $DIR ]
+then
+ echo "Direcotry $DIR already exists!";
+ exit
+fi
+
+mkdir $DIR
+
+# Cleanup:
+make clean-all
+
+#svn export asa103 $DIR/asa103
+if [[ -d .svn ]]
+then
+ svn export asa103 $DIR/asa103
+ svn export boost $DIR/boost
+ svn export samtools $DIR/samtools
+else
+ echo "Copying boost to '$DIR'."
+ cp -r boost $DIR
+ cp -rv asa103 boost samtools $DIR
+fi
+
+cp -v _release_Makefile $DIR/Makefile
+
+cp -v $( cat releaseList ) $DIR
+
+echo "==================" >> $DIR/README
+date >> $DIR/README
+if [[ -d .svn ]]
+then
+ svn info | grep -e "^Revision:" >> $DIR/README
+else
+ git log -1 | grep "commit" >> $DIR/README
+fi
+
+echo "REMINDERs:"
+echo "File Makefile contains current version of BitSeq, please update if haven't done already."
diff --git a/releaseList b/releaseList
new file mode 100644
index 0000000..328fa0f
--- /dev/null
+++ b/releaseList
@@ -0,0 +1,55 @@
+ArgumentParser.cpp
+ArgumentParser.h
+CollapsedSampler.cpp
+CollapsedSampler.h
+common.cpp
+common.h
+convertSamples.cpp
+estimateDE.cpp
+estimateExpression.cpp
+estimateHyperPar.cpp
+estimateVBExpression.cpp
+extractSamples.cpp
+FileHeader.cpp
+FileHeader.h
+getFoldChange.cpp
+getGeneExpression.cpp
+getPPLR.cpp
+getVariance.cpp
+getWithinGeneExpression.cpp
+GibbsParameters.cpp
+GibbsParameters.h
+GibbsSampler.cpp
+GibbsSampler.h
+lowess.cpp
+lowess.h
+misc.cpp
+misc.h
+MyTimer.cpp
+MyTimer.h
+parseAlignment.cpp
+PosteriorSamples.cpp
+PosteriorSamples.h
+ReadDistribution.cpp
+ReadDistribution.h
+Sampler.cpp
+Sampler.h
+SimpleSparse.cpp
+SimpleSparse.h
+TagAlignments.cpp
+TagAlignments.h
+TranscriptExpression.cpp
+TranscriptExpression.h
+TranscriptInfo.cpp
+TranscriptInfo.h
+TranscriptSequence.cpp
+TranscriptSequence.h
+transposeFiles.cpp
+transposeFiles.h
+transposeLargeFile.cpp
+VariationalBayes.cpp
+VariationalBayes.h
+extractTranscriptInfo.py
+getCounts.py
+parameters1.txt
+README
diff --git a/tagAlignment.h b/tagAlignment.h
new file mode 100644
index 0000000..6fcf143
--- /dev/null
+++ b/tagAlignment.h
@@ -0,0 +1,37 @@
+#ifndef TAGALIGNMENT_H
+#define TAGALIGNMENT_H
+
+
+
+class TagAlignment{
+ protected:
+ long trId;
+// bool strand; // true = forward; false = reverse
+ long double prob;
+ public:
+ TagAlignment(long t=0,long double p = 0){
+ trId=t;
+// strand=s;
+ prob=p;
+ }
+ long getTrId()const {return trId;}
+ double getProb()const {return prob;}
+ void setProb(double p){prob=p;}
+};
+
+class TagAlignment2: public TagAlignment {
+ private:
+ long double lowProb;
+ public:
+ //TagAlignment(long t=0,bool s=true,long double p = 0,long double lp = 0){
+ TagAlignment2(long t=0,long double p = 0,long double lp = 0){
+ trId=t;
+// strand=s;
+ prob=p;
+ lowProb = lp;
+ }
+ double getLowProb()const {return lowProb;}
+};
+
+
+#endif
diff --git a/transposeFiles.cpp b/transposeFiles.cpp
new file mode 100644
index 0000000..3f15c1a
--- /dev/null
+++ b/transposeFiles.cpp
@@ -0,0 +1,146 @@
+#include<cstdlib>
+#include<fstream>
+#include<iomanip>
+#include<vector>
+
+using namespace std;
+
+#include "FileHeader.h"
+#include "transposeFiles.h"
+
+#include "common.h"
+
+bool transposeFiles(vector<string> inFileNames, string outFileName, bool verbose, string message){
+ long M=0,fileN=1,i,j,bufMax,bufN,m,n,totalN,maxN=0,f;
+ bool trans=false,transposed=false;
+ vector<long> N;
+ bufMax=BUFFER_DEFAULT;
+
+ ofstream outFile(outFileName.c_str());
+ if(!outFile.is_open()){//{{{
+ error("TransposeFile: Unable to open output file\n");
+ return 0;
+ }//}}}
+ //{{{ Opening input
+ fileN = inFileNames.size();
+ ifstream *inFile = new ifstream[fileN];
+ totalN=0;
+ FileHeader fh;
+ for(i=0;i<fileN;i++){
+ inFile[i].open(inFileNames[i].c_str());
+ fh.setFile(&inFile[i]);
+ m = n = 0;
+ if((!fh.samplesHeader(&n,&m,&trans)) || (m == 0) || (n == 0)){
+ error("TransposeFile: Unable to read header of file: %s\n",(inFileNames[i]).c_str());
+ return false;
+ }
+ if(N.size()==0){
+ M=m;
+ transposed=trans;
+ maxN=n;
+ }else if((M!=m)||(transposed!=trans)){
+ error("TransposeFile: Different number of transcripts or file %s is in wrong format.\n",(inFileNames[i]).c_str());
+ return false;
+ }
+ outFile<<"# "<<inFileNames[i]<<" "<<n<<endl;
+ N.push_back(n);
+ if(n>maxN)maxN=n;
+ totalN+=n;
+ }
+ if(bufMax>M)bufMax=M;
+ //}}}
+
+ outFile<<message;
+ if(!trans)
+ outFile<<"# T (M rows,N cols)";
+ else
+ outFile<<"# (N rows,M cols)";
+ outFile<<"\n# M "<<M<<"\n# N "<<totalN<<endl;
+ outFile.precision(9);
+ outFile<<scientific;
+ if(verbose)message("Transposing files:\n Samples: %ld Transcripts: %ld Buffer size: %ld\n",totalN,M,bufMax);
+ if(!trans){ // {{{
+ vector< vector<long> > seeks(fileN,vector<long>(maxN,-1));
+ vector<vector<string> > valueBuf(bufMax,vector<string>(totalN));
+ long lastBuf = 0, done=0;
+ bufN=bufMax;
+ if(verbose)messageF("(r");
+ for(f=0;f<fileN;f++){
+ for(i=0;i<N[f];i++){
+ for(j=0;j<bufN;j++) inFile[f]>>valueBuf[j][lastBuf];
+ lastBuf++;
+ seeks[f][i]=inFile[f].tellg();
+ inFile[f].ignore(10000000,'\n');
+ }
+ }
+ if(verbose)messageF(">w.");
+ for(j=0;j<bufN;j++){
+ for(i=0;i < lastBuf - 1;i++)
+ outFile<<valueBuf[j][i]<<" ";
+ // Write last value without space.
+ outFile<<valueBuf[j][i]<<endl;
+ }
+ lastBuf=0;
+ done=bufN;
+ while(done<M){
+ bufN=bufMax;
+ if(M-done<bufMax)bufN=M-done;
+ if(verbose)messageF("r");
+ for(f=0;f<fileN;f++){
+ for(i=0;i<N[f];i++){
+ inFile[f].seekg(seeks[f][i]);
+ for(j=0;j<bufN;j++) inFile[f]>>valueBuf[j][lastBuf];
+ lastBuf++;
+ seeks[f][i]=inFile[f].tellg();
+ }
+ }
+ if(verbose)messageF(">w.");
+ for(j=0;j<bufN;j++){
+ for(i=0;i < lastBuf - 1;i++)
+ outFile<<valueBuf[j][i]<<" ";
+ // Write last value without space.
+ outFile<<valueBuf[j][i]<<endl;
+ }
+ lastBuf=0;
+ done+=bufN;
+ }
+ for(f=0;f<fileN;f++)inFile[f].close();
+ if(verbose)message(")\n");
+ } // }}}
+ else{ // if(trans) {{{
+ vector<long> seeks(M,-1);
+ vector<vector<string> > valueBuf(M,vector<string>(bufMax));
+ long done;
+ if(verbose)message("(");
+ for(f=0;f<fileN;f++){
+ seeks.assign(M,-1);
+ done = 0;
+ while(done<N[f]){
+ bufN=bufMax;
+ if(bufN>N[f]-done)bufN=N[f]-done;
+ if(verbose)messageF("r");
+ for(j=0;j<M;j++){
+ if(seeks[j]!=-1)inFile[f].seekg(seeks[j]);
+ for(i=0;i<bufN;i++){
+ inFile[f]>>valueBuf[j][i];
+ }
+ seeks[j]=inFile[f].tellg();
+ if((j+1<M)&&(seeks[j+1]==-1))inFile[f].ignore(100000000,'\n');
+ }
+ if(verbose)messageF(">w.");
+ for(i=0;i<bufN;i++){
+ for(j=0;j < M - 1;j++)
+ outFile<<valueBuf[j][i]<<" ";
+ // Write last value without space.
+ outFile<<valueBuf[j][i]<<endl;
+ }
+ done+=bufN;
+ }
+ inFile[f].close();
+ }
+ if(verbose)message(")\n");
+ } //}}}
+ delete[] inFile;
+ outFile.close();
+ return true;
+}
diff --git a/transposeFiles.h b/transposeFiles.h
new file mode 100644
index 0000000..da9ff70
--- /dev/null
+++ b/transposeFiles.h
@@ -0,0 +1,4 @@
+
+#define BUFFER_DEFAULT 20000
+
+bool transposeFiles(vector<string> inFileNames, string outFileName, bool verbose, string message = "");
diff --git a/transposeLargeFile.cpp b/transposeLargeFile.cpp
new file mode 100644
index 0000000..aba31a9
--- /dev/null
+++ b/transposeLargeFile.cpp
@@ -0,0 +1,22 @@
+#include "ArgumentParser.h"
+#include "transposeFiles.h"
+#include "common.h"
+
+int main(int argc,char* argv[]){
+ string programDescription =
+"Transposes [input files] into [outFileName] so that there are M lines with N columns each.";
+ ArgumentParser args(programDescription,"[input files]",1);
+ args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
+ if(!args.parse(argc,argv))return 0;
+ if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
+
+ if(transposeFiles(args.args(),args.getS("outFileName"),args.verbose)){
+ if(args.verbose)message("DONE.\n");
+ return 0;
+ }else{
+ error("Failed.\n");
+ return 1;
+ }
+}
+
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bitseq.git
More information about the debian-med-commit
mailing list