AUTHORS | 14 +
LICENSE | 143 +
Makefile | 68 +
README | 33 +
README.install | 13 +
README.sources | 43 +
chaining/Makefile | 51 +
chaining/file.h | 733 +++
chaining/filters.h | 331 ++
chaining/graph.h | 578 ++
chaining/lcbchecks.h | 905 +++
chaining/mincut.h | 975 ++++
chaining/synchain-mugsy.cpp | 2237 ++++++++
delta-dups.sh | 21 +
fixMAFnames.pl | 28 +
labelblocks.pl | 59 +
maf2fasta.pl | 73 +
maf2gp.pl | 75 +
maf2synchain.pl | 115 +
mapping/AlignmentTree.pm | 1476 +++++
mapping/IntervalTree.pm | 154 +
mapping/Makefile | 20 +
mapping/README | 39 +
mapping/README.example | 40 +
mapping/bsmlindex.pl | 67 +
mapping/chadoindex.pl | 4 +
mapping/featureindex.pl | 119 +
mapping/intersect.pl | 58 +
mapping/mafindex.pl | 139 +
mapping/mapfeatures.pl | 3865 +++++++++++++
mapping/mugsy-annotator | 48 +
mapping/mugsyindex.pl | 38 +
mapping/mugsymapper | 34 +
mapping/query.pl | 19 +
mapping/reportvariants.pl | 118 +
mapping/testitree.pl | 327 ++
mapping/xmfaindex.pl | 145 +
mugsy | 1013 ++++
mugsy-seqan/projects/library/apps/Makefile | 48 +
mugsy-seqan/projects/library/apps/mugsy/mugsy.cpp | 6035 ++++++++++++++++++++
.../projects/library/apps/mugsy/rna_alphabet.h | 305 +
.../projects/library/apps/mugsy/transformcoords.h | 36 +
mugsyWGA | 1 +
mugsyenv.sh | 8 +
mumi.sh | 140 +
mumi_fasta.sh | 95 +
plot.pl | 403 ++
splitmaf.pl | 48 +
synchain-mugsy | 1 +
util/mafgrep.pl | 55 +
util/mafstats.pl | 600 ++
util/reportvariants.pl | 118 +
xmfa2maf.pl | 116 +
54 files changed, 22243 insertions(+)
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..9fe6679
--- /dev/null
@@ -0,0 +1,14 @@
+Sam Angiuoli <angiuoli at cs.umd.edu>
+Mugsy utilizes Seqan 1.2 (Doring et al. BMC Bioinformatics. 2008) and
+MUMmer 3.20 (Kurtz, S et al. Genome Biology. 2004). The version of
+these sources with modifications used for Mugsy and respective license
+and copyright files are available in SVN at http://mugsy.sf.net.
+Mugsy: http://mugsy.sf.net
+Seqan: http://www.seqan.de
+Mummer: http://mummer.sourceforge.net
+November 2010
new file mode 100644
index 0000000..45fec2a
--- /dev/null
@@ -0,0 +1,16 @@
+*Mugsy 1.2.3 (12/21/2011)
+-Raised hard-coded max genome limit to 256
+-Recompile with more portable options
+-Fixes to correct reporting of unaligned seqs at beginning of contigs and in some repeats
+-Performance improvement for draft genomes
+*Mugsy 1.2.2 (5/25/2011)
+-Compilation and portability improvements
+*Mugsy 1.2.1 (12/16/2010)
+-Raised hard-coded max genome limit to 128. Plans to remove this limit in the future
+-Documentation updates and portability fixes
+*Mugsy 1.2 (11/1/2010)
+-First public release
+Citation: Angiuoli SV, Salzberg SL. Mugsy: Fast multiple alignment of closely related whole genomes. Bioinformatics. 2010 Dec 9.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..03d7dc6
--- /dev/null
@@ -0,0 +1,143 @@
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..c7fcfab
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,68 @@
+#Set release name or install directory
+all: nucmer synchain_mugsy mugsy_seqan
+install: mugsy_install mummer_install
+ tar cvzf ${RELEASE_NAME}.tgz ${INSTALL_DIR}
+ make -C MUMmer3.20 all
+ make -C chaining synchain-mugsy
+ make -C mugsy-seqan Project=mugsy
+ mkdir -p ${INSTALL_DIR}
+ install mugsyenv.sh ${INSTALL_DIR}
+ perl -pi -e 's|export MUGSY_INSTALL=.*|export MUGSY_INSTALL=${INSTALL_DIR}|' ${INSTALL_DIR}/mugsyenv.sh
+ install mugsy ${INSTALL_DIR}
+ install mugsyWGA ${INSTALL_DIR}
+ install synchain-mugsy ${INSTALL_DIR}
+ install maf2fasta.pl ${INSTALL_DIR}
+ install xmfa2maf.pl ${INSTALL_DIR}
+ install fixMAFnames.pl ${INSTALL_DIR}
+ install splitmaf.pl ${INSTALL_DIR}
+ install plot.pl ${INSTALL_DIR}
+ install delta-dups.sh ${INSTALL_DIR}
+ install -m 644 README ${INSTALL_DIR}
+ install -m 644 CHANGELOG ${INSTALL_DIR}
+ install -m 644 LICENSE ${INSTALL_DIR}
+ install -m 644 LICENSE ${INSTALL_DIR}
+ install -m 644 README.install ${INSTALL_DIR}
+ mkdir -p ${INSTALL_DIR}/MUMmer3.20/scripts
+ install MUMmer3.20/delta-filter ${INSTALL_DIR}/MUMmer3.20/
+ install MUMmer3.20/gaps ${INSTALL_DIR}/MUMmer3.20/
+ install MUMmer3.20/mgaps ${INSTALL_DIR}/MUMmer3.20/
+ install MUMmer3.20/delta2maf ${INSTALL_DIR}/MUMmer3.20/
+ install MUMmer3.20/aux_bin/postnuc ${INSTALL_DIR}/MUMmer3.20/
+ install MUMmer3.20/aux_bin/prenuc ${INSTALL_DIR}/MUMmer3.20/
+# install MUMmer3.20/src/tigr/show-coords ${INSTALL_DIR}/MUMmer3.20/
+ install MUMmer3.20/mummer ${INSTALL_DIR}/MUMmer3.20/
+ install MUMmer3.20/mummerplot ${INSTALL_DIR}/MUMmer3.20/
+ install MUMmer3.20/nucmer ${INSTALL_DIR}/MUMmer3.20/
+ install MUMmer3.20/scripts/Foundation.pm ${INSTALL_DIR}/MUMmer3.20/scripts
+# mapping_install:
+# @install -d perllibs ${INSTALL_DIR}
+# @install mapping/mafindex.pl ${INSTALL_DIR}
+# @install mapping/AlignmentTree.pm ${INSTALL_DIR}
+# @install mapping/IntervalTree.pm ${INSTALL_DIR}
+# @install mapping/featureindex.pl ${INSTALL_DIR}
+# @install mapping/mapfeatures.pl ${INSTALL_DIR}
+# multiz_install:
+# @install labelblocks.pl ${INSTALL_DIR}
diff --git a/README b/README
new file mode 100644
index 0000000..7ae6c88
--- /dev/null
+++ b/README
@@ -0,0 +1,33 @@
+Mugsy - multiple whole genome alignment tool
+Angiuoli SV, Salzberg SL. Mugsy: Fast multiple alignment of closely related whole genomes. Bioinformatics. 2010 Dec 9.
+See README.install for installation
+To run
+%source mugsyenv.sh
+%mugsy --help
+Mugsy generates MAF formatted multiple alignments from FASTA inputs. The
+wrapper script 'mugsy' invokes all the steps to calculate the multiple
+Example invocation
+% mugsy --directory /local/scratch --prefix mygenomes genome1.fsa genome2.fsa genome3.fsa
+The core executables are
+mugsyWGA - whole genome aligner based on Seqan::TCoffee
+synchain-mugsy - segmentation program to produce locally collinear
+blocks (LCBs) from a set of anchors
+nucmer - 3.20 release bundled for convenience with new utility
+delta2maf and modified delta-filter to add support for reporting
diff --git a/README.install b/README.install
new file mode 100644
index 0000000..2373ecd
--- /dev/null
+++ b/README.install
@@ -0,0 +1,13 @@
+The x86-64 tar contains pre-compiled binaries for 64-bit x86 machines running Linux.
+Untar the release in the target installation area
+ tar xvzf mugsy-x86-64-vXrX.tgz /path/to/install_dir/
+Edit MUGSY_INSTALL in mugsyenv.sh and set to absolute path of the
+installation directory
+In bash, run
+ source mugsyenv.sh
diff --git a/README.sources b/README.sources
new file mode 100644
index 0000000..61bcd65
--- /dev/null
+++ b/README.sources
@@ -0,0 +1,43 @@
+To build, run make all
+Original Mugsy sources for LCB identification code
+Requires the Boost library
+Mugsy includes some 3rd party sources to build
+A copy of the original sources from http://mummer.sourceforge.net.
+Modifications include delta-filter -b for reporting duplications and
+new utility delta2maf
+A copy of the original sources from the Seqan library and
+build environment that is required to build mugsyWGA
+The orginal sources were obtained from here http://www.seqan.de/
+New sources for mugsyWGA are in projects/library/apps/mugsy
+Additional changes to support reversals include these sources
+A library of maf conversion utilities was built from Multiz
+downloaded from here http://www.bx.psu.edu/miller_lab/multiz-tba.012109.tar.gz
+built with
+libmaf.a: mz_scores.o charvec.o nib.o seq.o multi_util.o maf.o util.o
+ ar rsc $@ mz_scores.o charvec.o nib.o seq.o multi_util.o maf.o util.o
diff --git a/chaining/Makefile b/chaining/Makefile
new file mode 100644
index 0000000..d57ebf1
--- /dev/null
+++ b/chaining/Makefile
@@ -0,0 +1,51 @@
+#-- Imported variables from top level makefile
+CPPFLAGS = -I /usr/local/projects/angiuoli/boost/include/boost-1_38 -pedantic -ftemplate-depth-200 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -O3
+#-Wall -mfpmath=sse -msse2
+#-march=nocona -mfpmath=sse -msse2
+# -march=pentium4
+#CPPFLAGS = -ggdb -pg
+#CPPFLAGS = -I /usr/local/projects/angiuoli/boost/include/boost-1_38 -pg
+ifndef BIN_DIR
+ifndef AUX_BIN_DIR
+OBJ_RULE = $(CXX) $(CPPFLAGS) $< -static -I /usr/local/projects/angiuoli/boost/include/boost-1_38 -c -o $@
+BIN_RULE = $(CXX) $(CPPFLAGS) $^ -static -L /usr/local/projects/angiuoli/boost/lib/ -lbgl-viz -lboost_graph-gcc41-mt -o $(BIN_DIR)/$@; \
+ chmod 755 $(BIN_DIR)/$@
+AUX_BIN_RULE = $(CXX) $(CPPFLAGS) $^ -o $(AUX_BIN_DIR)/$@; \
+ chmod 755 $(AUX_BIN_DIR)/$@
+ALL := synchain-mugsy
+#-- PHONY rules --#
+.PHONY: all clean
+all: $(ALL)
+ rm -f *.o *~
+ cd $(BIN_DIR); rm -f $(ALL)
+ cd $(AUX_BIN_DIR); rm -f $(ALL)
+#-- not so PHONY rules --#
+synchain-mugsy.o: synchain-mugsy.cpp graph.h filters.h lcbchecks.h file.h mincut.h
+synchain-mugsy: synchain-mugsy.o
diff --git a/chaining/file.h b/chaining/file.h
new file mode 100644
index 0000000..6513777
--- /dev/null
+++ b/chaining/file.h
@@ -0,0 +1,733 @@
+//File IO
+/*Block format is 6 column
+anchor seqindex genomeindex orient beg end
+void read_blocks(std::istream &in,
+ Graph & g,
+ NameVertexMap & name2vertex,
+ NameLabelMap & genome2index,
+ NameLabelMap & sequence2index,
+ VertexLabelIntervalMap & coordinates,
+ int distance){
+ NameLabelMap::iterator pos1;
+ NameVertexMap::iterator pos;
+ VertexLabelIntervalMap::iterator pos2;
+ bool inserted;
+ Label seqindex=0;
+ Vertex news;
+ Edge e1,newe;
+ int edges=0;
+ bool found;
+ std::string line;
+ typedef tokenizer<char_separator<char> > Tok;
+ int field=0;
+ VertexName sname=0;
+ std::string sequence,genome;
+ Orientation sorient=false;
+ Coordinate sbeg=0,send=0;
+ int dist=0;
+ std::string sorientstr;
+ OrientedLabelSet newso;
+ property_map < Graph, vertex_orient_t >::type orientmap = get(vertex_orient,g);
+ property_map < Graph, vertex_len_t >::type lenmap = get(vertex_len,g);
+#if defined(STORE_EDGE_LABELS)
+ property_map < Graph, edge_label_t >::type labelmap = get(edge_label,g);
+ property_map < Graph, edge_labelmask_t >::type elabelmaskmap = get(edge_labelmask,g);
+ vector<int> ordercounts(BITMAX);
+ while (getline(in, line)) {
+ Tok tok(line, char_separator<char>(" "));
+ field=0;
+ for (Tok::iterator id = tok.begin(); id != tok.end(); ++id) {
+ switch(field){
+ case 0:
+ try{
+ sname = lexical_cast<VertexName>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 1:
+ try{
+ sequence = lexical_cast<std::string>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 2:
+ try{
+ genome = lexical_cast<std::string>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 3:
+ try {
+ sorientstr = lexical_cast<std::string>(*id);
+ if(sorientstr == "+"){
+ sorient = true;
+ }
+ else{
+ assert(sorientstr=="-");
+ sorient = false;
+ }
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 4:
+ try {
+ sbeg = lexical_cast<Coordinate>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 5:
+ try {
+ send = lexical_cast<Coordinate>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ }
+ //6 column table
+ //7 column table includes orientations
+ //11 column table includes coordinates
+ if(field==6){
+ //Set either returns existing index or inserts
+ tie(pos1, inserted) = sequence2index.insert(std::make_pair(sequence, 0));
+ if (inserted) {
+ pos1->second = sequence2index.size();
+ seqindex = pos1->second;
+ assert(seqindex>=0&&seqindex<BITMAX);
+ }
+ else{
+ seqindex = pos1->second;
+ assert(seqindex>=0&&seqindex<BITMAX);
+ }
+ //Add source vertex
+ //Set either returns existing vertex or inserts
+ tie(pos, inserted) = name2vertex.insert(std::make_pair(sname, Vertex()));
+ if (inserted) {
+ news = add_vertex(VertexProperties(sname),g);
+ pos->second = news;
+ } else{
+ news = pos->second;
+ }
+ //Add oriented label
+ orientmap[news].insert(make_pair(seqindex,sorient));
+ //Save coordinates
+ assert(sbeg!=send);
+ tie(pos2, inserted) = coordinates.insert(std::make_pair(make_pair(news,seqindex), std::make_pair(sbeg,send)));
+ if(lenmap[news]>0){
+ assert(lenmap[news]==pos2->second.second-pos2->second.first);
+ }
+ lenmap[news] = pos2->second.second-pos2->second.first;
+ }
+ else{
+ //Ignoring line
+ }
+ }
+ //iterate over all seqs
+ NameLabelMap::iterator sit,sit_end;
+ sit_end = sequence2index.end();
+ for(sit = sequence2index.begin();sit!=sit_end;++sit){
+ Label seqidx = sit->second;
+ //store all vertices with this seqlabel
+ list<Vertex> sortedV;
+ boost::graph_traits<Graph>::vertex_iterator vit,vit_end;
+ vit_end = vertices(g).second;
+ VertexIntervalMap currcoords;
+ for(vit=vertices(g).first;vit!=vit_end;++vit){
+ VertexLabelIntervalMap::iterator cit = coordinates.find(std::make_pair(*vit,seqidx));
+ if(cit!=coordinates.end()){
+ sortedV.push_back(*vit);
+ currcoords.insert(std::make_pair(*vit,
+ cit->second));
+ }
+ }
+ //sort
+ sortedV.sort(coordsorder_vertex(&currcoords));
+ Vertex currvertex,prevvertex;
+ list<Vertex>::iterator it,it_end;
+ for(it=sortedV.begin();it!=sortedV.end();++it){
+ currvertex = *it;
+ if(it==sortedV.begin()){
+ prevvertex=currvertex;
+ }
+ else{
+ dist = abs(coordinates[std::make_pair(prevvertex,seqidx)].second - coordinates[std::make_pair(currvertex,seqidx)].first);
+ //Add edge if
+ assert(dist>=0);
+ if(dist <= distance){
+ tie(e1,found) = edge(prevvertex,currvertex, g);
+ if(found){
+ //existing edge between prevvertex--currvertex
+ //add attributes from Graph g edge,e to Graph gcomp edge,e1
+#if defined(STORE_EDGE_LABELS)
+ labelmap[e1].insert(std::make_pair(seqindex,dist));
+ elabelmaskmap[e1].set(seqindex,1);
+ }
+ else{
+ //Code to handle directed graph where
+ //reverse orientation
+ //TODO
+ //Need to consider case where
+ //this edge is mis-oriented introducing an artificial breakpoint
+ //in the chain
+ tie(e1,found) = edge(currvertex,prevvertex,g);
+ if(found){
+#if defined(STORE_EDGE_LABELS)
+ labelmap[e1].insert(std::make_pair(seqindex,dist));
+ elabelmaskmap[e1].set(seqindex,1);
+ }
+ else{
+ bool inserted;
+ Edge e1;
+#if defined(STORE_EDGE_LABELS)
+ LabelMap labels;
+ labels[seqindex] = dist;
+ tie(e1, inserted) = add_edge(prevvertex,currvertex,EdgeProperties(labels),g);
+ tie(e1, inserted) = add_edge(prevvertex,currvertex,EdgeProperties(),g);
+ assert(inserted);
+ elabelmaskmap[e1].set(seqindex,1);
+ }
+ }
+ edges++;
+ }
+ }
+ }
+ }
+ }
+Projection input is
+anchor1 anchor2 seqindex dist genomeindex orient1 orient2 beg1 end1 beg2 end2
+0 1 0 0 0 + + 0 196 196 15348
+1 3 0 1 0 + + 196 15348 15349 20373
+void read_pairwiseprojection(std::istream &in,
+ Graph & g,
+ NameVertexMap & name2vertex,
+ NameLabelMap & genome2index,
+ NameLabelMap & sequence2index,
+ VertexLabelIntervalMap & coordinates,
+ SequenceGenomeMap & sequence2genome,
+ int distance,
+ int minanchor){
+ NameVertexMap::iterator pos;
+ NameLabelMap::iterator pos1;
+ VertexLabelIntervalMap::iterator pos2;
+ SequenceGenomeMap::iterator pos3;
+ bool inserted;
+ Label seqindex,genomeindex;
+ Vertex news, newt;
+ Edge e1,newe;
+ int edges=0;
+ bool found;
+ std::string line;
+ typedef tokenizer<char_separator<char> > Tok;
+ int field=0;
+ VertexName sname=0,tname=0;
+ std::string sequence,genome;
+ Orientation sorient=false,torient=false;
+ Coordinate sbeg=0,send=0,tbeg=0,tend=0;
+ int dist=0;
+ std::string sorientstr;
+ std::string torientstr;
+ OrientedLabelSet newso,newto;
+ property_map < Graph, vertex_orient_t >::type orientmap = get(vertex_orient,g);
+ property_map < Graph, vertex_label_t >::type vlabelmap = get(vertex_label,g);
+ property_map < Graph, vertex_genome_t >::type genomemap = get(vertex_genome,g);
+ property_map < Graph, vertex_len_t >::type lenmap = get(vertex_len,g);
+ property_map < Graph, edge_labelmask_t >::type elabelmaskmap = get(edge_labelmask,g);
+#if defined(STORE_EDGE_LABELS)
+ property_map < Graph, edge_label_t >::type labelmap = get(edge_label,g);
+ vector<int> ordercounts(BITMAX);
+ while (getline(in, line)) {
+ //std::cerr << line << std::endl;
+ Tok tok(line, char_separator<char>(" "));
+ field=0;
+ for (Tok::iterator id = tok.begin(); id != tok.end(); ++id) {
+ switch(field){
+ case 0:
+ try{
+ sname = lexical_cast<VertexName>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 1:
+ try{
+ tname = lexical_cast<VertexName>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 2:
+ try{
+ sequence = lexical_cast<std::string>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 3:
+ try{
+ dist = lexical_cast<long int>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 4:
+ try{
+ genome = lexical_cast<std::string>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 5:
+ try {
+ sorientstr = lexical_cast<std::string>(*id);
+ if(sorientstr == "+"){
+ sorient = true;
+ }
+ else{
+ assert(sorientstr=="-");
+ sorient = false;
+ }
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 6:
+ try {
+ torientstr = lexical_cast<std::string>(*id);
+ if(torientstr == "+"){
+ torient = true;
+ }
+ else{
+ assert(torientstr=="-");
+ torient = false;
+ }
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 7:
+ try {
+ sbeg = lexical_cast<Coordinate>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 8:
+ try {
+ send = lexical_cast<Coordinate>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 9:
+ try {
+ tbeg = lexical_cast<Coordinate>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ case 10:
+ try {
+ tend = lexical_cast<Coordinate>(*id);
+ field++;
+ }
+ catch (std::exception e){
+ }
+ break;
+ }
+ }
+ //5 column table is minimum input
+ //7 column table includes orientations
+ //11 column table includes coordinates
+ if(field==5 || field==7 || field==11){
+ if(field==5){
+ cerr << "Incomplete file "<< endl;
+ sorient = true;
+ torient = true;
+ }
+ if(abs(tend-tbeg)>=minanchor && abs(send-sbeg)>=minanchor){
+ //Set either returns existing index or inserts
+ tie(pos1, inserted) = sequence2index.insert(std::make_pair(sequence, 0));
+ if (inserted) {
+ pos1->second = sequence2index.size();
+ seqindex = pos1->second;
+ //assert(seqindex>=0&&seqindex<BITMAX);
+ }
+ else{
+ seqindex = pos1->second;
+ //assert(seqindex>=0&&seqindex<BITMAX);
+ }
+ //Add source vertex
+ //Set either returns existing vertex or inserts
+ tie(pos, inserted) = name2vertex.insert(std::make_pair(sname, Vertex()));
+ if (inserted) {
+ news = add_vertex(VertexProperties(sname),g);
+ pos->second = news;
+ } else{
+ news = pos->second;
+ }
+ //Add genome
+ tie(pos1, inserted) = genome2index.insert(std::make_pair(genome, 0));
+ if (inserted) {
+ pos1->second = genome2index.size();
+ genomeindex = pos1->second;
+ assert(genomeindex>=0&&genomeindex<BITMAX);
+ }
+ else{
+ genomeindex = pos1->second;
+ assert(genomeindex>=0&&genomeindex<BITMAX);
+ }
+ //Save sequence2genome lookup
+ //TODO
+ //Use genomeindex
+ //TEMP HACK to support 1 seq per genome
+ //genomeindex=seqindex;
+ //</HACK>
+ genomemap[news].insert(genomeindex);
+ tie(pos3, inserted) = sequence2genome.insert(std::make_pair(seqindex, genomeindex));
+ if (inserted) {
+ }
+ else{
+ assert(seqindex==pos3->first);
+ assert(genomeindex=pos3->second);
+ }
+ //Add oriented label
+ orientmap[news].insert(make_pair(seqindex,sorient));
+ vlabelmap[news].insert(seqindex);
+ //Add target vertex
+ tie(pos, inserted) = name2vertex.insert(std::make_pair(tname, Vertex()));
+ if (inserted) {
+ newt = add_vertex(VertexProperties(tname),g);
+ pos->second = newt;
+ } else{
+ newt = pos->second;
+ }
+ genomemap[newt].insert(genomeindex);
+ orientmap[newt].insert(make_pair(seqindex,torient));
+ vlabelmap[newt].insert(seqindex);
+ //Save coordinates
+ if(field==11){
+ assert(sbeg!=send);
+ assert(tbeg!=tend);
+ tie(pos2, inserted) = coordinates.insert(std::make_pair(make_pair(news,seqindex), std::make_pair(sbeg,send)));
+ //std::cerr << "Coords source V:" << news << " " << sbeg << "-" << send << std::endl;
+ if(lenmap[news]>0){
+ //assert(lenmap[news]==pos2->second.second-pos2->second.first);
+ lenmap[news] = (lenmap[news]>pos2->second.second-pos2->second.first) ? lenmap[news] : pos2->second.second-pos2->second.first;
+ }
+ else{
+ lenmap[news] = pos2->second.second-pos2->second.first;
+ }
+ tie(pos2, inserted) = coordinates.insert(std::make_pair(make_pair(newt,seqindex), std::make_pair(tbeg,tend)));
+ //std::cerr << "Coords target V:" << newt << " " << tbeg << "-" << tend << std::endl;
+ if(lenmap[newt]>0){
+ //assert(lenmap[newt]==pos2->second.second-pos2->second.first);
+ lenmap[newt] = (lenmap[newt]>pos2->second.second-pos2->second.first) ? lenmap[newt] : pos2->second.second-pos2->second.first;
+ }
+ else{
+ lenmap[newt] = pos2->second.second-pos2->second.first;
+ }
+ }
+ //Add edge if
+ assert(dist>=0);
+ if(dist <= distance){
+ tie(e1,found) = edge(news,newt, g);
+ if(found){
+ //existing edge between news--newt
+ //add attributes from Graph g edge,e to Graph gcomp edge,e1
+#if defined(STORE_EDGE_LABELS)
+ labelmap[e1].insert(std::make_pair(genomeindex,dist));
+ elabelmaskmap[e1].set(genomeindex,1);
+ }
+ else{
+ //Code to handle directed graph where
+ //reverse orientation
+ //TODO
+ //Need to consider case where
+ //this edge is mis-oriented introducing an artificial breakpoint
+ //in the chain
+ tie(e1,found) = edge(newt,news,g);
+ if(found){
+#if defined(STORE_EDGE_LABELS)
+ labelmap[e1].insert(std::make_pair(genomeindex,dist));
+ elabelmaskmap[e1].set(genomeindex,1);
+ }
+ else{
+ bool inserted;
+ Edge e1;
+#if defined(STORE_EDGE_LABELS)
+ LabelMap labels;
+ labels[genomeindex] = dist;
+ tie(e1, inserted) = add_edge(news,newt,EdgeProperties(labels),g);
+ tie(e1, inserted) = add_edge(news,newt,EdgeProperties(),g);
+ assert(inserted);
+ elabelmaskmap[e1].set(genomeindex,1);
+ }
+#ifdef DEBUG
+ std::cerr << "Added edge " << sname << " " << tname << std::endl;
+ }
+ edges++;
+ }
+ else{
+#ifdef DEBUG
+ std::cerr << "Skipping edge dist>distance " << line << std::endl;
+ }
+ }
+ else{
+#ifdef DEBUG
+ std::cerr << "Skipping short anchor " << line << std::endl;
+ //Skipping short anchor
+ }
+ }
+ else{
+#ifdef DEBUG
+ std::cerr << "Ignoring line " << line << std::endl;
+ //Ignoring line
+ }
+ }
+//Add this to the dot file to force drawing of labels in large graphs
+//node [fontsize="9",margin="0.0,0.0",fixedsize=true];
+template<typename TGraph, typename Tedgelabelmap>
+ void do_write_graphviz(TGraph &g,
+ std::string fname,
+ std::vector<int> & cc,
+ VertexLabelIntervalMap & coordinates,
+ EdgeSet & maskedEdges,
+ VertexSet & maskedLCBs,
+ Tedgelabelmap & edgelabelmap,
+ bool expandlabel){
+ typedef typename TGraph::vertex_descriptor TVertex;
+ typedef typename TGraph::edge_descriptor TEdge;
+ //property_map < Graph,edge_stringname_t >::type edgelabelmap;// = get(edge_stringname, g);
+ typename property_map < TGraph,edge_category_t >::type ecatmap = get(edge_category,g);
+ //Set up dynamic properties for graphviz
+ boost::dynamic_properties dp;
+ dp.property("id", get(vertex_name, g));
+ std::map<TEdge,std::string> edgecatmap;
+ //Need to set edge, vertex labels
+ //Build label map
+ std::map<TVertex, std::string> vertexlabelmap;
+ std::map<TEdge, std::string> efmap;
+ std::map<TVertex, std::string> vfmap;
+ std::map<TVertex, std::string> shapemap;
+ std::map<TEdge, std::string> linemap;
+ //std::map< Graph::edge_descriptor, std::string> edgelabelmap;
+ for(typename boost::graph_traits<TGraph>::vertex_iterator
+ vit = vertices(g).first;vit!=vertices(g).second;++vit){
+ Vertex v = *vit;
+ std::ostringstream labelstring;
+ labelstring << get(vertex_name,g,v) << " "
+ << v
+ << " "
+ << "CC" << cc[v];
+ //if(get(vertex_relorder,g,v)){
+ //labelstring << " TL" << get(vertex_relorder,g,v);
+ //}
+ labelstring << "\\n";
+ if(expandlabel){
+ OrientedLabelSet olabel = get(vertex_orient,g,v);
+ for(OrientedLabelSet::iterator it = olabel.begin();it!=olabel.end();++it){
+ //TODO support genomeidx
+ labelstring << "S" << it->first << ":" << (it->second ? '+' : '-')
+ << ":"
+ << coordinates[std::make_pair(v,it->first)].first << "-" << coordinates[std::make_pair(v,it->first)].second << "\\n";
+ }
+ }
+ labelstring << v << "\\n";
+ vertexlabelmap[v]=labelstring.str();
+ vfmap[v]="6";
+ if(maskedLCBs.find(v)!=maskedLCBs.end()){
+ shapemap[v]="diamond";
+ }
+ else{
+ shapemap[v]="circle";
+ }
+ }
+ for(typename boost::graph_traits<TGraph>::edge_iterator
+ eit = edges(g).first;eit!=edges(g).second;++eit){
+ TEdge e = *eit;
+ std::ostringstream labelstring;
+ unsigned int numset=0;
+#if defined(STORE_EDGE_LABELS)
+ LabelMap currlm = get(edge_label,g,e);
+ //labelstring << "MASKS:" << get(edge_labelmask,g,e) << "\\n";
+ for(LabelMap::iterator it = currlm.begin(); it!=currlm.end(); ++it){
+ labelstring << it->first << ":" << it->second << " ";
+ numset++;
+ }
+ assert(numset==currlm.size());
+ if(maskedEdges.find(std::make_pair(source(e,g),target(e,g)))!=maskedEdges.end()
+ || maskedEdges.find(std::make_pair(target(e,g),source(e,g)))!=maskedEdges.end()){
+ //only true if g is of type filteredgraph assert(false);
+ linemap[e] = "dashed,bold";
+ }
+ else{
+ linemap[e] = "solid";
+ }
+ //edgelabelmap[e] = labelstring.str();
+ efmap[e]="6";
+ switch (ecatmap[e]){
+ case RED: //default
+ edgecatmap[e] = "red";
+ break;
+ case GREEN:
+ edgecatmap[e] = "green";
+ break;
+ case BLUE: //cut by mincut
+ edgecatmap[e] = "blue";
+ break;
+ case ORANGERED: //introduced via a merge
+ edgecatmap[e] = "yellow";
+ break;
+ case PURPLE: //change in relative orientation
+ edgecatmap[e] = "purple";
+ break;
+ case CYAN:
+ edgecatmap[e] = "cyan";
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ }
+ boost::associative_property_map< std::map<TVertex, std::string> >
+ vlabel_map(vertexlabelmap);
+ //boost::associative_property_map< std::map<Vertex, std::string> >
+ //vfontmap(vfmap);
+ //boost::associative_property_map< std::map<Edge, std::string> >
+ //efontmap(efmap);
+ boost::associative_property_map< std::map<TVertex, std::string> >
+ bshapemap(shapemap);
+ boost::associative_property_map< std::map<TEdge, std::string> >
+ blinemap(linemap);
+ //boost::associative_property_map< std::map<TEdge, Tedgelabel> >
+ //elmap(edgelabelmap);
+ boost::associative_property_map< std::map<TEdge, std::string> >
+ ecmap(edgecatmap);
+ dp.property("label",vlabel_map);
+ //dp.property("label",edgelabelmap);
+ dp.property("label",edgelabelmap);
+ dp.property("shape",bshapemap);
+ dp.property("style",blinemap);
+ //dp.property("fontsize",vfontmap);
+ //dp.property("fontsize",efontmap);
+ dp.property("color",ecmap);
+ dp.property("color",ecmap);
+ //dp.property("rankdir","LR");
+ //dp.property("rotate","90");
+ //Open file
+ std::ofstream gout;
+ gout.open(fname.c_str());
+ std::string node_id("id");
+ std::map<std::string,std::string> graph_attr, vertex_attr, edge_attr;
+ graph_attr["rankdir"] = "LR";
+ graph_attr["rotate"] = 90;
+ write_graphviz(gout, g,
+ dynamic_vertex_properties_writer(dp,node_id),
+ dynamic_properties_writer(dp),
+ make_graph_attributes_writer(graph_attr,vertex_attr,edge_attr));
+ //graph::detail::node_id_property_map<Vertex>(dp,node_id));
+ gout.close();
+template<typename TGraph>
+void do_write_graphviz(TGraph &g,
+ std::string fname,
+ std::vector<int> & cc,
+ VertexLabelIntervalMap & coordinates,
+ EdgeSet & maskedEdges,
+ VertexSet & maskedLCBs,
+ bool expandlabel=true){
+ std::map<Edge,std::string> nullmap;
+ boost::associative_property_map< std::map<Edge,std::string > > edgelabelmap(nullmap);
+ do_write_graphviz(g,fname,cc,coordinates,maskedEdges,maskedLCBs,edgelabelmap,expandlabel);
diff --git a/chaining/filters.h b/chaining/filters.h
new file mode 100644
index 0000000..ca99697
--- /dev/null
+++ b/chaining/filters.h
@@ -0,0 +1,331 @@
+// Graph filters
+//Filter edges and exclude all edges that don't match sequences in matchlabel
+//Fast edge filter using bit masks predefined on mindist in setedgesmask
+template <typename EdgeLabelMap, typename LabelContainer>
+struct distance_label_filter_bv {
+ distance_label_filter_bv() { }
+ distance_label_filter_bv(EdgeLabelMap label, LabelContainer &lc)
+ : m_label(label),matchlabels(lc) {
+ }
+ template <typename Edgetype>
+ bool operator()(const Edgetype& e) const {
+ if((m_label[e]&matchlabels)==matchlabels){
+ return true;
+ }
+ else
+ return false;
+ }
+ EdgeLabelMap m_label;
+ LabelContainer matchlabels;
+template <typename TEdgeLabelMap, typename TLabelSet>
+struct edge_label_filter {
+ edge_label_filter() { }
+ edge_label_filter(TEdgeLabelMap l,
+ TLabelSet s)
+ : labelmap(l),
+ matchlabels(s){}
+ template <typename Edgetype>
+ bool operator()(const Edgetype& e) const {
+ assert(matchlabels.size()!=0);
+ if(std::includes(matchlabels.begin(),matchlabels.end(),labelmap[e].begin(),labelmap[e].end())){
+ return true;
+ }
+ }
+ TEdgeLabelMap labelmap;
+ TLabelSet matchlabels;
+template <typename TGraph>
+struct snode_efilter {
+ snode_efilter()
+ :G(NULL),m_snodes(NULL)
+ {}
+ snode_efilter(std::set<typename TGraph::vertex_descriptor> *m,TGraph *gin)
+ :m_snodes(m),G(gin)
+ {}
+ template <typename Edgetype>
+ bool operator()(const Edgetype& e) const {
+ if(m_snodes->find(source(e,*G))==m_snodes->end()
+ && m_snodes->find(target(e,*G))==m_snodes->end()){
+ return false;
+ }
+ else{
+ return true;
+ }
+ }
+ std::set<typename TGraph::vertex_descriptor> *m_snodes;
+ TGraph *G;
+//Filter vertices and exclude all vertices that do not contain a subset of sequences defined by matchlabels
+//and (optional) orientation matchorient
+//Passing a empty matchorient container ignores filtering by orientation
+//Implemented using bitmasks preset based on distance within setvertexmasks
+//Refactor to another container besides BitMask and ensure proper filtering
+template <typename VertexLabelMaskMap, typename VertexOrientMaskMap, typename OrientContainer>
+struct orient_filter_bv {
+ orient_filter_bv() { }
+ orient_filter_bv(VertexLabelMaskMap label,
+ OrientContainer &lc,
+ VertexOrientMaskMap omask,
+ OrientContainer &oc,
+ OrientContainer &roc) : m_label(label),
+ matchlabels(lc),
+ m_orientmask(omask),
+ matchorient(oc),
+ matchrevorient(roc) { }
+ template <typename Vertextype>
+ bool operator()(const Vertextype& v) const {
+ assert(matchlabels!=0);
+ if((m_label[v]&matchlabels)==matchlabels){
+ OrientContainer orientmask = (m_orientmask[v]&matchlabels);
+ //!matchorient.any() means all - orients, which is disallowed
+ //Using this state to specify shortcircuit "ignore orient mask"
+ if(!matchorient.any() || orientmask==matchorient){
+ return true;
+ }
+ else{
+ if(!matchrevorient.any() || orientmask==matchrevorient){
+ return true;
+ }
+ else{
+ return false;
+ }
+ }
+ }
+ else{
+ return false;
+ }
+ }
+ VertexLabelMaskMap m_label;
+ OrientContainer matchlabels;
+ VertexOrientMaskMap m_orientmask;
+ OrientContainer matchorient;
+ OrientContainer matchrevorient;
+template <typename TVertexLabelMap, typename TLabelSet>
+struct vertex_label_filter {
+ vertex_label_filter() { }
+ vertex_label_filter(TVertexLabelMap l,
+ TLabelSet s)
+ : labelmap(l),
+ matchlabels(s){}
+ template <typename Vertextype>
+ bool operator()(const Vertextype& v) const {
+ assert(matchlabels.size()!=0);
+ //assert((labelmap[v].find(*(matchlabels.begin())) != labelmap[v].end())
+ //==
+ //(std::includes(labelmap[v].begin(),labelmap[v].end(),matchlabels.begin(),matchlabels.end())));
+ if(labelmap[v].find(*(matchlabels.begin())) != labelmap[v].end()){
+ //std::includes(labelmap[v].begin(),labelmap[v].end(),matchlabels.begin(),matchlabels.end())){
+ return true;
+ }
+ else{
+ return false;
+ }
+ }
+ TVertexLabelMap labelmap;
+ TLabelSet matchlabels;
+//Filter edges and exclude all edges in EdgeSet
+//where EdgeSet is a pair of vertices
+template <typename TGraph>
+struct synbp_edge_filter {
+ typedef typename boost::graph_traits<TGraph>::edge_descriptor Edge;
+ typedef typename boost::graph_traits<TGraph>::vertex_descriptor Vertex;
+ synbp_edge_filter()
+ :G(NULL)
+ {}
+ synbp_edge_filter(EdgeSet *m, TGraph *gin)
+ :maskededges(m),G(gin)
+ {}
+ template <typename Edgetype>
+ bool operator()(const Edgetype& e) const {
+ if(maskededges->find(std::make_pair(source(e,*G),target(e,*G)))!=maskededges->end()
+ || maskededges->find(std::make_pair(target(e,*G),source(e,*G)))!=maskededges->end()){
+ //TODO
+ //This additional check target,source should not be necessary
+ return false;
+ }
+ else{
+ return true;
+ }
+ }
+ EdgeSet *maskededges;
+ TGraph *G;
+//Filter edges and exclude all incident edges to vertices in VertexSet
+template <typename TGraph>
+struct LCB_edge_filter {
+ typedef typename boost::graph_traits<TGraph>::edge_descriptor Edge;
+ typedef typename boost::graph_traits<TGraph>::vertex_descriptor Vertex;
+ LCB_edge_filter()
+ :G(NULL)
+ {}
+ LCB_edge_filter(VertexSet *m, TGraph *gin)
+ :maskedvertices(m),G(gin)
+ {}
+ template <typename Edgetype>
+ bool operator()(const Edgetype& e) const {
+ if(maskedvertices->find(source(e,*G))!=maskedvertices->end()
+ ||
+ maskedvertices->find(target(e,*G))!=maskedvertices->end()){
+ return false;
+ }
+ else{
+ return true;
+ }
+ }
+ VertexSet *maskedvertices;
+ TGraph *G;
+//Filter vertices and exclude all vertices in VertexSet
+template <typename TGraph>
+struct LCB_vertex_filter {
+ LCB_vertex_filter() { }
+ LCB_vertex_filter(VertexSet *m)
+ :maskedvertices(m)
+ {}
+ template <typename Vertextype>
+ bool operator()(const Vertextype& v) const {
+ if(maskedvertices->find(v)!=maskedvertices->end()){
+ return false;
+ }
+ else{
+ return true;
+ }
+ }
+ VertexSet *maskedvertices;
+//Create compound edge filter by chaining two edge filters
+template <typename TFilter1, typename TFilter2>
+struct compound_edge_filter{
+ compound_edge_filter(){}
+ compound_edge_filter(TFilter1 &tf1, TFilter2 &tf2)
+ :filter1(tf1),filter2(tf2)
+ {}
+ template <typename Edgetype>
+ bool operator()(const Edgetype& e) const {
+ if(filter1(e) && filter2(e)){
+ assert(filter1(e));
+ assert(filter2(e));
+ }
+ else{
+ assert(!filter1(e)||!filter2(e));
+ }
+ return filter1(e) && filter2(e);
+ }
+ TFilter1 filter1;
+ TFilter2 filter2;
+//Create compound vertex filter by chaining two vertex filters
+template <typename TFilter1, typename TFilter2>
+struct compound_vertex_filter{
+ compound_vertex_filter(){}
+ compound_vertex_filter(TFilter1 &tf1, TFilter2 &tf2)
+ :filter1(tf1),filter2(tf2)
+ {}
+ template <typename Vertextype>
+ bool operator()(const Vertextype& v) const {
+ if(filter1(v) && filter2(v)){
+ assert(filter1(v));
+ assert(filter2(v));
+ }
+ else{
+ assert(!filter1(v)||!filter2(v));
+ }
+ return filter1(v) && filter2(v);
+ }
+ TFilter1 filter1;
+ TFilter2 filter2;
+//Define LCB as a set of connected vertices
+typedef std::vector<Vertex> LCB;
+typedef std::map<std::pair<unsigned int,Label>,Interval > LCBLabelIntervalMap;
+//Graph types
+//Synteny graph contains connected subgraphs, each an LCB
+typedef filtered_graph<Graph,
+ synbp_edge_filter<Graph> > FilterSynGraph;
+typedef filtered_graph<Graph,
+ LCB_edge_filter<Graph>,
+ LCB_vertex_filter<Graph> > FilterLCBGraph;
+//Synteny graph that supports masking/filtering of LCBs
+typedef filtered_graph<Graph,
+ compound_edge_filter<LCB_edge_filter<Graph>, synbp_edge_filter<Graph> >,
+ LCB_vertex_filter<Graph> > LCBSynFilterGraph;
+template<typename TPos>
+class poscmp
+ poscmp()
+ {}
+ bool operator()( const TPos &e1, const TPos &e2 ) const {
+ if(e1.first==e2.first){
+ //return false if e2 is interval close
+ /*
+ if(e2.second == false){
+ return 0;
+ }
+ else{
+ return 1;
+ }
+ */
+ return e1.second < e2.second;
+ }
+ else{
+ return e1.first < e2.first;
+ }
+ }
+class lencmp
+ lencmp(std::map<int,int> & m)
+ :lenmap(&m)
+ {}
+ bool operator()( const int i1, const int i2) const {
+ assert(lenmap->find(i1)!=lenmap->end());
+ assert(lenmap->find(i2)!=lenmap->end());
+ if (lenmap->find(i1)->second < lenmap->find(i2)->second){
+ return true;
+ }
+ else{
+ return false;
+ }
+ }
+ std::map<int,int> *lenmap;
diff --git a/chaining/graph.h b/chaining/graph.h
new file mode 100644
index 0000000..205bd93
--- /dev/null
+++ b/chaining/graph.h
@@ -0,0 +1,578 @@
+//BGL requires crazy amount of code to define custom graph properties
+//Edge properties
+struct label_t {
+ typedef edge_property_tag kind;
+struct genome_t {
+ typedef edge_property_tag kind;
+//TODO f
+//Make edge_labelmask set based
+//on genomeidx rather than seqidx
+struct labelmask_t{
+ typedef edge_property_tag kind;
+struct visted_t{
+ typedef edge_property_tag kind;
+struct stringname_t{
+ typedef edge_property_tag kind;
+//Vertex properties
+struct orient_t{
+ typedef vertex_property_tag kind;
+struct orientmask_t{
+ typedef vertex_property_tag kind;
+struct vlabelmask_t{
+ typedef vertex_property_tag kind;
+struct chains_t{
+ typedef vertex_property_tag kind;
+struct relorder_t{
+ typedef vertex_property_tag kind;
+//vertex_name_t and edge_weight_t are already defined by default
+enum edge_label_t { edge_label = 10001 };
+enum edge_labelmask_t { edge_labelmask = 10004 };
+enum edge_visited_t { edge_visited = 10005 };
+enum edge_stringname_t { edge_stringname = 10011 };
+enum edge_category_t { edge_category = 10012 };
+enum vertex_orient_t { vertex_orient = 10006 };
+enum vertex_label_t { vertex_label = 10016 };
+enum vertex_genome_t { vertex_genome = 10015 };
+enum vertex_vlabelmask_t { vertex_vlabelmask = 10007 };
+enum vertex_orientmask_t { vertex_orientmask = 10008 };
+enum vertex_len_t { vertex_len = 10009 };
+enum vertex_relorder_t { vertex_relorder = 10010 };
+namespace boost {
+ BOOST_INSTALL_PROPERTY(edge, labelmask);
+ BOOST_INSTALL_PROPERTY(edge, visited);
+ BOOST_INSTALL_PROPERTY(edge, stringname);
+ BOOST_INSTALL_PROPERTY(edge, category);
+ BOOST_INSTALL_PROPERTY(vertex, relorder);
+ BOOST_INSTALL_PROPERTY(vertex, label);
+ BOOST_INSTALL_PROPERTY(vertex, vlabelmask);
+ BOOST_INSTALL_PROPERTY(vertex, orientmask);
+ BOOST_INSTALL_PROPERTY(vertex, orient);
+ BOOST_INSTALL_PROPERTY(vertex, genome);
+//End custom properties code
+//Label is an index that corresponds to a genome sequence for complete
+//genomes or a species for incomplete genomes. using a short limits
+//to 65,535 labels in an attempt to save some space
+//typedef unsigned short int Label;
+typedef unsigned int Label;
+//DNA sequence orientation -,+ == false,true
+typedef bool Orientation;
+//Label,distance map specifies the proximity between two
+//anchors/blocks along a sequence whose index is Label
+typedef std::map<Label,int> LabelMap;
+//typedef __gnu_cxx::hash_map<Label,int> LabelMap;
+//Set of labels
+//typedef std::set<Label> LabelSet;
+typedef boost::unordered_set<Label> LabelSet;
+//MAXGENOMES,BITMAX: Critical parameters that limit the number of genome labels
+//bitset is used for fast pattern matching of subsets
+//Increasing the size of this parameter will degrade performance
+//even for small numbers of sequences
+//TODO: Refactor. Compare or replace with use of STL includes,set_intersection,...
+//or use boost::dynamic_bitset
+typedef std::bitset<BITMAX> BitMask;
+typedef pair<Label,Orientation> OrientedLabel;
+struct orientedlabelhasher {
+ size_t operator()(const OrientedLabel& v) const { return hash<Label>()(v.first); }
+//faster for bitsets that can fit in ulong
+//otherwise will throw an overflow exception
+struct bitsethasher_ulong {
+ size_t operator()(const BitMask& v) const { return hash<long unsigned int>()(v.to_ulong()); }
+//general for bitsets of any size
+struct bitsethasher_string {
+ size_t operator()(const BitMask& v) const { return hash<std::string>()(v.to_string()); }
+struct orientedlabelcmp {
+ bool operator()( const OrientedLabel& s1, const OrientedLabel& s2 ) const {
+ if(s1.first == s2.first){
+ return s1.second < s2.second;
+ }
+ else{
+ return s1.first < s2.first;
+ }
+ }
+struct hasheq
+ bool operator()(const OrientedLabel& s1, const OrientedLabel& s2) const
+ {
+ return s1==s2;
+ }
+//Edges in the anchor graph are marked with a set of labels called an
+//OrientedLabelSet. The cardinality of this set is the number of
+//member sequences. Each sequence is labeled by an integer index (type
+//Label) and a boolean orientation (type Orientation) which are paired
+//to defined an OrientedLabel.
+//TODO objects of this type are copied all over the place in the current impl
+//need to refactor to improve performance
+typedef std::set<OrientedLabel,orientedlabelcmp > OrientedLabelSet;
+//typedef boost::unordered_set<OrientedLabel> OrientedLabelSet;
+//typedef __gnu_cxx::hash_set<OrientedLabel,orientedlabelhasher> OrientedLabelSet;
+//Names of blocks/anchors and graph vertices: VertexName, VertexID
+//The program input includes a set of anchors across two or more
+//genomes. These anchors are also referred to as blocks. Each block
+//is stored as a vertex in a graph. The identifier provided for each
+//block in the user provided input is stored as the VertexName. It is
+//assumed that the VertexName is a unique identifier for a block. An
+//additional internal identifier for each block, VertexID, is used by
+//the graph library. For a given block, the VertexName and VertexID
+//may not be equivalent. It is also possible to change the typedef
+//for VertexName to std::string to support string names for blocks.
+typedef unsigned int VertexName;
+typedef unsigned int VertexID; //TODO, reconcile,replace with Graph::vertex_descriptor
+//Coordinates and Intervals
+typedef int Coordinate;
+typedef std::pair<Coordinate,Coordinate> Interval;
+//typedef BitMask SeqSet;
+typedef OrientedLabelSet SeqSet;
+//bit set per genome
+typedef LabelSet GenomeSet;
+//BGL requires properties in this nested format
+typedef property<vertex_genome_t, GenomeSet> VertexGenome;
+typedef property<vertex_relorder_t, int, VertexGenome> VertexRelOrder;
+typedef property<vertex_len_t, int, VertexRelOrder> VertexLen;
+typedef property<vertex_orientmask_t, BitMask, VertexLen> VertexOrientMask;
+typedef property<vertex_vlabelmask_t, BitMask, VertexOrientMask> VertexLabelMask;
+typedef property<vertex_orient_t, SeqSet, VertexLabelMask> VertexOrientedLabel;
+typedef property<vertex_label_t, LabelSet, VertexOrientedLabel> VertexLabel;
+typedef property<vertex_name_t, VertexName, VertexLabel> VertexProperties;
+//Define graph properties
+//Replace edge properties, such as EdgeMask and LabelMap, with index to save space
+//typedef property<edge_category_t, std::string, property<edge_weight_t,int> > EdgeCategory;
+//typedef property<edge_stringname_t, std::string> EdgeStringName;
+//typedef property<edge_visited_t, bool, EdgeCategory > EdgeVisited;
+//typedef property<edge_category_t, std::string > EdgeCategory;
+//BLACK - collinear and syntenic edge between segments, indegree==outdegree==1
+//RED - collinear edge that traverses a syntenic breakpoint, degree!=1
+//PURPLE - non-collinear edge indicating possible reversal, change in orientation
+//GREEN - non-collinear edge indicative of some other flux
+//BLUE - edge removed during a mincut to split an LCB
+//ORANGERED - previously masked edge that was reintroduced during a merge
+//YELLOW - new edge introduced during a mask short, merge adjacent iteration
+typedef property<edge_category_t, EdgeCats > EdgeCategory;
+#if defined(STORE_EDGE_LABELS)
+typedef property<edge_labelmask_t, GenomeSet, EdgeCategory > EdgeMask;
+typedef property<edge_label_t, LabelMap, EdgeMask > EdgeProperties;
+typedef property<edge_labelmask_t, BitMask,EdgeCategory > EdgeProperties;
+//Determine if edge,vertex storage is better as vecS,listS,multisetS
+typedef boost::adjacency_list <
+ vecS, // Store out-edges of each vertex in a std::set
+ vecS, // Store vertex set in a std::vector
+ bidirectionalS, // The file dependency graph is directed, support for in_edges as well as out_edges
+ VertexProperties, // vertex properties
+ EdgeProperties // edge properties
+ > Graph;
+typedef Graph::vertex_descriptor Vertex;
+typedef Graph::edge_descriptor Edge;
+typedef std::map<std::string,Label> NameLabelMap;
+typedef std::map<Label, std::string> LabelNameMap;
+typedef std::map<VertexName, Vertex> NameVertexMap;
+typedef std::map<Label,Label> SequenceGenomeMap;
+//typedef std::map<pair<Vertex,Label>,Interval > VertexLabelIntervalMap;
+typedef boost::unordered_map<std::pair<Vertex,Label>,Interval > VertexLabelIntervalMap;
+typedef boost::unordered_map<Vertex,Interval > VertexIntervalMap;
+typedef adjacency_list_traits < setS, vecS, directedS > LTraits;
+#ifdef DEBUG
+//Add edgecategory for printing
+typedef adjacency_list <
+ listS, //to allow for removal of edges
+ vecS,
+ directedS,
+ property < vertex_name_t, VertexName,
+ property < vertex_index_t, long,
+ property < vertex_color_t, boost::default_color_type,
+ property < vertex_distance_t, long,
+ property < vertex_predecessor_t, LTraits::edge_descriptor > > > > >,
+ property < edge_capacity_t, long,
+ property < edge_residual_capacity_t, long,
+ property < edge_reverse_t, LTraits::edge_descriptor,
+ EdgeCategory > > > > LGraph;
+typedef adjacency_list <
+ listS, //to allow for removal of edges
+ vecS,
+ directedS,
+ property < vertex_name_t, VertexName,
+ property < vertex_index_t, long,
+ property < vertex_color_t, boost::default_color_type,
+ property < vertex_distance_t, long,
+ property < vertex_predecessor_t, LTraits::edge_descriptor > > > > >,
+ property < edge_capacity_t, long,
+ property < edge_residual_capacity_t, long,
+ property < edge_reverse_t, LTraits::edge_descriptor > > > > LGraph;
+typedef LGraph::vertex_descriptor LVertex;
+typedef LGraph::edge_descriptor EVertex;
+typedef std::set<std::pair<Vertex,Vertex> > EdgeSet;
+//typedef boost::unordered_set<std::pair<Vertex,Vertex> > EdgeSet;
+typedef std::set<Vertex> VertexSet;
+//typedef boost::unordered_set<Vertex> VertexSet;
+struct iloc{
+ int first;
+ int second;
+ int blocknum;
+void printtime(){
+ time_t now;
+ time(&now);
+ struct tm *current = localtime(&now);
+ current = localtime(&now);
+ std::cerr << "TIME " << current->tm_hour << ":" << current->tm_min << ":" << current->tm_sec << std::endl;
+class cutsdist
+ cutsdist(std::map<std::pair<LVertex,LVertex>,unsigned int > *m)
+ :distmap(m)
+ {}
+ bool operator()( const std::pair<LVertex,LVertex>& v1, const std::pair<LVertex,LVertex>& v2 ) const {
+ assert(distmap->find(v1)!=distmap->end());
+ assert(distmap->find(v2)!=distmap->end());
+ //.first is fmin coordinate
+ if(distmap->find(v1)->second > distmap->find(v2)->second){
+ return true;
+ }
+ else{
+ return false;
+ }
+ }
+ std::map<std::pair<LVertex,LVertex>,unsigned int > *distmap;
+class coordsorder
+ coordsorder(VertexLabelIntervalMap *c, Label s)
+ :coords(c),currentSeq(s)
+ {}
+ bool operator()( const LVertex& v1, const LVertex& v2 ) const {
+ assert(coords->find(std::make_pair(v1,currentSeq))!=coords->end());
+ assert(coords->find(std::make_pair(v2,currentSeq))!=coords->end());
+ //.first is fmin coordinate
+ if (coords->find(std::make_pair(v1,currentSeq))->second.first < coords->find(std::make_pair(v2,currentSeq))->second.first){
+ return true;
+ }
+ else{
+ return false;
+ }
+ }
+ VertexLabelIntervalMap *coords;
+ Label currentSeq;
+class coordsorder_vertex
+ coordsorder_vertex(VertexIntervalMap *c)
+ :coords(c)
+ {}
+ bool operator()( const LVertex& v1, const LVertex& v2 ) const {
+ assert(coords->find(v1)!=coords->end());
+ assert(coords->find(v2)!=coords->end());
+ //.first is fmin coordinate
+ if (coords->find(v1)->second.first < coords->find(v2)->second.first){
+ return true;
+ }
+ else{
+ return false;
+ }
+ }
+ VertexIntervalMap *coords;
+class matchmaporder
+ matchmaporder(std::map<Vertex,int> *c)
+ :matchmap(c)
+ {}
+ bool operator()( const Vertex& v1, const Vertex& v2 ) const {
+ assert(matchmap->find(v1)!=matchmap->end());
+ assert(matchmap->find(v2)!=matchmap->end());
+ if (matchmap->find(v1)->second > matchmap->find(v2)->second){
+ return true;
+ }
+ else{
+ return false;
+ }
+ }
+ std::map<Vertex,int> *matchmap;
+void printlabel(OrientedLabelSet & i){
+ for(OrientedLabelSet::iterator j=i.begin();j!=i.end();++j){
+ cerr << j->first << ":" << j->second << " ";
+ }
+void printlabel(OrientedLabelSet & i, LabelNameMap & index2sequence){
+ for(OrientedLabelSet::iterator j=i.begin();j!=i.end();++j){
+ cerr << index2sequence[j->first] << ":" << j->second << " ";
+ }
+template<class TGraph, class CoordMap, class SequenceGenomeMap>
+void setedgemasks(TGraph & g, int distance, CoordMap & coordinates, SequenceGenomeMap & sequence2genome){
+ typename property_map < TGraph, vertex_orient_t >::type orientmap = get(vertex_orient,g);
+#if defined(STORE_EDGE_LABELS)
+ typename property_map < TGraph, edge_label_t >::type elabelmap = get(edge_label,g);
+ typename property_map < TGraph, edge_labelmask_t >::type elabelmaskmap = get(edge_labelmask,g);
+ typename boost::graph_traits<TGraph>::edge_iterator eit,eit_end;
+ eit_end = edges(g).second;
+ for(eit = edges(g).first;eit!=eit_end;++eit){
+ Edge e = *eit;
+ Vertex sv = source(e,g);
+ Vertex tv = target(e,g);
+ //add extra edge labels
+ OrientedLabelSet::iterator it_end = orientmap[sv].end();
+ for(OrientedLabelSet::iterator it = orientmap[sv].begin();it != it_end;++it){
+ Label seqidx = it->first;
+ assert(sequence2genome.find(seqidx)!=sequence2genome.end());
+ Label genomeidx = sequence2genome[seqidx];
+ Orientation orient = it->second;
+ Orientation rorient = (orient) ? false : true;
+ //If sv--tv connected in label seqidx
+ //Need to check original and reverse orientations of tv to ensure a match to sv
+ if(orientmap[tv].find(*it) != orientmap[tv].end()
+ || orientmap[tv].find(std::make_pair(genomeidx,rorient)) != orientmap[tv].end()){
+ //vertices share label
+ //If edge label does not include seqidx, then we need to update
+ //if(elabelmap[*eit].find(genomeidx)==elabelmap[*eit].end()){
+ if(!elabelmaskmap[*eit].test(genomeidx)){
+ assert(!elabelmaskmap[*eit].test(genomeidx));
+ int dist=-1;
+ //Need to check if coordinates for seqidx
+ if(coordinates.find(std::make_pair(source(*eit,g),seqidx))!=coordinates.end()
+ && coordinates.find(std::make_pair(target(*eit,g),seqidx))!=coordinates.end()){
+ if(it->second==true){
+ if(coordinates[std::make_pair(target(*eit,g),seqidx)].first >= coordinates[std::make_pair(source(*eit,g),seqidx)].second){
+ dist = coordinates[std::make_pair(target(*eit,g),seqidx)].first - coordinates[std::make_pair(source(*eit,g),seqidx)].second;
+ }
+ else{
+ dist = coordinates[std::make_pair(source(*eit,g),seqidx)].first - coordinates[std::make_pair(target(*eit,g),seqidx)].second;
+ }
+ //assert(dist>=0);
+ if(dist<0){
+ //std::cout << source(e,g) << "-" << target(e,g) << " "
+ //<< coordinates[std::make_pair(source(*eit,g),seqidx)].first << "-" << coordinates[std::make_pair(source(*eit,g),seqidx)].second
+ //<< " "
+ //<< coordinates[std::make_pair(target(*eit,g),seqidx)].first << "-" << coordinates[std::make_pair(target(*eit,g),seqidx)].second
+ //<< " " << dist << ":COORDS " << std::endl;
+ dist=0;
+ }
+ }
+ else{
+ if(coordinates[std::make_pair(target(*eit,g),seqidx)].first >= coordinates[std::make_pair(source(*eit,g),seqidx)].second){
+ dist = coordinates[std::make_pair(target(*eit,g),seqidx)].first - coordinates[std::make_pair(source(*eit,g),seqidx)].second;
+ }
+ else{
+ dist = coordinates[std::make_pair(source(*eit,g),seqidx)].first - coordinates[std::make_pair(target(*eit,g),seqidx)].second;
+ }
+ if(dist<0){
+ //std::cout << source(e,g) << "-" << target(e,g) << " "
+ //<< coordinates[std::make_pair(source(*eit,g),seqidx)].first << "-" << coordinates[std::make_pair(source(*eit,g),seqidx)].second
+ //<< " "
+ //<< coordinates[std::make_pair(target(*eit,g),seqidx)].first << "-" << coordinates[std::make_pair(target(*eit,g),seqidx)].second
+ //<< " " << dist << ":COORDS" << std::endl;
+ dist=0;
+ }
+ //assert(dist>=0);
+ }
+ assert(dist>=0);
+ if(dist<=distance){
+#if defined(STORE_EDGE_LABELS)
+ elabelmap[*eit].insert(std::make_pair(genomeidx,dist));
+ elabelmaskmap[*eit].set(genomeidx,1);
+ }
+ else{
+ //elabelmaskmap[*eit].set(genomeidx,0);
+ }
+ }
+ else{
+ }
+ }
+ }
+ }
+#if defined(STORE_EDGE_LABELS)
+ BitMask mask;
+ unsigned int numset=0;
+ for(LabelMap::iterator i1 = labelmap[*eit].begin();i1 != labelmap[*eit].end();++i1){
+ if(i1->second<=distance){
+ assert(i1->first>=0&&i1->first<BITMAX);
+ mask.set(i1->first,1);
+ numset++;
+ }
+ }
+ assert(numset==labelmap[*eit].size());
+ assert(mask.any());
+ if(mask!=elabelmaskmap[*eit]){
+ std::cerr << "Mask " << mask << std::endl;
+ std::cerr << "EdgeMask " << elabelmaskmap[*eit] << std::endl;
+ }
+ assert(mask==elabelmaskmap[*eit]);
+ //put(edge_labelmask,g,*eit,mask);
+ }
+//Sets the following graph vertex properties: vertex_vlabelmask, vertex_orientmask
+template<class TGraph>
+void setvertexmasks(TGraph & g, SequenceGenomeMap & sequence2genome){
+ typename property_map < TGraph, vertex_orient_t >::type vmap = get(vertex_orient,g);
+ typename property_map < TGraph, vertex_vlabelmask_t >::type lmaskmap = get(vertex_vlabelmask,g);
+ typename property_map < TGraph, vertex_orientmask_t >::type omaskmap = get(vertex_orientmask,g);
+ typename boost::graph_traits<TGraph>::vertex_iterator vit_end = vertices(g).second;
+ for(typename boost::graph_traits<TGraph>::vertex_iterator
+ vit = vertices(g).first;vit!=vit_end;++vit){
+ Vertex v = *vit;
+ OrientedLabelSet::iterator o_end = vmap[v].end();
+ for(OrientedLabelSet::iterator o=vmap[v].begin();o!=o_end;++o){
+ Label seqidx = o->first;
+ Label genomeidx = sequence2genome[seqidx];
+ assert(genomeidx>=0&&genomeidx<BITMAX);
+ //set lmask for each genome
+ lmaskmap[v].set(genomeidx,1);
+ //set omask for all + orientation
+ if(o->second == true){//+ orientation
+ omaskmap[v].set(genomeidx,1);
+ }
+ }
+#ifdef DEBUG
+ cerr << "VERTEX: " << get(vertex_name,g,v) << endl;
+ cerr << "VERTEXIDX: " << v << endl;
+ cerr << "LMASK :" << lmaskmap[v] << endl;
+ cerr << "OMASK :" << omaskmap[v] << endl;
+ }
+void updateCoordinates(VertexLabelIntervalMap & coordinates,
+ SequenceGenomeMap & sequence2genome){
+ std::map<Label,Coordinate> maxcoord;
+ for(VertexLabelIntervalMap::iterator it=coordinates.begin();it!=coordinates.end();it++){
+ Label seqidx = it->first.second;
+ assert(it->second.first<=it->second.second);
+ maxcoord[seqidx] = (it->second.second > maxcoord[seqidx]) ? it->second.second : maxcoord[seqidx];
+ }
+ std::map<Label,std::vector<Label> > genome2sequence;
+ for(SequenceGenomeMap::iterator sit=sequence2genome.begin();sit!=sequence2genome.end();sit++){
+ genome2sequence[sit->second].push_back(sit->first);
+ }
+ std::map<Label,Coordinate> seqoffset;
+ for(std::map<Label,std::vector<Label> >::iterator git=genome2sequence.begin();git!=genome2sequence.end();git++){
+ //Label genomeidx = git->first;
+ Coordinate curroffset=0;
+ for(std::vector<Label>::iterator sit=git->second.begin();sit!=git->second.end();sit++){
+ Label seqidx = *sit;
+ seqoffset[seqidx] = curroffset;
+ curroffset = curroffset+maxcoord[seqidx];
+ }
+ }
+ VertexLabelIntervalMap newcoordinates;
+ for(VertexLabelIntervalMap::iterator it=coordinates.begin();it!=coordinates.end();it++){
+ Label seqidx = it->first.second;
+ Vertex v = it->first.first;
+ Label genomeidx = sequence2genome[seqidx];
+ Coordinate newbeg=seqoffset[genomeidx]+it->second.first;
+ Coordinate newend=seqoffset[genomeidx]+it->second.second;
+ //update coordinate map
+ pair<Vertex,Label> key = std::make_pair(v,seqidx);
+ pair<Vertex,Label> value = std::make_pair(newbeg,newend);
+ //assert(newcoordinates.find(key)==newcoordinates.end());
+ //if(newcoordinates.find(key)!=newcoordinates.end()){
+ //std::cerr << "Duplicate V:"<<v<<" seqidx:" <<seqidx << " genomeidx:"<<genomeidx << " " <<newbeg << "-" << newend << std::endl;
+ //}
+ //else{
+ //std::cerr << "Storing V:"<<v<<" seqidx:" <<seqidx << " genomeidx:"<<genomeidx << " " <<newbeg << "-" << newend << std::endl;
+ //}
+ newcoordinates[key]=value;
+ assert(newcoordinates[key].first==newbeg);
+ assert(newcoordinates[key].second==newend);
+ }
+ coordinates=newcoordinates;
diff --git a/chaining/lcbchecks.h b/chaining/lcbchecks.h
new file mode 100644
index 0000000..ca7a074
--- /dev/null
+++ b/chaining/lcbchecks.h
@@ -0,0 +1,905 @@
+//Test if v1 and v2 are separated by < maxgap in label intersection(s1,s2)
+bool isLabelMaxGap(Vertex v1,
+ Vertex v2,
+ OrientedLabelSet & s1,
+ OrientedLabelSet & s2,
+ VertexLabelIntervalMap & coordinates,
+ unsigned int maxgap,
+ SequenceGenomeMap & sequence2genome){
+ OrientedLabelSet::iterator s1_it_end = s1.end();
+ for(OrientedLabelSet::iterator s1_it=s1.begin();s1_it!=s1_it_end;++s1_it){
+ Label seqidx = s1_it->first;
+ //Label genomeidx = sequence2genome[seqidx];
+ std::list<Vertex> sortedV;
+ VertexIntervalMap currcoords;
+ //only consider seqs present in both s1 and s2
+ OrientedLabelSet::iterator s2_it = s2.find(*s1_it);
+ if(s2_it != s2.end()){
+ assert(seqidx==s2_it->first);
+ sortedV.push_back(v1);
+ sortedV.push_back(v2);
+ currcoords.insert(std::make_pair(v1,
+ coordinates[std::make_pair(v1,seqidx)]));
+ currcoords.insert(std::make_pair(v2,
+ coordinates[std::make_pair(v2,seqidx)]));
+ //sort(sortedV.begin(),sortedV.end(),coordsorder(&coordinates,seqidx));
+ //sortedV.sort(coordsorder(&coordinates,seqidx));
+ sortedV.sort(coordsorder_vertex(&currcoords));
+ int prevcoord=-1;
+ int currstart,currend;
+ std::list<Vertex>::iterator vit_end = sortedV.end();
+ for(std::list<Vertex>::iterator vit = sortedV.begin();vit!=vit_end;++vit){
+ assert(coordinates.find(std::make_pair(*vit,seqidx)) != coordinates.end());
+ assert(coordinates.find(std::make_pair(*vit,seqidx))->second == currcoords.find(*vit)->second);
+ tie(currstart,currend) = currcoords[*vit];
+ if(prevcoord==-1){
+ assert(vit==sortedV.begin());
+ }
+ else{
+ //assert(coordinates.find(std::make_pair(*(vit-1),seqidx)) != coordinates.end());
+ //assert(coordinates.find(std::make_pair(*(vit-1),seqidx))->second.second == prevcoord);
+ assert(currstart<currend);
+ //assert(currstart>=prevcoord);
+ int dist = currstart-prevcoord;
+ if(dist>(int)maxgap){
+ return false;
+ }
+ }
+ prevcoord=currend;
+ }
+ }
+ }
+ return true;
+//Return true if there is no change in orientation between labels s1
+//and s2. This implies a collinear relationship between s1 and s2
+//meaning there are no rearrangments between the labels in s1 and
+//s2. Labels are comprised of a pair (seq,orient). There are 2
+//possibilities for a collinear relationship,
+//return orient(s1 in S)==orient(s2 in S)
+//revorient(s1 in S)==orient(s2 in S)
+//In other words, this function checks both the stored orientation of
+//s1 vs. s2 and rev(s1) vs s2 for all sequences shared between s1 and
+//s2 and returns true if either comparison is collinear (ie, no change in
+inline bool isLabelCollinear(OrientedLabelSet & s1,
+ OrientedLabelSet & s2,
+ SequenceGenomeMap & sequence2genome){
+ //Implemented using bitmasks to store the presence of a sequence
+ //and the orientation
+ BitMask s1lmask,s2lmask;
+ BitMask s1omask,s2omask,s1omaskrev;
+ OrientedLabelSet::iterator s1_it_end = s1.end();
+ for(OrientedLabelSet::iterator s1_it=s1.begin();s1_it!=s1_it_end;++s1_it){
+ Label seqidx = s1_it->first;
+ Label genomeidx = sequence2genome[seqidx];
+ assert(genomeidx>=0&&genomeidx<BITMAX);
+ s1lmask.set(genomeidx,1);
+ //set omask for all + orientation
+ if(s1_it->second == true){
+ s1omask.set(genomeidx,1);
+ }
+ }
+ s1omaskrev=s1omask;
+ s1omaskrev.flip();
+ OrientedLabelSet::iterator s2_it_end = s2.end();
+ for(OrientedLabelSet::iterator s2_it=s2.begin();s2_it!=s2_it_end;++s2_it){
+ Label seqidx = s2_it->first;
+ Label genomeidx = sequence2genome[seqidx];
+ assert(genomeidx>=0&&genomeidx<BITMAX);
+ s2lmask.set(genomeidx,1);
+ //set omask for all + orientation
+ if(s2_it->second == true){
+ s2omask.set(genomeidx,1);
+ }
+ }
+ //Shared labels obtain by intersection using bitwise AND
+ BitMask sharedlabels = (s1lmask&s2lmask);
+#if defined(V_DEBUG)
+ BitMask difflabels = (s1lmask^s2lmask)&(s1lmask|s2lmask);
+ cout << "S1MASK: " << s1lmask << endl;
+ cout << "S2MASK: " << s2lmask << endl;
+ cout << "SHARED: " << sharedlabels << endl;
+ cout << "DIFF: " << difflabels << endl;
+ cout << "O1MASK: " << s1omask << endl;
+ cout << "O2MASK: " << s2omask << endl;
+ cout << "O1MASKa: " << (s1omask&sharedlabels) << endl;
+ cout << "O2MASKa: " << (s2omask&sharedlabels) << endl;
+ if((s1omask&sharedlabels)==(s2omask&sharedlabels) ||
+ (s1omaskrev&sharedlabels)==(s2omask&sharedlabels)){
+ return true;
+ }
+ else{
+ return false;
+ }
+inline bool isLabelCollinearMask(BitMask & sharedlabels, BitMask & s1omask, BitMask & s2omask){
+ if((s1omask&sharedlabels)==(s2omask&sharedlabels)){
+ return true;
+ }
+ else{
+ BitMask s1omaskrev = s1omask;
+ s1omaskrev.flip();
+ if((s1omaskrev&sharedlabels)==(s2omask&sharedlabels)){
+ return true;
+ }
+ else{
+#ifdef DEBUG
+ //std::cerr << (s1omask&sharedlabels) << std::endl
+ // << (s2omask&sharedlabels) << std::endl << std::endl
+ // << (s1omaskrev&sharedlabels) << std::endl
+ // << (s2omask&sharedlabels) << std::endl;
+ return false;
+ }
+ }
+ assert(false);
+//Return true if no LCB gaps > maxgap
+template<typename TGraph> inline
+bool checkLCBGaps(TGraph & g,
+ LCB & lcb,
+ std::vector<int> & ccvmap,
+ VertexLabelIntervalMap & coordinates,
+ unsigned int maxgap,
+ SequenceGenomeMap & sequence2genome){
+ bool shortcircuit=true;
+ bool badGap=false;
+ LabelSet seqidxSet;
+ std::set<Vertex> mmV;
+ std::map<Label,std::set<Label> > seqspergenomeMap; //tracks the number of seqs per genome in an LCB
+ std::map<Label,std::set<Label> >::iterator gpos;
+ bool inserted;
+ typename property_map <TGraph, vertex_orient_t >::type orientmap = get(vertex_orient,g);
+ LCB::iterator lit_end = lcb.end();
+ for(LCB::iterator lit=lcb.begin();lit!=lit_end;++lit){
+ Vertex v = *lit;
+ //OrientedLabelSet o1 = get(vertex_orient, g, v);
+ OrientedLabelSet::iterator oit_end = orientmap[v].end();
+ for(OrientedLabelSet::iterator oit=orientmap[v].begin();oit!=oit_end;++oit){ //all seqs on the vertex
+ Label seqidx = oit->first;
+ seqidxSet.insert(seqidx);
+ tie(gpos, inserted) = seqspergenomeMap.insert(std::make_pair(sequence2genome[seqidx],std::set<Label>()));
+ gpos->second.insert(seqidx);
+ if(gpos->second.size()>1){
+ return false;
+ }
+ }
+ }
+ LabelSet::iterator it2_end = seqidxSet.end();
+ for(LabelSet::iterator it2 = seqidxSet.begin(); it2 != it2_end; ++it2){
+ Label seqidx = *it2;
+ //Label genomeidx = sequence2genome[seqidx];
+ std::list<Vertex> sortedV;
+ unsigned int spanlen=0;
+ //std::map<pair<Vertex,Label>,Interval > currcoords;
+ VertexIntervalMap currcoords;
+ LCB::iterator lit_end = lcb.end();
+ for(LCB::iterator lit=lcb.begin();lit!=lit_end;++lit){
+ Vertex v = *lit;
+ VertexLabelIntervalMap::iterator cit = coordinates.find(std::make_pair(v,seqidx));
+ if(cit!=coordinates.end()){
+ assert(cit->second.first<cit->second.second);
+ sortedV.push_back(v);
+ currcoords.insert(std::make_pair(v,
+ cit->second));
+ spanlen = spanlen + get(vertex_len,g,v);
+ }
+ }
+#ifdef DEBUG
+ std::cerr << "checkLCBGaps seqidx: " << seqidx << " span length " << spanlen << " MINSPANLEN " << MINSPANLEN << std::endl;
+ if(spanlen >= MINSPANLEN){
+ sortedV.sort(coordsorder_vertex(&currcoords));
+ int prevcoord=-1;
+ int currstart,currend;
+ Vertex prevvertex=0;
+ std::list<Vertex>::iterator vit_end = sortedV.end();
+ for(std::list<Vertex>::iterator vit = sortedV.begin();vit!=vit_end;++vit){
+ assert(coordinates.find(std::make_pair(*vit,seqidx)) != coordinates.end());
+ assert(coordinates[std::make_pair(*vit,seqidx)] == currcoords[*vit]);
+ tie(currstart,currend) = currcoords[*vit];
+ if(prevcoord==-1){
+ assert(vit==sortedV.begin());
+ prevvertex=*vit;
+ }
+ else{
+ assert(coordinates.find(std::make_pair(prevvertex,seqidx)) != coordinates.end());
+ assert(coordinates.find(std::make_pair(prevvertex,seqidx))->second.second == prevcoord);
+ assert(currstart<currend);
+ int dist = currstart-prevcoord;
+#ifdef DEBUG
+ std::cerr << "Checking dist:" << dist << " > " << maxgap
+ << " between V:" << get(vertex_name,g,prevvertex) << " " << currstart << "-" << currend
+ << " and V:" << get(vertex_name,g,*vit)
+ << " on seqidx:" << seqidx << std::endl;
+ if(dist>(int)maxgap){
+ badGap=true;
+#ifdef DEBUG
+ std::cerr << "Large gap dist:" << dist << " > " << maxgap
+ << " between V:" << get(vertex_name,g,prevvertex)
+ << " and V:" << get(vertex_name,g,*vit)
+ << " on seqidx:" << seqidx << std::endl;
+ if(shortcircuit){
+ return !badGap;
+ }
+ else{
+ mmV.insert(*vit);
+ }
+ //std::cerr << "NO SHORT CIRCUIT" << std::endl;
+ }
+ }
+ prevvertex=*vit;
+ prevcoord=currend;
+ }
+ }
+ else{
+ //std::cerr << "Skipping check of seqidx: " << seqidx << " span length " << spanlen << " MINSPANLEN " << MINSPANLEN << std::endl;
+ }
+#ifdef DEBUG
+ if(mmV.size()>0){
+ std::cerr << "Num bad vertices:" << mmV.size()
+ << " LCB size:" << lcb.size() << std::endl;
+ }
+ }
+ return !badGap;
+//Bitmask implementation
+inline bool checkLCBOrient(BitMask & lcbl1,
+ BitMask & lcbl2,
+ BitMask & lcbo1,
+ BitMask & lcbo2){
+#ifdef DEBUG
+ std::cerr << "LCBOrient l1:" << lcbl1 << " l2:" << lcbl2 << std::endl
+ << "LCBOrient o1:" << lcbo1 << " o2:" << lcbo2 << std::endl;
+ BitMask sharedlabels = (lcbl1&lcbl2);
+#ifdef DEBUG
+ std::cerr << "LCBOrient shared:" << sharedlabels << std::endl;
+ //Make sure this function is working symmetrically
+ assert(isLabelCollinearMask(sharedlabels,lcbo1,lcbo2)==isLabelCollinearMask(sharedlabels,lcbo2,lcbo1));
+ if(isLabelCollinearMask(sharedlabels,lcbo1,lcbo2)){
+#ifdef DEBUG
+ std::cerr << "Match" << std::endl;
+ return true;
+ }
+ else{
+#ifdef DEBUG
+ std::cerr << "MisMatch" << std::endl;
+ return false;
+ }
+template<typename TLCBOrientMap>
+bool checkLCBOrient(TLCBOrientMap & lcborientmap,
+ typename TLCBOrientMap::key_type &lcbidx1,
+ typename TLCBOrientMap::key_type &lcbidx2,
+ BitMask &longlabelmask){
+ BitMask t1=lcborientmap[lcbidx1].first&longlabelmask;
+ BitMask t2=lcborientmap[lcbidx2].first&longlabelmask;
+ BitMask t3=lcborientmap[lcbidx1].second&longlabelmask;
+ BitMask t4=lcborientmap[lcbidx2].second&longlabelmask;
+ return checkLCBOrient(t1,t2,t3,t4);
+//Return true if no mismatch between Vertex orientations within an LCB
+//Currently implemented with bitmasks
+//Improve perfomance
+//Profiling shows this check is primary performance bottleneck
+//First attempt above was to cache orientation of the LCB once instead of checking all pairs
+//for consisency
+//Surprisingly, this does not appear to improve performance?
+template<typename TGraph> inline
+bool checkLCBOrient(TGraph & g,
+ LCB & lcb,
+ BitMask &lcbmask1,
+ BitMask &lcbmask2,
+ BitMask &longlabelmask){
+ LCB::iterator it,it2,it_end,it2_end;
+ //Use orientmap so that we can pass by reference using lvalue []
+ typename property_map < TGraph, vertex_orient_t >::type orientmap = get(vertex_orient,g);
+ typename property_map < TGraph, vertex_orientmask_t >::type orientmaskmap = get(vertex_orientmask,g);
+ typename property_map < TGraph, vertex_vlabelmask_t >::type labelmaskmap = get(vertex_vlabelmask,g);
+ it_end = lcb.end();
+ it2_end = lcb.end();
+ BitMask l1 = lcbmask1&longlabelmask;
+ BitMask l2 = lcbmask2&longlabelmask;
+ for(it = lcb.begin();it!=it_end;++it){
+ for(it2 = it+1;it2!=it2_end;++it2){
+ if(checkLCBOrient(l1,//lcbmask1&longlabelmask,
+ l2,//lcbmask2&longlabelmask,
+ orientmaskmap[*it]&longlabelmask,
+ orientmaskmap[*it2]&longlabelmask)){
+ }
+ else{
+#ifdef DEBUG
+ std::cerr << "SAM Mismatch " << *it << "-" << *it2 << std::endl;
+ return false;
+ }
+ }
+ }
+ return true;
+template<typename TGraph> inline
+bool checkLCBOrient(TGraph & g,
+ LCB & lcb,
+ SequenceGenomeMap & sequence2genome){
+ BitMask longlabelmask;
+ longlabelmask.set();
+ return checkLCBOrient(g,lcb,longlabelmask,sequence2genome);
+template<typename TGraph> inline
+bool checkLCBOrient(TGraph & g,
+ LCB & lcb,
+ BitMask & longlabelmask,
+ SequenceGenomeMap & sequence2genome){
+ LCB::iterator it,it2,it_end,it2_end;
+ //Use orientmap so that we can pass by reference using lvalue []
+ typename property_map < TGraph, vertex_orient_t >::type orientmap = get(vertex_orient,g);
+ typename property_map < TGraph, vertex_orientmask_t >::type orientmaskmap = get(vertex_orientmask,g);
+ typename property_map < TGraph, vertex_vlabelmask_t >::type labelmaskmap = get(vertex_vlabelmask,g);
+ typename property_map < TGraph, vertex_label_t >::type labelset = get(vertex_label,g);
+ typename property_map < TGraph, vertex_len_t >::type lenmap = get(vertex_len,g);
+ it_end = lcb.end();
+ it2_end = lcb.end();
+ for(it = lcb.begin();it!=it_end;++it){
+ BitMask o1 = orientmaskmap[*it];
+ BitMask l1 = labelmaskmap[*it];
+ for(it2 = it+1;it2!=it2_end;++it2){
+ BitMask sharedlabels = ((l1&labelmaskmap[*it2])&longlabelmask);
+ if(isLabelCollinearMask(sharedlabels,o1,orientmaskmap[*it2])){
+ }
+ else{
+#ifdef DEBUG
+ std::cerr << "SAM Mismatch " << *it << "-" << *it2 << std::endl;
+ return false;
+ }
+ }
+ }
+ return true;
+template<typename TGraph, typename TLCBOrientMap> inline
+bool checkLCBOrient_old(TGraph & g,
+ LCB & lcb,
+ int lcbidx,
+ TLCBOrientMap & lcborientmap){
+ bool shortcircuit=true;
+ //Check for label orientation mismatches
+ bool mmOrient=false;
+ LCB::iterator it,it2,it_end,it2_end;
+ EdgeSet mmV;
+ //Use orientmap so that we can pass by reference using lvalue []
+ typename property_map < TGraph, vertex_orient_t >::type orientmap = get(vertex_orient,g);
+ typename property_map < TGraph, vertex_orientmask_t >::type orientmaskmap = get(vertex_orientmask,g);
+ typename property_map < TGraph, vertex_vlabelmask_t >::type labelmaskmap = get(vertex_vlabelmask,g);
+ it_end = lcb.end();
+ it2_end = lcb.end();
+ for(it = lcb.begin();it!=it_end;++it){
+ BitMask o1 = orientmaskmap[*it];
+ BitMask l1 = labelmaskmap[*it];
+ for(it2 = it+1;it2!=it2_end;++it2){
+ BitMask sharedlabels = (l1&labelmaskmap[*it2]);
+#ifdef DEBUG
+ std::cerr << "LCBOrientS l1:" << l1 << " l2:" << labelmaskmap[*it2] << std::endl
+ << "LCBOrientS o1:" << o1 << " o2:" << orientmaskmap[*it2] << std::endl;
+ std::cerr << "LCBOrientS shared:" << sharedlabels << std::endl;
+ //assert(checkLCBOrient(lcborientmap[lcbidx].second,lcborientmap[lcbidx].second,o1,orientmaskmap[*it2])
+ //== isLabelCollinearMask(sharedlabels,o1,orientmaskmap[*it2]));
+ if(isLabelCollinearMask(sharedlabels,o1,orientmaskmap[*it2])){
+#ifdef DEBUG
+ std::cerr << "Match" << std::endl;
+ }
+ else{
+#ifdef DEBUG
+ std::cerr << "MisMatch" << std::endl;
+ mmV.insert(std::make_pair(*it,*it2));
+ mmOrient=true;
+ if(shortcircuit){
+ return !mmOrient;
+ }
+ }
+ }
+ }
+#ifdef DEBUG
+ for(EdgeSet::iterator it = mmV.begin();it!=mmV.end();++it){
+ OrientedLabelSet l1 = get(vertex_orient, g, it->first);
+ OrientedLabelSet l2 = get(vertex_orient, g, it->second);
+ Edge e1;
+ bool found;
+ tie(e1,found) = edge(it->first,it->second,g);
+ if(found){
+ //assert(get(edge_category,g,e1)==PURPLE);
+ }
+ tie(e1,found) = edge(it->first,it->second,g);
+ if(found){
+ //assert(get(edge_category,g,e1)==PURPLE);
+ }
+ std::cerr << "Orient Mismatch V1:" << get(vertex_name,g,it->first) << " L1:";
+ printlabel(l1);
+ std::cerr << " V2:" << get(vertex_name,g,it->second) << " L2:";
+ printlabel(l2);
+ std::cerr << std::endl;
+ }
+ return !mmOrient;
+template<typename TLabel>
+bool sameLabel(TLabel v1, TLabel v2, TLabel e){
+ if(v1==v2){
+ if(v1==e){
+ return true;
+ }
+ else{
+ return false;
+ }
+ }
+ else{
+ return false;
+ }
+template<typename TLabel>
+bool sameOrient(TLabel o1, TLabel o2, TLabel v1){
+ BitMask revo=o1;
+ revo.flip();
+ revo=revo&v1;
+ //Must logical AND with labelmask v1
+ if(o1==o2 || o2==revo){
+ return true;
+ }
+ else{
+ return false;
+ }
+//First determine the orientation of the sequences in the LCB
+//Then orient each vertex in the LCB
+// BitMask s1omaskrev = s1omask;
+// s1omaskrev.flip();
+// if((s1omask&sharedlabels)==(s2omask&sharedlabels) ||
+// (s1omaskrev&sharedlabels)==(s2omask&sharedlabels)){
+// return true;
+// }
+// else{
+// return false;
+//Need to assign LCBs a true orientation for each seq
+//This is not as simple as always setting a bit for every + seq
+//because blocks within an LCB can be reversed
+//Need to mark each block and whether it is flipped wrt to the LCB orientation
+//Attempt to mark this flipped state when adding the block to the LCB2
+template<typename TGraph>
+inline std::pair<BitMask,BitMask> setLCBOrient(TGraph & g,
+ LCB & lcb,
+ std::vector<Vertex> & badV,
+ SequenceGenomeMap & sequence2genome){
+ BitMask labelmask;
+ BitMask orientmask;
+ setLCBOrient(g,labelmask,orientmask,lcb,badV,sequence2genome);
+ return std::make_pair(labelmask,orientmask);
+template<typename TGraph>
+inline void setLCBOrient(TGraph & g,
+ BitMask & labelmask,
+ BitMask & orientmask,
+ LCB & lcb,
+ std::vector<Vertex> & badV,
+ SequenceGenomeMap & sequence2genome){
+#ifdef DEBUG
+ std::cerr << "Setting orientation for lcb with " << lcb.size() << " vertices" << std::endl;
+ badV.clear();
+ assert(!orientmask.any());
+ assert(!labelmask.any());
+ typename property_map < TGraph, vertex_vlabelmask_t >::type lmaskmap = get(vertex_vlabelmask,g);
+ typename property_map < TGraph, vertex_orientmask_t >::type omaskmap = get(vertex_orientmask,g);
+ typename property_map < TGraph, vertex_len_t >::type lenmap = get(vertex_len,g);
+ typename property_map < TGraph, vertex_label_t >::type labelset = get(vertex_label,g);
+ //Save label mask and determine evaluation order for orientmask
+ std::list<Vertex> sortedV;
+ std::map<Label,int> spans;
+ std::map<Label,int>::iterator sit;
+ //bool found;
+ std::map<Vertex,int> vorientmatchcount;
+ std::map<Vertex,int>::iterator mit;
+ LCB::iterator it,it_end,it2,it2_end;
+ it_end = lcb.end();
+ it2_end = lcb.end();
+ BitMask omask;
+ BitMask lmask;
+ BitMask sharedlabels;
+ //Foreach vertex in the LCB, sum the number of bases with compatible orientation labeling
+ for(it = lcb.begin();it!=it_end;++it){
+ assert(lmaskmap[*it].any());//at least one seq > MINSPANLEN
+ labelmask = labelmask|lmaskmap[*it];
+ sortedV.push_back(*it);
+ //Check num bp from other vertices compatible within the LCBs
+ omask = omaskmap[*it];
+ lmask = lmaskmap[*it];
+ int len = lenmap[*it];
+ mit = vorientmatchcount.find(*it);
+ for(it2 = lcb.begin();it2!=it2_end;++it2){
+ if(it2!=it){
+ sharedlabels = (lmask&lmaskmap[*it2]);
+ if(isLabelCollinearMask(sharedlabels,omask,omaskmap[*it2])){
+ //Update count of bp
+ if(mit==vorientmatchcount.end()){
+ vorientmatchcount[*it]=len;
+ mit = vorientmatchcount.find(*it);
+ }
+ else{
+ mit->second = mit->second+len;
+ }
+ }
+ }
+ }
+ }
+ //Determine orientation compatible with most bp in the LCB
+ //
+ //Sort by vorientmatchcount so that we consider most frequent orientations first
+ //before alternative orientations
+ //TODO, confirm this is actually working
+ sortedV.sort(matchmaporder(&vorientmatchcount));
+#ifdef DEBUG
+ std::cerr << "LCB labelmask :" << labelmask << std::endl;
+ BitMask currlabelmask;
+ //Realize speedup if < 64 genomes
+#if BITMAX > 64
+ boost::unordered_set<BitMask,bitsethasher_string> altOrients;
+ boost::unordered_set<BitMask,bitsethasher_ulong> altOrients;
+ std::set<Vertex> badVS;
+ std::list<Vertex>::iterator svit,svit_end;
+ svit_end = sortedV.end();
+ for(svit=sortedV.begin();svit!=svit_end;++svit){
+ if((omaskmap[*svit]&currlabelmask) //Vertex orients for seqs in LCB
+ ==(orientmask&lmaskmap[*svit])){ //LCB orients on current vertex
+ orientmask = orientmask|omaskmap[*svit];
+ }
+ else{
+ BitMask s1omaskrev = omaskmap[*svit];
+ s1omaskrev.flip();
+ s1omaskrev = s1omaskrev&lmaskmap[*svit];
+ if((s1omaskrev&currlabelmask)
+ ==(orientmask&lmaskmap[*svit])){
+ orientmask = orientmask|s1omaskrev;
+ }
+ else{
+ //Alternative orientation
+ //Sorting on bp above ensures this alternative is congruent with
+ //fewer bp than at least on other alternative orientation for the LCB
+ badV.push_back(*svit);
+#ifdef DEBUG
+ badVS.insert(*svit);
+ altOrients.insert((omaskmap[*svit]&labelmask));
+ //(1)Simple breakpoints
+ //A simple breakpoint occues when a pair of vertices in the block have incompatible orientations
+ //For example, consider an LCB with vertices 1(a+b+) and 2(a+b-)
+ //(2)Compound orientation breakpoints
+ //A compound breakpoint occurs when a combination of
+ //vertices has an orientation that is incompatible with
+ //other vertices in a block
+ //For example, consider an LCB with vertices 1(a+b+,d-) 2(b+c-) 3(a+b+c-) 4(c-,d+)
+ //All pairwise comparisons are congruent in orientation if reversals are allowed
+ //But not all multiway combinations are congruent because
+ // 1+2 requires (a+b+c-d-)
+ //while 3+4 requires (a+b+c-d+)
+ //producing an incompatibility in the orientations of d
+ //when considered in the context of the other
+ //vertices in the block
+ //Save a map v->totallcbsize congruent with orientmask(v)
+ //Sort by totallcbsize in decreasing order
+ //Build LCB mask in sorted order
+ //This ensures orientations congruent with the most bp are considered first
+ //Each alternative orientation encountered is a compound breakpoint
+ //Count length/number of vertices congruent with this alternative orientation
+#ifdef DEBUG
+ std::cerr << "Mismatched orient for vertex " << get(vertex_name,g,*svit) << " with vertex/lcb/shared labels " <<std::endl
+ << " vertex_l :" << lmaskmap[*svit] << std::endl
+ << " lcb_l :" << currlabelmask << std::endl
+ << " shared_l :" << (lmaskmap[*svit]&currlabelmask) << std::endl
+ << " lcb_o :" << orientmask << std::endl
+ << " vertex_o :" << (omaskmap[*svit]&currlabelmask) << std::endl
+ << " vertex_ro :" << (s1omaskrev&currlabelmask) << std::endl
+ << " shared_o :" << (orientmask&lmaskmap[*svit]) << std::endl;
+ //assert(false);
+ std::cerr << "Num bp matching orient len:" << vorientmatchcount[*svit] << std::endl;
+ }
+ }
+ //Update label mask
+ currlabelmask = currlabelmask|lmaskmap[*svit];
+ }
+#ifdef DEBUG
+ std::cerr << "LCB orientmask:" << orientmask << std::endl;
+ //Check that orientmask is only set on member sequences in the LCB
+ assert((orientmask&labelmask)==orientmask);
+ /*
+ if(0 && DEBUG){
+ for(it = lcb.begin();it!=it_end;++it){
+ //Label for vertex contains strict subset of seqs as LCB
+ assert(((lmaskmap[*it]&labelmask))==lmaskmap[*it]);
+ assert(((omaskmap[*it]&labelmask))==((omaskmap[*it]&lmaskmap[*it]&labelmask)));
+ assert(((omaskmap[*it]&labelmask))==((omaskmap[*it]&lmaskmap[*it])));
+ //Check that vertex omask matches lcb omask for member sequences on the vertex
+ if((omaskmap[*it]&labelmask)==(orientmask&lmaskmap[*it])){
+ if(badVS.find(*it)!=badVS.end()){
+ assert(false);
+ }
+ }
+ else{
+ BitMask s1omaskrev = omaskmap[*it];
+ s1omaskrev.flip();
+ s1omaskrev = s1omaskrev&lmaskmap[*it];
+ if((s1omaskrev&labelmask)==(orientmask&lmaskmap[*it])){
+ if(badVS.find(*it)!=badVS.end()){
+ assert(false);
+ }
+ }
+ else{
+ if(badVS.find(*it)==badVS.end()){
+ std::cerr << "Vertex " << get(vertex_name,g,*it) << " does not match LCB orientation " << std::endl
+ << "vertex_o: " << (omaskmap[*it]&labelmask) << std::endl
+ << "vertex_ro:" << (s1omaskrev&labelmask) << std::endl
+ << "lcb_o: " << (orientmask&lmaskmap[*it]) << std::endl;
+ assert(false);
+ }
+ }
+ }
+ }
+ }
+ */
+ if(badV.size()>0){
+#ifdef DEBUG
+ std::cerr << "LCB has " << badV.size() << "/" << lcb.size()
+ << " misoriented vertices. Max num alternative orients " << altOrients.size() << std::endl;
+ }
+#ifdef DEBUG
+ std::cerr << "Setting orientation done" << std::endl;
+template<typename TGraph, typename TLCBMap, typename TComponentMap>
+void setLCBOrient(TGraph & g, TLCBMap & lcborientmap, TComponentMap & componentMap, SequenceGenomeMap & sequence2genome){
+#ifdef DEBUG
+ std::cerr <<"Resetting lcborientmap" << std::endl;
+ lcborientmap.clear();
+ typename property_map < TGraph, vertex_vlabelmask_t >::type lmaskmap = get(vertex_vlabelmask,g);
+ typename property_map < TGraph, vertex_orientmask_t >::type omaskmap = get(vertex_orientmask,g);
+ int lcbidx=0;
+ std::vector<Vertex> badV;
+ typename TComponentMap::iterator lit,lit_end;
+ lit_end=componentMap.end();
+ for(lit = componentMap.begin();lit!=lit_end;++lit){
+ lcborientmap[lcbidx] = setLCBOrient(g,*lit,badV,sequence2genome);
+ //At least one genome label must be set
+ assert(lcborientmap[lcbidx].first.any());
+ lcbidx++;
+ }
+template<typename TLCB,
+ typename TLenMap,
+ typename TLabelMap>
+BitMask setSpanMask(TLCB & lcb,
+ TLenMap & lenmap,
+ TLabelMap & labelset,
+ SequenceGenomeMap & sequence2genome,
+ int MINSPANLEN=0){
+ std::map<Label,int> spans;
+ std::map<Label,int>::iterator sit;
+ LCB::iterator it,it_end,it2,it2_end;
+ LabelSet::iterator lit,lit_end;
+ BitMask longlabelmask;
+ it_end = lcb.end();
+ it2_end = lcb.end();
+ //Only consider sequences with > MINSPANLEN
+ for(it = lcb.begin();it!=it_end;++it){
+ //assert(labelset.find(*it)!=labelset.end());
+ //assert(lenmap.find(*it)!=lenmap.end());
+ lit_end = labelset[*it].end();
+ for(lit = labelset[*it].begin();lit != lit_end;++lit){
+ assert(sequence2genome.find(*lit)!=sequence2genome.end());
+ Label l = sequence2genome[*lit];
+ sit = spans.find(l);
+ if(sit==spans.end()){
+ spans.insert(std::make_pair(l,lenmap[*it]));
+ }
+ else{
+ int prev = sit->second;
+ sit->second = prev + lenmap[*it];
+ assert(spans[l] == (prev + lenmap[*it]));
+ }
+ }
+ }
+ for(std::map<Label,int>::iterator sit = spans.begin();sit!=spans.end();++sit){
+ if(sit->second>(int)MINSPANLEN){
+ longlabelmask.set(sit->first);
+ }
+ }
+ return longlabelmask;
+//LCB reporting functions
+unsigned int get_LCB_length(LCB & lcb,
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type orientmap,
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lenmap,
+ VertexLabelIntervalMap & coordinates,
+ LCBLabelIntervalMap & lcbcoords,
+ unsigned int lcbidx,
+ int & totallen,
+ SequenceGenomeMap & sequence2genome,
+ int minlength=0
+ ){
+ std::map<unsigned int,unsigned int> mincoordsbyseq;
+ std::map<unsigned int,unsigned int> maxcoordsbyseq;
+ std::map<Label,int> spans;
+ OrientedLabelSet alllabel;
+ LCB::iterator it;
+ for(it = lcb.begin();it!=lcb.end();++it){
+ OrientedLabelSet::iterator it2_end = orientmap[*it].end();
+ for(OrientedLabelSet::iterator it2 = orientmap[*it].begin();it2!=it2_end;++it2){
+ assert(it2->first>=0);
+ Label seqidx = it2->first;
+ Label genomeidx = sequence2genome[seqidx];
+ if(spans.find(genomeidx)==spans.end()){
+ spans.insert(std::make_pair(genomeidx,lenmap[*it]));
+ }
+ else{
+ spans[genomeidx] = spans[genomeidx] + lenmap[*it];
+ }
+ }
+ }
+ for(it = lcb.begin();it!=lcb.end();++it){
+ OrientedLabelSet::iterator it2_end = orientmap[*it].end();
+ for(OrientedLabelSet::iterator it2 = orientmap[*it].begin();it2!=it2_end;++it2){
+ assert(it2->first>=0);
+ Label seqidx = it2->first;
+ Label genomeidx = sequence2genome[seqidx];
+ assert(spans.find(genomeidx)!=spans.end());
+ if(spans[genomeidx] >= minlength){
+ if(mincoordsbyseq.find(seqidx)==mincoordsbyseq.end()){
+ mincoordsbyseq[seqidx] = std::numeric_limits<unsigned int>::max();
+ }
+ if(maxcoordsbyseq.find(seqidx)==maxcoordsbyseq.end()){
+ maxcoordsbyseq[seqidx] = std::numeric_limits<unsigned int>::min();
+ }
+ alllabel.insert(OrientedLabel(seqidx,true));
+ VertexLabelIntervalMap::iterator vit = coordinates.find(std::make_pair(*it,seqidx));
+ assert(vit!=coordinates.end());
+ assert(vit->second.second>vit->second.first);
+ totallen+=abs(vit->second.second-vit->second.first);
+ mincoordsbyseq[seqidx] = ((unsigned int)vit->second.first<mincoordsbyseq[seqidx]) ? vit->second.first : mincoordsbyseq[seqidx];
+ maxcoordsbyseq[seqidx] = ((unsigned int)vit->second.second>maxcoordsbyseq[seqidx]) ? vit->second.second : maxcoordsbyseq[seqidx];
+ assert(mincoordsbyseq[seqidx] != std::numeric_limits<unsigned int>::max());
+ assert(maxcoordsbyseq[seqidx] != std::numeric_limits<unsigned int>::min());
+ //std::cerr << "seq:" << seqidx << " " << *it << " " << maxcoordsbyseq[seqidx] << " " << mincoordsbyseq[seqidx] << std::endl;
+ }
+ }
+ }
+ if(alllabel.size()==0){
+ return 0;
+ }
+ //Save spanning coords for lcb in lcbcoords
+ OrientedLabelSet::iterator it2_end = alllabel.end();
+ for(OrientedLabelSet::iterator it2 = alllabel.begin();it2!=it2_end;++it2){
+ Label seqidx = it2->first;
+ Label genomeidx = sequence2genome[seqidx];
+ assert(spans[genomeidx] >= minlength);
+ LCBLabelIntervalMap::iterator it=lcbcoords.find(std::make_pair(lcbidx,seqidx));
+ if(it!=lcbcoords.end()){
+ lcbcoords.erase(it);
+ }
+ assert(mincoordsbyseq[seqidx]<maxcoordsbyseq[seqidx]);
+ lcbcoords.insert(std::make_pair(std::make_pair(lcbidx,seqidx),std::make_pair(mincoordsbyseq[seqidx],maxcoordsbyseq[seqidx])));
+ }
+ //Maximum span of all seqs
+ unsigned int maxminlen = 0;
+ OrientedLabelSet::iterator it_end = alllabel.end();
+ for(OrientedLabelSet::iterator it = alllabel.begin();it!=it_end;++it){
+ Label seqidx = it->first;
+ Label genomeidx = sequence2genome[seqidx];
+ assert(spans[genomeidx] >= minlength);
+ assert(maxcoordsbyseq[seqidx]>mincoordsbyseq[seqidx]);
+ if(maxcoordsbyseq[seqidx]>mincoordsbyseq[seqidx]){
+ unsigned int len = (unsigned int)(maxcoordsbyseq[seqidx] - mincoordsbyseq[seqidx]);
+ //std::cerr << "seq:" << seqidx << " " << maxcoordsbyseq[seqidx] << " " << mincoordsbyseq[seqidx] << " " << len << std::endl;
+ maxminlen = len>maxminlen ? len : maxminlen;
+ }
+ else{
+ //std::cerr << "??seq:" << seqidx << " " << maxcoordsbyseq[seqidx] << " " << mincoordsbyseq[seqidx] << std::endl;
+ assert(false);
+ }
+ }
+ assert(maxminlen>0);
+ //std::cerr << maxminlen << std::endl;
+ return maxminlen;
diff --git a/chaining/mincut.h b/chaining/mincut.h
new file mode 100644
index 0000000..cbce460
--- /dev/null
+++ b/chaining/mincut.h
@@ -0,0 +1,975 @@
+//Interpret anchor graph as a flow network
+//Use mincut,max-flow to partition the graph to fullfill criteria
+//1)gaps <= distance
+//2)no conflicting orientations
+//3)at most one sequence per genome (important for draft data)
+//Cut edges are tagged as BLUE in the input graph.
+template<typename TGraph, typename TGraph2>
+int breakLCBmincutconnect(std::vector<LCB > &componentMap,
+ std::vector<int> &ccvmap,
+ EdgeSet&maskedEdges,
+ TGraph g,
+ TGraph2 fglcbsyn,
+ unsigned int distance,
+ VertexLabelIntervalMap &coordinates,
+ std::set<Label> &seqidxSet,
+ NameVertexMap &name2vertex,
+ SequenceGenomeMap & sequence2genome,
+ int filenumoffset=0){
+ bool found=false;
+ int lcbcount=0;
+ bool reusesupernodes=false;
+ //int SEARCH_RADIUS=std::numeric_limits<unsigned int>::max();
+ int DEFAULT_CAP=1;
+ int numcuts=0;
+ int cutcount=0;
+ //Determine cuts over each LCB
+ for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+#ifdef DEBUG
+ std::cerr << "mincut LCB count:" << lcbcount << " num vertices " << it->size() << std::endl;
+ lcbcount++;
+ std::map<VertexName, LVertex> currlcbv;
+ std::map<VertexName, LVertex>::iterator pos;
+ std::map<LVertex,Vertex> vmap;
+ typename property_map < TGraph, vertex_orientmask_t >::type orientmaskmap = get(vertex_orientmask,g);
+ typename property_map < TGraph, vertex_vlabelmask_t >::type labelmaskmap = get(vertex_vlabelmask,g);
+ typename property_map < TGraph, vertex_orient_t>::type orientmap = get(vertex_orient,g);
+ std::map<std::pair<VertexName,VertexName>,std::pair<LVertex,LVertex> > cuts;
+ std::map<std::pair<LVertex,LVertex>,std::pair<VertexName,VertexName> > revcuts;
+ std::set<std::pair<LVertex,LVertex> > cutsnodes;
+ std::map<std::pair<LVertex,LVertex>,unsigned int> cutsdistmap;
+ std::map<LVertex,int> supernodes;
+ //std::set<LVertex> supernodes2;
+ int snodeedges=0;
+ //The connectivity graph used for max flow, min cut
+ LGraph currlcbg;
+ int supercount=0;
+ property_map < LGraph, edge_capacity_t >::type
+ capacity = get(edge_capacity, currlcbg);
+ property_map < LGraph, edge_reverse_t >::type
+ rev = get(edge_reverse, currlcbg);
+ property_map < LGraph, edge_residual_capacity_t >::type
+ residual_capacity = get(edge_residual_capacity, currlcbg);
+ //The set of vertices for the LCB
+ LCB blockV = *it;
+ //Get sequence labels for this lcb
+ //Slows us down for complete genomes, but helps us with perf for draft genomes
+ std::set<Label> currseqidxSet;
+ typename property_map < TGraph, vertex_label_t >::type vlabelmap = get(vertex_label,g);
+ std::map<Label,std::set<Label> > seqspergenomeMap; //tracks the number of seqs per genome in an LCB
+ std::map<Label,std::set<Label> >::iterator gpos;
+ std::set<Label>::iterator spos,spos2;
+ std::map<Label,std::set<LVertex> > seqsvertex;
+ std::set<LVertex>::iterator vpos,vpos2;
+ bool inserted;
+ for(LCB::iterator vit = blockV.begin();vit!=blockV.end();++vit){
+ for(LabelSet::iterator sit = vlabelmap[*vit].begin();sit!=vlabelmap[*vit].end();++sit){
+ currseqidxSet.insert(*sit);
+ tie(gpos, inserted) = seqspergenomeMap.insert(std::make_pair(sequence2genome[*sit],std::set<Label>()));
+ gpos->second.insert(*sit);
+ }
+ }
+ std::vector<LGraph::edge_descriptor> disconnecting_set;
+ for(std::set<Label>::iterator it2 = currseqidxSet.begin(); it2 != currseqidxSet.end(); ++it2){
+ assert(seqidxSet.find(*it2) != seqidxSet.end());
+ Label seqidx = *it2;
+ //Label genomeidx = sequence2genome[seqidx];
+ std::list<LVertex> sortedV;
+ unsigned int spanlen=0;
+ //need to create custom coords map for LGraph
+ VertexIntervalMap currcoords;
+ //Create special graph currlcbv to represent the current lcb
+ //createLCBGraph(g,currlcbv,blockV);
+ for(LCB::iterator vit = blockV.begin();vit!=blockV.end();++vit){
+ Vertex v=*vit;
+ VertexName sname = get(vertex_name,g,v);
+ LVertex news;
+ //
+ //Insert vertex into currlcbg if needed
+ tie(pos, inserted) = currlcbv.insert(std::make_pair(sname, LVertex()));
+ if(inserted){
+ news = add_vertex(sname,currlcbg);
+ currlcbv[sname]=news;
+ vmap[news]=v;
+ }
+ else{
+ news = pos->second;
+ }
+ //
+ //Save coordinate information for news
+ if(coordinates.find(std::make_pair(v,seqidx))!=coordinates.end()){
+ assert(coordinates.find(std::make_pair(v,seqidx))->second.first<coordinates.find(std::make_pair(v,seqidx))->second.second);
+ sortedV.push_back(news);
+ spanlen = spanlen + get(vertex_len,g,v);
+#ifdef DEBUG
+ std::cerr << "seqidx: " << seqidx << " V:" << get(vertex_name,g,v)
+ << " len:" << get(vertex_len,g,news) << " spanlen:" << spanlen << std::endl;
+ currcoords.insert(std::make_pair(news,
+ coordinates.find(std::make_pair(v,seqidx))->second));
+ seqsvertex[seqidx].insert(news);
+ }
+ //
+ //Add all edges for news
+ //First make sure target vertex is part of currlcbg
+ //graph_traits<LCBSynFilterGraph>::out_edge_iterator out_i, out_end;
+ typename graph_traits<TGraph2>::out_edge_iterator out_i, out_end;
+ for(tie(out_i, out_end) = out_edges(v, fglcbsyn); out_i != out_end; ++out_i){
+ if(ccvmap[v] != ccvmap[target(*out_i,g)]){
+ std::cerr << "Skipping edge, outside LCB" << std::endl;
+ continue;
+ }
+ VertexName tname = get(vertex_name,g,target(*out_i,g));
+ LVertex newt;
+ tie(pos, inserted) = currlcbv.insert(std::make_pair(tname, LVertex()));
+ if(inserted){
+ newt = add_vertex(tname,currlcbg);
+ currlcbv[tname]=newt;
+ vmap[newt]=target(*out_i,g);
+ }
+ else{
+ newt = pos->second;
+ }
+ //Now add the forward and reverse edges
+ //and flow properties
+ LGraph::edge_descriptor e1,e2;
+ tie(e1, inserted) = edge(news,newt,currlcbg);
+ if(!inserted){
+ tie(e1, inserted) = edge(newt,news,currlcbg);
+ assert(!inserted);
+ tie(e1, inserted) = add_edge(news,newt,currlcbg);
+ assert(inserted);
+ tie(e2, inserted) = add_edge(newt,news,currlcbg);
+ assert(inserted);
+ //put(edge_reverse,currlcbg,e1,e2);
+ //put(edge_reverse,currlcbg,e2,e1);
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ //Capacity is set as number of labels on the edge
+ BitMask emask = get(edge_labelmask,g,*out_i);
+ //int minlen = (get(vertex_len,g,news) < get(vertex_len,g,newt)) ? get(vertex_len,g,news) : get(vertex_len,g,newt);
+ //int ecapacity = emask.count() * minlen;
+ int ecapacity = emask.count();
+ assert(ecapacity>=1);
+#ifdef DEBUG
+ std::cerr << "mincutlcbg " << sname << "-" << tname << " capacity:" << ecapacity << std::endl;
+ capacity[e1]=ecapacity;//DEFAULT_CAP;
+ capacity[e2]=ecapacity;//DEFAULT_CAP;
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
+ }
+ }
+ }
+ //Condition (1) split gaps
+ if(num_vertices(currlcbg)>0
+ && num_edges(currlcbg)>0
+ && sortedV.size()>0
+ && spanlen>=MINSPANLEN){ //Check that span of seqs > MINSPANLEN to avoid breaking LCBs based on inconsistent short fragments
+ assert(num_vertices(currlcbg)>0);
+ assert(num_edges(currlcbg)>0);
+ assert(num_vertices(currlcbg)>=sortedV.size());
+ assert(currcoords.size()==sortedV.size());
+ //Project order onto seq
+ sortedV.sort(coordsorder_vertex(&currcoords));
+ int prevcoord=-1;
+ int currpos=0;
+ LVertex prevvertex=0,currvertex=0;
+ VertexName prevname=0,currname=0;
+ LVertex currvertexlcb;
+ std::vector<int> lcbcc(num_vertices(currlcbg));
+ prevname=0;
+ prevcoord=-1;
+#ifdef DEBUG
+ std::cerr << "Order by seqidx:" << seqidx << std::endl;
+ for(std::list<LVertex>::iterator vit = sortedV.begin();vit!=sortedV.end();++vit){
+ currvertex=*vit;
+ currname = get(vertex_name,currlcbg,currvertex);
+ assert(get(vertex_name,currlcbg,currvertex)==get(vertex_name,g,vmap[currvertex]));
+ currvertexlcb = currlcbv[currname];
+ assert(coordinates.find(std::make_pair(name2vertex[currname],seqidx))!=coordinates.end());
+ //assert(coordinates.find(std::make_pair(name2vertex[currname],seqidx))->second==currcoords.find(std::make_pair(currvertex,seqidx))->second);
+ int currstart,currend;
+ tie(currstart,currend) = coordinates.find(std::make_pair(name2vertex[currname],seqidx))->second;
+ if(prevcoord==-1){
+ assert(vit==sortedV.begin());
+ }
+ else{
+ //assert(*(vit-1)==prevvertex);
+ //assert(currstart>=prevcoord);
+ int dist = currstart-prevcoord;
+#ifdef DEBUG
+ std::cerr << "seqidx:" << seqidx << " dist:" << dist << " "
+ << prevname << "-" << currname << " "
+ << prevvertex << "-" << currvertex
+ << " coords " << prevcoord << "-" << currstart
+ << " spanlenonseq: " << spanlen
+ << " numV: " << num_vertices(currlcbg) << std::endl;
+ if(dist>(int)distance){
+ //Since the vertices are sorted by genomic position.
+ //All verticies begin()->currVertex are also at a dist>distance
+#ifdef DEBUG
+ std::cerr << "Found GAP " << dist << ">" << distance << std::endl;
+#ifdef CALCFLOW
+ ;
+ boost::graph_traits<LGraph>::edge_descriptor de2;
+ tie(de2,found) = edge(currvertex,prevvertex,currlcbg);
+ if(!found){
+ tie(de2,found) = edge(prevvertex,currvertex,currlcbg);
+ if(found){
+ disconnecting_set.push_back(de2);
+ }
+ }
+ else{
+ disconnecting_set.push_back(de2);
+ }
+ //Convert into multi-source multi-sink problem
+ //Add super-source and super sink nodes
+ LVertex ssource,ssink;
+ if(reusesupernodes &&
+ cuts.find(std::make_pair(prevname,currname)) != cuts.end()){
+ ssource = cuts.find(std::make_pair(prevname,currname))->second.first;
+ ssink = cuts.find(std::make_pair(prevname,currname))->second.second;
+ //std::cerr << "Found prev source sink " << ssource << "-" << ssink << std::endl;
+ assert(cutsdistmap.find(std::make_pair(ssource,ssink))!=cutsdistmap.end());
+ if(dist<(int)cutsdistmap[std::make_pair(ssource,ssink)]){
+ cutsdistmap[std::make_pair(ssource,ssink)]=dist;
+ }
+ }
+ else{
+ ssource = add_vertex(std::numeric_limits<int>::max()-supercount,currlcbg);
+ currlcbv[std::numeric_limits<int>::max()-supercount]=ssource;
+ ssink = add_vertex(std::numeric_limits<int>::max()-supercount-1,currlcbg);
+ currlcbv[std::numeric_limits<int>::max()-supercount-1]=ssink;
+ supercount+=2;
+ cuts[std::make_pair(prevname,currname)] = std::make_pair(ssource,ssink);
+ revcuts[std::make_pair(ssource,ssink)] = std::make_pair(prevname,currname);
+ cutsdistmap[std::make_pair(ssource,ssink)]=dist;
+ }
+#ifdef DEBUG
+ std::cerr << "Source,sink " << ssource << "-" << ssink << " for cut " << prevname << "-" << currname << endl;
+ std::list<LVertex>::iterator sinkend = sortedV.end();
+ std::list<LVertex>::iterator vit3 = vit;
+ //std::vector<LVertex>::iterator sinkend = (int(SEARCH_RADIUS+currpos)<(int)sortedV.size()) ? vit+SEARCH_RADIUS : sortedV.end();
+ for(;vit3!=sinkend;++vit3){
+ //addFlowEdge(ssink,*vit3,currlcbg);
+ graph_traits < LGraph >::edge_descriptor e1,e2;
+ tie(e1, inserted) = add_edge(ssink,*vit3,currlcbg);
+ if(inserted){
+ //std::cerr << "Adding edge " << get(vertex_name,currlcbg,*vit3) << " --> sink:" << ssink << std::endl;
+ snodeedges++;
+ tie(e2, inserted) = add_edge(*vit3,ssink,currlcbg);
+ assert(inserted);
+ snodeedges++;
+ //put(edge_reverse,currlcbg,e1,e2);
+ //put(edge_reverse,currlcbg,e2,e1);
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ capacity[e1]=0;
+ capacity[e2]=std::numeric_limits<int>::max();
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
+ }
+ else{
+ tie(e2, inserted) = add_edge(*vit3,ssink,currlcbg);
+ assert(!inserted);
+ }
+ }
+ std::list<LVertex>::iterator sourceend = sortedV.begin();
+ //std::list<LVertex>::iterator sourceend = ((int)(currpos-SEARCH_RADIUS)>0) ? vit-SEARCH_RADIUS : sortedV.begin();
+ //std::cerr << "Curr pos " << currpos << std::endl;
+ std::list<LVertex>::iterator vit2=vit;
+ for(--vit2;vit2!=sourceend;--vit2){
+ graph_traits < LGraph >::edge_descriptor e1,e2;
+ tie(e1, inserted) = add_edge(ssource,*vit2,currlcbg);
+ if(inserted){
+ //std::cerr << "Adding edge source:" << ssource << " --> " << get(vertex_name,currlcbg,*vit2) << std::endl;
+ snodeedges++;
+ tie(e2, inserted) = add_edge(*vit2,ssource,currlcbg);
+ snodeedges++;
+ assert(inserted);
+ //put(edge_reverse,currlcbg,e1,e2);
+ //put(edge_reverse,currlcbg,e2,e1);
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ capacity[e1]=std::numeric_limits<int>::max();
+ capacity[e2]=0;
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
+ }
+ else{
+ tie(e2, inserted) = add_edge(*vit2,ssource,currlcbg);
+ assert(!inserted);
+ }
+ }
+ if(vit2==sortedV.begin()){
+ graph_traits < LGraph >::edge_descriptor e1,e2;
+ tie(e1, inserted) = add_edge(ssource,*vit2,currlcbg);
+ if(inserted){
+ //std::cerr << "Adding edge source:" << ssource << " --> " << get(vertex_name,currlcbg,*vit2) << std::endl;
+ snodeedges++;
+ tie(e2, inserted) = add_edge(*vit2,ssource,currlcbg);
+ snodeedges++;
+ assert(inserted);
+ //put(edge_reverse,currlcbg,e1,e2);
+ //put(edge_reverse,currlcbg,e2,e1);
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ capacity[e1]=std::numeric_limits<int>::max();
+ capacity[e2]=0;
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
+ }
+ else{
+ tie(e2, inserted) = add_edge(*vit2,ssource,currlcbg);
+ assert(!inserted);
+ }
+ }
+ if(cutsnodes.find(std::make_pair(ssource,ssink))==cutsnodes.end()){
+ supernodes[ssource]++;
+ supernodes[ssink]++;
+ //supernodes2.insert(ssource);
+ //supernodes2.insert(ssink);
+ cutsnodes.insert(std::make_pair(ssource,ssink));
+ }
+ }
+ }
+ currpos++;
+ currvertex=*vit;
+ currname = get(vertex_name,currlcbg,currvertex);
+ assert(currvertex==(*vit));
+ prevvertex = currvertex;
+ //max coord of block
+ prevcoord = currend;
+ prevname = currname;
+ }
+ }
+ else{
+ //std::cerr << "Skipping merge on seq:" << seqidx
+ //<< " spanlen:" << spanlen << " < " << MINSPANLEN << std::endl;
+ }
+ }
+#ifdef DEBUG
+ std::cerr << "Graph built for lcbidx:" << lcbcount << " V:" << num_vertices(currlcbg) << " E:" << num_edges(currlcbg) << std::endl;
+ //
+ //Condition (2) - conflicting orientation
+ //Check orientation on this one seq
+ //Condition (3) - multiple seqs per genome
+ //Need to break LCBs that have multiple seqs from the same genome
+ LVertex prevvertex=0,currvertex=0;
+ VertexName prevname=0,currname=0;
+ LVertex currvertexlcb,prevvertexlcb;
+ for(gpos = seqspergenomeMap.begin();gpos!=seqspergenomeMap.end();++gpos){
+ if(gpos->second.size()>1){
+#ifdef DEBUG
+ std::cerr << "LCB with multiple seqs " << gpos->second.size() << " from same genome, splitting" << std::endl;
+ std::vector<Label> seqs;
+ for(spos = gpos->second.begin();spos!=gpos->second.end();++spos){//each seq1
+ seqs.push_back(*spos);
+#ifdef DEBUG
+ std::cerr << "Seqs " << *spos << std::endl;
+ assert(sequence2genome[*spos]==gpos->first);
+ }
+ std::vector<LVertex> compv;
+ //Split sequences from the same genome
+ for(std::vector<Label>::iterator spos1 = seqs.begin();spos1!=seqs.end();++spos1){
+ //std::cerr << "S1" << *spos1 << std::endl;
+ assert(seqsvertex.find(*spos1)!=seqsvertex.end());
+ for(std::vector<Label>::iterator spos2 = spos1+1;spos2!=seqs.end();++spos2){
+ //std::cerr << "S2" << *spos2 << std::endl;
+ assert(seqsvertex.find(*spos2)!=seqsvertex.end());
+ assert(spos1!=spos2);
+ for(vpos = seqsvertex[*spos1].begin();vpos != seqsvertex[*spos1].end();++vpos){//each vertex seq1
+ compv.push_back(*vpos);
+ currvertex = *vpos;
+ currname = get(vertex_name,currlcbg,currvertex);
+ currvertexlcb = currlcbv[currname];
+ //std::cerr << *vpos << " name:" << currname << std::endl;
+ assert(get(vertex_name,currlcbg,currvertex)==get(vertex_name,g,vmap[currvertex]));
+ for(vpos2 = seqsvertex[*spos2].begin();vpos2 != seqsvertex[*spos2].end();++vpos2){//each vertex seq1
+ prevvertex = *vpos2;
+#ifdef CALCFLOW
+ ;
+ boost::graph_traits<LGraph>::edge_descriptor de2;
+ tie(de2,found) = edge(currvertex,prevvertex,currlcbg);
+ if(!found){
+ tie(de2,found) = edge(prevvertex,currvertex,currlcbg);
+ if(found){
+ disconnecting_set.push_back(de2);
+ }
+ }
+ else{
+ disconnecting_set.push_back(de2);
+ }
+ prevname = get(vertex_name,currlcbg,prevvertex);
+ prevvertexlcb = get(vertex_name,currlcbg,prevvertex);
+ //std::cerr << *vpos2 << " name:" << prevname << std::endl;
+ LVertex ssource,ssink;
+ ssource = add_vertex(std::numeric_limits<int>::max()-supercount,currlcbg);
+ currlcbv[std::numeric_limits<int>::max()-supercount]=ssource;
+ ssink = add_vertex(std::numeric_limits<int>::max()-supercount-1,currlcbg);
+ currlcbv[std::numeric_limits<int>::max()-supercount-1]=ssink;
+ supercount+=2;
+ cuts[std::make_pair(prevname,currname)] = std::make_pair(ssource,ssink);
+ revcuts[std::make_pair(ssource,ssink)] = std::make_pair(prevname,currname);
+ cutsdistmap[std::make_pair(ssource,ssink)]=0;
+ graph_traits < LGraph >::edge_descriptor e1,e2;
+ tie(e1, inserted) = add_edge(ssink,currvertex,currlcbg);
+ if(inserted){
+#ifdef DEBUG
+ std::cerr << "Added edge sink for multiple anchors same genome:" << currvertex << " <-- " << ssink << std::endl;
+ snodeedges++;
+ tie(e2, inserted) = add_edge(currvertex,ssink,currlcbg);
+ assert(inserted);
+ snodeedges++;
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ capacity[e1]=0;
+ capacity[e2]=std::numeric_limits<int>::max();
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
+ }
+ else{
+ tie(e2, inserted) = add_edge(currvertex,ssink,currlcbg);
+ assert(!inserted);
+ }
+ tie(e1, inserted) = add_edge(ssource,prevvertex,currlcbg);
+ if(inserted){
+#ifdef DEBUG
+ std::cerr << "Adding edge source:" << ssource << " --> " << prevvertex << std::endl;
+ snodeedges++;
+ tie(e2, inserted) = add_edge(prevvertex,ssource,currlcbg);
+ snodeedges++;
+ assert(inserted);
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ capacity[e1]=std::numeric_limits<int>::max();
+ capacity[e2]=0;
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
+ }
+ else{
+ tie(e2, inserted) = add_edge(prevvertex,ssource,currlcbg);
+ assert(!inserted);
+ }
+ //Add ssource,ssink to cutset
+ if(cutsnodes.find(std::make_pair(ssource,ssink))==cutsnodes.end()){
+ supernodes[ssource]++;
+ supernodes[ssink]++;
+ cutsnodes.insert(std::make_pair(ssource,ssink));
+ }
+ }
+ }
+ }
+ }
+ /*
+ //Check for misoriented vertices within an LCB and break
+ for(std::vector<LVertex>::iterator vpos = compv.begin();vpos != compv.end();++vpos){//each vertex seq1
+ for(std::vector<LVertex>::iterator vpos2 = vpos+1;vpos2 != compv.end();++vpos2){//each vertex seq1
+ //Mismatched orient
+ BitMask sharedlabels = (labelmaskmap[vmap[*vpos]]&labelmaskmap[vmap[*vpos2]]);
+ assert(isLabelCollinearMask(sharedlabels,
+ orientmaskmap[vmap[*vpos]],
+ orientmaskmap[vmap[*vpos2]])
+ ==
+ isLabelCollinear(orientmap[vmap[*vpos]],
+ orientmap[vmap[*vpos2]],
+ sequence2genome));
+ if(! isLabelCollinearMask(sharedlabels,orientmaskmap[vmap[*vpos]],orientmaskmap[vmap[*vpos2]])){
+ std::cerr << "Breaking vertices with incompatible labeling " << vmap[*vpos] << "--" << vmap[*vpos2] << std::endl;
+ currvertex = *vpos;
+ currname = get(vertex_name,currlcbg,currvertex);
+ currvertexlcb = currlcbv[currname];
+ std::cerr << *vpos << " name:" << currname << std::endl;
+ assert(get(vertex_name,currlcbg,currvertex)==get(vertex_name,g,vmap[currvertex]));
+ prevvertex = *vpos2;
+ prevname = get(vertex_name,currlcbg,prevvertex);
+ prevvertexlcb = get(vertex_name,currlcbg,prevvertex);
+ std::cerr << *vpos2 << " name:" << prevname << std::endl;
+ LVertex ssource,ssink;
+ ssource = add_vertex(std::numeric_limits<int>::max()-supercount,currlcbg);
+ currlcbv[std::numeric_limits<int>::max()-supercount]=ssource;
+ ssink = add_vertex(std::numeric_limits<int>::max()-supercount-1,currlcbg);
+ currlcbv[std::numeric_limits<int>::max()-supercount-1]=ssink;
+ supercount+=2;
+ cuts[std::make_pair(prevname,currname)] = std::make_pair(ssource,ssink);
+ revcuts[std::make_pair(ssource,ssink)] = std::make_pair(prevname,currname);
+ cutsdistmap[std::make_pair(ssource,ssink)]=0;
+ graph_traits < LGraph >::edge_descriptor e1,e2;
+ tie(e1, inserted) = add_edge(ssink,currvertex,currlcbg);
+ if(inserted){
+ std::cerr << "Added edge sink:" << currvertex << " <-- " << ssink << std::endl;
+ snodeedges++;
+ tie(e2, inserted) = add_edge(currvertex,ssink,currlcbg);
+ std::cerr << "Added edge sink:" << currvertex << " <-- " << ssink << std::endl;
+ assert(inserted);
+ snodeedges++;
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ capacity[e1]=0;
+ capacity[e2]=std::numeric_limits<int>::max();
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
+ }
+ else{
+ tie(e2, inserted) = add_edge(currvertex,ssink,currlcbg);
+ assert(!inserted);
+ }
+ std::cerr << "Added sink" << std::endl;
+ tie(e1, inserted) = add_edge(ssource,prevvertex,currlcbg);
+ if(inserted){
+ std::cerr << "Adding edge source:" << ssource << " --> " << prevvertex << std::endl;
+ snodeedges++;
+ tie(e2, inserted) = add_edge(prevvertex,ssource,currlcbg);
+ snodeedges++;
+ assert(inserted);
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ capacity[e1]=std::numeric_limits<int>::max();
+ capacity[e2]=0;
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
+ }
+ else{
+ tie(e2, inserted) = add_edge(prevvertex,ssource,currlcbg);
+ assert(!inserted);
+ }
+ //Add ssource,ssink to cutset
+ if(cutsnodes.find(std::make_pair(ssource,ssink))==cutsnodes.end()){
+ supernodes[ssource]++;
+ supernodes[ssink]++;
+ cutsnodes.insert(std::make_pair(ssource,ssink));
+ }
+ }
+ else{
+ std::cerr << "Compatible labeling " << vmap[*vpos] << "--" << vmap[*vpos2] << std::endl;
+ }
+ }
+ }
+ */
+ }
+ }
+ //
+ //
+ //Write graph
+ std::vector<int> ccvmap; //empty
+ VertexSet maskedLCBs; //empty
+ ;
+ do_write_graphviz(currlcbg, std::string("gout.preflow"+lexical_cast<std::string>(cutcount+filenumoffset)+".dot"),ccvmap,coordinates,maskedEdges,maskedLCBs,capacity,false);
+ std::cerr << "Writing " << std::string("gout.preflow"+lexical_cast<std::string>(cutcount+filenumoffset)+".dot") << std::endl;
+ LGraph::edge_iterator ei,e_end;
+ //property_map < LGraph, edge_reverse_t >::type revtest = get(edge_reverse,currlcbg);
+ //for(tie(ei, e_end) = edges(currlcbg); ei != e_end; ++ei) {
+ //assert(revtest[revtest[*ei]] == *ei); //check if the reverse edge map is build up properly
+ //}
+ //Evaluation order of cuts can matter
+ //TODO, try smallest->largest and largest->smallest
+ std::vector<std::pair<LVertex,LVertex> > cutsnodesuniq;
+ for(std::set<pair<LVertex,LVertex> >::iterator cit = cutsnodes.begin(); cit!= cutsnodes.end();++cit){
+ cutsnodesuniq.push_back(*cit);
+ }
+ sort(cutsnodesuniq.begin(),cutsnodesuniq.end(),cutsdist(&cutsdistmap));
+ for(std::vector<std::pair<LVertex,LVertex> >::iterator cit = cutsnodesuniq.begin(); cit!= cutsnodesuniq.end();++cit){
+ LVertex ssource = cit->first;
+ LVertex ssink = cit->second;
+#ifdef DEBUG
+ std::cerr << "Attempting split " << get(vertex_name,currlcbg,ssource) << "(" << supernodes[ssource] << ")"
+ << "-" << get(vertex_name,currlcbg,ssink) << "(" << supernodes[ssink] << ")"
+ << " due to edge " << revcuts[std::make_pair(ssource,ssink)].first
+ << "-" << revcuts[std::make_pair(ssource,ssink)].second
+ << " dist:" << cutsdistmap[std::make_pair(ssource,ssink)] << std::endl;
+ assert(supernodes[ssource]>0);
+ assert(supernodes[ssink]>0);
+ std::vector<default_color_type> color(num_vertices(currlcbg));
+ std::vector<LGraph::edge_descriptor> pred(num_vertices(currlcbg));
+ assert(num_edges(currlcbg)>0);
+ std::set<LVertex> S_star;
+ property_map < LGraph, vertex_index_t >::type
+ idx = get(vertex_index, currlcbg);
+ property_map < LGraph, vertex_distance_t >::type
+ distance = get(vertex_distance, currlcbg);
+ capacity = get(edge_capacity, currlcbg);
+ rev = get(edge_reverse, currlcbg);
+ residual_capacity = get(edge_residual_capacity, currlcbg);
+#ifdef CALCFLOW
+ long flow = edmonds_karp_max_flow(currlcbg, ssource, ssink, capacity, residual_capacity, rev, &color[0], &pred[0]);
+ ++cutcount;
+ //kolmogorov is faster but its not clear how to find the disconnecting set
+ //long flow = kolmogorov_max_flow(currlcbg, capacity, residual_capacity, rev, &pred[0], &color[0],distance,idx,ssource,ssink);
+ //long flow = edmonds_karp_max_flow(currlcbg, ssource, ssink);
+ //long flow = push_relabel_max_flow(currlcbg, ssource, ssink);
+ //long flow = kolmogorov_max_flow(currlcbg, ssource, ssink);
+ /*
+ //Testing trimming graph of all but the current supernode source and sink
+ typedef std::set<LVertex> SuperNodeMap;
+ typedef filtered_graph<LGraph,
+ snode_efilter<LGraph>,snode_vfilter<LGraph> > FLGraph;
+ snode_efilter<LGraph> efilter(&supernodes2,&currlcbg);
+ snode_vfilter<LGraph> vfilter(&supernodes2);
+ FLGraph filtlcbg(currlcbg, efilter, vfilter);
+ supernodes2.erase(supernodes2.find(ssource));
+ supernodes2.erase(supernodes2.find(ssink));
+ long flow = kolmogorov_max_flow(filtlcbg, capacity, residual_capacity, rev, &pred[0], &color[0],distance,idx,ssource,ssink);
+ */
+ //Check flow since we may have already introduced a break
+ if(flow>0){
+ assert(flow>0);
+ graph_traits<LGraph>::out_edge_iterator ei, ei_end;
+ graph_traits<LGraph>::vertex_iterator vi, vi_end;
+ typedef color_traits<default_color_type> Color;
+ for(tie(vi,vi_end) = vertices(currlcbg);vi!=vi_end;++vi){
+ if(color[*vi]!=Color::white()){
+ if(reusesupernodes || supernodes.find(*vi)==supernodes.end()){
+ S_star.insert(*vi);
+ }
+ }
+ }
+ for( std::set<LVertex>::iterator si = S_star.begin();si!=S_star.end();++si){
+ for(tie(ei,ei_end) = out_edges(*si,currlcbg);ei!=ei_end;++ei){
+ if(S_star.find(target(*ei,currlcbg))==S_star.end()){
+ if(reusesupernodes || supernodes.find(target(*ei,currlcbg))==supernodes.end()){
+ disconnecting_set.push_back(*ei);
+ #ifdef DEBUG
+ std::cerr << "Disconnecting set " << get(vertex_name,currlcbg,source(*ei,currlcbg)) << "-" << get(vertex_name,currlcbg,target(*ei,currlcbg)) << std::endl;
+ put(edge_category,currlcbg,*ei,BLUE);
+ #endif
+ }
+ }
+ }
+ }
+ #ifdef DEBUG
+ std::cerr << " flow:" << flow << std::endl;
+ #endif
+ int flow=0;
+#ifdef DEBUG
+ std::cerr << "Disconnecting set size:" << disconnecting_set.size() << std::endl;
+ //Write graph
+ std::vector<int> ccvmap; //empty
+ VertexSet maskedLCBs; //empty
+ ;
+ do_write_graphviz(currlcbg, std::string("gout.flow"+lexical_cast<std::string>(cutcount+filenumoffset)+".dot"),ccvmap,coordinates,maskedEdges,maskedLCBs,capacity,false);
+ std::cerr << "Writing " << std::string("gout.flow"+lexical_cast<std::string>(cutcount+filenumoffset)+".dot") << std::endl;
+ for(std::vector<LGraph::edge_descriptor>::iterator ei=disconnecting_set.begin();ei!=disconnecting_set.end();++ei){
+ boost::graph_traits<LGraph>::edge_descriptor e2;
+ //This edge may have been cut previously so
+ //first check if it is still present in the connectivity graph
+ tie(e2,found) = edge(source(*ei,currlcbg),target(*ei,currlcbg),currlcbg);
+ if(found){
+ LGraph::edge_descriptor maxe = *ei;
+ Vertex cuts = name2vertex[get(vertex_name,currlcbg,source(maxe,currlcbg))];
+ assert(vmap[source(maxe,currlcbg)]==cuts);
+ Vertex cutt = name2vertex[get(vertex_name,currlcbg,target(maxe,currlcbg))];
+ assert(vmap[target(maxe,currlcbg)]==cutt);
+ Edge e1;
+ tie(e1,found) = edge(cuts,cutt,fglcbsyn);
+ if(found){
+ assert(revcuts.find(std::make_pair(ssource,ssink))!=revcuts.end());
+#ifdef DEBUG
+ std::cerr << "cut " << get(vertex_name,currlcbg,source(maxe,currlcbg))
+ << "-" << get(vertex_name,currlcbg,target(maxe,currlcbg))
+ << " " << cuts << "-" << cutt
+ << std::endl;
+ if((get(vertex_name,currlcbg,target(maxe,currlcbg)) == revcuts[std::make_pair(ssource,ssink)].first
+ && get(vertex_name,currlcbg,source(maxe,currlcbg)) == revcuts[std::make_pair(ssource,ssink)].second)
+ ||
+ (get(vertex_name,currlcbg,source(maxe,currlcbg)) == revcuts[std::make_pair(ssource,ssink)].first
+ && get(vertex_name,currlcbg,target(maxe,currlcbg)) == revcuts[std::make_pair(ssource,ssink)].second)
+ ){
+ std::cerr << "Split is trivial" << std::endl;
+ }
+ else{
+ std::cerr << "Split is non-local " << std::string("gout.flow"+lexical_cast<std::string>(cutcount+filenumoffset)) << " " << " supernodes:" << supernodes.size() << " " << "cut_set:" << disconnecting_set.size() << std::endl;
+ }
+ maskedEdges.insert(std::make_pair(cuts,cutt));
+ put(edge_category,g,e1,BLUE);
+ }
+ else{
+ assert(revcuts.find(std::make_pair(ssource,ssink))!=revcuts.end());
+ tie(e1,found) = edge(cutt,cuts,fglcbsyn);
+ if(found){
+#ifdef DEBUG
+ std::cerr << "cut " << get(vertex_name,currlcbg,target(maxe,currlcbg))
+ << "-" << get(vertex_name,currlcbg,source(maxe,currlcbg))
+ << " " << cuts << "-" << cutt
+ << std::endl;
+ if((get(vertex_name,currlcbg,target(maxe,currlcbg)) == revcuts[std::make_pair(ssource,ssink)].first
+ && get(vertex_name,currlcbg,source(maxe,currlcbg)) == revcuts[std::make_pair(ssource,ssink)].second)
+ ||
+ (get(vertex_name,currlcbg,source(maxe,currlcbg)) == revcuts[std::make_pair(ssource,ssink)].first
+ && get(vertex_name,currlcbg,target(maxe,currlcbg)) == revcuts[std::make_pair(ssource,ssink)].second)
+ ){
+ std::cerr << "Split is trivial" << std::endl;
+ }
+ else{
+ std::cerr << "Split is non-local " << std::string("gout.flow"+lexical_cast<std::string>(cutcount+filenumoffset)) << " " << " supernodes:" << supernodes.size() << " " << "cut_set:" << disconnecting_set.size() << std::endl;
+ }
+ maskedEdges.insert(std::make_pair(cutt,cuts));
+ put(edge_category,g,e1,BLUE);
+ }
+ }
+ tie(e1,found) = edge(cutt,cuts,fglcbsyn);
+ assert(!found);
+ tie(e1,found) = edge(cuts,cutt,fglcbsyn);
+ assert(!found);
+ //Removing the edges from the graph can help short circuit future runs of the maxflow/mincut algorithm
+ //The check above for flow>0 ensures subsequent cuts are only considered if there is still
+ //connectivity in the graph
+ //TODO, consider changing capacity to zero instead of removing for perf boost
+ //#ifdef DEBUG
+ //save edges so we can visualize
+ //#else
+ remove_edge(rev[maxe],currlcbg);
+ remove_edge(maxe,currlcbg);
+ //#endif
+ }
+ else{
+ //assert(false);
+ }
+ }
+#ifdef DEBUG
+ //Check to make sure the cuts eliminated all the flow
+ //This is for testing only
+ flow = kolmogorov_max_flow(currlcbg, capacity, residual_capacity, rev, &pred[0], &color[0],distance,idx,ssource,ssink);
+ std::cerr << "Remaining flow " << flow << std::endl;
+ assert(flow==0);
+ numcuts+=disconnecting_set.size();
+#ifdef CALCFLOW
+ }
+ else{
+ //Previous cut already broke flow between ssource-ssink. No further cuts needed
+ }
+ //supernodes2.insert(ssource);
+ //supernodes2.insert(ssink);
+ //Remove supernodes only if
+ //they are no longer referenced
+ /*
+ This doesn't work as expected. Can't get the vertex and
+ associated edges to properly clear to property maps
+ supernodes[ssource]--;
+ supernodes[ssink]--;
+ if(supernodes[ssource]==0){
+ supernodes.erase(supernodes.find(ssource));
+ clear_vertex(ssource,currlcbg);
+ remove_vertex(ssource,currlcbg);
+ }
+ if(supernodes[ssink]==0){
+ supernodes.erase(supernodes.find(ssink));
+ clear_vertex(ssink,currlcbg);
+ remove_vertex(ssink,currlcbg);
+ }
+ LGraph::edge_iterator ei,e_end;
+ property_map < LGraph, edge_reverse_t >::type rev2 = get(edge_reverse, currlcbg);
+ for(tie(ei, e_end) = edges(currlcbg); ei != e_end; ++ei) {
+ std::cerr << *ei << std::endl;
+ //This will segfault after removed nodes
+ assert(rev2[rev2[*ei]]==*ei);
+ }
+ */
+ }
+ }
+ return numcuts;
+template<typename TGraph, typename TFGraph, typename LCBGraph, typename VertexMap>
+void createLCBGraph(TGraph & g, TFGraph & fglcbsyn, LCBGraph & currlcbg, LCB & lcb, VertexMap & vmap){
+ int DEFAULT_CAP=1;
+ property_map < LGraph, edge_capacity_t >::type
+ capacity = get(edge_capacity, currlcbg);
+ property_map < LGraph, edge_reverse_t >::type
+ rev = get(edge_reverse, currlcbg);
+ property_map < LGraph, edge_residual_capacity_t >::type
+ residual_capacity = get(edge_residual_capacity, currlcbg);
+ std::map<VertexName, LVertex> currlcbv;
+ std::map<VertexName, LVertex>::iterator pos;
+ bool inserted;
+ for(LCB::iterator vit = lcb.begin();vit!=lcb.end();++vit){
+ Vertex v=*vit;
+ VertexName sname = get(vertex_name,g,v);
+ LVertex news;
+ //
+ //Insert vertex into currlcbg if needed
+ tie(pos, inserted) = currlcbv.insert(std::make_pair(sname, LVertex()));
+ if(inserted){
+ news = add_vertex(sname,currlcbg);
+ currlcbv[sname]=news;
+ vmap[news]=v;
+ }
+ else{
+ news = pos->second;
+ }
+ //
+ //Add all edges for news
+ //First make sure target vertex is part of currlcbg
+ graph_traits<LCBSynFilterGraph>::out_edge_iterator out_i, out_end;
+ for(tie(out_i, out_end) = out_edges(v, fglcbsyn); out_i != out_end; ++out_i){
+ VertexName tname = get(vertex_name,g,target(*out_i,g));
+ LVertex newt;
+ tie(pos, inserted) = currlcbv.insert(std::make_pair(tname, LVertex()));
+ if(inserted){
+ newt = add_vertex(tname,currlcbg);
+ currlcbv[tname]=newt;
+ vmap[newt]=target(*out_i,g);
+ }
+ else{
+ newt = pos->second;
+ }
+ //Now add the forward and reverse edges
+ //and flow properties
+ LGraph::edge_descriptor e1,e2;
+ tie(e1, inserted) = edge(news,newt,currlcbg);
+ if(!inserted){
+ tie(e1, inserted) = edge(newt,news,currlcbg);
+ assert(!inserted);
+ tie(e1, inserted) = add_edge(news,newt,currlcbg);
+ assert(inserted);
+ tie(e2, inserted) = add_edge(newt,news,currlcbg);
+ assert(inserted);
+ //put(edge_reverse,currlcbg,e1,e2);
+ //put(edge_reverse,currlcbg,e2,e1);
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ capacity[e1]=DEFAULT_CAP;
+ capacity[e2]=DEFAULT_CAP;
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
+ }
+ }
+ }
+template<typename TGraph,
+ typename TEdge>
+void addFlowEdge(TGraph & g,
+ TEdge & ss,
+ TEdge & e,
+ rev,
+ residual_capacity){
+ graph_traits < LGraph >::edge_descriptor e1,e2;
+ tie(e1, inserted) = add_edge(ssink,*vit3,currlcbg);
+ if(inserted){
+ //std::cerr << "Adding edge " << get(vertex_name,currlcbg,*vit3) << " --> sink:" << ssink << std::endl;
+ snodeedges++;
+ tie(e2, inserted) = add_edge(*vit3,ssink,currlcbg);
+ assert(inserted);
+ snodeedges++;
+ //put(edge_reverse,currlcbg,e1,e2);
+ //put(edge_reverse,currlcbg,e2,e1);
+ rev[e1] = e2;
+ assert(rev[e1]==e2);
+ rev[e2] = e1;
+ assert(rev[e2]==e1);
+ capacity[e1]=0;
+ capacity[e2]=std::numeric_limits<int>::max();
+ residual_capacity[e1]=0;
+ residual_capacity[e2]=0;
diff --git a/chaining/synchain-mugsy.cpp b/chaining/synchain-mugsy.cpp
new file mode 100644
index 0000000..669f017
--- /dev/null
+++ b/chaining/synchain-mugsy.cpp
@@ -0,0 +1,2237 @@
+//USAGE:mugsy-chaining max-distance min-lcblen [min-lcblenstats] < anchors.projection
+//Mugsy chaining algorithm to partition a graph of mult-genome anchors
+//into collinear "syntenic" segments
+//Projection format is
+//anchor1 anchor2 seqindex dist genomeindex orient1 orient2 beg1 end1 beg2 end2
+//0 1 0 0 0 + + 0 196 196 15348
+//1 3 0 1 0 + + 196 15348 15349 20373
+//The anchor graph is an directed graph where each vertex is a
+//multi-genome anchor and each edge connects adjacent anchors on one
+//or more genomes. This input projection already should list anchors
+//that are adjacent on a genome within distance $max-distance. The
+//anchor graph will be built such that edges are stored for adjacent
+//anchors in at least one genome.
+//A series of heuristics is applied to identify paths in the graph
+//that correspond to collinear regions ignoring micro-rearrangments <
+//The regions may be overlapping with the degree of overlap determined
+//by max-distance
+//General outline
+//(1)Build anchor graph
+//(2)Initial clustering
+//(2.1) Identify vertices with more than 2 edges and mask all incident
+//edges in the graph. These are syntenic breakpoints. Some will be
+//micro- events that we ignore later
+//(2.2) Calculate connected components. The remaining edges correspond
+//to vertices with exactly two vertices and comprise runs of synteny.
+//(2.3) Run mincut to break paths that traverse breakpoints. Edges
+//indicate on synteny on some genomes but do not ensure all incident
+//anchors are syntenic. We use a maxflow-mincut procedure to determine
+//which edges to break such that the LCBs respect max-distance and do
+//not include inversions.
+//(2.4) Merge adjacent LCBs. The procedures of (2.2) and (2.3) will
+//over-parition the graph. Merge adjacent LCBs that have compatible
+//anchor orientations and respect max-distance
+//(2.5) Mask short LCBs after merge. LCBs < minlen after merging are
+//masked from the graph. Next, the vertices are projected along each
+//of the member sequences and additional edges are added to the anchor
+//graph. The clustering of (2.1) and (2.2) is repeated to identify a
+//new set of LCBs. This step allows for ignoring short LCBs that may
+//be breaking synteny.
+//(2.6) Run mincut to restore invariants.
+//(2.7) Merge
+//At this step the LCBs are . Two additional iterations of masking short LCBs and merging are run to try to cluster additional bps.
+//S. Angiuoli - UMD CS, 2009
+#define NDEBUG
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string>
+#include <fstream>
+#include <vector>
+#include <queue>
+#include <list>
+#include <bitset>
+#include <algorithm>
+#include <ext/hash_set> //__gnu_cxx namespace
+//#include <tr1/unordered_set>
+#include <boost/graph/graphviz.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/adjacency_matrix.hpp>
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/graph/iteration_macros.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/filtered_graph.hpp>
+#include <boost/graph/graph_utility.hpp>
+#include <boost/graph/connected_components.hpp>
+#include <boost/graph/strong_components.hpp>
+#include <boost/graph/topological_sort.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/graph/dijkstra_shortest_paths.hpp>
+#include <boost/graph/graph_traits.hpp>
+#include <boost/graph/properties.hpp>
+#include <boost/property_map.hpp>
+#include <boost/graph/breadth_first_search.hpp>
+#include <boost/graph/edmonds_karp_max_flow.hpp>
+#include <boost/graph/kolmogorov_max_flow.hpp>
+#include <boost/graph/push_relabel_max_flow.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/read_dimacs.hpp>
+#include <boost/graph/graph_utility.hpp>
+// Archivers
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/config.hpp>
+#include <boost/pending/queue.hpp>
+using namespace boost;
+using namespace std;
+//Maximum number of input genomes
+//There is no limit on the number of sequences per genome
+//Used to set size of std::bitset<> only
+//TODO, replace with boost::dynamic_bitset to avoid setting a limit
+#define MAXGENOMES 256
+//Print LCB stats
+#define LCBSTATS 1
+//Print timings for subsets
+#define TIMING 1
+//Use max-flow,min-cut
+#define CALCFLOW
+//Debug creates
+//#define DEBUG 1
+// print sequences and coords in graphviz output
+//#define PRINTSEQS
+// draw flow network, cannot be combined with printseqs
+//#define PRINTFLOW
+//Defining this option removes all edges labelled in a single sequence
+//only. Such edges represent can link to unaligned or non-syntenic
+#define TRIMEDGES
+//This is useful for simplying the graph and improves performance for draft genomes but changes the
+//the algorithm
+//When defined, sequence specific indels > distance parameter will break blocks automatically
+//When undefined, such indels are broken only during mincut
+//Mugsy codes
+#include "graph.h"
+#include "filters.h"
+#include "file.h"
+#include "lcbchecks.h" //isLabelCollinear,isLabelMaxGap,checkLCBGaps,checkLCBOrient,sameLabel,sameOrient,setLCBOrient
+#include "mincut.h" //breakLCBmincutconnect
+//variables for testing
+//Define to store and print edge labels w/ distances
+//in dot output
+//Undefine for release to save space
+//#define STORE_EDGE_LABELS 0
+//#define V_DEBUG 0
+//Remove misoriented vertices from an LCB
+template<typename TGraph>
+void fixMisOrientedLCBs(TGraph &g,
+ LCB & lcb,
+ VertexSet &maskedLCBs,
+ EdgeSet &maskedEdges,
+ SequenceGenomeMap & sequence2genome){
+#ifdef DEBUG
+ std::cerr << "Trimming misoriented vertices in lcb with " << lcb.size() << " vertices" << std::endl;
+ std::vector<Vertex> badV;
+ setLCBOrient(g,lcb,badV,sequence2genome);
+ std::vector<Vertex>::iterator it,it_end;
+ it_end = badV.end();
+ for(it=badV.begin();it!=it_end;++it){
+ maskedLCBs.insert(*it);
+ typename graph_traits<TGraph>::out_edge_iterator ei, edge_end;
+ typename graph_traits<TGraph>::in_edge_iterator ei2, edge_end2;
+ tie(ei,edge_end) = out_edges(*it,g);
+ for(;ei!=edge_end;++ei){
+ maskedEdges.insert(std::make_pair(source(*ei,g),target(*ei,g)));
+ put(edge_category,g,*ei,CYAN);
+ }
+ tie(ei2,edge_end2) = in_edges(*it,g);
+ for(;ei2!=edge_end2;++ei2){
+ maskedEdges.insert(std::make_pair(source(*ei2,g),target(*ei2,g)));
+ put(edge_category,g,*ei2,CYAN);
+ }
+ }
+template<typename TGraph, typename TGraphB, typename TLCBMap>
+void updateAdjacency(TGraph &g,
+ TGraphB &baseg,
+ std::set<Label> &seqidxSet,
+ VertexLabelIntervalMap &coordinates,
+ TLCBMap & lcborientmap,
+ unsigned int distance,
+ EdgeSet&maskedEdges,
+ std::vector<int> & ccvmap,
+ std::vector<LCB> & componentMap,
+ SequenceGenomeMap & sequence2genome){
+ //Graph of LCBs filtered by sequences
+ typedef typename property_map<TGraph, vertex_vlabelmask_t>::type VertexLabelMask;
+ typedef typename property_map<TGraph, vertex_orientmask_t>::type VertexOrientMask;
+ typedef typename property_map<TGraph, edge_labelmask_t>::type EdgeLabelMask;
+ typename property_map < TGraph, vertex_orientmask_t >::type orientmaskmap = get(vertex_orientmask,g);
+ typename property_map < TGraph, vertex_vlabelmask_t >::type labelmaskmap = get(vertex_vlabelmask,g);
+ typename property_map < TGraph, vertex_orient_t>::type orientmap = get(vertex_orient,g);
+ typename property_map < TGraph, edge_labelmask_t >::type elabelmaskmap = get(edge_labelmask,g);
+ typename property_map < TGraph, vertex_label_t >::type labelmap = get(vertex_label,g);
+ typename property_map < TGraph, vertex_len_t >::type lenmap = get(vertex_len,g);
+ typename property_map < TGraph, vertex_genome_t >::type genomemap = get(vertex_genome,g);
+ //Variables
+ //
+ Edge e1;
+ bool found;
+ unsigned int numnewedges=0;
+ //sort by coordinates by position on each sequence in the graph
+#ifdef DEBUG
+ std::cerr << "Updating adjacency edges in alignment graph" << std::endl;
+ //TODO looping over all seqs is a bottleneck for draft genomes
+ //Refactor by looping over graph and saving map [seqidx]->[vertex set]
+ std::map<Label,std::vector<typename TGraph::vertex_descriptor> > seqVertexMap;
+ for(typename boost::graph_traits<TGraph>::vertex_iterator
+ vit = vertices(g).first;vit!=vertices(g).second;++vit){
+ //assert(labelmap.find(*vit)!=labelmap.end());
+ assert(labelmap[*vit].size() > 0);
+ for(LabelSet::iterator sit = labelmap[*vit].begin();sit!=labelmap[*vit].end();++sit){
+ seqVertexMap[*sit].push_back(*vit);
+ }
+ }
+ std::set<Label> skipseqs;
+ for(typename std::map<Label,std::vector<typename TGraph::vertex_descriptor> >::iterator mit = seqVertexMap.begin();mit!=seqVertexMap.end();++mit){
+ unsigned int spanlen=0;
+ Label seqidx = mit->first;
+ for(typename std::vector<typename TGraph::vertex_descriptor>::iterator vit=mit->second.begin();vit!=mit->second.end();++vit){
+ typename TGraph::vertex_descriptor v = *vit;
+ assert(coordinates.find(std::make_pair(v,seqidx))!=coordinates.end());
+ if(coordinates.find(std::make_pair(v,seqidx))!=coordinates.end()){
+ spanlen = spanlen + get(vertex_len,g,v);
+ }
+ else{
+ assert(false);
+ }
+ }
+ if(spanlen==0){
+ skipseqs.insert(mit->first);
+ }
+ }
+ for(typename std::map<Label,std::vector<typename TGraph::vertex_descriptor> >::iterator mit = seqVertexMap.begin();mit!=seqVertexMap.end();++mit){
+ Label seqidx = mit->first;
+ assert(sequence2genome.find(seqidx)!=sequence2genome.end());
+ Label genomeidx = sequence2genome[seqidx];
+ if(skipseqs.find(seqidx)==skipseqs.end()){
+ //sort(sortedV.begin(),sortedV.end(),coordsorder(&coordinates,seqidx));
+ sort(mit->second.begin(),mit->second.end(),coordsorder(&coordinates,seqidx));
+ //
+ //(8.1)Check and add any new edges between adjacent alignment blocks in an LCB
+ for(std::vector<Vertex>::iterator it2 = mit->second.begin();it2!=mit->second.end();++it2){
+ if(it2+1!=mit->second.end() && ccvmap[*it2]!=ccvmap[*(it2+1)]){//only consider new edges that bridge clusters
+ //check still on same sequence,genome
+ assert(labelmap[*it2].find(seqidx) != labelmap[*it2].end());
+ assert(genomemap[*it2].find(genomeidx) != genomemap[*it2].end());
+ assert(labelmap[*(it2+1)].find(seqidx) != labelmap[*(it2+1)].end());
+ assert(genomemap[*(it2+1)].find(genomeidx) != genomemap[*(it2+1)].end());
+ assert(coordinates.find(std::make_pair(*(it2+1),seqidx)) != coordinates.end());
+ assert(coordinates.find(std::make_pair(*it2,seqidx)) != coordinates.end());
+ //new edge exists only if dist < distance threshold
+ //int dist = abs(coordinates[std::make_pair(*it2,seqidx)].second - coordinates[std::make_pair(*(it2+1),seqidx)].first);
+ int dist = coordinates[std::make_pair(*(it2+1),seqidx)].first - coordinates[std::make_pair(*it2,seqidx)].second;
+#ifdef NDEBUG
+ BitMask sharedlabels = (labelmaskmap[*it2]&labelmaskmap[*(it2+1)]);
+ assert(isLabelCollinearMask(sharedlabels,
+ orientmaskmap[*it2],
+ orientmaskmap[*(it2+1)])
+ ==
+ isLabelCollinear(orientmap[*it2],
+ orientmap[*(it2+1)],
+ sequence2genome));
+ //Additional checks to ensure that we only add "good" edges, between vertices on the same genomes
+ if(dist <= (int)distance
+ && isLabelCollinear(orientmap[*it2],
+ orientmap[*(it2+1)],
+ sequence2genome)
+ && isLabelMaxGap(*it2,*(it2+1),orientmap[*it2],orientmap[*(it2+1)],coordinates,distance,sequence2genome)){
+ //make sure that we do not introduce a rearrangment
+ //make sure that we do not introduce a long gap
+ LCB newlcb;
+ newlcb.insert(newlcb.end(),componentMap[ccvmap[*it2]].begin(),componentMap[ccvmap[*it2]].end());
+ newlcb.insert(newlcb.end(),componentMap[ccvmap[*(it2+1)]].begin(),componentMap[ccvmap[*(it2+1)]].end());
+ //
+ BitMask longlabelmask=setSpanMask(newlcb,lenmap,labelmap,sequence2genome);
+ //Two check required
+ //Check if orientation of lcb1 and lcb2 are congruent
+ //Check if orientation of it2 and it2+1 are congruent
+ //TODO
+ //The checkLCBOrient(masks) will not consider the case where a single vertex
+ //can be flipped to match the orientation
+ //TODO
+ //Only checking overall lcb orientation currently
+ //checkPairOrient(vlabelmap,vorientmap,*it1,*it2])
+ if(checkLCBOrient(g,newlcb,longlabelmask,sequence2genome)
+ && checkLCBOrient(lcborientmap,ccvmap[*it2],ccvmap[*(it2+1)],longlabelmask)
+ && checkLCBGaps(g,newlcb,ccvmap,coordinates,distance,sequence2genome)){
+ tie(e1,found) = edge(*it2,*(it2+1), g);
+#ifdef DEBUG
+ std::cerr << "Adding new edge between " << get(vertex_name,g,*it2) << "-"
+ << get(vertex_name,g,*(it2+1)) << std::endl;
+ numnewedges++;
+ if(found){
+ //TODO
+ //addEdgeLabel(g,e1,genomeidx);
+ if(!elabelmaskmap[e1].test(genomeidx)){
+#if defined(STORE_EDGE_LABELS)
+ labelmap[e1].insert(std::make_pair(genomeidx,dist));
+ assert(!elabelmaskmap[e1].test(genomeidx));
+ }
+ else{
+ numnewedges--;
+ }
+ elabelmaskmap[e1].set(genomeidx,1);
+ }
+ else{
+ //TODO
+ //addEdgeLabel(g,e1,genomeidx)
+ tie(e1,found) = edge(*(it2+1),*it2, g);
+ if(found){
+ if(!elabelmaskmap[e1].test(genomeidx)){
+#if defined(STORE_EDGE_LABELS)
+ labelmap[e1].insert(std::make_pair(genomeidx,dist));
+ assert(!elabelmaskmap[e1].test(genomeidx));
+ }
+ else{
+ numnewedges--;
+ }
+ elabelmaskmap[e1].set(genomeidx,1);
+ }
+ else{
+#if defined(STORE_EDGE_LABELS)
+ LabelMap plabels;
+ plabels[genomeidx] = dist;
+ tie(e1,found) = add_edge(*it2,*(it2+1),EdgeProperties(plabels),baseg);
+ tie(e1,found) = add_edge(*it2,*(it2+1),EdgeProperties(),baseg);
+ //TODO
+ //addEdgeLabel(g,e1,genomeidx)
+ elabelmaskmap[e1].set(genomeidx,1);
+ /*
+ //Remove any mask on this edges, only necessary if g is a filtered graph
+ if(maskedEdges.find(std::make_pair(*it2,*(it2+1)))!=maskedEdges.end()){
+ maskedEdges.erase(maskedEdges.find(std::make_pair(*it2,*(it2+1))));
+ }
+ if(maskedEdges.find(std::make_pair(*(it2+1),*it2))!=maskedEdges.end()){
+ maskedEdges.erase(maskedEdges.find(std::make_pair(*(it2+1),*it2)));
+ }
+ */
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ else{
+ //std::cerr << "Skipping merge on seq:" << seqidx
+ //<< " spanlen:" << spanlen << std::endl;
+ }
+ }
+ //std::cerr << "Added " << numnewedges << " edges" << std::endl;
+ setedgemasks(g,distance,coordinates,sequence2genome);
+ setvertexmasks(g,sequence2genome);
+ //std::cerr << "Finished setting edge and vertex masks" << std::endl;
+//Merge adjacent lcbs
+//This sub will only merge entire LCBs that are congruent
+//TODO, consider edges from longest LCBs first
+// Populate edgelcbmap with max(lcblenmap[source(e)],lcblenmap[target(e)])
+// lcblenmap[edgelcbmap[e1]] < lcblenmap[edgelcbmap[e2]]
+template<typename TGraph, typename TLCBMap>
+int mergeLCBsGreedy(TGraph & g,
+ std::vector<int> & ccvmap,
+ std::vector<LCB> & componentMap,
+ TLCBMap & lcborientmap,
+ VertexLabelIntervalMap & coordinates,
+ EdgeSet & maskedEdges,
+ unsigned int maxgap,
+ SequenceGenomeMap & sequence2genome){
+#ifdef DEBUG
+ std::cerr << "Merging LCBs. Total count " << componentMap.size() << std::endl;
+ typename property_map < TGraph, vertex_label_t >::type labelmap = get(vertex_label,g);
+ typename property_map < TGraph, vertex_len_t >::type lenmap = get(vertex_len,g);
+ typename property_map < TGraph, vertex_orient_t >::type omap = get(vertex_orient, g);
+ typename property_map < TGraph, vertex_len_t >::type lmap = get(vertex_len, g);
+ typename graph_traits<TGraph>::out_edge_iterator ei, edge_end;
+ typename graph_traits<TGraph>::in_edge_iterator ei2, edge_end2;
+ LCBLabelIntervalMap lcbcoords;
+ std::vector<int> ccremap=ccvmap;
+ std::set<std::pair<int,int> > searches;
+ int lcbcount=componentMap.size();
+ int nummerges=0;
+ //Capture LCB length
+ std::map<int,int> lcblenMap; //lcbid->max_seq_span
+ std::vector<int> lcbidx;
+ for(unsigned int k=0;k<componentMap.size();++k){
+#ifdef DEBUG
+ std::cerr << "Component " << k << std::endl;
+ if(componentMap[k].size()>0){
+ int bplen=0;
+ unsigned int len = get_LCB_length(componentMap[k],omap,lmap,coordinates,lcbcoords,k,bplen,sequence2genome,0);
+#ifdef DEBUG
+ std::cerr << " len:" << len << std::endl;
+ lcblenMap[k] = len;
+ }
+ else{
+ lcblenMap[k]=0;
+ }
+ lcbidx.push_back(k);
+ }
+ //Sort LCBs on length
+ sort(lcbidx.begin(),lcbidx.end(),lencmp(lcblenMap));
+ //Greedy merge adjacent LCBs from largest to smallest
+ for(std::vector<int>::reverse_iterator cit = lcbidx.rbegin();cit != lcbidx.rend();++cit){
+#ifdef DEBUG
+ std::cerr << "Greedy merge LCB:" << *cit << " len:" << lcblenMap[*cit] << std::endl;
+ if(componentMap[*cit].size()>0){
+ std::vector<Vertex> lcbv = componentMap[*cit];
+ for(LCB::iterator vit = lcbv.begin();vit!=lcbv.end();++vit){
+ std::vector<Edge> lcbedges;
+ tie(ei,edge_end) = out_edges(*vit,g);
+ for(;ei!=edge_end;++ei){
+ assert(source(*ei,g)==*vit);
+ lcbedges.push_back(*ei);
+ }
+ tie(ei2,edge_end2) = in_edges(*vit,g);
+ for(;ei2!=edge_end2;++ei2){
+ assert(target(*ei2,g)==*vit);
+ lcbedges.push_back(*ei2);
+ }
+#ifdef DEBUG
+ std::cerr << "Edges " << lcbedges.size() << std::endl;
+ for(vector<Edge>::iterator eit=lcbedges.begin();eit!=lcbedges.end();++eit){
+ Vertex sv = source(*eit,g);
+ Vertex tv = target(*eit,g);
+#ifdef DEBUG
+ std::cerr << "Vertex " << sv << "-" << tv << std::endl;
+ int sidx = ccvmap[sv];
+ int tidx = ccvmap[tv];
+ //
+ //If edge connects two components consider merging
+ //if compatible
+ if(sidx!=tidx
+ && searches.find(std::make_pair(sidx,tidx)) == searches.end()
+ && searches.find(std::make_pair(tidx,sidx)) == searches.end()
+ && componentMap[tidx].size()> 0
+ && componentMap[sidx].size()> 0
+ ){
+ //Make sure that there is no edge already connecting these LCBs
+ //This merge must be run after computing connected components
+ //
+ assert(maskedEdges.find(std::make_pair(sv,tv)) != maskedEdges.end()
+ ||maskedEdges.find(std::make_pair(tv,sv)) != maskedEdges.end());
+ //
+ //Mark that CC pair has been searched
+ searches.insert(std::make_pair(sidx,tidx));
+ LCB newlcb;
+ newlcb.insert(newlcb.end(),componentMap[sidx].begin(),componentMap[sidx].end());
+ newlcb.insert(newlcb.end(),componentMap[tidx].begin(),componentMap[tidx].end());
+ BitMask longlabelmask=setSpanMask(newlcb,lenmap,labelmap,sequence2genome);
+ //TODO
+ //The funcs called in this loop are a bottleneck according to gprof
+ //Most time spent copying OrientedLabelSet
+ //First this is a large loop, all edges.
+ //checkLCBgaps/checkLCBOrient makes copies of vertex properties like orientedlabelset
+ //checkLCBGaps creates and sorts vectors
+ if(checkLCBOrient(g,newlcb,longlabelmask,sequence2genome) //pairwise check for all vertices
+ && checkLCBOrient(lcborientmap,sidx,tidx,longlabelmask) //check consistency with lcb orient
+ && checkLCBGaps(g,newlcb,ccvmap,coordinates,maxgap,sequence2genome)){
+ //Save LCB
+ //std::cerr << "New LCB: ";
+ for(LCB::iterator vit2=newlcb.begin();vit2!=newlcb.end();++vit2){
+ assert(*vit2<ccvmap.size());
+ ccvmap[*vit2]=lcbcount;
+ }
+ //std::cerr << std::endl;
+ componentMap.push_back(newlcb);
+ //Clear out old LCB
+ componentMap[sidx] = LCB();
+ componentMap[tidx] = LCB();
+ std::vector<Vertex> badV;
+ //If joined LCBs have same labels, orietation, no need to recalc
+ assert(lcborientmap.find(sidx)!=lcborientmap.end());
+ assert(lcborientmap.find(tidx)!=lcborientmap.end());
+ if(lcborientmap[sidx]==lcborientmap[tidx]){
+ lcborientmap[lcbcount]=lcborientmap[tidx];
+ }
+ else{
+ //Set the label and orientation for the new lcb
+#ifdef DEBUG
+ std::cerr << lcborientmap[sidx].first << std::endl;
+ std::cerr << lcborientmap[sidx].second << std::endl << std::endl;
+ std::cerr << lcborientmap[tidx].first << std::endl;
+ std::cerr << lcborientmap[tidx].second << std::endl << std::endl;
+ lcborientmap[lcbcount]=setLCBOrient(g,newlcb,badV,sequence2genome);
+ }
+ //Remove mask on edge linking two lcbs
+ EdgeSet::iterator mit = maskedEdges.find(std::make_pair(sv,tv));
+ if(mit != maskedEdges.end()){
+ maskedEdges.erase(mit);
+ Edge e1;
+ bool found;
+ tie(e1,found) = edge(mit->first,mit->second,g);
+ assert(found);
+ put(edge_category,g,e1,ORANGERED);
+ }
+ else{
+ mit = maskedEdges.find(std::make_pair(tv,sv));
+ assert(mit != maskedEdges.end());
+ maskedEdges.erase(mit);
+ Edge e1;
+ bool found;
+ tie(e1,found) = edge(mit->first,mit->second,g);
+ assert(found);
+ put(edge_category,g,e1,ORANGERED);
+ }
+#ifdef DEBUG
+ std::cerr << "Merging LCB:"<<sidx<< " with LCB:"<<tidx<< " into LCB:"<<lcbcount << std::endl;
+ nummerges++;
+ lcbcount++;
+ assert(lcbcount==(int)componentMap.size());
+ }
+ else{
+ //skip LCB
+#ifdef DEBUG
+ std::cerr << "Skipping merge of LCB:"<<sidx
+ << " with LCB:"<<tidx
+ << " from edge "
+ << get(vertex_name,g,sv) << "-" << get(vertex_name,g,tv)
+ << std::endl;
+ }
+ }
+ }
+ }
+ }
+ }
+ return nummerges;
+//Merge adjacent lcbs
+//This sub will only merge entire LCBs that are congruent
+//TODO, consider edges from longest LCBs first
+// Populate edgelcbmap with max(lcblenmap[source(e)],lcblenmap[target(e)])
+// lcblenmap[edgelcbmap[e1]] < lcblenmap[edgelcbmap[e2]]
+template<typename TGraph, typename TLCBMap>
+int mergeLCBs(TGraph & g,
+ std::vector<int> & ccvmap,
+ std::vector<LCB> & componentMap,
+ TLCBMap & lcborientmap,
+ VertexLabelIntervalMap & coordinates,
+ EdgeSet & maskedEdges,
+ unsigned int maxgap,
+ SequenceGenomeMap & sequence2genome){
+#ifdef DEBUG
+ std::cerr << "Merging LCBs. Total count " << componentMap.size() << std::endl;
+ typename property_map < TGraph, vertex_label_t >::type labelmap = get(vertex_label,g);
+ typename property_map < TGraph, vertex_len_t >::type lenmap = get(vertex_len,g);
+ std::vector<int> ccremap=ccvmap;
+ std::set<std::pair<int,int> > searches;
+ int lcbcount=componentMap.size();
+ int nummerges=0;
+ typename boost::graph_traits<TGraph>::edge_iterator eit, edge_end;
+ edge_end=edges(g).second;
+ for(eit=edges(g).first;eit!=edge_end;++eit){//all edges in g
+ Vertex sv = source(*eit,g);
+ Vertex tv = target(*eit,g);
+ int sidx = ccvmap[sv];
+ int tidx = ccvmap[tv];
+ //
+ //If edge connects two components consider merging
+ //if compatible
+ if(sidx!=tidx
+ && searches.find(std::make_pair(sidx,tidx)) == searches.end()
+ && searches.find(std::make_pair(tidx,sidx)) == searches.end()
+ ){
+ //Make sure that there is no edge already connecting these LCBs
+ //This merge must be run after computing connected components
+ //
+ assert(maskedEdges.find(std::make_pair(sv,tv)) != maskedEdges.end()
+ ||maskedEdges.find(std::make_pair(tv,sv)) != maskedEdges.end());
+ //
+ //Mark that CC pair has been searched
+ searches.insert(std::make_pair(sidx,tidx));
+ LCB newlcb;
+ newlcb.insert(newlcb.end(),componentMap[sidx].begin(),componentMap[sidx].end());
+ newlcb.insert(newlcb.end(),componentMap[tidx].begin(),componentMap[tidx].end());
+ BitMask longlabelmask=setSpanMask(newlcb,lenmap,labelmap,sequence2genome);
+ //TODO
+ //The funcs called in this loop are a bottleneck according to gprof
+ //Most time spent copying OrientedLabelSet
+ //First this is a large loop, all edges.
+ //checkLCBgaps/checkLCBOrient makes copies of vertex properties like orientedlabelset
+ //checkLCBGaps creates and sorts vectors
+ if(checkLCBOrient(g,newlcb,longlabelmask,sequence2genome) //pairwise check for all vertices
+ && checkLCBOrient(lcborientmap,sidx,tidx,longlabelmask) //check consistency with lcb orient
+ && checkLCBGaps(g,newlcb,ccvmap,coordinates,maxgap,sequence2genome)){
+ //Save LCB
+ //std::cerr << "New LCB: ";
+ for(LCB::iterator vit=newlcb.begin();vit!=newlcb.end();++vit){
+ assert(*vit<ccvmap.size());
+ ccvmap[*vit]=lcbcount;
+ //std::cerr << get(vertex_name,g,*vit) << " ";
+ }
+ //std::cerr << std::endl;
+ componentMap.push_back(newlcb);
+ std::vector<Vertex> badV;
+ //If joined LCBs have same labels, orietation, no need to recalc
+ assert(lcborientmap.find(sidx)!=lcborientmap.end());
+ assert(lcborientmap.find(tidx)!=lcborientmap.end());
+ if(lcborientmap[sidx]==lcborientmap[tidx]){
+ lcborientmap[lcbcount]=lcborientmap[tidx];
+ }
+ else{
+ //Set the label and orientation for the new lcb
+#ifdef DEBUG
+ std::cerr << lcborientmap[sidx].first << std::endl;
+ std::cerr << lcborientmap[sidx].second << std::endl << std::endl;
+ std::cerr << lcborientmap[tidx].first << std::endl;
+ std::cerr << lcborientmap[tidx].second << std::endl << std::endl;
+ lcborientmap[lcbcount]=setLCBOrient(g,newlcb,badV,sequence2genome);
+ }
+ //Remove mask on edge linking two lcbs
+ EdgeSet::iterator mit = maskedEdges.find(std::make_pair(sv,tv));
+ if(mit != maskedEdges.end()){
+ maskedEdges.erase(mit);
+ Edge e1;
+ bool found;
+ tie(e1,found) = edge(mit->first,mit->second,g);
+ assert(found);
+ put(edge_category,g,e1,ORANGERED);
+ }
+ else{
+ mit = maskedEdges.find(std::make_pair(tv,sv));
+ assert(mit != maskedEdges.end());
+ maskedEdges.erase(mit);
+ Edge e1;
+ bool found;
+ tie(e1,found) = edge(mit->first,mit->second,g);
+ assert(found);
+ put(edge_category,g,e1,ORANGERED);
+ }
+#ifdef DEBUG
+ std::cerr << "Merging LCB:"<<sidx<< " with LCB:"<<tidx<< " into LCB:"<<lcbcount << std::endl;
+ nummerges++;
+ lcbcount++;
+ assert(lcbcount==(int)componentMap.size());
+ }
+ else{
+ //skip LCB
+#ifdef DEBUG
+ std::cerr << "Skipping merge of LCB:"<<sidx
+ << " with LCB:"<<tidx
+ << " from edge "
+ << get(vertex_name,g,sv) << "-" << get(vertex_name,g,tv)
+ << std::endl;
+ }
+ }
+ }
+ return nummerges;
+//Completely remove the LCB from the graph (by adding to maskedLCBs)
+void removeLCB(LCB & lcb,
+ std::set<std::pair<Vertex,bool> > &breakpoints,
+ VertexSet &maskedLCBs){
+ std::set<std::pair<Vertex,bool> >::iterator it2;
+ for(LCB::iterator vit = lcb.begin();vit!=lcb.end();++vit){
+#ifdef DEBUG
+ std::cerr << "Removing vertex " << *vit << std::endl;
+ maskedLCBs.insert(*vit);
+ }
+//Mark possible syntenic breakpoints in graph g, storing in maskedEdges
+//Breakpoints can arise from
+//(1)Change in label
+//(2)Change in orientation
+//(3)Flux, whereever indegree!=1 or outdegree!=1
+template<typename TGraph, typename BPMap1, typename BPMap2, typename VMap1>
+void markBreakpoints(TGraph &g,
+ BPMap1 &breakpoints,
+ BPMap2 &maskedEdges,
+ VMap1 &vertexList,
+ SequenceGenomeMap &sequence2genome){
+ typename graph_traits<TGraph>::vertex_iterator i, end;
+ typename graph_traits<TGraph>::out_edge_iterator ei, edge_end;
+ typename graph_traits<TGraph>::in_edge_iterator ei2, edge_end2;
+ typename property_map < TGraph, vertex_orientmask_t >::type vorientmap = get(vertex_orientmask, g);
+ typename property_map < TGraph, vertex_vlabelmask_t >::type vlabelmap = get(vertex_vlabelmask, g);
+ typename property_map < TGraph, vertex_orient_t >::type vmap = get(vertex_orient, g);
+ typename property_map < TGraph, edge_labelmask_t >::type elabelmap = get(edge_labelmask, g);
+ int bptype1=0;
+ int bptype2=0;
+ int bptype3=0;
+ int keepmerge=false;
+ for(typename boost::graph_traits<TGraph>::vertex_iterator
+ vit = vertices(g).first;vit!=vertices(g).second;++vit){
+ Vertex v = *vit;
+ if(vertexList.size()>0 && vertexList.find(v)==vertexList.end())
+ continue;
+#ifdef DEBUG
+ std::cerr << "Checking for breakpoints on vertex v:" << get(vertex_name,g,v) << std::endl;
+ //ei = out_edges(v, g).first;
+ // bptype==0 no breakpoint
+ // bptype==1 incoming bp, end a region
+ // bptype==2 outgoing bp, start a region
+ bool inlinebp=false;
+ bool fluxbp=false;
+ bool inbp=false;
+ bool outbp=false;
+ bool ismerge=false;
+ if(in_degree(v,g)==1){
+ tie(ei2,edge_end2) = in_edges(v,g);
+ assert(target(*ei2,g)==v);
+ //Check same labels
+ if(sameLabel(vlabelmap[v],vlabelmap[source(*ei2,g)],elabelmap[*ei2])){
+ }
+ else{
+ //Some type of case (3) flux
+ if(isLabelCollinear(vmap[v],vmap[source(*ei2,g)],sequence2genome)){
+ }
+ else{
+ fluxbp=true;
+ inbp=true;
+ maskedEdges.insert(std::make_pair(source(*ei2,g),v));
+ put(edge_category,g,*ei2,GREEN);
+ bptype1++;
+ }
+ }
+ if(isLabelCollinear(vmap[v],vmap[source(*ei2,g)],sequence2genome)){
+ assert(sameOrient(vorientmap[v]&elabelmap[*ei2],vorientmap[source(*ei2,g)]&elabelmap[*ei2],vlabelmap[v]&elabelmap[*ei2]));
+ }
+ else{
+ //Some type of case (2) orientation change
+ fluxbp=true;
+ inlinebp=false;
+ inbp=true;
+ maskedEdges.insert(std::make_pair(source(*ei2,g),v));
+ put(edge_category,g,*ei2,PURPLE);
+ bptype2++;
+ }
+ }
+ if(out_degree(v,g)==1){
+ tie(ei,edge_end) = out_edges(v,g);
+ assert(source(*ei,g)==v);
+ if(sameLabel(vlabelmap[v],vlabelmap[target(*ei,g)],elabelmap[*ei])){
+ }
+ else{
+ //Some type of case (3) flux
+ if(isLabelCollinear(vmap[v],vmap[target(*ei,g)],sequence2genome)){
+ }
+ else{
+ fluxbp=true;
+ //inlinebp=true;
+ outbp=true;
+ maskedEdges.insert(std::make_pair(v,target(*ei,g)));
+ put(edge_category,g,*ei,GREEN);
+ bptype1++;
+ }
+ }
+ if(isLabelCollinear(vmap[v],vmap[target(*ei,g)],sequence2genome)){
+ assert(sameOrient(vorientmap[v]&elabelmap[*ei],
+ vorientmap[target(*ei,g)]&elabelmap[*ei],
+ vlabelmap[v]&elabelmap[*ei]));
+ }
+ else{
+ //Some type of case (2) orientation change
+ maskedEdges.insert(std::make_pair(v,target(*ei,g)));
+ put(edge_category,g,*ei,PURPLE);
+ fluxbp=true;
+ inlinebp=false;
+ outbp=true;
+ bptype2++;
+ }
+ }
+ if(in_degree(v,g)>1){
+ //Some type of case (3) flux
+ fluxbp=true;
+ inbp=true;
+ tie(ei2,edge_end2) = in_edges(v,g);
+ for(;ei2!=edge_end2;++ei2){
+ assert(target(*ei2,g)==v);
+ //maskedEdges.insert(std::make_pair(source(*ei2,g),target(*ei2,g)));
+#ifdef DEBUG
+ std::cerr << "Adding bp " << get(vertex_name,g,source(*ei2,g)) << "-" << get(vertex_name,g,target(*ei2,g)) << std::endl;
+ if(isLabelCollinear(vmap[source(*ei2,g)],vmap[target(*ei2,g)],sequence2genome)){
+ //Previously merged, keep
+ if(keepmerge && get(edge_category,g,*ei2)==ORANGERED){
+ ismerge=true;
+ }
+ else{
+ put(edge_category,g,*ei2,RED);
+ maskedEdges.insert(std::make_pair(source(*ei2,g),target(*ei2,g)));
+ }
+ }
+ else{
+ //Some type of case (2) orientation change
+ put(edge_category,g,*ei2,PURPLE);
+ maskedEdges.insert(std::make_pair(source(*ei2,g),target(*ei2,g)));
+ }
+ }
+ bptype3++;
+ }
+ if(out_degree(v,g)>1){
+ //Some type of case (3) flux
+ fluxbp=true;
+ outbp=true;
+ tie(ei,edge_end) = out_edges(v,g);
+ for(;ei!=edge_end;++ei){
+ assert(source(*ei,g)==v);
+ //maskedEdges.insert(std::make_pair(source(*ei,g),target(*ei,g)));
+#ifdef DEBUG
+ std::cerr << "Adding bp " << get(vertex_name,g,source(*ei,g)) << "-" << get(vertex_name,g,target(*ei,g)) << std::endl;
+ if(isLabelCollinear(vmap[source(*ei,g)],vmap[target(*ei,g)],sequence2genome)){
+ if(keepmerge && get(edge_category,g,*ei)==ORANGERED){
+ ismerge=true;
+ }
+ else{
+ put(edge_category,g,*ei,RED);
+ maskedEdges.insert(std::make_pair(source(*ei,g),target(*ei,g)));
+ }
+ }
+ else{
+ //Some type of case (2) orientation change
+ put(edge_category,g,*ei,PURPLE);
+ maskedEdges.insert(std::make_pair(source(*ei,g),target(*ei,g)));
+ }
+ }
+ bptype3++;
+ }
+ if(out_degree(v,g)==0){
+ outbp=true;
+ }
+ if(in_degree(v,g)==0){
+ inbp=true;
+ }
+ if(fluxbp){
+ breakpoints.insert(std::make_pair(v,false));
+ }
+ }
+#ifdef DEBUG
+ std::cerr << "Marked breakpoints. type1 " << bptype1 << " type2 " << bptype2 << " type3 " << bptype3 << std::endl;
+//Remove all breakpoints except
+//PURPLE orientation changing
+//BLUE mincuts
+template<typename TGraph>
+void clearInlineBreakpoints(TGraph & g,
+ EdgeSet &maskedEdges){
+ vector<EdgeSet::iterator > eraseMask;
+ EdgeSet::iterator mit;
+ for(mit = maskedEdges.begin();mit!=maskedEdges.end();++mit){
+ Edge e;
+ bool found;
+ tie(e,found) = edge(mit->first,mit->second,g);
+ assert(found);
+ if(get(edge_category,g,e)!=PURPLE && get(edge_category,g,e)!=BLUE){
+ eraseMask.push_back(mit);
+ }
+ tie(e,found) = edge(mit->second,mit->first,g);
+ if(found){
+ if(get(edge_category,g,e)!=PURPLE && get(edge_category,g,e)!=BLUE){
+ eraseMask.push_back(mit);
+ }
+ }
+ }
+ vector<EdgeSet::iterator >::iterator eit;
+ for(eit=eraseMask.begin();eit!=eraseMask.end();++eit){
+ maskedEdges.erase(*eit);
+ }
+//Connected components
+template<typename TGraph, typename TGraphBase, typename TComponentMap, typename TVertexMap, typename TLCBMap>
+int calc_components_undirected(TGraph & fg,
+ TGraphBase & g,
+ TComponentMap & componentMap,
+ TVertexMap & c,
+ TLCBMap & lcborientmap,
+ SequenceGenomeMap & sequence2genome){
+ typedef adjacency_list<vecS,vecS,undirectedS,VertexProperties,EdgeProperties> TLGraph;
+ typedef typename TLGraph::vertex_descriptor TLVertex;
+ typedef typename TLGraph::edge_descriptor TLEdge;
+ typedef typename TGraph::vertex_descriptor TVertex;
+ typedef typename TGraphBase::edge_descriptor TEdgeBase;
+ typedef typename boost::graph_traits<TGraph>::edge_iterator TEdgeIterator;
+ typedef typename boost::graph_traits<TGraph>::vertex_iterator TVertexIterator;
+ typedef typename boost::graph_traits<TLGraph>::vertex_iterator TLVertexIterator;
+ typedef typename boost::graph_traits<TGraphBase>::edge_iterator TEdgeBaseIterator;
+ typedef typename boost::graph_traits<TGraphBase>::vertex_iterator TVertexBaseIterator;
+ bool inserted;
+ //Undirected graph(currlcbg) is required here for the CC algorithm
+ //TODO Performance enhancement refactor
+ // Avoid building a second graph and run CC on directed graph(fg)
+ adjacency_list<vecS,vecS,undirectedS,VertexProperties,EdgeProperties> currlcbg;
+ std::map<VertexName, TLVertex> currlcbv;
+ std::map<VertexName, TLVertex>::iterator pos;
+ //Map between undirected graph(currlcbg) vertices and directed graph(fg) vertices
+ std::map<TLVertex,TVertex> vmap;
+ TLVertex news,newt;
+ TEdgeIterator starte,ende;
+ TVertexIterator startv,endv;
+ TLEdge ne;
+ tie(startv,endv)=vertices(fg);
+ for(TVertexIterator vit = startv;vit!=endv;++vit){
+ VertexName sname = get(vertex_name,fg,*vit);
+ assert(currlcbv.find(sname)==currlcbv.end());
+ tie(pos, inserted) = currlcbv.insert(std::make_pair(sname, TLVertex()));
+ assert(inserted);
+ news = add_vertex(sname,currlcbg);
+ currlcbv[sname]=news;
+ assert(vmap.find(news)==vmap.end());
+ vmap[news]=*vit;
+ }
+ tie(starte,ende)=edges(fg);
+ for(TEdgeIterator eit = starte;eit!=ende;++eit){
+ TEdgeBase e = *eit;
+ VertexName tname = get(vertex_name,fg,target(e,fg));
+ VertexName sname = get(vertex_name,fg,source(e,fg));
+ assert(currlcbv.find(sname)!=currlcbv.end());
+ assert(currlcbv.find(tname)!=currlcbv.end());
+ news=currlcbv[sname];
+ newt=currlcbv[tname];
+ tie(ne, inserted) = add_edge(news,newt,currlcbg);
+ if(inserted){
+ }
+ }
+ assert(currlcbv.size()==num_vertices(currlcbg));
+ c.clear();
+ c.resize(num_vertices(currlcbg));
+ int numComponents = connected_components(currlcbg,&c[0]);
+ //Save mapping of componentNum->vector<Vertex>
+ componentMap.clear();
+ assert(componentMap.size()==0);
+ componentMap.resize(numComponents);
+ for(TLVertexIterator vit = vertices(currlcbg).first;vit!=vertices(currlcbg).second;++vit){
+ assert(vmap.find(*vit)!=vmap.end());
+ //This ensures lcbidx=c[vertex]
+ componentMap[c[*vit]].push_back(vmap[*vit]);
+ }
+ //
+ //Save mask for the LCB
+ setLCBOrient(g,lcborientmap,componentMap,sequence2genome);
+ return numComponents;
+//Calculate some summary statistics
+template<typename TGraph>
+void summaryStats(TGraph & fglcbsyn,
+ std::vector<LCB> &componentMap,
+ VertexLabelIntervalMap &coordinates,
+ unsigned int minlength,
+ int & numc,
+ unsigned int & minlen,
+ int & totallen,
+ unsigned int & avglen,
+ int & maxv,
+ SequenceGenomeMap & sequence2genome){
+ LCBLabelIntervalMap lcbcoords;
+ avglen=0;
+ minlen=std::numeric_limits<unsigned int>::max();
+ totallen=0;
+ numc=0;
+ maxv=0;
+ for(unsigned int k=0;k<componentMap.size();++k){
+ if(componentMap[k].size()>0){
+ maxv = (maxv > (int)componentMap[k].size()) ? maxv : componentMap[k].size();
+ assert(componentMap[k].size()>0);
+ OrientedLabelSet label = get(vertex_orient, fglcbsyn, componentMap[k][0]);
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type omap = get(vertex_orient, fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lmap = get(vertex_len, fglcbsyn);
+ int bplen=0;
+ unsigned int len = get_LCB_length(componentMap[k],omap,lmap,coordinates,lcbcoords,k,bplen,sequence2genome,minlength);
+ if(len>0){
+ if(len>=minlength){
+ minlen = (len < minlen) ? len : minlen;
+ avglen+=len;
+ totallen+=bplen;
+ numc++;
+ }
+ else{
+ }
+ }
+ }
+ }
+int main(int argc, char* argv[])
+ //Number of iterations to run
+ unsigned int MAXITERS=5;
+ unsigned int MAXSTABLE=1;
+ //Input graph
+ Graph g;
+ //Key parameters
+ unsigned int distance=0; //maximum gap length between anchors
+ unsigned int shortlcblen=0; //maximum length of LCBs that are masked during chaining
+ unsigned int minlength=0; //for reporting stats only
+ unsigned int minanchor=0; //minimum anchor length
+ unsigned int minprintlength=0;
+ //Ensure chains do not overlap by removing overlapping regions
+ bool removeoverlaps=false;
+ //Lookups
+ NameVertexMap name2vertex,name2vertexcomp;
+ NameLabelMap sequence2index,genome2index;
+ LabelNameMap index2sequence;
+ SequenceGenomeMap sequence2genome;
+ //Map of coordinates for each anchor
+ VertexLabelIntervalMap coordinates;
+ VertexLabelIntervalMap::iterator cpos;
+#ifdef TIMING
+ time_t now;
+ time(&now);
+ time_t lasttime=now;
+ if(argc<=3){
+ cerr << "USAGE:mugsy-chaining max-distance min-lcbspan min-statslen < anchors.projection" << std::endl;
+ exit(1);
+ }
+ if(argc>1){
+ assert(atoi(argv[1])>=0);
+ distance = atoi(argv[1]);
+ }
+ if(argc>2){
+ assert(atoi(argv[2])>=0);
+ shortlcblen = atoi(argv[2]);
+ }
+ if(argc>3){
+ minlength = atoi(argv[3]);
+ }
+ assert(distance>0);
+ assert(minlength>=0);
+ cerr << "#Using custom distance " << distance << endl;
+ cerr << "#Using custom minlength " << minlength << endl;
+ cerr << "#Parsing graph from stdin" << endl;
+ if(0){
+ //TODO
+ //Check file format
+ //Allow for unprojected,projected list of blocks
+ //Read blocks and build alignment graph perform projection over
+ //each sequence and connect blocks that are adjacent on any given
+ //sequence at distance < d
+ read_blocks(std::cin,
+ g,
+ name2vertex,
+ genome2index,
+ sequence2index,
+ coordinates,
+ distance);
+ }
+ else{
+ //Read a projection of anchors and build anchor graph
+ //Only consider anchors that are adjacent < d
+ read_pairwiseprojection(std::cin,
+ g,
+ name2vertex,
+ genome2index,
+ sequence2index,
+ coordinates,
+ sequence2genome,
+ distance,
+ minanchor);
+ //Save coordinates for each anchor in coordinates map
+ updateCoordinates(coordinates,sequence2genome);
+ }
+ //Reverse sequence2index map
+ for(NameLabelMap::iterator i = sequence2index.begin();i!=sequence2index.end();++i){
+#ifdef DEBUG
+ std::cerr << "Seq idx:" << i->second << " " << i->first << std::endl;
+ index2sequence[i->second] = i->first;
+ }
+ //Restrict to a set of labels
+ LabelSet labels;
+ if(argc>4){
+ for(int i=4;i<argc;++i){
+ NameLabelMap::iterator it = sequence2index.find(argv[i]);
+ if(it != sequence2index.end()){
+ cerr << "#Restricting outputs to sequence label " << argv[i] << endl;
+ labels.insert(it->second);
+ }
+ else{
+ cerr << "#Invalid sequence label " << argv[i] << endl;
+ assert(false);
+ }
+ }
+ }
+ cerr << "#Num of vertices " << num_vertices(g) << endl;
+ cerr << "#Num of edges " << num_edges(g) << endl;
+ //Set edge and vertex masks for fast pattern matching
+ cerr << "#Setting edge and vertex masks" << endl;
+ setedgemasks(g,distance,coordinates,sequence2genome);
+ setvertexmasks(g,sequence2genome);
+ //Remove edges connnected in only one label. This simplifies the
+ //graph by removing flux contributed by a single genome only.
+ std::vector<boost::graph_traits<Graph>::edge_descriptor> eraseEdges;
+ for(boost::graph_traits<Graph>::edge_iterator
+ eit = edges(g).first;eit!=edges(g).second;++eit){
+ Edge e = *eit;
+ BitMask emask = get(edge_labelmask,g,*eit);
+#if defined(STORE_EDGE_LABELS)
+ LabelMap inlabels = get(edge_label,g,*eit);
+ assert(inlabels.size()==emask.count());
+ if(emask.count()<=1){
+ eraseEdges.push_back(*eit);
+ }
+ }
+ for(std::vector<boost::graph_traits<Graph>::edge_descriptor>::iterator eit=eraseEdges.begin();eit!=eraseEdges.end();++eit){
+ //std::cerr << "Removing edge:" << get(vertex_name,g,source(*eit,g)) << "-" << get(vertex_name,g,target(*eit,g)) << std::endl;
+ remove_edge(*eit,g);
+ }
+ property_map < Graph, vertex_name_t >::type vertex_name_map = get(vertex_name, g);
+ //Variables
+ Edge e1;
+ int itercount=0;
+ //Extract all sequences
+ std::set<Label> seqidxSet;
+ for(NameLabelMap::iterator it = sequence2index.begin();it!=sequence2index.end();++it){
+ //it->first is the sequence name
+ //it->second is the index
+ seqidxSet.insert(it->second);
+ }
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_INIT:" << now-lasttime << std::endl;
+ lasttime=now;
+ //Initial clustering: build anchor graph, cut, merge, maskshort, recluster
+ //Building the anchor graph
+ //(1) Create filtered graph that supports breakpoints and maskedLCBs
+ std::cerr << "Building alignment graph and initial clutering" << std::endl;
+ std::set<std::pair<Vertex,bool> > breakpoints;
+ VertexSet maskedLCBs;
+ //Store vertex pair rather than edge_descriptor to avoid problems with
+ //stale edge descriptors and lack of < operator needed for std::set
+ EdgeSet maskedEdges;
+ LCBLabelIntervalMap lcbcoords;
+ //Filter graph has predicates for
+ //-Masked edges
+ //-Masked LCBs
+ //
+ //Edge filters
+ synbp_edge_filter<Graph> synefilter(&maskedEdges,&g);
+ LCB_edge_filter<Graph> lcbefilter(&maskedLCBs,&g);
+ compound_edge_filter<LCB_edge_filter<Graph>, synbp_edge_filter<Graph> >
+ cmpefilter(lcbefilter,synefilter);
+ //Vertex filters
+ LCB_vertex_filter<Graph> lcbvfilter(&maskedLCBs);
+ //The graph
+ LCBSynFilterGraph fglcbsyn(g,cmpefilter,lcbvfilter);
+ //
+ //(2.1) Find and mark all breakpoints in the graph
+ //Breakpoint types (stored in edge_category)
+ //RED - potential syntenic brkpt due to multiple incoming/outgoing edges
+ //PURPLE - change in orientation between sequences in adjacent blocks/vertices
+ //GREEN - other flux such loss of homology in a single genome
+ set<Vertex> dummySet;
+ markBreakpoints(g,breakpoints,maskedEdges,dummySet,sequence2genome);
+#ifdef DEBUG
+ std::cerr << "Marked " << breakpoints.size() << " breakpoints" << std::endl;
+ //
+ //(2.2) Calculate LCBs using connected components
+ //This initial clustering is expected to produce a over-segmented
+ //set of LCBs. Later steps in the clustering will collapse LCBs
+ std::vector<LCB> componentMap;
+ std::vector<int> ccvmap(num_vertices(fglcbsyn));
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.input.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ //TODO
+ //Replace with non BitMask version or use boost::dynamic_bitset
+ //Lookup for lcbid->(labelmask,orientmask)
+ std::map<int,std::pair<BitMask,BitMask> > lcborientmap;
+ int numComponents = calc_components_undirected(fglcbsyn,g,componentMap,ccvmap,lcborientmap,sequence2genome);
+ itercount=numComponents;
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_CLUST1:" << now-lasttime << std::endl;
+ lasttime=now;
+ unsigned int avglen,minlen;
+ int totallen,numc,maxv;
+ int allbps;
+#ifdef LCBSTATS
+ //Calculate stats
+ summaryStats(fglcbsyn,componentMap,coordinates,minprintlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome); //using minprintlength
+ allbps=totallen;
+ if(numc>0){
+ std::cerr << "LCB summary orig " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ summaryStats(fglcbsyn,componentMap,coordinates,minlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome); //using minlength
+ if(numc>0){
+ std::cerr << "LCB summary orig " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.orig.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ do_write_graphviz(fglcbsyn, std::string("gout.orig.dot.filtered"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ std::cerr << "Partitioning graph to maintain contraints" << std::endl;
+ //(2.3) Breaks LCBs based on gap lengths, mismatched orient, and mult seqs same genome
+ int cutattempts=0;
+ int origbreaks = breakLCBmincutconnect(componentMap,ccvmap,maskedEdges,g,fglcbsyn,distance,coordinates,seqidxSet,name2vertex,sequence2genome);
+#ifdef DEBUG
+ std::cerr << "Num orig breaks " << origbreaks << std::endl;
+ numComponents = calc_components_undirected(fglcbsyn,g,componentMap,ccvmap,lcborientmap,sequence2genome);
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_MINCUT1:" << now-lasttime << std::endl;
+ lasttime=now;
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.mincut1.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ do_write_graphviz(fglcbsyn, std::string("gout.mincut1.dot.filtered"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+#ifdef LCBSTATS
+ //Calculate stats
+ summaryStats(fglcbsyn,componentMap,coordinates,minprintlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome); //using minprintlength
+ if(numc>0){
+ std::cerr << "LCB summary post-cuts (" << origbreaks << " cuts) " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ summaryStats(fglcbsyn,componentMap,coordinates,minlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome); //using minlength
+ if(numc>0){
+ std::cerr << "LCB summary post-cuts (" << origbreaks << " cuts) " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ int lcbidx=0;
+#ifdef DEBUG
+ //Preceeding step breakLCBmincut and CC should not introduce
+ //bad edges so check predicates
+ cutattempts+=1000;
+ int morebreaks = breakLCBmincutconnect(componentMap,ccvmap,maskedEdges,g,fglcbsyn,distance,coordinates,seqidxSet,name2vertex,sequence2genome,cutattempts);
+ std::cerr << "Num orig breaks " << morebreaks << std::endl;
+ assert(morebreaks==0);
+ for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+ //checkSeqsPerLCB(g,*it)
+ std::map<Label,std::set<Label> > seqspergenomeMap; //tracks the number of seqs per genome in an LCB
+ std::map<Label,std::set<Label> >::iterator gpos;
+ bool inserted;
+ property_map < Graph, vertex_label_t >::type vlabelmap = get(vertex_label,g);
+ std::cerr << " LCB " << lcbidx << std::endl;
+ for(LCB::iterator vit = it->begin();vit!=it->end();++vit){
+ std::cerr << " V:" << *vit << std::endl;
+ printlabel(get(vertex_orient,g,*vit));
+ std::cerr << std::endl;
+ for(LabelSet::iterator sit = vlabelmap[*vit].begin();sit!=vlabelmap[*vit].end();++sit){
+ //std::cerr << " seqidx:" << *sit << " genomeidx:" << sequence2genome[*sit] << std::endl;
+ tie(gpos, inserted) = seqspergenomeMap.insert(std::make_pair(sequence2genome[*sit],std::set<Label>()));
+ gpos->second.insert(*sit);
+ assert(gpos->second.size()==1);
+ }
+ }
+ if(checkLCBGaps(g,*it,ccvmap,coordinates,distance,sequence2genome)){}
+ else{
+ std::cerr << "Bad gap" << std::endl;
+ assert(false);
+ }
+ if(checkLCBOrient(g,*it,sequence2genome)){}
+ else{
+ std::cerr << "Misoriented LCB" << std::endl;
+ //TODO, add orientation condition to mincut
+ //assert(false);
+ }
+ lcbidx++;
+ }
+ //
+ //(2.4)Attempt to merge lcbs that are adjacent on two or more genomes
+ // and do not introduce rearrangements, gaps
+ //
+ std::cerr << "Merging adjacent LCBs" << std::endl;
+ //Update lcbcoords
+ lcbcoords.clear();
+#ifdef DEBUG
+ for(unsigned int k=0;k<componentMap.size();++k){
+ if(componentMap[k].size()>0){
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type omap = get(vertex_orient, fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lmap = get(vertex_len, fglcbsyn);
+ int bplen=0;
+ unsigned int len = get_LCB_length(componentMap[k],omap,lmap,coordinates,lcbcoords,k,bplen,sequence2genome);
+ assert(len>=0);
+ }
+ }
+ //Breakpoints are stored in maskedEdges. mergeLCBs clears breakpoints between connected,adjacent and congruent LCBs
+ //TODO, put this in loop
+ int nummerges=-1;
+ int totalnummerges=0;
+ while(nummerges!=0){
+ nummerges = mergeLCBsGreedy(g,ccvmap,componentMap,lcborientmap,coordinates,maskedEdges,distance,sequence2genome);
+ totalnummerges = totalnummerges+nummerges;
+ }
+ //int origmerges = mergeLCBs(g,ccvmap,componentMap,lcborientmap,coordinates,maskedEdges,distance,sequence2genome);
+#ifdef DEBUG
+ std::cerr << "Num orig merges " << totalnummerges << std::endl;
+ numComponents = calc_components_undirected(fglcbsyn,g,componentMap,ccvmap,lcborientmap,sequence2genome);
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_MERGE1:" << now-lasttime << std::endl;
+ lasttime=now;
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.merge1.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ do_write_graphviz(fglcbsyn, std::string("gout.merge1.dot.filtered"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+#ifdef DEBUG
+ //Ensure merge didn't introduce large gaps
+ //origbreaks = breakLCBmincutconnect(componentMap,maskedEdges,g,fglcbsyn,distance,coordinates,seqidxSet,name2vertex,sequence2genome);
+ //std::cerr << "Num orig breaks " << origbreaks << std::endl;
+ //assert(origbreaks==0);
+ //Preceeding step breakLCBmincut and CC should not introduce
+ //bad edges so check predicates
+ lcbidx=0;
+ for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+ if(checkLCBGaps(g,*it,ccvmap,coordinates,distance,sequence2genome)
+ && checkLCBOrient(g,*it,sequence2genome)){}
+ else{
+ //TODO, fix orient check in mincut
+ //property_map < Graph, vertex_label_t >::type labelmap = get(vertex_label,g);
+ //property_map < Graph, vertex_len_t >::type lenmap = get(vertex_len,g);
+ //BitMask longlabelmask=setSpanMask(*it,lenmap,labelmap,sequence2genome);
+ //assert(!checkLCBOrient(g,*it,longlabelmask,sequence2genome));
+ //assert(false);
+ }
+ lcbidx++;
+ }
+#ifdef LCBSTATS
+ //Calculate stats
+ summaryStats(fglcbsyn,componentMap,coordinates,minprintlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome); //using minlength==0
+ if(numc>0){
+ std::cerr << "LCB summary post-merge (" << totalnummerges << " merges) " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ summaryStats(fglcbsyn,componentMap,coordinates,minlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome); //using minlength
+ if(numc>0){
+ std::cerr << "LCB summary post-merge (" << totalnummerges << " merges) " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+#ifdef DEBUG
+ cutattempts+=1000;
+ int newbreaks = breakLCBmincutconnect(componentMap,ccvmap,maskedEdges,g,fglcbsyn,distance,coordinates,seqidxSet,name2vertex,sequence2genome,cutattempts);
+ std::cerr << "Breaks after merge " << newbreaks << std::endl;
+ assert(newbreaks==0);
+ //(7) Remove breakpoints caused by short LCBs
+ unsigned int threshold=shortlcblen;//bp
+ std::cerr << "Masking short lcbs <= length " << threshold << std::endl;
+ unsigned int numremoved=0;
+ std::vector<LCB> currRemovedLCB = componentMap;
+ for(unsigned int k=0;k<componentMap.size();++k){
+ if(componentMap[k].size()>0){
+ assert(componentMap[k].size()>0);
+ //checkLCB(componentMap[k],fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type omap = get(vertex_orient, fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lmap = get(vertex_len, fglcbsyn);
+ unsigned int len = get_LCB_length(componentMap[k],omap,lmap,coordinates,lcbcoords,k,totallen,sequence2genome);
+#ifdef DEBUG
+ std::cerr << "LCB " << k << " len:" << len << std::endl;
+ if(len >=0 && len < threshold){
+ //Remove LCB
+ removeLCB(componentMap[k],breakpoints,maskedLCBs);
+ currRemovedLCB[k].clear();
+ numremoved++;
+ for(LCB::iterator vit = componentMap[k].begin();vit!=componentMap[k].end();++vit){
+ put(vertex_relorder,g,*vit,len);
+ }
+ }
+ }
+ }
+#ifdef DEBUG
+ std::cerr << "Removed " << numremoved << " LCBs (len<" << threshold << ") containing "
+ << maskedLCBs.size() << " vertices" << std::endl;
+ std::cerr << "Remaining LCBs: " << numComponents-numremoved << std::endl;
+ //(8) Update synteny graph
+ //to connect vertices that are adjacent when ignoring short/masked LCBs
+ //be sure to only add good edges to avoid over-merging clusters
+ updateAdjacency(fglcbsyn,
+ g,
+ seqidxSet,
+ coordinates,
+ lcborientmap,
+ distance,
+ maskedEdges,
+ ccvmap,
+ componentMap,
+ sequence2genome);
+#ifdef DEBUG
+ //std::cerr << "Iteration 1 of CC. Num LCBs: " << numComponents << std::endl;
+ do_write_graphviz(g, std::string("gout.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ do_write_graphviz(fglcbsyn, std::string("gout.dot.filtered"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ //
+ //(9) Recalculate breakpoints on updated graph
+ //breakpoints.clear();
+ EdgeSet keepmaskedEdges;
+ for(EdgeSet::iterator it = maskedEdges.begin();it!=maskedEdges.end();++it){
+ Edge e;
+ bool found;
+ tie(e,found) = edge(it->first,it->second,g);
+ assert(found);
+ //BLUE edges are previous cuts
+ if(get(edge_category,g,e)==BLUE){
+ keepmaskedEdges.insert(*it);
+ }
+ }
+#ifdef DEBUG
+ std::cerr << "Keeping " << keepmaskedEdges.size() << " breakpoints" << std::endl;
+ maskedEdges.clear();
+ maskedLCBs.clear();
+ //
+ //Mark breakpoints with short LCBs "masked"
+ markBreakpoints(fglcbsyn,breakpoints,maskedEdges,dummySet,sequence2genome);
+ //markBreakpoints(fglcbsyn,breakpoints,maskedEdges,maskedLCBs,sequence2genome);
+#ifdef DEBUG
+ std::cerr << "Recalc breakpoints. Num:" << breakpoints.size() << std::endl;
+ for(EdgeSet::iterator it = keepmaskedEdges.begin();it!=keepmaskedEdges.end();++it){
+ maskedEdges.insert(*it);
+ }
+ maskedLCBs.clear();
+ numComponents = calc_components_undirected(fglcbsyn,g,componentMap,ccvmap,lcborientmap,sequence2genome);
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_MASKSHORT1:" << now-lasttime << std::endl;
+ lasttime=now;
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.maskshort.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ do_write_graphviz(fglcbsyn, std::string("gout.maskshort.dot.filtered"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ if(numComponents==0){
+ //No components
+ return 0;
+ }
+#ifdef LCBSTATS
+ //Calculate stats
+ summaryStats(fglcbsyn,componentMap,coordinates,minprintlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ assert(avglen>0);
+ assert(numc>0);
+ std::cerr << std::endl;
+ //std::cerr << "Iteration 2 of CC. Num LCBs: " << numComponents << std::endl;
+ std::cerr << "LCB summary post-maskshort+merge ("<< numremoved << " LCBs < " << threshold << ") " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ summaryStats(fglcbsyn,componentMap,coordinates,minlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ if(numc>0){
+ std::cerr << "LCB summary post-maskshort+merge ("<< numremoved << " LCBs < " << threshold << ") " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ itercount=MAXITERS;
+ int nobreaks=0; //number of iterations with no breaks
+ while(itercount>0){
+ /*
+ (10) Break apart components that violate invariants
+ For each component/LCB
+ -Order the component by projecting blocks onto each member
+ sequence in increasing order along the genome
+ -Iterator over the projection checking the distance invariant at each iteration
+ -If the "gap" between the current block previously seen block > distance (prev.max-curr.min>distance)
+ Break/mask all edges that connect the current block with the previous blocks in the ordering.
+ Save the broken edges in maskedEdges
+ */
+#ifdef DEBUG
+ std::cerr << "Breaking LCBs that violate contraints" << std::endl;
+ std::cerr << "Partitioning graph to maintain contraints" << std::endl;
+ cutattempts+=1000;
+ int breaks = breakLCBmincutconnect(componentMap,ccvmap,maskedEdges,g,fglcbsyn,distance,coordinates,seqidxSet,name2vertex,sequence2genome,cutattempts);
+#ifdef DEBUG
+ std::cerr << "Number breaks " << breaks << std::endl;
+ if(breaks==0){
+#ifdef DEBUG
+ std::cerr << "Summary mincut shows no breaks necessary, ending iteration at " << MAXITERS-itercount << std::endl;
+ nobreaks++;
+ if(nobreaks>MAXSTABLE){
+ break;
+ }
+ }
+ //
+ //(11) Recalc CC
+ //
+ numComponents = calc_components_undirected(fglcbsyn,g,componentMap,ccvmap,lcborientmap,sequence2genome);
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_MINCUT2:" << now-lasttime << std::endl;
+ lasttime=now;
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.mincut2.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ do_write_graphviz(fglcbsyn, std::string("gout.mincut2.dot.filtered"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+#ifdef DEBUG
+ int lcbidx=0;
+ //Preceeding step breakLCBmincut and CC should not introduce
+ //bad edges so check predicates
+ for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+ if(checkLCBGaps(g,*it,ccvmap,coordinates,distance,sequence2genome)
+ && checkLCBOrient(g,*it,sequence2genome)){}
+ else{
+ std::cerr << "Bad gap or orient in LCB " << lcbidx << std::endl;
+ //Assert(false);
+ }
+ lcbidx++;
+ }
+ VertexSet shortLCBs;
+#ifdef DEBUG
+ //
+ //Update cc map to match output
+ std::vector<int> xxxc(num_vertices(fglcbsyn));
+ int gvlcbnum=0;
+ int maskedlcbnum=0;
+ for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+ if(it->size()>0){
+ LCB clcb = *it;
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type omap = get(vertex_orient, fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lmap = get(vertex_len, fglcbsyn);
+ unsigned int len = get_LCB_length(clcb,omap,lmap,coordinates,lcbcoords,gvlcbnum,totallen,sequence2genome);
+ if(len>=minlength){
+ for(LCB::iterator vit = it->begin();vit!=it->end();++vit){
+ xxxc[*vit]=gvlcbnum;
+ }
+ gvlcbnum++;
+ }
+ else{
+ maskedlcbnum--;
+ for(LCB::iterator vit = it->begin();vit!=it->end();++vit){
+ xxxc[*vit]=maskedlcbnum;
+ shortLCBs.insert(*vit);
+ }
+ }
+ }
+ }
+ do_write_graphviz(fglcbsyn, std::string("gout.dot.postmincut2"),xxxc,coordinates,maskedEdges,shortLCBs);
+ do_write_graphviz(g, std::string("gout.dot.postmincut2.all"),xxxc,coordinates,maskedEdges,shortLCBs);
+#ifdef LCBSTATS
+ //Calculate stats
+ unsigned int avglen,minlen;
+ int totallen,numc,maxv;
+ summaryStats(fglcbsyn,componentMap,coordinates,minprintlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ assert(avglen>0);
+ assert(numc>0);
+ std::cerr << "LCB summary post-cuts (" << breaks << " cuts) " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ summaryStats(fglcbsyn,componentMap,coordinates,minlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ std::cerr << "LCB summary post-cuts (" << breaks << " cuts) " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ //(12) Mask short LCBs
+ //
+ //Mask short LCBs and fix misorients
+ unsigned int numremoved=0;
+ for(unsigned int k=0;k<componentMap.size();++k){
+ if(componentMap[k].size()>0){
+ assert(componentMap[k].size()>0);
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type omap = get(vertex_orient, fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lmap = get(vertex_len, fglcbsyn);
+ int bplen=0;
+ unsigned int len = get_LCB_length(componentMap[k],omap,lmap,coordinates,lcbcoords,k,bplen,sequence2genome);
+ if(len>=0 && len>=shortlcblen){
+ //TODO, consider if misoriented vertices should be trimmed or kept till end
+ //fixMisOrientedLCBs(g,componentMap[k],maskedLCBs,maskedEdges);
+ }
+ else{
+ //
+ //Mask the LCB
+ removeLCB(componentMap[k],breakpoints,maskedLCBs);
+ numremoved++;
+ }
+ }
+ }
+ //(13) Connect adjacent LCBs after masking short LCBs
+ //
+ //Update synteny graph with short LCBs masked
+ //be sure to only add good edges to avoid over-merging clusters
+ updateAdjacency(fglcbsyn,
+ g,
+ seqidxSet,
+ coordinates,
+ lcborientmap,
+ distance,
+ maskedEdges,
+ ccvmap,
+ componentMap,
+ sequence2genome);
+ //(14) Recalculate breakpoints on updated graph
+ //breakpoints.clear();
+ EdgeSet keepmaskedEdges;
+ for(EdgeSet::iterator it = maskedEdges.begin();it!=maskedEdges.end();++it){
+ Edge e;
+ bool found;
+ tie(e,found) = edge(it->first,it->second,g);
+ assert(found);
+ if(get(edge_category,g,e)==BLUE){
+ keepmaskedEdges.insert(*it);
+ }
+ }
+#ifdef DEBUG
+ std::cerr << "Keeping " << keepmaskedEdges.size() << " breakpoints" << std::endl;
+ maskedEdges.clear();
+ maskedLCBs.clear();
+ //Mark breakpoints on original graph
+ markBreakpoints(fglcbsyn,breakpoints,maskedEdges,dummySet,sequence2genome);
+ //markBreakpoints(fglcbsyn,breakpoints,maskedEdges,maskedLCBs,sequence2genome);
+#ifdef DEBUG
+ std::cerr << "Recalc breakpoints. Num:" << breakpoints.size() << std::endl;
+ for(EdgeSet::iterator it = keepmaskedEdges.begin();it!=keepmaskedEdges.end();++it){
+ maskedEdges.insert(*it);
+ }
+ maskedLCBs.clear();
+ numComponents = calc_components_undirected(fglcbsyn,g,componentMap,ccvmap,lcborientmap,sequence2genome);
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_MASK2:" << now-lasttime << std::endl;
+ lasttime=now;
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.mask2.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ do_write_graphviz(fglcbsyn, std::string("gout.mask2.dot.filtered"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+#ifdef LCBSTATS
+ //Calculate stats
+ summaryStats(fglcbsyn,componentMap,coordinates,minprintlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ if(numc>0){
+ std::cerr << "LCB summary post-maskshort ("<< numremoved << " LCBs < " << threshold << ") " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ summaryStats(fglcbsyn,componentMap,coordinates,minlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ if(numc>0){
+ std::cerr << "LCB summary post-maskshort ("<< numremoved << " LCBs < " << threshold << ") " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ std::cerr << "Merging adjacent LCBs " << std::endl;
+ //
+ //(XX)Attempt to merge LCBs
+ int nummerges=-1;
+ int totalnummerges=0;
+ while(nummerges!=0){
+ nummerges = mergeLCBsGreedy(g,ccvmap,componentMap,lcborientmap,coordinates,maskedEdges,distance,sequence2genome);
+ totalnummerges = totalnummerges+nummerges;
+ //TODO, consider fixing misoriented vertices here
+ //for(unsigned int k=0;k<componentMap.size();++k){
+ //fixMisOrientedLCBs(g,componentMap[k],maskedLCBs,maskedEdges);
+ //}
+ //
+ //(XX)Recalculate connected components
+#ifdef DEBUG
+ std::cerr << "Num merges:" << nummerges << std::endl;
+ numComponents = calc_components_undirected(fglcbsyn,g,componentMap,ccvmap,lcborientmap,sequence2genome);
+#ifdef DEBUG
+ std::cerr << "Recalc components: " << numComponents << std::endl;
+ }
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_MERGE2:" << now-lasttime << std::endl;
+ lasttime=now;
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.merge2.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ do_write_graphviz(fglcbsyn, std::string("gout.merge2.dot.filtered"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+#ifdef DEBUG
+ for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+ if(it->size()>0){
+ LCB clcb = *it;
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type omap = get(vertex_orient, fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lmap = get(vertex_len, fglcbsyn);
+ unsigned int len = get_LCB_length(clcb,omap,lmap,coordinates,lcbcoords,0,totallen,sequence2genome);
+ if(len>=minlength){
+ }
+ else{
+ for(LCB::iterator vit = it->begin();vit!=it->end();++vit){
+ //shortLCBs.insert(*vit);
+ }
+ }
+ }
+ }
+ do_write_graphviz(g, std::string("gout.dot.postimerge."+lexical_cast<std::string>(MAXITERS-itercount)+".all"),ccvmap,coordinates,maskedEdges,shortLCBs);
+#ifdef LCBSTATS
+ //Calculate stats
+ summaryStats(fglcbsyn,componentMap,coordinates,minprintlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ if(numc>0){
+ std::cerr << "LCB summary post-maskshort+merge ("<< totalnummerges << " merges) " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ summaryStats(fglcbsyn,componentMap,coordinates,minlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ if(numc>0){
+ std::cerr << "LCB summary post-maskshort+merge ("<< totalnummerges << " merges) " << numc << " min:" << minlen << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ itercount--;
+ }// end numiters
+ //(9) Remove misoriented vertices
+ VertexSet shortLCBs;
+ shortLCBs.clear();
+ for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+ if(it->size()>0){
+ LCB clcb = *it;
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type omap = get(vertex_orient, fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lmap = get(vertex_len, fglcbsyn);
+ unsigned int len = get_LCB_length(clcb,omap,lmap,coordinates,lcbcoords,0,totallen,sequence2genome);
+ if(len>=shortlcblen){
+ fixMisOrientedLCBs(g,clcb,maskedLCBs,maskedEdges,sequence2genome);
+ }
+ else{
+#ifdef DEBUG
+ for(LCB::iterator vit = it->begin();vit!=it->end();++vit){
+ shortLCBs.insert(*vit);
+ }
+ }
+ }
+ }
+ numComponents = calc_components_undirected(fglcbsyn,g,componentMap,ccvmap,lcborientmap,sequence2genome);
+ //Sanity check to ensure no large gaps
+ //TODO
+ //Try and avoid this condition
+ //Currently remvong LCBs in fixMisOriented may introduce gaps > threshold
+ //so we need to recut here.
+ int breaks=-1;
+ while(breaks!=0){
+ cutattempts += 1000;
+ breaks=breakLCBmincutconnect(componentMap,ccvmap,maskedEdges,g,fglcbsyn,distance,coordinates,seqidxSet,name2vertex,sequence2genome,cutattempts);
+#ifdef DEBUG
+ std::cerr << "Num final breaks " << breaks << std::endl;
+ numComponents = calc_components_undirected(fglcbsyn,g,componentMap,ccvmap,lcborientmap,sequence2genome);
+ }
+#ifdef DEBUG
+ cutattempts+=1000;
+ breaks=breakLCBmincutconnect(componentMap,ccvmap,maskedEdges,g,fglcbsyn,distance,coordinates,seqidxSet,name2vertex,sequence2genome,cutattempts);
+ assert(breaks==0);
+#ifdef LCBSTATS
+ //Calculate stats
+ summaryStats(fglcbsyn,componentMap,coordinates,minprintlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ if(numc>0){
+ std::cerr << "LCB summary final " << numc << " min:" << minlen
+ << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ summaryStats(fglcbsyn,componentMap,coordinates,minlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ if(numc>0){
+ std::cerr << "LCB summary final " << numc << " min:" << minlen
+ << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.dot.final.all"),ccvmap,coordinates,maskedEdges,shortLCBs);
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_MINCUT3:" << now-lasttime << std::endl;
+ lasttime=now;
+#ifdef DEBUG
+ do_write_graphviz(g, std::string("gout.final.dot"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ do_write_graphviz(fglcbsyn, std::string("gout.final.dot.filtered"),ccvmap,coordinates,maskedEdges,maskedLCBs);
+ //
+ //
+ //Optionally remove overlapping LCBs, keep longest LCBs spanning each region
+ int idx=0;
+ //Index of lcb in componentMap
+ lcbidx=0;
+ std::map<int,int> lcboverlapMap; //lcbid->longer_overlapping_lcbid
+ std::set<int> bestLCBs; //set of LCBs that are longest over at least one genomic segment
+ std::map<int,int> lcblenMap; //lcbid->max_seq_span
+ if(removeoverlaps){
+ std::cerr << "Sorting LCBs along each seq. Num seqs " << seqidxSet.size() << std::endl;
+ property_map < Graph, vertex_orient_t >::type orientmap = get(vertex_orient,g);
+ property_map < Graph, vertex_len_t >::type lenmap = get(vertex_len,g);
+ typedef iloc TLoc;
+ LCB::iterator it;
+ std::vector<std::vector<TLoc> > olaplcbs;
+ olaplcbs.resize(seqidxSet.size()+1);
+ for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+ std::cerr << "Looking at LCB " << lcbidx << " of size " << it->size() << std::endl;
+ std::set<Label> currseqs;
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type omap = get(vertex_orient, fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lmap = get(vertex_len, fglcbsyn);
+ for(LCB::iterator lit = it->begin();lit!=it->end();++lit){
+ OrientedLabelSet::iterator it2_end = omap[*lit].end();
+ for(OrientedLabelSet::iterator it2 = omap[*lit].begin();it2!=it2_end;++it2){
+ Label seqidx = it2->first;
+ currseqs.insert(seqidx);
+ }
+ }
+ unsigned int len = get_LCB_length(*it,omap,lmap,coordinates,lcbcoords,lcbidx,totallen,sequence2genome);
+ lcblenMap[lcbidx] = len;
+ for(std::set<Label>::iterator it2 = currseqs.begin();it2!=currseqs.end();++it2){
+ Label seqidx = *it2;
+ assert(lcbcoords.find(std::make_pair(lcbidx,seqidx))!=lcbcoords.end());
+ //Label genomeidx = sequence2genome[seqidx];
+ TLoc t1,t2;
+ t1.first = lcbcoords[std::make_pair(lcbidx,seqidx)].second;
+ t1.second = 0;
+ t1.blocknum=lcbidx;
+ t2.first = lcbcoords[std::make_pair(lcbidx,seqidx)].first;
+ t2.second = 1;
+ t2.blocknum=lcbidx;
+ if(t1.first-t2.first > 0){//{(int)shortlcblen){
+ std::cerr << "lcbidx: " << lcbidx << " sidx: " << seqidx << " " << olaplcbs.size() << std::endl;
+ assert(seqidx<olaplcbs.size());
+ olaplcbs[seqidx].push_back(t1);
+ olaplcbs[seqidx].push_back(t2);
+ }
+ }
+ lcbidx++;
+ idx++;
+ }
+ std::cerr << "Iterating over LCBs to look for overlaps" << std::endl;
+ for(int i=0;i<(int)seqidxSet.size()+1;i++){
+ int open=0;
+ assert(i<(int)olaplcbs.size());
+ std::vector<TLoc> &ait=olaplcbs[i];
+ std::cerr << "Sorting on seq " << i << std::endl;
+ sort(ait.begin(),ait.end(),poscmp<TLoc>());
+ std::cerr << "sorted" << std::endl;
+ std::set<int> currlcbs;
+ for(std::vector<TLoc>::iterator pit = ait.begin();pit!=ait.end();pit++){
+ int currlen = lcblenMap[pit->blocknum];
+ if(pit->second>0){
+ int longestlen = lcblenMap[pit->blocknum];
+ int bestlcb = pit->blocknum;
+ std::cerr << "LCB open " << pit->blocknum << " len " << currlen << std::endl;
+ if(open>0){
+ //in overlap
+ //only remove short LCBs that are overlapped by longer lcbs
+ //if(currlen<shortlcblen){
+ std::cerr << "Number overlaps " << currlcbs.size() << std::endl;
+ for(std::set<int>::iterator cit=currlcbs.begin();cit!=currlcbs.end();++cit){
+ std::cerr << "Overlapping " << *cit << " len " << lcblenMap[*cit] << std::endl;
+ assert(*cit!=pit->blocknum);
+ if(lcblenMap[*cit]>longestlen){
+ longestlen = lcblenMap[*cit];
+ bestlcb = *cit;
+ }
+ //overlapping lcb > current lcb
+ if(lcblenMap[*cit]>currlen){
+ if(lcboverlapMap.find(pit->blocknum)!=lcboverlapMap.end()){
+ //
+ if(lcblenMap[*cit]>lcblenMap[lcboverlapMap[pit->blocknum]]){
+ lcboverlapMap[pit->blocknum] = *cit;
+ }
+ }
+ else{
+ lcboverlapMap[pit->blocknum] = *cit;
+ }
+ }
+ }
+ }
+ open++;
+ assert(currlcbs.find(pit->blocknum)==currlcbs.end());
+ currlcbs.insert(pit->blocknum);
+ bestLCBs.insert(bestlcb);
+ std::cerr << "opened " << pit->blocknum << std::endl;
+ }
+ else{
+ open--;
+ assert(currlcbs.find(pit->blocknum)!=currlcbs.end());
+ assert(currlcbs.size()>0);
+ currlcbs.erase(pit->blocknum);
+ std::cerr << "closed " << pit->blocknum << std::endl;
+ int longestlen = lcblenMap[pit->blocknum];
+ int bestlcb = pit->blocknum;
+ if(open){
+ assert(currlcbs.size()>0);
+ }
+ for(std::set<int>::iterator cit=currlcbs.begin();cit!=currlcbs.end();++cit){
+ if(lcblenMap[*cit]>longestlen){
+ longestlen = lcblenMap[*cit];
+ bestlcb = *cit;
+ }
+ if(lcblenMap[*cit]>currlen){
+ if(lcboverlapMap.find(pit->blocknum)!=lcboverlapMap.end()){
+ if(lcblenMap[*cit]>lcblenMap[lcboverlapMap[pit->blocknum]]){
+ lcboverlapMap[pit->blocknum] = *cit;
+ }
+ }
+ else{
+ lcboverlapMap[pit->blocknum] = *cit;
+ }
+ }
+ }
+ bestLCBs.insert(bestlcb);
+ }
+ }
+ }
+ }
+ lcbidx=0;
+ std::vector<LCB > validLCBs;
+ if(removeoverlaps){
+ for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+ if(bestLCBs.find(lcbidx)==bestLCBs.end()){
+ std::cerr << "LCB idx: " << lcbidx;
+ if(lcboverlapMap.find(lcbidx)!=lcboverlapMap.end()){
+ std::cerr << " overlaps " << lcboverlapMap[lcbidx];
+ }
+ LCB newlcb;
+ newlcb.insert(newlcb.end(),it->begin(),it->end());
+ newlcb.insert(newlcb.end(),componentMap[lcboverlapMap[lcbidx]].begin(),componentMap[lcboverlapMap[lcbidx]].end());
+ validLCBs.push_back(newlcb);
+ }
+ else{
+ validLCBs.push_back(*it);
+ }
+ lcbidx++;
+ }
+ }
+ else{
+ validLCBs = componentMap;
+ }
+#ifdef LCBSTATS
+ //Calculate stats
+ summaryStats(fglcbsyn,validLCBs,coordinates,minprintlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ if(numc>0){
+ std::cerr << "LCB summary final post-processing " << numc << " min:" << minlen
+ << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ summaryStats(fglcbsyn,validLCBs,coordinates,minlength,
+ numc,minlen,totallen,avglen,maxv,sequence2genome);
+ if(numc>0){
+ std::cerr << "LCB summary final post-processing " << numc << " min:" << minlen
+ << " coverage:" << totallen << "(" << (float)totallen/allbps << ")" << " avg_bp:" << avglen/numc << " maxv:" << maxv<< std::endl;
+ }
+ //Write out LCBs
+ //Format is 2 lines per LCB
+ //I seq1 orient1 coords1 ... seqN orientN coordsN
+ //V feat1 feat2 .... featN
+ assert((unsigned int)numComponents==componentMap.size());
+ lcbidx=0;
+ idx=0;
+ unsigned int maxlcblen=0;
+ property_map < Graph, vertex_orient_t >::type vmap = get(vertex_orient,g);
+ //for(std::vector<LCB >::iterator it = componentMap.begin();it!=componentMap.end();++it){
+ for(std::vector<LCB >::iterator it = validLCBs.begin();it!=validLCBs.end();++it){
+ if(it->size()>0){
+ idx++;
+ LCB clcb = *it;
+ unsigned int len;
+ property_map < LCBSynFilterGraph, vertex_orient_t >::type omap = get(vertex_orient, fglcbsyn);
+ property_map < LCBSynFilterGraph, vertex_len_t >::type lmap = get(vertex_len, fglcbsyn);
+ unsigned int nlen = get_LCB_length(clcb,omap,lmap,coordinates,lcbcoords,idx,totallen,sequence2genome);
+ len=nlen;
+ if(len>=minprintlength){
+ maxlcblen = (len>maxlcblen ? len : maxlcblen);
+ unsigned int numcomps=0;
+ if(checkLCBOrient(fglcbsyn,*it,sequence2genome)){
+ if(checkLCBGaps(g,*it,ccvmap,coordinates,distance,sequence2genome)){
+ std::cout << "I ";
+ //
+ //Save mask for the LCB
+ BitMask labelsmask;
+ BitMask orientmask;
+ std::vector<Vertex> badV;
+ tie(labelsmask,orientmask) = setLCBOrient(g,*it,badV,sequence2genome);
+ SeqSet currlabelset;
+ for(LCB::iterator vit = it->begin();vit!=it->end();++vit){
+ currlabelset.insert(vmap[*vit].begin(),vmap[*vit].end());
+ }
+ for(OrientedLabelSet::iterator oit = currlabelset.begin();oit != currlabelset.end();++oit){
+ Label seqidx = oit->first;
+ Label genomeidx = sequence2genome[seqidx];
+ assert(labelsmask.test(genomeidx));
+ std::cout << index2sequence[seqidx] <<" " << (orientmask.test(genomeidx) ? '+' : '-') << " ";
+ std::cout << lcbcoords[std::make_pair(idx,seqidx)].first << "-" << lcbcoords[std::make_pair(idx,seqidx)].second << " ";
+ }
+ std::cout << " ;" << std::endl;
+ std::cout << "V ";
+ for(LCB::iterator vit = it->begin();vit!=it->end();++vit){
+ std::cout << get(vertex_name,fglcbsyn,*vit) << " ";
+ numcomps++;
+ }
+ std::cout << " ;" << std::endl;
+ }
+ else{
+ std::cerr << "SKIPPING LCB:" << idx << " Bad gap" << std::endl;
+ assert(false);
+ }
+ }
+ else{
+ std::cerr << "BAD LCB:" << idx << " Mis-matched lable orientation" << std::endl;
+ assert(false);
+ }
+ }
+ else{
+ std::cerr << "SKIPPING LCB:" << lcbidx << " len:" << nlen << " < " << minprintlength << std::endl;
+ }
+ }
+ lcbidx++;
+ }
+ std::cerr << "Max LCB length " << maxlcblen << std::endl;
+#ifdef TIMING
+ time(&now);
+ std::cerr << "TIME_POSTPROC:" << now-lasttime << std::endl;
+ lasttime=now;
+ return 0;
+//General utilities
+unsigned int getIntervalDist(int s1, int e1, int s2, int e2){
+ //Contained
+ if(s1>s2 && s1<e2){
+ return 0;
+ }
+ else{
+ if(s2>s1 && s2<e1){
+ return 0;
+ }
+ else{
+ if(s1<s2){
+ assert(s2-s1>=0);
+ return (unsigned int)s2-s1;
+ }
+ else{
+ assert(s1-s2>=0);
+ return (unsigned int)s1-s2;
+ }
+ }
+ }
diff --git a/delta-dups.sh b/delta-dups.sh
new file mode 100755
index 0000000..3ea9c3f
--- /dev/null
+++ b/delta-dups.sh
@@ -0,0 +1,21 @@
+#Identify duplicated regions in a pairwise delta file from NUCmer
+#Run delta-filter -b for duplications that are detected using LIS
+$mummerpath/delta-filter -b $deltafile > $deltafile.b
+#Capture additional dup/repeat regions by looking for overlapping alignments
+#Alignments that overlap by more than half their lengths are reports as dups
+$mummerpath/delta-filter -m $deltafile > $deltafile.m
+$mummerpath/delta-filter -v -u 50 $deltafile.m > $deltafile.u
+#Dump union of two sets to maf format
+$mummerpath/delta2maf $deltafile.b 2> /dev/null | $mugsypath/fixMAFnames.pl
+#Skip first line
+$mummerpath/delta2maf $deltafile.u 2> /dev/null | $mugsypath/fixMAFnames.pl | tail -n +1
+rm $deltafile.b &
+rm $deltafile.m &
+rm $deltafile.u &
diff --git a/fixMAFnames.pl b/fixMAFnames.pl
new file mode 100755
index 0000000..318b2a8
--- /dev/null
+++ b/fixMAFnames.pl
@@ -0,0 +1,28 @@
+use strict;
+while(my $line=<STDIN>){
+ if($line =~ /^s\s+(\S+)\:(\S+):\d+-\d+:\d+:[+-]:\d+/){
+ if($1 eq $2){
+ $line =~ s/^s\s+(\S+)\:(\S+):\d+-\d+:\d+:[+-]:\d+/s $1.$1/;
+ }
+ else{
+ $line =~ s/^s\s+(\S+)\:(\S+):\d+-\d+:\d+:[+-]:\d+/s $1.$2/;
+ }
+ }
+ elsif($line =~ /^s\s+(\S+)\:(\S+):\d+:[+-]:\d+/){
+ if($1 eq $2){
+ $line =~ s/^s\s+(\S+)\:(\S+):\d+:[+-]:\d+/s $1.$1/;
+ }
+ else{
+ $line =~ s/^s\s+(\S+)\:(\S+):\d+:[+-]:\d+/s $1.$2/;
+ }
+ }
+ elsif($line =~ /^s\s+\S+\s+/){
+ $line =~ s/^s\s+(\S+)(\s+)/s $1.$1$2/;
+ }
+ print $line;
diff --git a/labelblocks.pl b/labelblocks.pl
new file mode 100755
index 0000000..bfadaf2
--- /dev/null
+++ b/labelblocks.pl
@@ -0,0 +1,59 @@
+use strict;
+my $members=0;
+my $label=0;
+my $blockopen=0;
+my @lines=0;
+while(my $line=<STDIN>){
+ if($line =~ /^a score/){
+ if($blockopen==1){
+ if($members>=1){
+ &labelblocks(\@lines,$members,++$label);
+ }
+ else{
+ print @lines;
+ }
+ }
+ $blockopen=1;
+ $members=0;
+ @lines=();
+ }
+ if($blockopen==1){
+ push @lines,$line;
+ }
+ else{
+ print $line;
+ }
+ if($line =~ /^s\s\S/){
+ $members++;
+ }
+ if($members>1){
+ &labelblocks(\@lines,$members,++$label);
+ }
+ else{
+ print @lines;
+ }
+sub labelblocks{
+ my($lines,$nummembers,$label) = @_;
+ die if($lines[0] !~ /^a score=/);
+ chomp $lines[0];
+ my @orients;
+ for(my $i=1;$i<@lines;$i++){
+ my($orient) = ($lines[$i] =~ /^s\s+\S+\s+\d+\s+\d+\s+([+-])/);
+ push @orients,$orient if(defined $orient);
+ }
+ $lines[0] .= " label=$label ";
+ if($lines[0] !~ /orient/){
+ $lines[0] .= " orient=+ ";
+ }
+ $lines[0] .= "\n";
+ print @lines;
diff --git a/maf2fasta.pl b/maf2fasta.pl
new file mode 100755
index 0000000..b29e41a
--- /dev/null
+++ b/maf2fasta.pl
@@ -0,0 +1,73 @@
+#Convert MAF to FASTA
+#Optionally only convert blocks that contain label
+#./maf2fasta.pl [label] < maf > fasta
+use strict;
+my $currscore;
+my $currlabel;
+my $currcoord;
+my $currorient;
+my $saveblock=0;
+my @matches;
+my @blocks;
+while(my $line=<STDIN>){
+ if($line =~ /a\s+score=([\d\.\-]+)/){
+ if($saveblock>0){
+ my @nmatches = @matches;
+ push @blocks,[$currscore,$currlabel,$currorient,$currcoord,\@nmatches];
+ }
+ ($currscore) = ($line =~ /a\s+score=([\d\.\-]+)/);
+ ($currlabel) = ($line =~ /label=(\d+)/);
+ @matches=();
+ }
+ elsif($line =~ /s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\S+)/){
+ my $accession = $1;
+ my $start = $2;
+ my $len = $3;
+ my $orientation = $4;
+ my $seqlength = $5;
+ my $seq = $6;
+ if($accession =~ /([^\.]+)\.(\S+)/){
+ }
+ else{
+ die if($accession =~ /\./);
+ #$accession = "$accession.$accession";
+ }
+ push @matches,[$accession,$start,$len,$orientation,$seqlength,$seq];
+ $saveblock=1;
+ }
+ else{
+ }
+ my @nmatches = @matches;
+ push @blocks,[$currscore,$currlabel,$currorient,$currcoord,\@nmatches];
+foreach my $block (sort {$a->[3] <=> $b->[3]} @blocks){
+ if($ARGV[0]){
+ if($block->[1] eq $ARGV[0]){
+ &printFASTA(@$block) ;
+ }
+ }
+ else{
+ &printFASTA(@$block) ;
+ }
+sub printFASTA{
+ my($score,$label,$orient,$coord,$matches) = @_;
+ foreach my $m (@$matches){
+ #print ">$m->[0].$label score=$score $m->[1] $m->[2] $m->[3] $m->[4]\n";
+ print ">$m->[0] $m->[1] $m->[2] $m->[3] $m->[4]\n";
+ for(my $i=0;$i<length($m->[5]);$i+=60){
+ print substr($m->[5],$i,60),"\n";
+ }
+ }
+ print "=\n";
diff --git a/maf2gp.pl b/maf2gp.pl
new file mode 100755
index 0000000..014f13d
--- /dev/null
+++ b/maf2gp.pl
@@ -0,0 +1,75 @@
+use strict;
+print "0 0 0\n";
+print "0 0 0\n";
+print "\n\n";
+my @x;
+my $regex = ($ARGV[0] eq '+') ? '\-' : '\+';
+while(my $line=<STDIN>){
+ if($line =~ /^a/){
+ my @p;
+ foreach my $elt (@x){
+ my($acc) = ($elt =~ /s\s+(\w+)/);
+ if($acc eq $ARGV[1]){
+ $p[0] = $elt;
+ }
+ if($acc eq $ARGV[2]){
+ $p[1] = $elt;
+ }
+ }
+ &printpair(@p) if(scalar(@p) ==2);
+ @x = ();
+ }
+ else{
+ if($line =~ /^(s.+)\s+\S+/){
+ push @x,$1;
+ }
+ }
+my @p;
+foreach my $elt (@x){
+ my($acc) = ($elt =~ /s\s+(\w+)/);
+ if($acc eq $ARGV[1] || $acc eq $ARGV[2]){
+ push @p,$elt;
+ }
+&printpair(@p) if(scalar(@p) ==2);
+sub printpair{
+ my($ref,$qry) = @_;
+ my($refa,$refb,$refe,$refo,$reflen) = ($ref =~ /s\s+(\w+)\s+(\d+)\s+(\d+)\s+([\+\-])\s+(\d+)/);
+ my($qrya,$qryb,$qrye,$qryo,$qrylen) = ($qry =~ /s\s+(\w+)\s+(\d+)\s+(\d+)\s+([\+\-])\s+(\d+)/);
+ $refe = $refb + $refe;
+ $qrye = $qryb + $qrye;
+ print "#$ref\n";
+ print "#$qry\n";
+ if($refo eq '+' && $qryo eq '+' && $ARGV[0] ne '-'){
+ print "$refb $qryb 100\n";
+ print "$refe $qrye 100\n\n\n";
+ }
+ elsif($refo eq '+' && $qryo eq '-' && $ARGV[0] eq '-'){
+ $qrye = $qrylen - $qrye;
+ $qryb = $qrylen - $qryb;
+ print "$refe $qrye 100\n";
+ print "$refb $qryb 100\n\n\n";
+ }
+ elsif($refo eq '-' && $qryo eq '+' && $ARGV[0] eq '-'){
+ $refe = $reflen - $refe;
+ $refb = $reflen - $refb;
+ print "$refe $qrye 100\n";
+ print "$refb $qryb 100\n\n\n";
+ }
+ else{
+# print STDERR "$ref\n$qry\n";
+ }
diff --git a/maf2synchain.pl b/maf2synchain.pl
new file mode 100644
index 0000000..684f855
--- /dev/null
+++ b/maf2synchain.pl
@@ -0,0 +1,115 @@
+#Convert MAF to FASTA
+#Optionally only convert blocks that contain label
+#./maf2fasta.pl [label] < maf > fasta
+use strict;
+my $anchors = {};
+my $seq2anchors = {};
+my $seq2index = {};
+my $genome2index = {};
+my $anchornum=-1;
+while(my $line=<STDIN>){
+ if($line =~ /a\s+score=([\d\.\-]+)/){
+ $anchornum++;
+ }
+ elsif($line =~ /s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\S+)/){
+ my $accession = $1; #Must be formated as Genome.Sequence
+ my $start = $2;
+ my $len = $3;
+ my $orientation = $4;
+ my $end;
+ if($orientation eq '-'){
+ $end = $start-$len-1;
+ }
+ else{
+ $end = $start+$len;
+ }
+ my $seqlength = $5;
+ my $sequence;
+ my $genome;
+ if($accession =~ /([^\.]+)\.(\S+)/){
+ $genome=$1;
+ $sequence=$2;
+ }
+ else{
+ die "Accession not in Genome.Sequence format";
+ }
+ #Store index for this accession if first time we've seen it
+ if(!exists $seq2index->{$accession}){
+ $seq2index->{$accession} = scalar(keys %$seq2index);
+ }
+ if(!exists $genome2index->{$genome}){
+ $genome2index->{$genome} = scalar(keys %$genome2index);
+ }
+ $anchors->{$anchornum}->{$accession}->{'gidx'} = $genome2index->{$genome};
+ $anchors->{$anchornum}->{$accession}->{'sidx'} = $seq2index->{$accession};
+ $anchors->{$anchornum}->{$accession}->{'start'} = ($start<$end ? $start:$end);
+ $anchors->{$anchornum}->{$accession}->{'end'} = ($start>$end ? $start:$end);
+ $anchors->{$anchornum}->{$accession}->{'orient'} = $orientation;
+ die "Can't find in $anchornum,$accession" if(! exists $anchors->{$anchornum}->{$accession});
+ $seq2anchors->{$accession}->{$anchornum}++;
+ }
+ else{
+ }
+#Foreach sequence, sort anchors by coordinate and print distance between adjacent coords
+foreach my $accession (sort {$a cmp $b} (keys %$seq2index)){
+ my @sortedanchors = sort {$anchors->{$a}->{$accession}->{'start'} <=> $anchors->{$b}->{$accession}->{'start'}} (keys %{$seq2anchors->{$accession}});
+ my $genome;
+ if($accession =~ /([^\.]+)\.(\S+)/){
+ $genome=$1;
+ }
+ else{
+ die "Accession not in Genome.Sequence format";
+ }
+ for(my $i=0;$i<scalar(@sortedanchors)-1;$i++){
+ my $a1 = $sortedanchors[$i];
+ my $a2 = $sortedanchors[$i+1];
+ die "Can't find in $a1,$accession" if(! exists $anchors->{$a1}->{$accession});
+ die "Can't find in $a2,$accession" if(! exists $anchors->{$a2}->{$accession});
+ my $dist = &getDistance($anchors->{$a1}->{$accession},$anchors->{$a2}->{$accession});
+ print STDERR "Bad coords Accession:$accession a1:$a1 $anchors->{$a1}->{$accession}->{'start'} - $anchors->{$a1}->{$accession}->{'end'} a2:$a2 $anchors->{$a2}->{$accession}->{'start'} - $anchors->{$a2}->{$accession}->{'end'}\n" if($dist < 0);
+ print STDERR "Genome $genome $genome2index->{$genome} missing" if(!exists $genome2index->{$genome});
+ $dist = 0 if($dist<0);
+ print $a1," ",$a2," ", #Anchors
+ $seq2index->{$accession}," ", #Seqindex
+ $dist," ", #Distance between anchors
+ $genome2index->{$genome}," ", #Genomeindex
+ $anchors->{$a1}->{$accession}->{'orient'}," ",$anchors->{$a2}->{$accession}->{'orient'}," ", #Orientation
+ $anchors->{$a1}->{$accession}->{'start'}," ",$anchors->{$a2}->{$accession}->{'start'}," ", #Anchor1 coords
+ $anchors->{$a1}->{$accession}->{'end'}," ",$anchors->{$a2}->{$accession}->{'end'}," ", #Anchor2 coords
+ "\n";
+ }
+sub getDistance{
+ my($anchors1,$anchors2) = @_;
+ return (abs($anchors2->{'start'} - $anchors1->{'end'}));
+ #Short circuit for now
+ if($anchors1->{'orient'} eq '-' && $anchors2->{'orient'} eq '-'){
+ # <e----s| <e----s|
+ return $anchors2->{'end'} - $anchors1->{'start'};
+ }
+ elsif($anchors1->{'orient'} eq '-' && $anchors2->{'orient'} eq '+'){
+ # <e---s| |s---e>
+ return $anchors2->{'start'} - $anchors1->{'start'};
+ }
+ elsif($anchors1->{'orient'} eq '+' && $anchors2->{'orient'} eq '-'){
+ # |s---e> <e---s|
+ return $anchors2->{'end'} - $anchors1->{'end'};
+ }
+ elsif($anchors1->{'orient'} eq '+' && $anchors2->{'orient'} eq '+'){
+ # |s---e> |s---e>
+ return $anchors2->{'start'} - $anchors1->{'end'};
+ }
+ else{
+ die "Bad orientations $anchors1->{'orient'} && $anchors2->{'orient'}";
+ return -1;
+ }
diff --git a/mapping/AlignmentTree.pm b/mapping/AlignmentTree.pm
new file mode 100644
index 0000000..47e5acf
--- /dev/null
+++ b/mapping/AlignmentTree.pm
@@ -0,0 +1,1476 @@
+package AlignmentTree;
+#AlignedIntervalTree is an interval tree with the additions that
+#stored intervals 1) may contain a correspondence map, such as an
+#alignment and 2) can be oriented for DNA sequences.
+#The data structure supports retrieval of corresponding,aligned intervals
+#The data structure also supports discontinuous intervals
+#Each interval in the structure is associated with a single coordinate
+#system or sequence and has an orientation '+','-'
+#The data structure used to represent an interval and an alignment is
+# [seqname2,start2,end2,orientation2,cigarstring2,tag2_0,...tag2_N],...,]
+#Represented in the code as $alignobj = [$alni_1,$alni_2];
+#insert(interval/alignment) - insert an interval or alignment (a series of mapped intervals)
+#find(seq,start,end) - retrieve intervals that overlap start,end on seq
+#intersect(seq,start,end) - retrieve corresponding,aligned intervals that overlap start,end on seq
+#map(seq,start,end) - retrieve intervals that overlap the range specified by any intersecting intervals
+# intersecting intervals are obtained if there exists an alignment that spans start,end on seq
+# Definitions
+# interval
+# alignment - a series of mapped intervals
+use strict;
+use Math::Random qw(random_uniform);
+use POSIX qw(ceil floor);
+use IntervalTree;
+use Bit::Vector;
+use Storable qw(store retrieve);
+#remove only using for translation machinery and revcom
+use Bio::Perl;
+use Bio::DB::Fasta;
+use Bio::Seq;
+use Bio::Tools::CodonTable;
+$Storable::Deparse = 1;
+$Storable::Eval = 1;
+my $DEBUG=0;
+my $QCCHECKS=0;
+my $BITV_SIZE=10000000; #10MB largest single aligned region
+my $aligntoken="WGA";
+#Disallow more than one genetic segment per alignment
+my $nodups=0;
+sub new{
+ my $classname = shift;
+ my $self = {};
+ bless($self,$classname);
+ $self->{_itrees} = {};
+ $self->{_alignments} = {}; #Saved as [alignref,bitvector,align_width]
+ #Support for filtering output using a phylogenetic profile of genomes
+ #Implemented using bitmasks
+ $self->{_maxbits} = 1000;
+ $self->{_doremoveoverlaps}=0;
+ $self->{_bits} = 0;
+ $self->{_bitlookup} = {};
+ $self->{_bitmask} = new Bit::Vector($self->{_maxbits});
+ $self->{_defaultmask} = new Bit::Vector($self->{_maxbits});
+ $self->{_debug}=$DEBUG;
+ return $self;
+sub serialize{
+ my($self,$file) = @_;
+ $self->{_bitmaskstr} = $self->{_bitmask}->to_Enum();
+ $self->{_defaultmaskstr} = $self->{_bitmask}->to_Enum();
+ return Storable::store($self,$file);
+sub deserialize{
+ my($file) = @_;
+ my $atree = Storable::retrieve($file);
+ $atree->{_bitmask} = new Bit::Vector($atree->{_maxbits});
+ $atree->{_bitmask}->from_Enum($atree->{_bitmaskstr});
+ $atree->{_defaultmask} = new Bit::Vector($atree->{_maxbits});
+ $atree->{_defaultmask}->from_Enum($atree->{_bitmaskstr});
+ return $atree;
+#Require output contains one or more tags
+#An example of a tag is a genome name
+sub filter{
+ my($self) = shift;
+ foreach my $tag (@_){
+ if(!exists $self->{_bitlookup}->{$tag}){
+ $self->{_bitlookup}->{$tag} = $self->{_bits}++;
+ }
+ $self->{_bitmask}->flip($self->{_bitlookup}->{$tag});
+ }
+ $self->{_bitmask}->Union($self->{_defaultmask},$self->{_bitmask});
+sub clear_filter{
+ my($self) = shift;
+ $self->{_bitmask}->Empty();
+#Insert an interval or alignment
+# {[seqname,start,end,orientation,cigarstring,tag0,...tagN]},
+# uniquename,
+# tags
+# )
+#A unique identifier for the alignment, uniquename, must be provided
+#seqname - must be a uniquename for the coordinate system containing interval [start,end]
+#start - beginning of interval 0-based
+#end - end of interval 0-based
+#orientation - 2
+#cigarstring - in the UCSC format (#M#S#I#D#X) indicates the continuity of the alignment over the interval
+#tag0...tagN - zero or more tags that can be used by filtering functions. Tags can be specified on either the alignment or the interval
+#Intervals and alignments are stored in a consistent manner. An
+#alignment is a set of intervals with a correspondence map. Single
+#annotated intervals, like genes, are stored as an alignment alignment
+#with only a single interval. The correspondence map is an identity
+#map in this case.
+sub insert{
+ my($self,$alignmentref,$name, at tags) = @_;
+ my $genomelookup = {};
+ my $alignment_bv = new Bit::Vector(1000);
+ my $align_width = 0;
+ die "Bad alignment passed to insert($alignmentref). Alignment needs to be a ref to an array" if(!ref($alignmentref));
+ foreach my $align (@$alignmentref){
+ die "Bad alignment passed to insert($align). Alignment needs to be a ref to an array" if(!ref($align));
+ #print "INSERTING ",join(',',@$align),"\n";
+ my $seqname = $align->[0];
+ my $start = $align->[1];
+ my $end = $align->[2];
+ my $orientation = $align->[3];
+ if($align->[4]){
+ #Check that column count is consistent
+ my ($cigs,$columncount) = &get_cigs($align->[4]);
+ $align_width = $columncount if(!$align_width);
+ if($columncount != $align_width){
+ &printAlignmentDebug($alignmentref,\*STDERR);
+ die "Bad input. Mismatched column count $columncount in $align->[4], expecting $align_width";
+ }
+ }
+ if($orientation =~ /\d/){
+ if($orientation>0){
+ $orientation = '+';
+ }
+ else{
+ $orientation = '-';
+ }
+ }
+ $align->[3]=$orientation;
+ die "Bad orient $orientation ".join(',',@$align)."\n" if($orientation ne '-' && $orientation ne '+');
+ #Store tags in bit vector
+ for(my $i=5;$i<@$align;$i++){
+ my $tag = $align->[$i];
+ if(!exists $self->{_bitlookup}->{$tag}){
+ $self->{_bitlookup}->{$tag} = $self->{_bits}++;
+ }
+ $alignment_bv->Bit_On($self->{_bitlookup}->{$tag});
+ }
+ if(!exists $self->{_itrees}->{$seqname}){
+ $self->{_itrees}->{$seqname} = new IntervalTree($start,$end,$name,$orientation);
+ }
+ else{
+ $self->{_itrees}->{$seqname}->insert($start,$end,$name,$orientation);
+ }
+ }
+ #Store tags in bit vector
+ foreach my $tag (@tags){
+ if(!exists $self->{_bitlookup}->{$tag}){
+ $self->{_bitlookup}->{$tag} = $self->{_bits}++;
+ }
+ #print STDERR "Adding tag $tag on $self->{_bitlookup}->{$tag} $self->{_defaultmask}\n";
+ $alignment_bv->Bit_On($self->{_bitlookup}->{$tag});
+ $self->{_defaultmask}->Bit_On($self->{_bitlookup}->{$tag});
+ }
+# print "Masks ",$self->{_defaultmask}->Norm()," ",$self->{_bitmask}->Norm(),"\n";
+ if(0 && exists $self->{_alignments}->{$name}){
+ print STDERR "Duplicate feature $name already stored. Skipping this one\n";
+ }
+ else{
+ $self->{_alignments}->{$name} = [$alignmentref,$alignment_bv,$align_width];
+ }
+#Find all intersecting alignments in interval (query.start,query.end) from query.seqname
+#Returns (start,end) coordinates on seqname of all matching alignments
+#returns [alignname,seqname,start,end,coverage,pid,queryorient,matchorient]
+#4-coverage is number of corresponding characters between start,end
+#5-pid is number of identical characters between start,end
+#6-queryorient is orientation of the matching aligned query interval query.seqname:query.start-query.end
+#7-matchorient is orientation of the matching aligned interval seqname:start-end
+sub intersect{
+ my($self,$qseqname,$qstart,$qend, at qtags) = @_;
+ my @results;
+ #$self->filter(@qtags);
+ if(exists $self->{_itrees}->{$qseqname}){
+ print "Querying $qseqname:$qstart,$qend with qtags $qtags[0]\n" if($self->{_debug});
+ #(1) Find all intersecting features on [$qstart,$qend]
+ #returns IntervalTree::intersect returns an array of interval names
+ my @alignments = $self->{_itrees}->{$qseqname}->intersect($qstart,$qend);
+ #Optionally remove fully nested intervals
+ if($self->{_doremoveoverlaps}){
+ @alignments = $self->removeOverlaps(\@alignments,$qseqname);
+ }
+ foreach my $align_name (@alignments){
+ die "Overlapping interval $align_name not found" if(! exists $self->{_alignments}->{$align_name});
+ my($alignobj,$alignment_bv,$align_width) = @{$self->{_alignments}->{$align_name}};
+ if($align_name =~ /$qtags[0]/){
+ print "Overlapping feature $align_name\n" if($self->{_debug});
+ print "MATCH $align_name query:$qseqname $qstart-$qend . Number of seqs ",scalar(@$alignobj),"\n" if($self->{_debug});
+ #(2) Crop interval [$qstart,$qend] to the alignment
+ my ($qmstart,$qmend,$queryorient) = &matchinginterval($alignobj,$qseqname,$qstart,$qend);
+ if(!defined $qmstart || !defined $qmend){
+ #Error condition
+ print "WARNING. print unexpected overlapping alignments for query $qstart,$qend on $qseqname\n";
+ foreach my $align_name2 (@alignments){
+ my($alignobj2,$alignment_bv2,$align_width2) = @{$self->{_alignments}->{$align_name2}};
+ foreach my $alni2 (@$alignobj2){
+ print "$align_name2 $alignobj2 ",join(' ',@$alni2),"\n";
+ }
+ }
+ die "Bad overlapping alignments";
+ }
+ die if($qmstart<$qstart);
+ die if($qmend>$qend);
+ if($qmstart==$qmend){
+ next;
+ }
+ die "Invalid matching interval coords:$qmend-$qmstart from query $qstart-$qend\n" if($qmend<=$qmstart);
+ my $queryspancheck=0;
+ if($qstart == $qmstart && $qend == $qmend){
+ print "Alignment fully spans query\n" if($self->{_debug});
+ }
+ else{
+ if($qstart != $qmstart && $qend != $qmend){
+ print "Query fully spans alignment\n" if($self->{_debug});
+ $queryspancheck=1;
+ }
+ }
+ print "ISECT: $align_name QUERY:$qseqname $qstart-$qend mapped:$qmstart-$qmend len:",$qmend-$qmstart,"\n" if($self->{_debug});
+ #(3) Convert from genomic coords to alignment column. 1->alignment_width
+ my ($qcolumnstart,$qcolumnend,$querybv) = &coordstocolumn($alignobj,$qseqname,$qmstart,$qmend);
+ #$querybv stores a bitmatrix from $qcolumnstart-$qcolumnend indicating if sequence $seqname is aligned in the interval
+ print "MAPPED $qseqname:$qmstart-$qmend len:",$qmend-$qmstart,
+ " to column coords $qseqname:$qcolumnstart-$qcolumnend len:",$qcolumnend-$qcolumnstart+1,"\n" if($self->{_debug});
+ print "Transform $qseqname:$qmstart-$qmend to column coords $qcolumnstart-$qcolumnend\n" if($self->{_debug});
+ foreach my $alni (@$alignobj){
+ die "Invalid zero length matching interval $alni->[2]-$alni->[1]\n" if($alni->[2] - $alni->[1]<=0);
+ print "Converting $align_name $alni->[0]:$alni->[1]-$alni->[2] from $qseqname:$qmstart-$qmend using column coords $qcolumnstart-$qcolumnend\n" if($self->{_debug});
+ my ($mseq,$malign_start,$malign_end,$morient) = @$alni;
+ print "ALNI: $mseq,$malign_start,$malign_end,$morient\n" if($self->{_debug});
+ die if(@$alni>5);
+ #(4) Crop aligned feature. Convert back from alignment column to genomic coords on $mseq
+ #$currbv stores a bitmatrix from $qcolumnstart-$qcolumnend indicating if sequence $mseq is aligned in the interval
+ my($s,$e,$currbv) = &columntocoords($alni,$qcolumnstart,$qcolumnend,$querybv);
+ #if($mseq eq $qseqname && $nodups){
+ #die if($s != $qmstart);
+ #die if($e != $qmend);
+ #die if($morient ne $queryorient);
+ #}
+ #Check the actual number of aligned columns
+ my $pid=0;
+ #if($mseq eq $qseqname && $nodups){
+ #my ($qs1,$qe1) = &coordstocolumn($alignobj,$qseqname,$s,$e);
+ #print "Checking for matching characters between col:$qcolumnstart-$qcolumnend $qs1-$qe1 coords:$s-$e\n" if($self->{_debug});
+ #}
+ my $intersectbv = new Bit::Vector($querybv->Size());
+ $intersectbv->Intersection($querybv,$currbv);
+ for(my $i=$qcolumnstart;$i<=$qcolumnend;$i++){
+ if($intersectbv->bit_test($i)==1){
+ $pid++;
+ }
+ }
+ print "Intersect matches: ",$intersectbv->Norm(),"\n" if($self->{_debug});
+ print "Query matches: ",$querybv->Norm(),"\n" if($self->{_debug});
+ print "Matches in the interval $qcolumnstart-$qcolumnend:$pid\n" if($self->{_debug});
+ if($e-$s>0){
+ die "Bad pid" if($pid==0);
+ die "Invalid zero length matching interval $e-$s\n" if($e-$s<=0);
+ print "($qcolumnend-$qcolumnstart) - ($qmend-$qmstart)\n" if($self->{_debug});
+ my $numgaps_query = ($qcolumnend-$qcolumnstart+1) - ($qmend-$qmstart);
+ my $querypid = ($qcolumnend-$qcolumnstart+1) - $numgaps_query;
+ my $coverage = $e-$s;
+ print "Num_query_gaps=$numgaps_query\nNum_qry_matches=$querypid\nNum_hit_matches=$pid\n" if($self->{_debug});
+ die "$pid<1" if($pid<1);
+ die "$pid>$coverage" if($pid>$coverage);
+ print "RESULT: $align_name,$mseq,$s,$e,$coverage,$pid\n" if($self->{_debug});
+ #Intersect result is a $alni,$coverage,$pid
+ push @results,[$align_name,$mseq,$s,$e,$coverage,$pid,$queryorient,$morient];
+ }
+ else{
+ #Entirely contained within a gap
+ die "Bad pid" if($pid !=0);
+ die "Bad coordinates $align_name,$mseq,$s,$e\n" if($e<$s);
+ print "NORES: Skipping $align_name,$mseq,$s,$e. Mapped in a gap\n" if($self->{_debug});
+ }
+ }
+ }
+ else{
+ print "Skipping $align_name does not match $qtags[0]\n" if($self->{_debug});;
+ }
+ }
+ }
+ else{
+ #Skip, nothing to find
+ if($self->{_debug}){
+ print "Interal trees for ",join(',',keys %{$self->{_itrees}}),"\n";
+ print "No interval tree for sequence [$qseqname] $self->{_itrees}->{$qseqname}\n";
+ }
+ }
+ #$self->clear_filter();
+ return @results;
+#Map a coordinate (query.start,query.end) from query.seqname to
+#intersecting alignments on match_i.seqname..match_j.seqname with
+#coordinates (match_i.start,match_i.end...match_j.start,match_j.end)
+#returns [match_name,$mseq,$mstart,$mend,$mcoverage,align_name,seq_name,$coveraged]
+sub map{
+ my($self,$qseqname,$qstart,$qend, at qtags) = @_;
+ my @results;
+ if(exists $self->{_itrees}->{$qseqname}){
+ print "Finding WGA alignments on $qseqname,$qstart,$qend\n" if($self->{_debug});
+ #(1)Retrieve all the alignments on genomic coords qstart-qend
+ $self->{_doremoveoverlaps}=1;
+ my @isects = $self->intersect($qseqname,$qstart,$qend,$aligntoken);
+ $self->{_doremoveoverlaps}=0;
+ print "FOUND ", scalar(@isects)," alignments\n" if($self->{_debug});
+ #Currently assuming non overlapping alignments and the total
+ #coverage,pid must be less than the query length $qend-$qstart
+ my $totalqcoverage=0;
+ my $totalqid=0;
+ my $qcoverage=undef;
+ my $qmstart=undef;
+ my $qmend=undef;
+ my $qmorient=undef;
+ #(2)Determine the min-max spanning interval over all matching alignments to the query
+ #intersect() already provides the query interval [$qstart,$qend] crop to the overlapping alignment(s)
+ foreach my $isectn (@isects){
+ my($align_name,$seq,$start,$end,$coverage,$pid,$qorient,$orient) = @$isectn;
+ print "Looking for $qseqname in $align_name,$seq,$start,$end,$coverage,$pid\n" if($self->{_debug});
+ if($seq eq $qseqname && $end<=$qend && $start>=$qstart){
+ die "Mismatched orient $qmorient != $orient" if($qorient ne $orient);
+ die "$end>$qend" if($end>$qend);
+ die "$start<$qstart" if($start<$qstart);
+ if(defined $qmstart || defined $qmend){
+ print "#Duplicate $seq already found in $align_name. Multiple alignments spanning query\n" if($self->{_debug});
+ $qcoverage=$coverage;
+ $totalqcoverage+=$coverage;
+ $totalqid+=$pid;
+ $qmstart=$start<$qmstart ? $start : $qmstart;
+ $qmend=$end>$qmend ? $end : $qmend;
+ if(defined $qmorient && $orient ne $qmorient){
+ #print "WARNING multiple matching alignments to $qseqname,$qstart,$qend with inconsistent orientations. $align_name:$orient ne $qmorient\n";
+ $qmorient='?';
+ }
+ else{
+ $qmorient=$orient;
+ }
+ }
+ else{
+ die if(defined $qmorient);
+ $qcoverage=$coverage;
+ $totalqcoverage+=$coverage;
+ $totalqid+=$pid;
+ $qmstart=$start;
+ $qmend=$end;
+ $qmorient=$orient;
+ }
+ }
+ }
+ #
+ #(3)Map the spanning interval [$qmstart,$qmend] to the rest of the sequences in the alignment
+ foreach my $isectn (@isects){
+ die if(!defined $qcoverage);
+ die if(!defined $qmstart || !defined $qmend);
+ my($align_name,$seq,$start,$end,$coverage,$pid,$qaln_orient,$aln_orient) = @$isectn;
+ my($alignobj,$alignment_bv,$align_width) = $self->getAlignment($align_name);
+ my($qfstart,$qfend,$qforient) = &matchinginterval($alignobj,$qseqname,$qmstart,$qmend);
+ my ($qfscolumnstart,$qfscolumnend,$fsquerybv) = &coordstocolumn($alignobj,$qseqname,$qfstart,$qfend);
+ #print "#Mapping with alignment $align_name $seq $start-$end cov:$coverage,pid:$pid,qaln_orient:$qaln_orient,aln_orient:$aln_orient\n";
+ #Query coverage should correspond to interval start-end
+ die if($coverage != ($end-$start));
+ die "$qaln_orient ne $qmorient" if($qmorient ne '?' && $qaln_orient ne $qmorient);
+ print "$align_name:$seq $start,$end ",$end-$start," query_start:$qstart query_end:$qend query_coverage:$qcoverage ",$qcoverage/($qend-$qstart),"\n" if($self->{_debug});
+ #(4)Find features in the mapped interval
+ my @misects = $self->intersect($seq,$start,$end,"gene");
+ foreach my $fisectn (@misects){
+ my($fname,$fseq,$fstart,$fend,$fcoverage,$fpid,$forient1,$forient2) = @$fisectn;
+ #Need intersection of $start,$fend $qmstart,$qmend to get proper $pid and $cov
+ my ($fscolumnstart,$fscolumnend,$fsbv) = &coordstocolumn($alignobj,$seq,$fstart,$fend);
+ #die "$fscolumnstart != $qfscolumnstart" if($fscolumnstart != $qfscolumnstart);
+ #die "$fscolumnend != $qfscolumnend" if($fscolumnend != $qfscolumnend);
+ my $ipid=0;
+ my $intersectbv = new Bit::Vector($fsquerybv->Size());
+ $intersectbv->Intersection($fsquerybv,$fsbv);
+ for(my $i=$fscolumnstart;$i<=$fscolumnend;$i++){
+ if($intersectbv->bit_test($i)==1){
+ $ipid++;
+ }
+ }
+ die "Bad number of matching columns $ipid>($fend-$fstart) $fscolumnstart-$fscolumnend" if($ipid>($fend-$fstart));
+ die if($forient1 ne $forient2);
+ die if($seq ne $fseq);
+ if($fseq eq $qseqname && $nodups){
+ die "$fstart<$qmstart query:$seq,$start,$end $fname,$fseq,$fstart,$fend,$fcoverage,$fpid" if($fstart<$qmstart);
+ die "$fend>$qmend query:$seq,$start,$end $fname,$fseq,$fstart,$fend,$fcoverage,$fpid" if($fend>$qmend);
+ }
+ print "Adding result $fname,$fseq,$fstart,$fend,$fcoverage,$align_name,$seq,$qcoverage,$fpid\n" if($self->{_debug});
+ #push @results,[$fname,$fseq,$fstart,$fend,$coverage,$align_name,$seq,$fcoverage,$ipid,$isectn,$qaln_orient,$aln_orient,$forient1];
+ #Determine span on query
+ my ($qfsstart,$qfsend) = &columntocoords($self->getAlignedInterval($align_name,$qseqname,$qfstart,$qfend),$fscolumnstart,$fscolumnend);
+ push @results,[$fname,$fseq,$fstart,$fend,$qfsend-$qfsstart,$align_name,$seq,$fcoverage,$ipid,$isectn,$qaln_orient,$aln_orient,$forient1];
+ }
+ print "Finished mapping alignment $align_name\n" if($self->{_debug});
+ }
+ if($totalqcoverage>($qend-$qstart)){
+#TODO die on bad coverage die "Bad coverage $totalqcoverage>($qend-$qstart) ".($qend-$qstart) if($totalqcoverage>($qend-$qstart));
+ $totalqcoverage=($qend-$qstart);
+ }
+ if($totalqid>($qend-$qstart)){
+#TODO die "Bad identity ($totalqid>($qend-$qstart) " if($totalqid>($qend-$qstart));
+ $totalqid = ($qend-$qstart);
+ }
+ }
+ return @results;
+#TODO, optimize this retrieval
+sub matchinginterval{
+ my($alignobj,$qseqname,$qstart,$qend) = @_;
+ my $start=undef;
+ my $end=undef;
+ my $orient=undef;
+ print "QUERYING $alignobj for $qseqname $qstart-$qend\n" if($DEBUG);
+ foreach my $alni (@$alignobj){
+ if($alni->[0] eq $qseqname){
+ print "HIT on $alni->[0] query=$qseqname:$qstart-$qend ; interval=$qseqname:$alni->[1]-$alni->[2] $alni->[3]\n" if($DEBUG);
+ if(($qstart < $alni->[1] && $qend < $alni->[1]) || ($qstart > $alni->[2] && $qend > $alni->[2])){
+ if($nodups){
+ print "WARNING: Invalid matching interval. Alignment interval $alni->[0]:$alni->[1]-$alni->[2] not contained in interval $qseqname:$qstart-$qend\n";
+ &printAlignmentDebug($alignobj);
+ #return ($start,$end,$orient);
+ }
+ next;
+ }
+ $start = $qstart < $alni->[1] ? $alni->[1] : $qstart;
+ $end = $qend < $alni->[2] ? $qend : $alni->[2];
+ if(defined $orient && $orient ne $alni->[3]){
+ print "WARNING multiple matching alignments to $qseqname,$qstart,$qend with inconsistent orientations. $orient ne $alni->[3]\n";
+ &printAlignmentDebug($alignobj);
+ die "Multiple copies of a sequence per alignment not supported";
+ }
+ else{
+ $orient = $alni->[3];
+ }
+ last;
+ }
+ else{
+ print "Checked $alni->[0] in obj size ",scalar(@$alignobj),"\n" if($DEBUG);
+ }
+ }
+ die "Cannot find $qseqname,$qstart,$qend in alignment. Returned ($start,$end,$orient)" if(!defined $start || !defined $end);
+ return ($start,$end,$orient);
+#Determine column indices in an alignment matrix
+#that correspond to interval $coord1-$coord2 on sequence $qseqname
+#The alignment matrix is assumed to have one sequence per row with no
+#sequence appearing more than oncequerybv
+#The format is
+#@$alignobj = [seqname,start,end,orient,cigar]
+#start<end is specified in 0 start interbase coordinates
+#cigar specifies the continuity of the alignment of the interval
+#when orient=='-' the cigar string specifies the alignment end->start
+#otherwise the cigar string species the alignment start->end
+#Column coordinates are 1 start, numbering bases/columns in an alignment matrix
+#Genomic coordinates are 0 start, interbase
+sub coordstocolumn{
+ my($alignobj,$qseqname,$coord1,$coord2,$skipbv) = @_;
+ die "Expecting 0 start, interbase coordinates $coord1<=$coord2" if($coord1>=$coord2);
+ #Column position in the alignment
+ #Starting from position 1
+ my $columnstart;
+ my $columnend;
+ #Bit vector keeps track of aligned columns
+ #Starting at column 1, column index 0 is ignored
+ my $querybv;
+ if(! $skipbv){
+ $querybv = new Bit::Vector($BITV_SIZE); #setting max aligned interval at 10MB
+ }
+ my $alnwidth;
+ foreach my $alni (@$alignobj){
+ if($alni->[0] eq $qseqname){
+ if($coord1<$alni->[1] || $coord1>$alni->[2]){
+ if($nodups){
+ &printAlignmentDebug($alignobj);
+ die "Start position $coord1 is not contained in interval $qseqname:$alni->[1]-$alni->[2]";
+ }
+ next;
+ }
+ if($coord2<$alni->[1] || $coord2>$alni->[2]){
+ if($nodups){
+ &printAlignmentDebug($alignobj);
+ die "End position $coord2 is not contained in interval $qseqname:$alni->[1]-$alni->[2]";
+ }
+ next;
+ }
+ my $offsetstart;
+ my $offsetend;
+ my $orient = $alni->[3];
+ print "COORDSTOCOLUMN for seq $qseqname and coordinates $coord1-$coord2 and orient $orient\n" if($DEBUG);
+ die "Bad orient: $orient\n" if($orient ne '-' && $orient ne '+');
+ #$alignobj is a collinear segment
+ #Determing offsets into the aligned segment
+ #$alni->[1] |-----------------| $alni->[2]
+ #coord1 |------| coord2
+ #+ strand
+ # ------| offsetstart
+ # -------------| offsetend
+ #Cigar string is relative to $alni->[1] --> $alni->[1]
+ #Columnstart,end are relative to $alni->[1]
+ #- strand
+ # |----- offsetstart
+ # |------------ offsetend
+ #Cigar string is relative to $alni->[2] --> $alni->[1]
+ #Columnstart,end are relative to $alni->[2]
+ if($orient eq '-'){
+ #offset is from end ($coord2) of segment match
+ # A T G C A T
+ #0 1 2 3 4 5 6
+ # |---------| i->[1] -> i[2] == 1-6
+ # |-----| coord1 -> coord2 == 2-5
+ # * offsetstart = 6-5+1 == 2; column 2 in the alignment matrix
+ # X X M M M X alignment matrix
+ # * offsetend = 6-2 == 4; column 4 in the alignment matrix
+ #0 1 2 3 4 5 6
+ # A T G C A T
+ $offsetstart = $alni->[2] - $coord2+1;
+ $offsetend = $alni->[2] - $coord1;
+ die "$offsetstart = $alni->[2] - $coord2+1; $offsetend = $alni->[2] - $coord1;" if($offsetend < $offsetstart);
+ }
+ else{
+ #offset is from beginning ($coord1) of segment match
+ $offsetstart = $coord1 - $alni->[1]+1;
+ $offsetend = $coord2 - $alni->[1];
+ die "$offsetstart < $offsetend " if($offsetend < $offsetstart);
+ }
+ die if($offsetstart<1);
+ die if($offsetend>($alni->[2]-$alni->[1]));
+ #If cigar string
+ if($alni->[4]){
+ print "coordstocolumn using cig $alni->[0] $alni->[4]\n" if($DEBUG);
+ print "Looking for offset $offsetstart-$offsetend:$orient in match $alni->[1]-$alni->[2] of length ",
+ $alni->[2]-$alni->[1]," orient:$orient\n" if($DEBUG);
+ my ($cigs,$columncount) = &get_cigs($alni->[4]);
+ $alnwidth=$columncount;
+ die "$offsetend>$columncount. Check cigar string $alni->[4], appears to be incorrect length for $coord1-$coord2" if($offsetend>$columncount);
+ my $currcount2=0;
+ foreach my $c2 (@$cigs){
+ my($count2,$char2) = @$c2;
+ if(!$skipbv && $char2 eq 'M'){
+ #|* |$currcount2
+ #| * |$currcount2+1
+ #| * |$currcount2+$count2
+ #| MMMXX |3M,$count2==3
+ #| 11100 |bitvector
+ $querybv->Interval_Fill($currcount2+1,$currcount2+$count2);
+ }
+ $currcount2+=$count2;
+ }
+ my $matches=0;
+ my $currcount=0;
+ my $foundstart=0;
+ my $foundend=0;
+ foreach my $c (@$cigs){
+ my($count,$char) = @$c;
+ if($char eq 'M'){
+ #0 1 2 3 4 5 6
+ # |---------| i->[1] -> i[2] == 1-6
+ # * offsetstart
+ # * offsetend
+ # X X M M M X alignment matrix
+ # * matches
+ #0 1 2 3 4 5 6
+ # A T G C A T
+ if($count+$matches>=$offsetstart){
+ if(!$foundstart){
+ print "FOUNDSTART $currcount $matches $count$char\n" if($DEBUG>1);
+ $columnstart = $currcount+($offsetstart-$matches);
+ print "columnstart=$columnstart\n" if($DEBUG>1);
+ $foundstart=1;
+ }
+ }
+ if($count+$matches>=$offsetend){
+ if(!$foundend){
+ print "FOUNDEND $currcount $matches $count$char\n" if($DEBUG>1);
+ #$columnend=$currcount+($offsetend-$matches)-1;
+ $columnend=$currcount+($offsetend-$matches);
+ print "columnend=$columnend\n" if($DEBUG>1);
+ $foundend=1;
+ last;
+ }
+ }
+ $matches+=$count;
+ $currcount+=$count;
+ }
+ elsif($char eq 'X'){
+ $currcount+=$count;
+ }
+ }
+ die "Could not find start or end " if(!$foundstart || !$foundend);
+ die if($columnstart<1);
+ die if($columnend>$columncount);
+ }
+ else{
+ #No cigar string
+ #Assume interval aligns at its entire length
+ $columnstart=$offsetstart;
+ $columnend=$offsetend;
+ if(!$skipbv){
+ $querybv->Interval_Fill($columnstart,$columnend);
+ }
+ }
+ last;
+ }
+ }
+ if(!defined $columnstart){
+ #$columnstart=1;
+ }
+ if(!defined $columnend){
+ #$columnend=$alnwidth;
+ }
+ die "Can't map $alignobj,$qseqname,$coord1,$coord2 to $columnstart-$columnend" if(!defined $columnstart || !defined $columnend);
+ return ($columnstart,$columnend,$querybv);
+#Maps the alignment columns $columnstart-$columnend to genomic coordinates
+#In the case where the specified columns map to gaps, the coordinate corresponding to the
+#next matching column is returned
+#Column coordinates are 1 start, numbering bases,gaps/columns in an alignment matrix
+#Genomic coordinates are 0 start, interbase
+sub columntocoords{
+ my($aln,$columnstart,$columnend,$querybv) = @_;
+ die "Columnstart $columnstart must be >= 1. aln:$aln" if($columnstart<1);
+ #columnstart is start relative to beginning of alignment matrix
+ #columnend is end relative to beginning of alignment matrix
+ # -----| columnstart
+ # ------------| columnend
+ #Alignment $aln is in genomic coordinates
+ #start,end are mapped genomic coordinates for aln relative to offsets specified by $columnstart,$columnend
+ my $start;
+ my $end;
+ #Bit vector keeps track of aligned columns
+ #Starting at column 1
+ if(! defined $querybv){
+ $querybv = Bit::Vector->new($BITV_SIZE);
+ $querybv->Interval_Fill($columnstart,$columnend);
+ }
+ my $currbv = Bit::Vector->new($querybv->Size());
+ my $orient = $aln->[3];
+ die "Bad orient $orient\n" if($orient ne '-' && $orient ne '+');
+ #Check if there is a cigar string
+ print "Converting for alni with orient $orient\n" if($DEBUG);
+ if(length($aln->[4])>0){
+ print "columntocoords using cig $aln->[0] $aln->[4]\n" if($DEBUG > 1);
+ my ($cigs,$count) = &get_cigs($aln->[4]);
+ die "Columnstart $columnstart must be >= 1. aln:$aln" if($columnstart<1);
+ die "Columnend $columnend > cigar count $count. Check cigar string $aln->[4] for $aln->[1]-$aln->[2]" if($columnend>$count);
+ print "Looking for offset $columnstart-$columnend:$orient in match $aln->[1]-$aln->[2] of length ",$aln->[2]-$aln->[1]," orient:$orient\n" if($DEBUG > 1);
+ my $matches=0;
+ my $currcount=0;
+ my $foundstart=0;
+ my $foundend=0;
+ my $startcount;
+ my $currcount2=0;
+ foreach my $c2 (@$cigs){
+ my($count2,$char2) = @$c2;
+ if($char2 eq 'M'){
+ $currbv->Interval_Fill($currcount2+1,$currcount2+$count2);
+ }
+ $currcount2+=$count2;
+ }
+ foreach my $c (@$cigs){
+ my($count,$char) = @$c;
+ print "Analyzing $count$char\n" if($DEBUG>1);
+ if($count+$currcount>=$columnstart){
+ if(!$foundstart){
+ print "FOUNDSTART $currcount $matches $count$char\n" if($DEBUG > 1);
+ if($char eq 'M'){
+ print "START IN MATCH $orient\n" if($DEBUG > 1);
+ #|----*------ columnstart
+ # --| currcount
+ # MMMM count columnstart-currcount=number of contributing matches in current cig
+ if($orient eq '-'){
+ $start = $aln->[2]-$matches-($columnstart-$currcount)+1;
+ }
+ else{
+ #Start is alignment start (s1) + matching columns + number of matches in current cigar
+ $start = $aln->[1]+$matches+($columnstart-$currcount)-1;
+ }
+ }
+ else{
+ #Report NEXT matching position
+ print "START IN GAP $orient\n" if($DEBUG > 1);
+ #No match to $->[0]; in gap
+ #Report next matching position
+ #in the alignment between query and current sequence
+ #(the next matching position past the gap)
+ #|----*------ columnstart
+ # --| currcount
+ # XXXX count
+ #Need to account for case where
+ #next matching position in current sequence is a gap in the query
+ #Use the bit vectors to find next matching position between current seq and query
+ my $intersectbv = new Bit::Vector($querybv->Size());
+ $intersectbv->Intersection($querybv,$currbv);
+ die if($QCCHECKS && $currbv->bit_test($currcount+1));
+ die if($QCCHECKS && $intersectbv->bit_test($currcount+1));
+ my($imin,$imax) = $intersectbv->Interval_Scan_inc($currcount+1);
+ my($cmin,$cmax) = $currbv->Interval_Scan_inc($currcount+1);
+ if(! defined $cmin || ! defined $imin){
+ print "INGAP Can't find matching position in $aln->[0] > columnstart:$columnstart. Returning no mapping\n" if($DEBUG > 1);
+ if($orient eq '-'){
+ $start = $aln->[2]-$matches;
+ }
+ else{
+ #Start is alignment start (s1) + matching columns
+ $start = $aln->[1]+$matches-1;
+ }
+ #return ($aln->[1],$aln->[1],$currbv);
+ return ($start,$start,$currbv);
+ }
+ if($imin>$columnend){
+ #|-------*MMMMMMM*-----------| Query columnstart -> columnend
+ # * * columnstart,columnend
+ # * cmin>columnend
+ #|----*XXXXXXXXXXXXXXMM------|
+ #
+ print "INGAP Alignment occurs entirely within a gap $cmin>$columnend\n" if($DEBUG > 1);
+ return ($aln->[1],$aln->[1],$currbv);
+ }
+ die "$imin,$imax $cmin,$cmax" if($imin<$cmin);
+ #num matching bits set between
+ #$imin-$cmin is number of matches in current seq until the
+ #next matching position in the query
+ my $nummatches=0;
+ if($cmin>0){
+ die "Bad match index after scan $cmin" if($QCCHECKS && $currbv->bit_test($cmin)!=1);
+ }
+ #TODO replace with Interval_Scan
+ for(my $i=$cmin;$i<=$imin;$i++){
+ if($currbv->bit_test($i)){
+ $nummatches++;
+ }
+ }
+ die if($cmin==0 && $nummatches!=0);
+ print "INGAP $nummatches matches until next match between query and $aln->[0] between columns $cmin-$imin\n" if($DEBUG > 1);
+ if($orient eq '-'){
+ $start = $aln->[2]-$matches-$nummatches+1;
+ }
+ else{
+ #Start is alignment start (s1) + matching columns
+ $start = $aln->[1]+$matches+$nummatches-1;
+ }
+ }
+ print "START genomic=$start\n" if($DEBUG > 1);
+ die if($start<$aln->[1]);
+ die if($start>$aln->[2]);
+ $startcount = $currcount+($columnstart-$currcount);
+ $foundstart=1;
+ }
+ if($count+$currcount>=$columnend){
+ if(!$foundend){
+ print "FOUNDEND $currcount $matches $count$char\n" if($DEBUG > 1);
+ if($char eq 'M'){
+ print "END IN MATCH $orient\n" if($DEBUG > 1);
+ if($orient eq '-'){
+ $end = $aln->[2]-$matches-($columnend-$currcount);
+ }
+ else{
+ $end = $aln->[1]+$matches+($columnend-$currcount);
+ }
+ }
+ else{
+ #Report last matching position
+ print "END INGAP $orient\n" if($DEBUG > 1);
+ #Report last matching position
+ #Last matching position is defined by last overlapping M interval between
+ #query and current sequence
+ if($QCCHECKS){
+ my $intersectbv = new Bit::Vector($querybv->Size());
+ $intersectbv->Intersection($querybv,$currbv);
+ die if($currbv->bit_test($currcount+1));
+ die if($intersectbv->bit_test($currcount+1));
+ my($imin,$imax) = $intersectbv->Interval_Scan_dec($currcount+1);
+ my($cmin,$cmax) = $currbv->Interval_Scan_dec($currcount+1);
+ my $nummatches=0;
+ if(! defined $cmax || !defined $imax){
+ die "INGAP Can't find matching position in $aln->[0] < columnend:$columnend. No last matching position\n" if($DEBUG > 1);
+ }
+ else{
+ if($DEBUG>1){
+ for(my $i=$cmax;$i>=$imax;$i--){
+ if($currbv->bit_test($i)){
+ $nummatches++;
+ }
+ }
+ }
+ }
+ print "INGAP $nummatches until next match between query and $aln->[0] at column $cmax-$imax\n" if($DEBUG > 1);
+ }
+ if($orient eq '-'){
+ $end = $aln->[2]-$matches;
+ }
+ else{
+ $end = $aln->[1]+$matches;
+ }
+ }
+ print "END genomic=$end\n" if($DEBUG > 1);
+ die if($end>$aln->[2]);
+ die if($end<$aln->[1]);
+ $foundend=1;
+ last;
+ }
+ }
+ }
+ if($char eq 'M'){
+ $matches+=$count;
+ }
+ $currcount+=$count;
+ }
+ die "Could not find start or end " if(!$foundstart || !$foundend);
+ }
+ else{
+ if($orient eq '-'){
+ $start=$aln->[2]-$columnend-1;
+ $end=$aln->[2]-$columnstart;
+ }
+ else{
+ $start=$aln->[1]+$columnstart-1;
+ $end=$aln->[1]+$columnend;
+ }
+ }
+ #returning fmin<fmax
+ if($orient eq '-'){
+ ($start,$end) = ($end,$start);
+ }
+ return ($start,$end,$currbv);
+sub get_cigs {
+ my($cig) = @_;
+ my @chars = split /\d+/,$cig;
+ my @counts = split /[MXIDG]/,$cig;
+ my @cigs;
+ my $columncount;
+ for(my $i=0;$i<@counts;$i++){
+ my $c = $chars[$i+1];
+ die "Invalid cigar str $c\n" if($c !~ /[MXIDG]/);
+ my $currcount = $counts[$i];
+ die "Invalid count $currcount\n" if($currcount !~ /\d+/);
+ push @cigs,[$currcount,$c];
+ $columncount+=$currcount;
+ }
+ return (\@cigs,$columncount);
+#return alnobj,bitvector,width
+sub getAlignment{
+ my($self,$name) = @_;
+ die "Bad alignment $name" if(ref $name);
+ if(exists $self->{_alignments}->{$name}){
+ return @{$self->{_alignments}->{$name}};
+ }
+ else{
+ print "Alignment $name not found\n";
+ return undef;
+ }
+#If passed with no start/end, then returns first interval encountered
+#Assumes one interval per genome, per alignment
+sub getAlignedInterval{
+ my($self,$align_name,$seqname,$qstart,$qend) = @_;
+ die if(!$seqname);
+ my $alignobj = $self->{_alignments}->{$align_name}->[0];
+ foreach my $alni (@$alignobj){
+ if($alni->[0] eq $seqname){
+ if(! defined $qstart && !defined $qend){
+ return $alni;
+ }
+ else{
+ if(($qstart < $alni->[1] && $qend < $alni->[1]) || ($qstart > $alni->[2] && $qend > $alni->[2])){
+ print "#Non overlapping alignment on $seqname: $qstart < $alni->[1] && $qend < $alni->[1]) || ($qstart > $alni->[2] && $qend > $alni->[2]\n" if($self->{_debug});
+ #Non-overlapping
+ }
+ else{
+ return $alni;
+ }
+ }
+ }
+ }
+ print "#Can't find $seqname on alignment $align_name\n";
+ return undef;
+#Returns closed interval [$startcol,$endcol]
+sub getAlignmentMatrix {
+ my($self,$align_name,$startcol,$endcol,$db,$ref, at seqs) = @_;
+ if(!$startcol){
+ $startcol=0;
+ }
+ die "Can't find alignment $align_name" if(!exists $self->{_alignments}->{$align_name});
+ my ($alignobj,$gv,$align_width) = @{$self->{_alignments}->{$align_name}};
+ die "Bad input columns $startcol-$endcol $startcol >$align_width || $endcol > $align_width" if($startcol >$align_width || $endcol > $align_width);
+ #populate alignment matrix
+ my $row=0;
+ my $matrix=[];
+ my $seqmatrix=[];
+ my @names;
+ my $alni;
+ my $namesfilter = {};
+ foreach my $s (@seqs){
+ $namesfilter->{$s}=1;
+ }
+ my $skipfilter = (scalar(keys %$namesfilter)>0) ? 0 : 1;
+ if($ref){
+ $namesfilter->{$ref}=1;
+ my $refi;
+ for(my $i=0;$i<(@$alignobj);$i++){
+ if($alignobj->[$i]->[0] eq $ref){
+ $refi=$i;
+ last;
+ }
+ }
+ my $tmpi = $alignobj->[0];
+ $alignobj->[0] = $alignobj->[$refi];
+ $alignobj->[$refi] = $tmpi;
+ }
+ if(!$endcol){
+ $endcol=$align_width;
+ }
+ foreach my $alni (@$alignobj){
+ if($skipfilter || $namesfilter->{$alni->[0]}==1){
+ #my $seqobj = $db->get_Seq_by_id($alni->[0]);
+ my $seqobj = $db->{$alni->[0]};
+ my $seq = $seqobj->seq();
+ die "Can't find seq $alni->[0]\n" if(!$seqobj);
+ my $matchcount=0;
+ my $column=1;
+ #my $mstr = '-'x ($align_width+1);
+ $matrix->[$row] = '-'x ($align_width+1);
+ #my $sstr = '-'x ($align_width+1);
+ $seqmatrix->[$row]='-'x ($align_width+1);
+ #push @names,$alni->[0];
+ $names[$row] = $alni->[0];
+ my ($cigs,$columncount) = &get_cigs($alni->[4]);
+ foreach my $c (@$cigs){
+ my($count,$char) = @$c;
+ if($char eq 'M'){
+ if(($column >= $startcol && $column <= $endcol)
+ ||
+ ($startcol >= $column && $startcol <= ($column+$count))){
+ #my $mmstr = '.' x $count;
+ #die if(length($mmstr)!=$count);
+ #substr($mstr,$column) = $mmstr;
+ substr($matrix->[$row],$column,$count,'.' x $count);# if($column >= $startcol && $column <= $endcol);
+ #die if($self->{_debug} && $alni->[1]>$alni->[2]);
+ #my $str;
+ #print "$alni->[1]+$matchcount,$alni->[1]+$matchcount+$count-1\n";
+ if($alni->[3] eq '+'){
+ #$str = $seqobj->subseq($alni->[1]+$matchcount+1,$alni->[1]+$matchcount+$count);
+ #$str = substr($seq,$alni->[1]+$matchcount,($alni->[1]+$matchcount+$count)-($alni->[1]+$matchcount)+1);
+ substr($seqmatrix->[$row],$column,$count,substr($seq,$alni->[1]+$matchcount,$count));# if($column >= $startcol && $column <= $endcol);
+ }
+ else{
+ #Note cigar always denotes offset from alignment start
+ #In '-' orient, cigar starts from $alni->[2]--->$alni->[1]
+ #$str = revcom($seqobj->subseq($alni->[2]-$matchcount-$count+1,$alni->[2]-$matchcount))->seq();
+ #$str = revcom(substr($seq,$alni->[2]-$matchcount-$count,($alni->[2]-$matchcount)-($alni->[2]-$matchcount-$count)+1));
+ #substr($sstr,$column,$count,revcom(substr($seq,$alni->[2]-$matchcount-$count,$count)));
+ substr($seqmatrix->[$row],$column,$count,revcom(substr($seq,$alni->[2]-$matchcount-$count,$count))->seq());# if($column >= $startcol && $column <= $endcol);
+ }
+ #die length($str)." != $count" if(length($str)!=$count);
+ #die if($self->{_debug} && length($str)!=length($mmstr));
+ #substr($sstr,$column,length($str)) = $str;
+ #die if($self->{_debug} && substr($sstr,$column,length($str)) ne $str);
+ }
+ $matchcount+=$count;
+ }
+ else{
+ #my $mmstr = '-' x $count;
+ #die if($self->{_debug} && length($mmstr)!=$count);
+ #substr($mstr,$column) = $mmstr;
+ if(($column >= $startcol && $column <= $endcol)
+ ||
+ ($startcol >= $column && $startcol <= ($column+$count))){
+ substr($matrix->[$row],$column,$count,'-' x $count);# if($column >= $startcol && $column <= $endcol);
+ }
+ }
+ $column+=$count;
+ }
+ #$matrix->[$row]=$mstr;
+ #$seqmatrix->[$row]=$sstr;
+ $row++;
+ }
+ }
+ die "Invalid range $startcol-$endcol" if($endcol < $startcol);
+ my $retmatrix=[];
+ my $retseqmatrix=[];
+ for(my $i=0;$i<@$matrix;++$i){
+ $retmatrix->[$i] = substr($matrix->[$i],$startcol,$endcol-$startcol+1);
+ $retseqmatrix->[$i] = substr($seqmatrix->[$i],$startcol,$endcol-$startcol+1);
+ die "Bad sequence $retmatrix->[$i]" if(length ($retmatrix->[$i])<1);
+ die "Bad sequence $retseqmatrix->[$i]" if(length ($retseqmatrix->[$i])<1);
+ }
+ #remove same characters
+ #this is really going to really slow in perl this way
+ my $mcount;
+ for(my $i=1;$i<@$retmatrix;++$i){
+ my $m=0;
+ for(my $j=0;$j<length($retseqmatrix->[$i]);$j++){
+ my $topchar = uc(substr($retseqmatrix->[0],$j,1));
+ if(uc(substr($retseqmatrix->[$i],$j,1)) ne $topchar){
+ substr($retmatrix->[$i],$j,1) = substr($retseqmatrix->[$i],$j,1);
+ }
+ else{
+ if($topchar eq '-'){
+ substr($retmatrix->[$i],$j,1) = '-';
+ }
+ else{
+ $m++;
+ substr($retmatrix->[$i],$j,1) = '.';
+ }
+ }
+ }
+ $mcount->{$i} = $m;
+ }
+ $mcount->{0} = 1000000000;#$align_width+1;
+ return ($retmatrix,$retseqmatrix,\@names,$mcount);
+sub contains{
+ my($self,$align_name,$qseqname,$coord1,$coord2) = @_;
+ die "Can't find alignment $align_name" if(!$align_name || !exists $self->{_alignments}->{$align_name});
+ my $alignobj = $self->{_alignments}->{$align_name}->[0];
+ foreach my $alni (@$alignobj){
+ if($alni->[0] eq $qseqname){
+ if($coord1<$alni->[1] || $coord1>$alni->[2]){
+ #print "Start position $coord1 is not contained in interval $qseqname:$alni->[1]-$alni->[2]\n";
+ return 0 if(!$nodups);
+ }
+ elsif($coord2<$alni->[1] || $coord2>$alni->[2]){
+ #print "End position $coord2 is not contained in interval $qseqname:$alni->[1]-$alni->[2]\n";
+ return 0 if(!$nodups);
+ }
+ return 1;
+ }
+ }
+ return 0;
+#mappedfeats in the form [name,seq,start,end]
+sub printAlignment{
+ my($self,$fh,$aln,$startcol,$endcol,$db,$ref,$mappedfeats,$htmlout) = @_;
+ die "Must specify Bioperl database $db that contains sequence data" if(!$db);
+ die "Must specify startcol, endcol $startcol-$endcol" if(!$startcol || !$endcol);
+ my($alignobj,$alignment_bv,$align_width) = @{$self->{_alignments}->{$aln}};
+ #$mmatrix,$seqmatrix are relative to $startcol, index starting at 0
+ my ($mmatrix,$seqmatrix,$names,$mcount) = $self->getAlignmentMatrix($aln,$startcol,$endcol,$db,$ref);
+ my $COL_WIDTH=100;
+ my $atree = new AlignmentTree();
+ my $features = {};
+ foreach my $feat (@$mappedfeats){
+ $atree->insert(@$feat);
+ $features->{$feat->[1]} = [$feat->[0]->[0]->[1],$feat->[0]->[0]->[2]];
+ }
+ my @mcountsort = sort {$mcount->{$b}<=>$mcount->{$a}} (keys %{$mcount});
+ for(my $j=0;$j<=(($endcol-$startcol)/$COL_WIDTH);$j++){
+ my $anchors; #for html output
+ my $s=$j*$COL_WIDTH+1;
+ my $e=$s+$COL_WIDTH-1;
+ $e = ($e>($endcol-$startcol+1)) ? ($endcol-$startcol+1) : $e;
+ my @coords;
+ #offset into full alignment $aln
+ my $absstartcol = $s+$startcol-1;
+ my $absendcol = $e+$startcol-1;
+ foreach my $i (@mcountsort){
+ #for(my $i=0;$i<@$names;$i++){
+ die if(! $names->[$i]);
+ my($alni) = $self->getAlignedInterval($aln,$names->[$i]);
+ my($start,$end) = &columntocoords($alni,$absstartcol,$absendcol);
+ $coords[$i] = [$start,$end,$alni->[3]];
+ ($start,$end) = ($alni->[3] eq '-') ? ($end,$start) : ($start,$end);
+ my $displaystr;
+ if($i==0){
+ $displaystr = substr($seqmatrix->[$i],$s-1,$e-$s+1);
+ #Highlight Shine Delgarno
+ $displaystr =~ s/AGGAGG/<font color='red'>aggagg<\/font>/g;
+ }
+ else{
+ $displaystr = substr($mmatrix->[$i],$s-1,$e-$s+1);
+ $displaystr =~ s/AGGAGG/<font color='red'>aggagg<\/font>/g;
+ }
+ if($self->{debug}){
+ printf $fh ("%30.30s %7s %11s %-30s %7s %11s\n",
+ "$names->[$i]:$alni->[3]",
+ $start,
+ "col:$absstartcol",
+ $displaystr,
+ $end,
+ "col:$absendcol");
+ }
+ else{
+ if($htmlout){
+ printf $fh ("%30.30s %7s %11s %-30s %7s %11s\n",
+ "$names->[$i]:$alni->[3]",
+ $start,
+ "",
+ $displaystr,
+ $end,
+ "");
+ }
+ else{
+ printf $fh ("%30.30s %7s %11s %-30s %7s %11s\n",
+ "$names->[$i]:$alni->[3]",
+ $start,
+ "",
+ $displaystr,
+ $end,
+ "");
+ }
+ }
+ }
+ printf $fh ("%".$COL_WIDTH.".".$COL_WIDTH."s","$aln col:$absstartcol-$absendcol\n");
+ foreach my $i (@mcountsort){
+ #for(my $i=0;$i<@$names;$i++){
+ #Show all matching features that intersect $coords[$i]->[0],$coords[$i]->[1]
+ if($coords[$i]->[1]-$coords[$i]->[0]>1){
+ my @res = $atree->intersect($names->[$i],$coords[$i]->[0],$coords[$i]->[1],'gene');
+ foreach my $r (@res){
+ #offset into full alignment $aln
+ my($cs,$ce) = &coordstocolumn($alignobj,$names->[$i],$r->[2],$r->[3],1);
+ #print "$r->[0] coords:$r->[2],$r->[3] $cs,$ce $cs-$absstartcol $absendcol-$ce\n";
+ my $leadinggap = 'X'x($cs-$absstartcol);
+ my $trailinggap = 'X'x($absendcol-$ce);
+ my $displaystr;
+ my $startcodonstr;
+ my $stopcodonstr;
+ my $displaytoken;
+ #TODO, REFACTOR into a matrix. this impl doesn't support codons that span row bounds
+ #currently only viz start,stop codons at beginning/end of alignment
+ #print "$r->[2] <= $features->{$r->[0]}->[0] && $r->[3] >= $features->{$r->[0]}->[0]\n";
+ if($r->[2] <= $features->{$r->[0]}->[0] && $r->[3] >= $features->{$r->[0]}->[0]){
+ $anchors->{"$aln:$cs"}++;
+ $anchors->{"$aln:$ce"}++;
+ my $showfirst = 3;
+ if($ce-$cs<3){
+ $showfirst = ($ce-$cs);
+ }
+ if($coords[$i]->[2] eq '-'){
+ $stopcodonstr = substr('***',0,$showfirst);
+ if($r->[6] eq '-'){
+ #$stopcodonstr = 'TAA';
+ #$stopcodonstr = substr($seqmatrix->[$i],$cs-$absstartcol-($COL_WIDTH-$showfirst)+($j*$COL_WIDTH),$showfirst);
+ $displaytoken .= 'STOP1<--';
+ }
+ else{
+ #$stopcodonstr = 'CAT';
+ #$stopcodonstr = substr($seqmatrix->[$i],$cs-$absstartcol-($COL_WIDTH-$showfirst),$showfirst);
+ $displaytoken .= '<--START1';
+ $anchors->{"$r->[0]"}++;
+ }
+ }
+ else{
+ $startcodonstr = substr('***',0,$showfirst);
+ if($r->[6] eq '-'){
+ #$startcodonstr = 'TTA';
+ #$startcodonstr = substr($seqmatrix->[$i],$cs-$absstartcol+($j*$COL_WIDTH),$showfirst);
+ $displaytoken .= 'STOP2<--';
+ }
+ else{
+ #$startcodonstr = 'ATG';
+ #$startcodonstr = substr($seqmatrix->[$i],$cs-$absstartcol+($j*$COL_WIDTH),$showfirst);
+ $displaytoken .= 'START2-->';
+ $anchors->{"$r->[0]"}++;
+ }
+ }
+ #TODO trim to row
+ }
+ if($r->[2] <= $features->{$r->[0]}->[1] && $r->[3] >= $features->{$r->[0]}->[1]){
+ $anchors->{"$aln:$cs"}++;
+ $anchors->{"$aln:$ce"}++;
+ my $showfirst = 3;
+ if($ce-$cs<3){
+ $showfirst = ($ce-$cs)+1;
+ }
+ if($coords[$i]->[2] eq '-'){
+ $startcodonstr = substr('***',0,$showfirst);
+ if($r->[6] eq '-'){
+ #$startcodonstr = 'ATG';
+ #$startcodonstr = substr($seqmatrix->[$i],$cs-$absstartcol+($j*$COL_WIDTH),$showfirst);
+ $displaytoken .= 'START3-->';
+ $anchors->{"$r->[0]"}++;
+ }
+ else{
+ #$startcodonstr = 'TTA';
+ #$startcodonstr = substr($seqmatrix->[$i],$absendcol-$ce-($COL_WIDTH-$showfirst),$showfirst);
+ $displaytoken .= 'STOP3<--';
+ }
+ }
+ else{
+ $stopcodonstr = substr('***',0,$showfirst);
+ if($r->[6] eq '-'){
+ #$stopcodonstr = 'CAT';
+ #$stopcodonstr = substr($seqmatrix->[$i],$absendcol-$ce-$showfirst,$showfirst);
+ $displaytoken .= '<--START4';
+ $anchors->{"$r->[0]"}++;
+ }
+ else{
+ #$stopcodonstr = 'TAA';
+ #$stopcodonstr = substr($seqmatrix->[$i],$absendcol-$ce-$showfirst,$showfirst);
+ $displaytoken .= 'STOP4<--';
+ }
+ }
+ #TODO trim to row
+ }
+ my $spacer = '_'x($ce-$cs+1-length($startcodonstr)-length($stopcodonstr));
+ $displaystr = $startcodonstr.$spacer.$stopcodonstr;#;substr($seqmatrix->[$i],$cs-1,$ce-$cs+1);
+ die if(length($displaystr) > $COL_WIDTH);
+ my ($feat_start,$feat_end) = ($r->[2],$r->[3]);
+ #TODO determine frame and print ~ only every 3 codons in frame if in frame
+ #otherwise
+ my $frame;
+ if($coords[$i]->[2] eq '-'){
+ if($r->[6] eq '-'){
+ die if($features->{$r->[0]}->[1] < $feat_end);
+ $frame = (($features->{$r->[0]}->[1] - $feat_end)%3);
+ }
+ else{
+ die if($features->{$r->[0]}->[1] < $feat_end);
+ $frame = (($features->{$r->[0]}->[1] - $feat_end)%3);
+ }
+ }
+ else{
+ if($r->[6] eq '-'){
+ die "$feat_start < $features->{$r->[0]}->[0] $r->[0]" if($feat_start < $features->{$r->[0]}->[0]);
+ $frame = (($feat_start - $features->{$r->[0]}->[0])%3);
+ }
+ else{
+ die if($feat_start < $features->{$r->[0]}->[0]);
+ $frame = (($feat_start - $features->{$r->[0]}->[0])%3);
+ }
+ }
+ ($feat_start,$feat_end) = ($coords[$i]->[2] eq '-') ? ($feat_end,$feat_start) : ($feat_start,$feat_end);
+ my $m=$frame;
+ my $fulldisplaystr = $leadinggap.$displaystr.$trailinggap;
+ for(my $k=length($leadinggap);$k<length($leadinggap)+length($displaystr);$k++){
+ my $idx=$k;
+ if(substr($mmatrix->[$i],$s-1+$k,1) ne '-'){
+ $m++;
+ if($m%3==0){
+ #Don't overwrite start,stop codons
+ if($idx>length($startcodonstr)+length($leadinggap)
+ & $idx<(length($leadinggap)+length($displaystr)-length($stopcodonstr))){
+ substr($fulldisplaystr,$idx,1) = '|';
+ }
+ }
+ else{
+ #substr($fulldisplaystr,$idx,1) = substr($mmatrix->[$i],$s-1+$k,1);
+ }
+ }
+ else{
+ if(substr($mmatrix->[$i],$s-1+$k,1) eq '-'){
+ substr($fulldisplaystr,$idx,1) = substr($mmatrix->[$i],$s-1+$k,1);
+ }
+ }
+ }
+ die if($r->[6] ne $r->[7]);
+ if($htmlout){
+ printf $fh ("%30.30s %7s %11s %-30s %7s %11s\n",
+ $r->[0].":$r->[6]",
+ $feat_start,
+ $displaytoken,
+ $fulldisplaystr,
+ $feat_end,
+ $displaytoken);
+ }
+ else{
+ printf $fh ("%30.30s %7s %11s %-30s %7s %11s\n",
+ $r->[0].":$r->[6]",
+ $feat_start,
+ $displaytoken,
+ $fulldisplaystr,
+ $feat_end,
+ $displaytoken);
+ }
+ }
+ }
+ }
+ if($htmlout){
+ foreach my $a (keys %$anchors){
+ print $fh "<a href='#$a'></a>\n";
+ }
+ }
+ printf $fh ("%".$COL_WIDTH.".".$COL_WIDTH."s","ANNOTATIONS\n");
+ }
+sub printAlignmentDebug{
+ my($alignobj,$handle) = @_;
+ foreach my $alni2 (@$alignobj){
+ if(!$handle){
+ $handle=\*STDOUT;
+ }
+ print $handle "#ALIGNOBJ $alignobj ",join(' ',@$alni2),"\n";
+ }
+sub revcomp{
+ my($aln) = @_;
+ my @naln;
+ foreach my $alni (@$aln){
+ push @naln,&revcomp_alni($alni);
+ }
+ return \@naln;
+sub revcomp_alni{
+ my($alni) = @_;
+ my $cigstr;
+ my $nalni = [$alni->[0],$alni->[1],$alni->[2]];
+ $nalni->[3] = ($alni->[3] eq '+') ? '-' : '+';
+ my ($cigs,$columncount) = &get_cigs($alni->[4]);
+ foreach my $c (@$cigs){
+ my($count,$char) = @$c;
+ if($c eq 'M' || $c eq 'X'){
+ $cigstr .= "$count$c";
+ }
+ elsif($c eq 'I'){
+ $cigstr .= "$count"."D";
+ }
+ elsif($c eq 'D'){
+ $cigstr .= "$count"."I";
+ }
+ else{
+ $cigstr .= "$count$c";
+ }
+ }
+ die if(length($cigstr)!=$columncount);
+ $nalni->[4] = $cigstr;
+ return $nalni;
+sub removeOverlaps{
+ my($self,$alignments,$qseqname) = @_;
+ my @alns;
+ my @results;
+ my %contained;
+ my %overlaps;
+ foreach my $align_name (@$alignments){
+ my $alni = $self->getAlignedInterval($align_name,$qseqname);
+ if($align_name =~ /$aligntoken/){
+ push @alns,[$align_name,$alni->[1],$alni->[2],$alni->[2]-$alni->[1]];
+ }
+ }
+ my @sortedalns = sort {$b->[3] <=> $a->[3]} @alns;
+ for(my $i=0;$i<@sortedalns;$i++){
+ my $ifmin = $sortedalns[$i]->[1];
+ my $ifmax = $sortedalns[$i]->[2];
+ for(my $j=$i+1;$j<@sortedalns;$j++){
+ my $jfmin = $sortedalns[$j]->[1];
+ my $jfmax = $sortedalns[$j]->[2];
+ if($jfmin>=$ifmin && $jfmax <=$ifmax){
+ print "Marking $sortedalns[$j]->[0] contained $jfmin>=$ifmin && $jfmax <=$ifmax in $sortedalns[$i]->[0]\n" if($DEBUG);
+ $contained{$j}++;
+ }
+ else{
+ if($jfmin>=$ifmin && $jfmin <=$ifmax){
+ $overlaps{$j}++;
+ }
+ if($jfmax>=$ifmin && $jfmax <=$ifmax){
+ $overlaps{$j}++;
+ }
+ }
+ }
+ }
+ if(scalar(keys %overlaps)>0){
+ print "#WARNING removing some alignments with overlaps\n" if($DEBUG);;
+ }
+ if(scalar(keys %contained)>0){
+ print "#WARNING removing some alignments that are fully contained\n" if($DEBUG);;
+ for(my $i=0;$i<@sortedalns;$i++){
+ if(!exists $contained{$i}){
+ push @results,$sortedalns[$i]->[0];
+ }
+ else{
+ print "#WARNING removing contained alignment $sortedalns[$i]->[0]\n" if($DEBUG);;
+ }
+ }
+ return @results;
+ }
+ else{
+ return @$alignments;
+ }
diff --git a/mapping/IntervalTree.pm b/mapping/IntervalTree.pm
new file mode 100644
index 0000000..71f0359
--- /dev/null
+++ b/mapping/IntervalTree.pm
@@ -0,0 +1,154 @@
+package IntervalTree;
+#Adapted from bx-python
+use strict;
+use Math::Random qw(random_uniform);
+use POSIX qw(ceil floor);
+use Data::Dumper;
+sub new{
+ my $classname = shift;
+ my $self = {};
+ bless($self,$classname);
+ my($start,$end,$name,$orient) = @_;
+ $self->{'priority'} = ceil( (-1.0 / log(.5)) * log( -1.0 / (random_uniform(1,0,1) - 1)));
+ $self->{'start'} = $start;
+ $self->{'end'} = $end;
+ $self->{'orient'} = $orient;
+ $self->{'maxend'} = $self->{'end'};
+ $self->{'minend'} = $self->{'end'};
+ $self->{'left'} = undef;
+ $self->{'right'} = undef;
+ $self->{'name'} = $name;
+ $self->{'default_func'} = sub {
+ my $interval = shift;
+ return $interval->{'name'};
+ };
+ return $self;
+sub insert{
+ my($self,$start,$end,$name,$orient) = @_;
+ die "Bad start-end $start-$end" if($end<$start);
+ die "Bad orient $orient" if($orient ne '-' && $orient ne '+');
+ my $root = $self;
+ if($start > $self->{'start'}){
+ if(defined $self->{'right'}){
+ $self->{'right'} = $self->{'right'}->insert($start,$end,$name,$orient);
+ }
+ else{
+ $self->{'right'} = new IntervalTree($start,$end,$name,$orient);
+ }
+ # rebalance tree
+ if($self->{'priority'} < $self->{'right'}->{'priority'}){
+ # $root = $self->rotateleft();
+ }
+ }
+ else{
+ if(defined $self->{'left'}){
+ $self->{'left'} = $self->{'left'}->insert($start,$end,$name,$orient);
+ }
+ else{
+ $self->{'left'} = new IntervalTree($start, $end, $name,$orient);
+ }
+ # rebalance tree
+ if($self->{'priority'} < $self->{'left'}->{'priority'}){
+ # $root = $self->rotateright();
+ }
+ }
+ if(defined $root->{'right'} && defined $root->{'left'}){
+ $root->{'maxend'} = ($root->{'end'}>$root->{'right'}->{'maxend'}) ? $root->{'end'} : $root->{'right'}->{'maxend'};
+ $root->{'maxend'} = ($root->{'maxend'}>$root->{'left'}->{'maxend'}) ? $root->{'maxend'} : $root->{'left'}->{'maxend'};
+ $root->{'minend'} = ($root->{'end'}<$root->{'right'}->{'minend'}) ? $root->{'end'} : $root->{'right'}->{'minend'};
+ $root->{'minend'} = ($root->{'minend'}<$root->{'left'}->{'minend'}) ? $root->{'minend'} : $root->{'left'}->{'minend'};
+ }
+ elsif(defined $root->{'right'}){
+ $root->{'maxend'} = ($root->{'end'}>$root->{'right'}->{'maxend'}) ? $root->{'end'} : $root->{'right'}->{'maxend'};
+ $root->{'minend'} = ($root->{'end'}<$root->{'right'}->{'minend'}) ? $root->{'end'} : $root->{'right'}->{'minend'};
+ }
+ elsif(defined $root->{'left'}){
+ $root->{'maxend'} = ($root->{'end'}>$root->{'left'}->{'maxend'}) ? $root->{'end'} : $root->{'left'}->{'maxend'};
+ $root->{'minend'} = ($root->{'end'}<$root->{'left'}->{'minend'}) ? $root->{'end'} : $root->{'left'}->{'minend'};
+ }
+ return $root;
+sub intersect{
+ my($self,$start,$end,$func) = @_;
+ die "$self->{'name'}:$end<$start not valid query" if($end<$start);
+ my @results;
+ $func = $self->{'default_func'} if(!$func);
+ #print "CHECKING $start $end\n";
+ if($start < $self->{'end'} && $end > $self->{'start'}){
+ #print "Found $self->{'name'} $start <= $self->{'end'} && $end >= $self->{'start'}\n";
+ push @results,$func->( $self );
+ }
+ if(defined $self->{'left'} && $start <= $self->{'left'}->{'maxend'}){
+ push @results, $self->{'left'}->intersect( $start, $end, $func );
+ }
+ if(defined $self->{'right'} && $end >= $self->{'start'}){
+ push @results, $self->{'right'}->intersect( $start, $end, $func );
+ }
+ return @results;
+sub rotateright{
+ my($self) = @_;
+ die if(!exists $self->{'left'});
+ die if(!exists $self->{'left'}->{'right'});
+ my $root = $self;
+ if(defined $self->{'left'}->{'right'}){
+ $root = $self->{'left'};
+ $self->{'left'} = $self->{'left'}->{'right'};
+ $root->{'right'} = $self;
+ if(defined $self->{'right'} && defined $self->{'left'}){
+ $self->{'maxend'} = ($self->{'end'}>$self->{'right'}->{'maxend'}) ? $self->{'end'} : $self->{'right'}->{'maxend'};
+ $self->{'maxend'} = ($self->{'maxend'}>$self->{'left'}->{'maxend'}) ? $self->{'maxend'} : $self->{'left'}->{'maxend'};
+ $self->{'minend'} = ($self->{'end'}<$self->{'right'}->{'minend'}) ? $self->{'end'} : $self->{'right'}->{'minend'};
+ $self->{'minend'} = ($self->{'minend'}<$self->{'left'}->{'minend'}) ? $self->{'minend'} : $self->{'left'}->{'minend'};
+ }
+ elsif(defined $self->{'right'}){
+ $self->{'maxend'} = ($self->{'end'}>$self->{'right'}->{'maxend'}) ? $self->{'end'} : $self->{'right'}->{'maxend'};
+ $self->{'minend'} = ($self->{'end'}<$self->{'right'}->{'minend'}) ? $self->{'end'} : $self->{'right'}->{'minend'};
+ }
+ elsif(defined $self->{'left'}){
+ $self->{'maxend'} = ($self->{'end'}>$self->{'left'}->{'maxend'}) ? $self->{'end'} : $self->{'left'}->{'maxend'};
+ $self->{'minend'} = ($self->{'end'}<$self->{'left'}->{'minend'}) ? $self->{'end'} : $self->{'left'}->{'minend'};
+ }
+ }
+ return $root;
+sub rotateleft{
+ my($self) = @_;
+ die if(!exists $self->{'right'});
+ die if(!exists $self->{'right'}->{'left'});
+ my $root = $self;
+ if(defined $self->{'right'}->{'left'}){
+ $root = $self->{'right'};
+ $self->{'right'} = $self->{'right'}->{'left'};
+ $root->{'left'} = $self;
+ if(defined $self->{'right'} && defined $self->{'left'}){
+ $self->{'maxend'} = ($self->{'end'}>$self->{'right'}->{'maxend'}) ? $self->{'end'} : $self->{'right'}->{'maxend'};
+ $self->{'maxend'} = ($self->{'maxend'}>$self->{'left'}->{'maxend'}) ? $self->{'maxend'} : $self->{'left'}->{'maxend'};
+ $self->{'minend'} = ($self->{'end'}<$self->{'right'}->{'minend'}) ? $self->{'end'} : $self->{'right'}->{'minend'};
+ $self->{'minend'} = ($self->{'minend'}<$self->{'left'}->{'minend'}) ? $self->{'minend'} : $self->{'left'}->{'minend'};
+ }
+ elsif(defined $self->{'right'}){
+ $self->{'maxend'} = ($self->{'end'}>$self->{'right'}->{'maxend'}) ? $self->{'end'} : $self->{'right'}->{'maxend'};
+ $self->{'minend'} = ($self->{'end'}<$self->{'right'}->{'minend'}) ? $self->{'end'} : $self->{'right'}->{'minend'};
+ }
+ elsif(defined $self->{'left'}){
+ $self->{'maxend'} = ($self->{'end'}>$self->{'left'}->{'maxend'}) ? $self->{'end'} : $self->{'left'}->{'maxend'};
+ $self->{'minend'} = ($self->{'end'}<$self->{'left'}->{'minend'}) ? $self->{'end'} : $self->{'left'}->{'minend'};
+ }
+ }
+ return $root;
diff --git a/mapping/Makefile b/mapping/Makefile
new file mode 100644
index 0000000..08417c6
--- /dev/null
+++ b/mapping/Makefile
@@ -0,0 +1,20 @@
+#Set release name or install directory
+all: mugsya_install
+install: mugsya_install
+ tar cvzf ${RELEASE_NAME}.tgz ${INSTALL_DIR}
+ mkdir -p ${INSTALL_DIR}
+ install mugsy-annotator ${INSTALL_DIR}
+ install mapfeatures.pl ${INSTALL_DIR}
+ install mafindex.pl ${INSTALL_DIR}
+ install featureindex.pl ${INSTALL_DIR}
+ install AlignmentTree.pm ${INSTALL_DIR}
+ install IntervalTree.pm ${INSTALL_DIR}
diff --git a/mapping/README b/mapping/README
new file mode 100644
index 0000000..d825085
--- /dev/null
+++ b/mapping/README
@@ -0,0 +1,39 @@
+Untar the mugsy-annotator download to an installation directory
+Edit PREFIX= in the mugsy-annotator script to the install directory
+Add the installation directory to the PERL5LIB environment variable
+export PERL5LIB=/path/to/mugsy-annotator:${PERL5LIB}
+(1) To run Mugsy-Annotator using the wrapper, you need genome FASTA
+files and annotations in either gff3 or genbank flat file format first
+run mugsy to generate a whole genome alignment.
+Genbank .gbk inputs require bp_genbank2gff3.pl from Bioperl
+mugsy --prefix mygenomes genome1.fsa genome2.fsa genome3.fsa
+#default output is in /tmp/mygenomes.maf
+cat genome1.fsa genome2.fsa genome3.fsa > allgenomes.fsa
+mugsy-annotator allgenomes.fsa /tmp/mygenomes.maf genome1.gff genome2.gff genome3.gff
+Note, the input fasta file and MAF need to have matching sequence names in the format genome.seqname
+#(2) Alternatively, execute the individual steps outside the wrapper
+#Clean MAF if necessary
+cat /tmp/nmen_v16.maf | perl -ne 's/^s(\s+)[^\.]+\./s$1/;print' > nmen_v16.maf
+mafindex.pl n16.index < nmen_v16.maf > /dev/null
+featureindex.pl n16.index genbank < v16annotations.gbk > /dev/null
+#To generate orthologs only
+mapfeatures.pl --reportedits=0 ./n16.index ./v16.all.fsa < v16annotations.out > v16.features.mapped
+#For full report (slower)
+mapfeatures.pl ./n16.index ./v16.all.fsa < v16annotations.out > v16.features.mapped
diff --git a/mapping/README.example b/mapping/README.example
new file mode 100644
index 0000000..88d2613
--- /dev/null
+++ b/mapping/README.example
@@ -0,0 +1,40 @@
+Download 3 genomes from Genbank ftp site, align, and run mugsy annotator
+#Get annotations
+wget ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Escherichia_coli_APEC_O1_uid16718/CP000468.gbk
+wget ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Escherichia_coli_K_12_substr__DH10B_uid20079/CP000948.gbk
+wget ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Escherichia_coli_O157H7_EDL933_uid259/AE005174.gbk
+#Get genome FASTA
+wget ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Escherichia_coli_APEC_O1_uid16718/CP000468.fna
+wget ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Escherichia_coli_K_12_substr__DH10B_uid20079/CP000948.fna
+wget ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Escherichia_coli_O157H7_EDL933_uid259/AE005174.fna
+#Calculate alignment
+mugsy --directory `pwd` --prefix ecolitest *.fna
+#Run mugsy-annotator
+cat *.fna > ecolitest.fsa
+mugsy-annotator ecolitest.fsa ecolitest.maf *.gbk > ecolitest.mugsyannotator.out
+This will output matching genes but will also throw errors "Can't find
+seqname" that will prevent calculating agreement information. The
+error results from a reformatting of the sequence names in the MAF
+output by Mugsy. To fix this, update the original FASTA headers to
+match the sequence names used in the MAF, which are in the form
+perl -pi -e 's/^>.*\|gb\|(\w+).*/>$1/' AE005174.fna
+perl -pi -e 's/^>.*\|gb\|(\w+).*/>$1/' CP000948.fna
+perl -pi -e 's/^>.*\|gb\|(\w+).*/>$1/' CP000468.fna
+cat *.fna > ecolitest.fsa
+Once fixed, mugsy-annotator should produce output, including agreement information
+mugsy-annotator ecolitest.fsa ecolitest.maf *.gbk > ecolitest.mugsyannotator.out
+A legend of the codes used in the output file is at the end of the output file
diff --git a/mapping/bsmlindex.pl b/mapping/bsmlindex.pl
new file mode 100755
index 0000000..3781a30
--- /dev/null
+++ b/mapping/bsmlindex.pl
@@ -0,0 +1,67 @@
+use strict;
+use XML::Twig;
+use AlignmentTree;
+use AlignmentTree;
+use Storable qw(store retrieve);
+use Data::Dumper;
+$Storable::Deparse = 1;
+$Storable::Eval = 1;
+my $atree = new AlignmentTree();
+if(-e $ARGV[0]){
+ $atree = AlignmentTree::deserialize($ARGV[0]);
+my $mapping;
+if(-e $ARGV[1]){
+ #parsing lookup file
+ open FILE, $ARGV[1] or die "Can't open mapping file $ARGV[1]";
+ while(my $line=<FILE>){
+ my($tseq,$oseq,$offset) = split(/\s+/,$line);
+ $mapping->{$oseq} = [$tseq,$offset-1];
+ }
+ close FILE;
+my $twig = new XML::Twig(
+ twig_handlers =>
+ { 'Feature[@class = "polypeptide"]' => sub {
+ my( $twig, $elt)= @_;
+ my $iloc = $elt->first_child('Interval-loc');
+ my $seqname = $elt->parent('Sequence')->{'att'}->{'id'};
+ my $featname = $elt->{'att'}->{'id'};
+ my $class = $elt->{'att'}->{'class'};
+ my $complement = $iloc->{'att'}->{'complement'};
+ if ($complement eq '1'){
+ $complement = '-';
+ }
+ if ($complement eq '0'){
+ $complement = '+';
+ }
+ my($fmin,$fmax) = ($iloc->{'att'}->{'startpos'},$iloc->{'att'}->{'endpos'});
+ if(exists $mapping->{$seqname}){
+ $fmin = $fmin+$mapping->{$seqname}->[1];
+ $fmax = $fmax+$mapping->{$seqname}->[1];
+ print "Using mapping for $seqname $mapping->{$seqname}->[0]\n";
+ $seqname = $mapping->{$seqname}->[0];
+ }
+ $atree->insert([[$seqname,$fmin,$fmax,$complement,$fmax-$fmin."M"]],$featname,$class);
+ #print "$seqname\t$featname\t$fmin\t$fmax\t$class\n";
+ },
+ },
+ );
+print STDERR "Writing index to $ARGV[0]\n";
+my $stdin_fh = \*STDIN;
diff --git a/mapping/chadoindex.pl b/mapping/chadoindex.pl
new file mode 100644
index 0000000..b78660e
--- /dev/null
+++ b/mapping/chadoindex.pl
@@ -0,0 +1,4 @@
+use strict;
diff --git a/mapping/featureindex.pl b/mapping/featureindex.pl
new file mode 100755
index 0000000..2284838
--- /dev/null
+++ b/mapping/featureindex.pl
@@ -0,0 +1,119 @@
+#./featureindex.pl mugsyindex < mugsy.out
+#Converts GFF or simple tab text files to
+#Supports Genbank files if Bioperl is also installed
+#Add more supported types from bioperl, remote download of accessions etc
+use strict;
+use lib '/usr/local/projects/angiuoli/mugsy_trunk/mapping';
+use lib './';
+use AlignmentTree;
+use Storable qw(store retrieve);
+use Data::Dumper;
+$Storable::Deparse = 1;
+$Storable::Eval = 1;
+my $atree = new AlignmentTree();
+if(-e $ARGV[0]){
+ $atree = AlignmentTree::deserialize($ARGV[0]);
+my $filetype = $ARGV[1];
+if(lc($filetype) =~ /gff/){
+ print STDERR "Reading filetype $filetype\n";
+ &parseGFF(\*STDIN,'gene','pseudogene');
+elsif(lc($filetype) =~ /genbank/){
+ print STDERR "Reading filetype $filetype\n";
+ my $file;
+ print `bp_genbank2gff3.pl --filter misc_feature -in stdin -out - < | grep -v "# Input" >> /tmp/$$.gff`;
+ open FILE,"/tmp/$$.gff";
+ &parseGFF(\*FILE,'gene','pseudogene');
+ close FILE;
+elsif(lc($filetype) =~ /ptt/){
+ my $seqname;
+ while(my $line=<STDIN>){
+ if($line =~ /^>/ || $line =~ /^Location/){
+ if($line =~ /^>(\S+)/){
+ $seqname = $1;
+ }
+ }
+ else{
+ #36..1 - 35 XOCORF_0001 - hypothetical protein
+ my @elts = split(/\t/,$line);
+ my ($fmin,$fmax) = ($elts[0] =~ /(\d+)\.\.(\d+)/);
+ ($fmin,$fmax) = ($fmax < $fmin) ? ($fmax,$fmin) : ($fmin,$fmax);
+ $fmin = $fmin-1;
+ my $strand = $elts[1];
+ my $featname = $elts[3];
+ print "Adding feature $featname on sequence:$seqname $fmin,$fmax,$strand to alignment tree\n";
+ $atree->insert([[$seqname,$fmin,$fmax,$strand,$fmax-$fmin."M"]],'gene:'.$featname,'gene');
+ }
+ }
+ while(my $line=<STDIN>){
+ my($featname,$seqname,$fmin,$fmax,$strand) = split(/\s+/,$line);
+ $atree->insert([[$seqname,$fmin,$fmax,$strand,$fmax-$fmin."M"]],'gene:'.$featname,'gene');
+ print "Adding feature $featname on sequence:$seqname $fmin,$fmax,$strand to alignment tree\n";
+ }
+print STDERR "Writing index to $ARGV[0]\n";
+sub parseGFF{
+ my $file = shift;
+ my @feattypes = @_;
+ my %featlookup = map {lc($_) => 1} @feattypes;
+ my $features={};
+ while(my $line=<$file>){
+ if($line !~ /^\#/){
+ chomp $line;
+ my @elts = split(/\t/,$line);
+ if(length($line)>0 && scalar(@elts)==9){
+ if(exists $featlookup{lc($elts[2])}){
+ my %attrs = map {split(/=/)} split(/;/,$elts[8]);
+ my $geneid;
+ if(exists $attrs{'locus_tag'}){
+ $geneid=$attrs{'locus_tag'};
+ }
+ elsif(exists $attrs{'ID'}){
+ #Can't expect that ID is unique across files, so append sequence name
+ $geneid=$elts[0].'_'.$attrs{'ID'};
+ }
+ else{
+ print STDERR "Skipping unrecognized GFF3 line $line\n";
+ next;
+ }
+ my $fmin = $elts[3];
+ my $fmax = $elts[4];
+ my $orient = $elts[6];
+ my $i=0;
+ while(exists $features->{$geneid}){
+ print "Duplicate named feature $geneid. Renaming to ${geneid}_$i\n";
+ $geneid=$geneid.'_'.++$i;
+ }
+ $features->{$geneid}++;
+ die "Unsupported $fmax>=$fmin. Line: $line" if($fmax<=$fmin);
+ die "Bad orient $orient. Line: $line" if($orient ne '+' && $orient ne '-');
+ $atree->insert([[$elts[0],$fmin-1,$fmax,$orient,($fmax-$fmin+1)."M"]],'gene:'.$geneid,'gene');
+ }
+ }
+ }
+ }
diff --git a/mapping/intersect.pl b/mapping/intersect.pl
new file mode 100755
index 0000000..80fc5f2
--- /dev/null
+++ b/mapping/intersect.pl
@@ -0,0 +1,58 @@
+use strict;
+use AlignmentTree;
+use Data::Dumper;
+print STDERR "Reading $ARGV[0]\n";
+my $atree = AlignmentTree::deserialize($ARGV[0]);
+print STDERR "Querying $ARGV[1],$ARGV[2],$ARGV[3]\n";
+my @results = $atree->intersect($ARGV[1],$ARGV[2],$ARGV[3]);
+my $outputtable = [];
+my $rowlookup;
+my $columnlookup;
+$columnlookup->{$ARGV[1]} = 1;
+my $row=0;
+my $column=2;
+foreach my $r (@results){
+ if(!exists $rowlookup->{$r->[0]}){
+ $rowlookup->{$r->[0]}=$row++;
+ }
+ if(!exists $columnlookup->{$r->[1]}){
+ $columnlookup->{$r->[1]}=$column++;
+ }
+foreach my $r (sort {$a->[2] <=> $b->[2]} @results){
+ $outputtable->[$rowlookup->{$r->[0]}]->[$columnlookup->{$r->[1]}] = "$r->[2] $r->[3]";
+ $outputtable->[$rowlookup->{$r->[0]}]->[0] = $r->[0];
+my $columnwidth=20;
+my $printformat='%-'.$columnwidth.'.'.$columnwidth.'s';
+foreach my $col (sort {$columnlookup->{$a} <=> $columnlookup->{$b}} keys %$columnlookup){
+ printf("$printformat\t","$col");
+print "\n";
+foreach my $row (sort {
+ if( $a->[1] eq $b->[1]){
+ $b->[1] cmp $a->[1];
+ }
+ else{
+ $a->[1] <=> $b->[1];
+ }
+ @$outputtable){
+ foreach my $col (@$row){
+ $col = '-' if(!$col);
+ printf("$printformat\t","$col");
+ }
+ print "\n";
diff --git a/mapping/mafindex.pl b/mapping/mafindex.pl
new file mode 100755
index 0000000..4a5a2a8
--- /dev/null
+++ b/mapping/mafindex.pl
@@ -0,0 +1,139 @@
+#./mafindex.pl mugsyindex < mugsy.out
+#Adds an MAF formatted file to a MUGSY formatted index
+#Each alignment is saved as type 'alignment'
+use strict;
+use lib '/usr/local/projects/angiuoli/mugsy_trunk/mapping';
+use lib './';
+use AlignmentTree;
+use Storable qw(store retrieve);
+use Data::Dumper;
+$Storable::Deparse = 1;
+$Storable::Eval = 1;
+my $atree = new AlignmentTree();
+if(-e $ARGV[0]){
+ $atree = AlignmentTree::deserialize($ARGV[0]);
+my $currscore;
+my $block = [];
+my $k=0;
+my $i=0;
+my $label;
+while(my $line=<STDIN>){
+ if($line =~ /^a\s+score=([\d\.\-]+)/){
+ my $name = "WGA_$label";
+ if(exists $atree->{_alignments}->{"WGA_$label"}){
+ print "Creating new alignment name. $name taken\n";
+ $name = "WGA_".$$."_$i";
+ }
+ if(scalar(@$block)){
+ print "Saving alignments $name with ",scalar(@$block)," sequences\n";
+ $atree->insert($block,"$name","alignment") if(scalar(@$block));
+ }
+ ($label) = $line =~ /label=(\w+)/;
+ $label = "nolabel".++$k if !$label;
+ $currscore=$1;
+ $block=[];
+ $i++;
+ }
+ elsif($line =~ /^s\s+/){
+ my @elts = split(/\s+/,$line);
+ #$elts[1] =~ s/\./_/g;
+ #$elts[1] =~ s/\|/_/g;
+ #[1] - accession
+ #[2] - start
+ #[3] - length
+ #[4] - orient
+ #[5] - seqlen
+ #[6] - seq
+#From UCSC FAQ about MAF format
+# start -- The start of the aligning region in the source sequence. This is a zero-based number. If the strand field is '-' then this is the start relative to the reverse-complemented source sequence.
+# size -- The size of the aligning region in the source sequence. This number is equal to the number of non-dash characters in the alignment text field
+ my $start = $elts[2];
+ my $end = $start+$elts[3];
+ my $orient = $elts[4];
+ if($orient eq '-'){
+ $start = ($elts[5] - $start - $elts[3]);
+ $end = $start + $elts[3];
+ }
+ my ($cigar,$len) = &get_cigar($elts[6]);
+ my $seq = $elts[1];
+ #Check for species.accession formatted names, trim to accession if the same
+ my($species,$accession) = ($seq =~ /(\S+)\.(\S+)/);
+ if($species ne "" && $species eq $accession){
+ $seq = $accession;
+ }
+ die "Bad orient: $orient\n" if($orient ne '-' && $orient ne '+');
+ print "$seq $start,$end ", $end-$start,"\n";
+ push @$block,[$seq,$start,$end,$orient,$cigar];
+ }
+my $name = "WGA_$label";
+if(exists $atree->{_alignments}->{"WGA_$label"}){
+ $name = "WGA_".$$."_$i";
+print "Saving alignments $name with ",scalar(@$block)," sequences\n";
+$atree->insert($block,"$name","alignment") if(scalar(@$block));
+print STDERR "Writing index to $ARGV[0]\n";
+sub get_cigar{
+ my($seqs) = @_;
+ my $cig;
+ my $len=0;
+ my @chars = split(//,$seqs);
+ my $count=0;
+ my $curr=0; #1 - match, 2 - gap
+ foreach my $c (@chars){
+ #match char
+ if($c ne '-'){
+ if($curr==2){
+ #in gap
+ #write prev gap
+ $cig .= $count."X";
+ $count=0;
+ }
+ #in match
+ $count++;
+ $curr=1;
+ }
+ else{
+ #gap char
+ if($curr==1){
+ #in match
+ #write prev gap
+ $cig .= $count."M";
+ $len += $count;
+ $count=0;
+ }
+ #in gap
+ $count++;
+ $curr=2;
+ }
+ }
+ if($curr==1){
+ #in gap
+ #write prev gap
+ $cig .= $count."M";
+ $len += $count;
+ }
+ if($curr==2){
+ #in gap
+ #write prev gap
+ $cig .= $count."X";
+ }
+ return ($cig,$len);
diff --git a/mapping/mapfeatures.pl b/mapping/mapfeatures.pl
new file mode 100755
index 0000000..5b73b62
--- /dev/null
+++ b/mapping/mapfeatures.pl
@@ -0,0 +1,3865 @@
+=head1 NAME
+mapfeatures - derives a set of mapped features according to a
+multiple sequence alignment. Reports on the consistency of
+annotated features in the mapping.
+=head1 USAGE
+mapfeatures.pl alignments.index seqs.fasta < features.txt
+Outputs are a series of text reports and an HTML report that can be
+loaded in a web browser
+(1) alignment.index - An index file containing a whole genome multiple
+alignment and genome annotations. This index can be generated with a
+combination of featureindex.pl,mafindex.pl,xmfaindex.pl. The whole
+genome multiple alignment can be produced by a whole genome aligner
+like Mugsy, TBA (indexed using mafindex.pl) or Mauve (index using
+xmfaindex.pl). The genome annotations in Genbank or GFF3 format can be
+indexed with featureindex.pl
+(2) seqs.fasta - Multi-FASTA file of the input genomes. These must be
+ the same genomes aligned.
+(3) features.txt - A space delimited file consisting of
+feature_id sequence_id fmin fmax strand
+=head1 SYNOPSIS
+#Example usage
+#Generate whole genome alignment
+mugsy --prefix nmen_v16 v16/*.fsa
+#Index output
+mafindex.pl nmen.index < nmen_v16.maf
+#Index annotations
+featureindex.pl n16.index genbank < nmen_v16.all.gbk > v16annotations.out
+cat v16/*.fsa > v16.all.fsa
+#Run mugsy-annotator
+mugsy-annotator ./n16.index ./v16.all.fsa < v16annotations.out > v16.features.mapping
+#For more detailed output (v16.html, v16.aln.report, v16.table, v16.clusters, v16.edits)
+mugsy-annotator --prefix v16 --print-alignments ./n16.index ./v16.all.fsa < v16annotations.out > v16.features.mapping
+1)Reporting orthologs using whole genome alignment
+The script can be used to produce a list of orthologous genes in the
+case where the input alignments correspond to orthologous regions
+2)Reporting annotation inconsistencies, such as frameshifts or
+varying start sites
+Aligned annotations are further classified and checked for
+consistency of start and stop codons. Inconsistencies may indicate
+annotation error, sequencing errors, or frameshifts. Alternatively,
+the inconsistencies can be due to poor or missing alignments. The
+summary information provided at the end of the output provides an
+indication of the overall consistency of the annotations in the set.
+The script has been used to evaluate consistency of annotations
+across numerous sequenced strains of bacteria and identify likely
+Meant to be used in conjunction the several utility scripts to
+identify orthologs and classify annotations in a set of aligned genomes
+Related scripts
+>mugsymapper aln.maf features.txt > clusters.out
+>gb2annottab genome.gbk1,...,genome.gbkN > orig.annot.tab
+>indextab alignments.index
+>updategb genome.gbk,....,genome.gbkN < annot.updates.tab
+-printAlignments displays wrong frame for gene fragments that have
+more than one start or end in a single display line
+-Will report coverage,identity>1 if there are overlapping alignments
+-Does not detect cases where gene fragments run off end of the contig
+-no command line usage,help
+-need to rename alignmenttree, AlignedIntervalTree
+#Input coordinates are zero start, interbase coordinates
+#0 1 2 3 4
+# A T A C
+#The feature TA above has coordinates 1-3
+#specified in the code as fmin=1 fmax=3. Length is fmax-fmin=2
+#Contact: S. Angiuoli (angiuoli at cs.umd.edu)
+#December 2010
+use strict;
+use lib '/usr/local/projects/angiuoli/mugsy_trunk/mapping';
+use lib './';
+use Pod::Usage;
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+use File::Basename;
+#Bioperl is used only for translation machinery
+use Bio::Perl;
+#use Bio::DB::Fasta;
+use Bio::Seq;
+use Bio::SeqIO;
+use Bio::Tools::CodonTable;
+use Bio::Seq::EncodedSeq;
+#use Bio::LiveSeq::Mutation; tried this but couldn't get to work properly
+#Default cutoffs
+use AlignmentTree;
+my %options;
+my $results = GetOptions (\%options,
+ 'prefix=s',
+ 'input_file=s',
+ 'map_file=s',
+ 'featlist=s', #Restrict mapping to list of features
+ 'duplications=s', #Report duplications, requires addl index file
+ 'coverage|c=s',
+ 'query_coverage|q=s',
+ 'identity|i=s',
+ 'sortkeys=s', #sort order of fields 'gfreq','len','afreq' when reporting edits
+ 'reportedits=s', #top number of edits to report, default all
+ 'maxchange=s', #max allowable %length changes
+ 'prefix=s', #Generate output reports with file prefix
+ 'cogformat=s', #Output cog format to stdout
+ 'printalignments',
+ 'printhtml',
+ 'skipframeshifts',
+ #Missing gene options
+ 'minorflen=s',
+ 'maxorflen=s',
+ 'verbose|v', #Verbose warnings
+ 'debug|d=s') || pod2usage(-verbose => 1);
+pod2usage(-verbose=>1) if($options{'help'});
+my $coverage_cutoff = (exists $options{'coverage'}) ? $options{'coverage'} : 0.5;
+my $query_coverage_cutoff = (exists $options{'query_coverage'}) ? $options{'query_coverage'} : 0;
+my $pid_cutoff= (exists $options{'identity'}) ? $options{'identity'} : 0.1;
+print "#Using coverage cutoff:$coverage_cutoff identity:$pid_cutoff query_coverage:$query_coverage_cutoff\n";
+my $MAXORFLEN = (exists $options{'maxorflen'}) ? $options{'maxorflen'} : 30000; #in bp
+my $MINORFLEN = $options{'minorflen'} || 30; #in aa residues
+my $ORFLEN_MAXDELTA = 0.5; #do not consider possible codons that are less than X the length of the maximum annotated ORF
+my $FS_THRESHOLD = 3;
+#'frameshift_consistency=s', #only report frameshifts that occur in < X fraction of aligned sequences. 1 show all possible frameshifts, default 0.5.
+#my $FS_FRACTIONGENOME = (exists $options{'frameshift_consistency'}) ? $options{'frameshift_consistency'} : 0.5;;
+#Used for detecting contig boundaries
+#Flag for checking consistent start,stop
+#Assumes input features are genes
+my $doconsistencychecks=1;
+#Report new ORFs using aligned start codons
+my $dofindneworfs = 0;
+my $autocorrect=0;
+#Only report alternative start codons that
+#results in a longer ORF
+my $longer_altstarts=1;
+my $moreconsistent_altstarts=1;
+#Only report alternative start codons that
+#appear more frequently in the aligned genoems
+my $freq_altstarts=1;
+my $freq_altstops=0;
+my $aligntoken="WGA";
+my $CODON_DELIM = '.';
+my $CODON_DELIM_REGEX = '\.';
+#Output flags
+my $COGoutputformat=(exists $options{'cogformat'}) ? $options{'cogformat'} : 0;
+my $cogfh; #cog format
+my $cfh; #cluster format
+my $ctfh; #table
+my $ctfh2; #table with coords
+my $htmlout=(exists $options{'printhtml'} ? 1 : 0);
+ open $cogfh, "+>$options{'cogformat'}" or die "Can't open COG file $options{'cogformat'}";#\*STDOUT;
+ open $cogfh, "+>$options{'prefix'}clusters.cog";
+if(! $options{'prefix'}){
+ $options{'prefix'} = "mugsyant/run$$";
+ print `mkdir -p $options{'prefix'}`;
+print STDERR "Writing output to $options{'prefix'}clusters.table, $options{'prefix'}clusters.coords.table, $options{'prefix'}clusters.out \n";
+print "#Writing output to $options{'prefix'}clusters.table, $options{'prefix'}clusters.coords.table, $options{'prefix'}clusters.out \n";
+print "#EDITTBL format cluster_id, codon_id, genome_freq, currentannotated_freq, avglen, num_orgswithoverlaps, comments\n";
+open $cfh, "+>$options{'prefix'}clusters.out";
+open $ctfh, "+>$options{'prefix'}clusters.table";
+open $ctfh2, "+>$options{'prefix'}clusters.coords.table";
+my $printalignments=(exists $options{'printalignments'}) ? $options{'printalignments'} : 0;
+my @sortkeys = (exists $options{'sortkeys'}) ? (split(/,/,$options{'sortkeys'})) : ('gfreq','len','afreq');
+if(scalar @sortkeys != 3){
+ print STDERR "Enter sort order using names gfreq,afreq,len for aligned frequency in the genome, annotated frequency, and ORF length. Sort is in descending order, largest value first.\n";
+ print STDERR "eg. --sortkeys gfreq,len,afreq\n";
+ exit 1;
+#Debugging flags
+my $checkbadlen=0;
+my $debug=$options{'debug'};
+my $printskipped=$debug;
+my $verbose=(exists $options{'verbose'} ? 1 : 0); #verbose warnings, mostly for debugging
+ $verbose=$debug;
+#Master list of features and attributes
+#7-startcodon pos
+#8-startcodon aln
+#9-stopcodon pos
+#10-stopcodon aln
+my $features = {};
+my $allseqs = {};
+my $seqindex = {};
+my $featlist;
+if(exists $options{'featlist'}){
+ foreach my $f (split(/\s+/,$options{'featlist'})){
+ $featlist->{$f}=1;
+ }
+if($featlist && scalar (keys %$featlist)){
+ print STDERR "Limiting results to ",scalar(keys %$featlist)," genes\n";
+my $codons = {};
+my $classes_sum = {};
+my $classes_all = {};
+my $newclasses_sum = {};
+#AlignmentTree is a interval tree that contains alignments between sequences
+#and features on those sequences
+my $atree = AlignmentTree::deserialize($ARGV[0]);
+#Read a white space delimited list of features to map from stdin
+my $datree;
+if(-e $options{'duplications'}){
+ $datree = AlignmentTree::deserialize($options{'duplications'});
+my %featlookup;
+my $filetype;
+my $fh;
+$options{'input_file'} = $options{'map_file'} if(exists $options{'map_file'});
+if($options{'input_file'}) {
+ open($fh, "<$options{'input_file'}") or die "Error in opening the file, $options{'input_file'}, $!\n";
+} else {
+ $fh = \*STDIN;
+my $seqref;
+while(my $line=<$fh>){
+ my($name,$seq,$fmin,$fmax,$orient,$polyid,$geneid,$annotations);
+ chomp $line;
+ if($line =~ /\#gff-version 3/){
+ $filetype = 'gff3';
+ $featlookup{'gene'}++;
+ $featlookup{'pseudogene'}++;
+ }
+ elsif($line =~ /^>/ || $line =~ /^Location/){
+ $filetype = 'ptt';
+ if($line =~ /^>(\S+)/){
+ $seqref = $1;
+ }
+ }
+ elsif($line !~ /^\#/){
+ if($filetype eq 'gff3'){
+ #GFF
+ my @elts = split(/\t/,$line);
+ if(scalar(@elts)==9){
+ if(exists $featlookup{lc($elts[2])}){
+ my %attrs = map {split(/=/)} split(/;/,$elts[8]);
+ if(exists $attrs{'locus_tag'}){
+ $name = $attrs{'locus_tag'};
+ }
+ elsif(exists $attrs{'ID'}){
+ #Can't expect that ID is unique across files, so append sequence name
+ $name=$elts[0].'_'.$attrs{'ID'};
+ }
+ if(exists $attrs{'product'}){
+ $annotations .= $attrs{'product'};
+ }
+ if(lc($elts[2]) eq 'pseudogene'){
+ $annotations .= "pseudogene ";
+ }
+ ($seq,$fmin,$fmax,$orient,$polyid,$geneid) = ($elts[0],$elts[3],$elts[4],$elts[6],$name,$name);
+ ($fmin,$fmax) = ($fmin<$fmax) ? ($fmin-1,$fmax) : ($fmax-1,$fmin);
+ }
+ elsif(lc($elts[2]) eq 'cds'){
+ #hack for names from genbank
+ my %attrs = map {split(/=/)} split(/;/,$elts[8]);
+ if(exists $attrs{'product'}){
+ $annotations .= $attrs{'product'};
+ my $cdsname;
+ if(exists $attrs{'locus_tag'}){
+ $cdsname = $attrs{'locus_tag'};
+ if(exists $features->{$cdsname}){
+ $features->{$cdsname}->[11] = $annotations;
+ }
+ }
+ if(exists $attrs{'ID'} && (length($cdsname)>0 && !exists $features->{$cdsname})){
+ #Can't expect that ID is unique across files, so append sequence name
+ $cdsname=$elts[0].'_'.$attrs{'ID'};
+ if(exists $features->{$cdsname}){
+ $features->{$cdsname}->[11] = $annotations;
+ }
+ }
+ }
+ }
+ }
+ else{
+ #print "Skipping $line\n" if($debug);
+ }
+ }
+ elsif($filetype eq 'ptt'){
+ #36..1 - 35 XOCORF_0001 - hypothetical protein
+ my @elts = split(/\t/,$line);
+ ($fmin,$fmax) = ($elts[0] =~ /(\d+)\.\.(\d+)/);
+ ($fmin,$fmax) = ($fmax < $fmin) ? ($fmax,$fmin) : ($fmin,$fmax);
+ $orient = $elts[1];
+ $fmin = $fmin-1;
+ $name = $elts[3];
+ $seq = $seqref;
+ $annotations .= $elts[5];
+ }
+ else{
+ #Custom simple space delim text
+ my @annots;
+ ($name,$seq,$fmin,$fmax,$orient,$polyid,$geneid, at annots) = split(/\s+/,$line);
+ $annotations .= join (' ', at annots);
+ #Allow for 0,1 orient
+ if($orient =~ /\d/){
+ if($orient > 0){
+ $orient = '+';
+ }
+ else{
+ $orient = '-';
+ }
+ }
+ die "Bad orient $orient\n" if($orient ne '-' && $orient ne '+');
+ }
+ if(length($name)>0){
+ die "Unsupported $fmax>=$fmin. $line" if($fmax<=$fmin);
+ if($fmin<0){
+ print STDERR "Illegal fmin $fmin for $seq,$fmin,$fmax,$fmax-$fmin,$orient,$polyid,$geneid\n";
+ $fmin=0;
+ next;
+ }
+ die "Bad orient $orient. $line" if($orient ne '+' && $orient ne '-');
+ my $i=0;
+ while(exists $features->{$name}){
+ print "#Duplicate named feature $name. Renaming to ${name}_$i\n";
+ $name=$name.'_'.++$i;
+ }
+ if(!defined $featlist || exists $featlist->{$name}){
+ $features->{$name} = [$seq,$fmin,$fmax,$fmax-$fmin,$orient,$polyid,$geneid];
+ my($org) = ($seq =~ /([^\.]+)/);
+ $allseqs->{$org}++;
+ #[7]-[10] reserved for start,stop codon info
+ $features->{$name}->[11] = $annotations;
+ }
+ }
+ }
+my @sortedallseqs = sort {$a cmp $b} (keys %$allseqs);
+for(my $i=0;$i<scalar(@sortedallseqs);$i++){
+ $seqindex->{$sortedallseqs[$i]}=$i;
+#Save a list of clusters
+my $clusters = {};
+#Current cluster id, a unique identifier for a cluster
+my $cluster_id = 0;
+#Count of clusters that pass cutoffs
+my $validcluster = 0;
+#All genes are categorized into one of three categories
+#mapped - aligned to other genes in the set above cutoffs
+#unmapped - aligned to other genes in the set but none above cutoffs
+#nomatches- not aligned to any other genes in the input set
+#List of mapped,unmapped,nohit genes
+my $mapped = {};
+my $unmapped = {};
+my $deleted = {};
+my $nomatches = {};
+my $dups = {};
+my $neworfcount = 0;
+my $adjustedorfs = 0;
+#List of newly called ORFs
+my $neworfs = {};
+#and the annotated ORFs they replace
+my $subsumed = {};
+#Map of feature => organism
+my $feat2organism = {};
+my $db;
+if(-f "$ARGV[1]"){
+ print STDERR "Using FASTA file $ARGV[1]. Debug level: $debug\n";
+ #Faster to read everything into RAM
+ #$db = Bio::DB::Fasta->new($ARGV[1],'-reindex'=>1);
+ my @ids;# = $db->ids();
+ my $istream = Bio::SeqIO->new(-file => $ARGV[1],
+ -format => 'Fasta');
+ while ( my $seq = $istream->next_seq()){
+ push @ids, $seq->id();
+ $db->{$seq->id()} = $seq;
+ print "#Storing ",$seq->id(),"\n" if($verbose);
+ }
+ print "#Parsed FASTA sequences for ",join(',', at ids),"\n";
+ print STDERR "No FASTA file provided. Reporting alternative start codons but not calling ORFs\n";
+#The mapping algorithm builds clusters of aligned genes in a greedy
+#fashion, starting with the longest feature in the input set and
+#mapping all aligned features that pass cutoffs. In the case of where
+#features are genes and the alignments are orthologous regions, such
+#as those identified by whole genome alignments(WGA), the clusters
+#represent orthologous genes.
+#Sort query genes by length in decreasing order, longest to
+#shortest. In doing do, all aligned genes that cover the query gene
+#above cutoffs are considered putative orthologs to the query. And the
+#query gene is always the longest member of the cluster. The reported
+#%id and %cov are relative to the query
+foreach my $query (sort {$features->{$b}->[3] <=> $features->{$a}->[3]} #Sort on length, decreasing order
+ keys %$features){ #Over all features
+ print "#Processing $query ",`date` if($verbose);
+ #As the algorithm progresses, features are mapped and removed from consideration
+ #Consider genes that remain unmapped or
+ #remain covered by <= cutoff% of length in alignments already considered
+ if(!exists $mapped->{$query} && !exists $deleted->{$query}){
+ #Start a new cluster based on the query gene. Set a new
+ #cluster id; each cluster can also be identified by the query
+ #gene ($query)
+ $cluster_id++;
+ my($mappedorgs,$mappedgenes,$unmappedorgs,$unmappedgenes) = &buildCluster($atree,$query);
+ print "#MAPPED Num_orgs:",scalar(keys %$mappedorgs)," Num_genes:",scalar(keys %$mappedgenes)," UNMAPPED Num_orgs:",scalar(keys %$unmappedorgs)," Num_genes:",scalar(keys %$unmappedgenes),"\n" if($verbose);
+ die "Less than 2 mapped sequences" if(scalar(keys %$mappedgenes)>1 && scalar(keys %$mappedorgs)<=0);
+ die "No mapped genes" if(scalar(keys%$mappedgenes)<1);
+ #Mark inconsistencies in the cluster and save start,stop codon
+ #positions of annotated genes only
+ #Codon aligned, annotated frequency is also saved as
+ #'start','stop',=>seqname
+ #'pairs'
+ #=>
+ # 'gfreq' -aligned genomic freq
+ # 'afreq' -annotated freq
+ # 'len' - average length
+ #
+ my($feat_attrs,$cluster_attrs,$codons) = &annotateCluster($atree,$mappedgenes,$mappedorgs);
+ my $seq_attrs = {};
+ my $new_orfs = {};
+ #Look for unannotated ORFs in remaining aligned seqs using other annotated start codons
+ #This can also recall orfs in the unmapped set
+ #if($dofindneworfs && !$options{'skipneworfs'}){
+ #$new_orfs = &findnewORFs($db,$atree,$mappedorgs,$mappedgenes,$codons);
+ #}
+ if((scalar(keys %$mappedgenes)>1 && scalar(keys %$mappedorgs)>1)){
+ print "#Cluster WGA$cluster_id codon_pairs:",scalar(keys %{$codons->{'pairs'}}),"\n" if($verbose);
+ #We have a good cluster, save it
+ #Save the cov,pid in master list of mapped genes
+ my $totallen=0;
+ my $maxlen=0;
+ foreach my $feat_name (keys %$mappedgenes){
+ die "Feature $feat_name already mapped" if(exists $mapped->{$feat_name});
+ $mapped->{$feat_name}->{'cov'}=$mappedgenes->{$feat_name}->{'cov'}/$features->{$feat_name}->[3];
+ $mapped->{$feat_name}->{'pid'}=$mappedgenes->{$feat_name}->{'pid'}/$mappedgenes->{$feat_name}->{'len'};
+ $mapped->{$feat_name}->{'cluster_id'}=$cluster_id;
+ $totallen += $features->{$feat_name}->[3];
+ $maxlen = ($features->{$feat_name}->[3] > $maxlen) ? $features->{$feat_name}->[3] : $maxlen;
+ delete $unmapped->{$feat_name};
+ die if(exists $unmappedgenes->{$feat_name});
+ }
+ my $avglen=$totallen/(scalar keys %$mappedgenes);
+ my $classesstr;
+ my $classesallstr;
+ #Save alternative ORFs
+ my $altcodons = {};
+ if(!defined $options{'reportedits'} || $options{'reportedits'} > 0){
+ #Save aligned and annotated codon frequency
+ foreach my $p (keys %{$codons->{'pairs'}}){
+ print "#Analyzing codon pair $p\n" if($verbose);
+ my($startcodon,$stopcodon) = split(/:/,$p);
+ foreach my $seqname (keys %$mappedorgs,keys %$unmappedorgs){
+ print "#Sequence $seqname\n" if($verbose);
+ #if this is the annotated pair
+ if(exists $codons->{'pairs'}->{$p}->{'orgs'}->{$seqname} && $codons->{'pairs'}->{$p}->{'orgs'}->{$seqname}->[3]==1){
+ #Do nothing, already annotated
+ $codons->{'pairs'}->{$p}->{'features'}->{$seqname} = $mappedorgs->{$seqname}->{'features'};
+ my($fmin,$fmax,$orient) = &findCoords($atree,$seqname,$startcodon,$stopcodon);
+ my $isorf = &isORF($db,$seqname,$fmin,$fmax,$orient);
+ if($isorf<=0){
+ print "#BAD ORF $seqname,$fmin,$fmax ",join(',',keys %{$mappedorgs->{$seqname}->{'features'}}),"\n" if($verbose);
+ foreach my $feat (keys %{$mappedorgs->{$seqname}->{'features'}}){
+ $feat_attrs->{$feat}->{'CX'}++;
+ }
+ }
+ print "#annotated\n" if($verbose);
+ }
+ else{
+ print "#checking\n" if($verbose);
+ #check if this is an ORF in $seqname
+ my($fmin,$fmax,$orient) = &findCoords($atree,$seqname,$startcodon,$stopcodon);
+ if(defined $fmin && defined $fmax && defined $orient && $fmax>$fmin){
+ die "$atree,$seqname,$startcodon,$stopcodon" if(! defined $fmin || ! defined $fmax);
+ my $isorf = &isORF($db,$seqname,$fmin,$fmax,$orient);
+ if($isorf>0){
+ print "#isORF true\n" if($verbose);
+ #There is an ORF on $seqname over this interval
+ if(exists $unmappedorgs->{$seqname}){
+ die if(exists $mappedorgs->{$seqname});
+ #genome is aligned but no ORFs above cutoffs
+ if(exists $unmappedorgs->{$seqname}->{'features'}){
+ #the region is annotated with an ORF that matches below cutoffs
+ #requires a new ORF that is different than currently annotated
+ $codons->{'pairs'}->{$p}->{'orgs'}->{$seqname} = [$fmin,$fmax,$orient,-1];
+ $codons->{'pairs'}->{$p}->{'features'}->{$seqname} = $unmappedorgs->{$seqname}->{'features'};
+ }
+ else{
+ #the region is not annotated
+ #Requires a new ORF in an unannotated region
+ my $olapgenes = &getFeaturesByInterval($atree,$seqname,$fmin,$fmax,$orient);
+ my $nummapped=0;
+ if(scalar(keys %$olapgenes)>0){
+ print "#Found ",scalar(keys %$olapgenes)," in region $seqname,$fmin,$fmax with no mapped,unmapped\n" if($debug);
+ foreach my $gene (keys %$olapgenes){
+ if(exists $mapped->{$gene}){
+ $nummapped++;
+ }
+ }
+ }
+ if(scalar (keys %$olapgenes)==0){
+ #the region is not annotated
+ #Requires a new ORF in an unannotated region
+ $codons->{'pairs'}->{$p}->{'orgs'}->{$seqname} = [$fmin,$fmax,$orient,-2];
+ print "#neworf $p $seqname ",$fmax-$fmin,"\n" if($verbose);
+ }
+ elsif($nummapped==0){
+ #the region is annotated
+ #Requires a alt ORF in an unannotated region
+ $codons->{'pairs'}->{$p}->{'orgs'}->{$seqname} = [$fmin,$fmax,$orient,-1];
+ $codons->{'pairs'}->{$p}->{'features'}->{$seqname} = $unmappedorgs->{$seqname}->{'features'};
+ }
+ }
+ }
+ else{
+ #the region is aligned with an annotated ORF above cutoffs
+ #requires a new ORF that is different than currently annotated
+ die if(! exists $mappedorgs->{$seqname});
+ $codons->{'pairs'}->{$p}->{'orgs'}->{$seqname} = [$fmin,$fmax,$orient,0];
+ if(exists $mappedorgs->{$seqname}){
+ $codons->{'pairs'}->{$p}->{'features'}->{$seqname} = $mappedorgs->{$seqname}->{'features'};
+ print "#altorf\n" if($verbose);
+ }
+ else{
+ print "#altorf, prev did not pass cutoffs\n" if($verbose);
+ }
+ }
+ }
+ elsif($isorf==0){
+ print "#isORF false\n" if($verbose);
+ if(! defined $options{'skipframeshifts'}){
+ if(($fmax-$fmin)<$MAXORFLEN && ($fmax-$fmin)>$MINORFLEN){
+ #See if we can call an ORF over this region with frameshifts
+ my $feat_name;
+ my $annotatedstop;
+ my $annotatedstart;
+ if(exists $mappedorgs->{$seqname} && scalar(keys %{$mappedorgs->{$seqname}->{'features'}}) == 1){
+ $feat_name = [keys %{$mappedorgs->{$seqname}->{'features'}}]->[0];
+ $annotatedstop = $features->{$feat_name}->[9] . $CODON_DELIM . $features->{$feat_name}->[10];
+ $annotatedstart = $features->{$feat_name}->[7] . $CODON_DELIM . $features->{$feat_name}->[8];
+ }
+ #Look for possible frameshifts if there are either
+ if(exists $unmappedorgs->{$seqname} #a) No annotations aligned in this region above cutoffs
+ ||
+ (exists $mappedorgs->{$seqname}
+ &&
+ (scalar(keys %{$mappedorgs->{$seqname}->{'features'}}) > 1 #b) Multiple annotated ORFs in this region
+ ||
+ $stopcodon ne $annotatedstop #c) Single annotated ORF with a different stop codon
+ ||
+ $startcodon ne $annotatedstart
+ )
+ )){
+ die "$seqname found in both mapped and unmapped org lists" if(exists $unmappedorgs->{$seqname} && exists $mappedorgs->{$seqname});
+ print "#Considering FS on $seqname for pair $startcodon,$stopcodon annotated:$annotatedstart,$annotatedstop $fmin,$fmax,$orient\n" if($verbose);
+ print "#Considering FS $stopcodon ne $annotatedstop for $feat_name on $seqname\n" if($debug && exists $mappedorgs->{$seqname});
+ #Find most similar sequence that has this ORF
+ my($nearestseq) = &findNearestNeighbor($atree,$seqname,$mappedorgs,$fmin,$fmax);
+ print "#Using $nearestseq as nearest neighbor to $seqname\n" if($verbose);
+ #Look for frameshifting mutations in $seqname
+ my($fs,$netfs) = &reportFrameShifts($atree,$db,$seqname,$nearestseq,$startcodon,$stopcodon);
+ if(ref $fs){
+ my $isorf = &isORF($db,$seqname,$fmin,$fmax,$orient,$fs);
+ print "#Possible ORF with frameshift indels:",scalar(@$fs)," net:$netfs isorf:$isorf\n" if($verbose);
+ if($isorf>0){
+ if($isorf==2 || abs($netfs) < $FS_THRESHOLD){
+ foreach my $fs (sort {$a->[0] <=> $b->[0]} @$fs){
+ if($verbose){
+ print "#FS ",join(',',@$fs)," $netfs $isorf\n";
+ }
+ }
+ print "#Adding frameshift net:",scalar(@$fs)," $netfs\n" if($debug);
+ if(exists $mappedorgs->{$seqname}){
+ $codons->{'pairs'}->{$p}->{'features'}->{$seqname} = $mappedorgs->{$seqname}->{'features'};
+ $codons->{'pairs'}->{$p}->{'orgs'}->{$seqname} = [$fmin,$fmax,$orient,0,$fs];
+ }
+ else{
+ my $olapgenes = &getFeaturesByInterval($atree,$seqname,$fmin,$fmax,$orient);
+ if(scalar(keys %$olapgenes)>0){
+ $codons->{'pairs'}->{$p}->{'orgs'}->{$seqname} = [$fmin,$fmax,$orient,0,$fs];
+ }
+ else{
+ print "#Neworf in frameshifted region $seqname,$fmin,$fmax,$orient\n" if($debug);
+ $codons->{'pairs'}->{$p}->{'orgs'}->{$seqname} = [$fmin,$fmax,$orient,-2,$fs];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ else{
+ print "#Skipping frameshifts check on $seqname range too big or small $fmin-$fmax\n" if($verbose);
+ }
+ }
+ }
+ elsif($isorf == -1){
+ #TODO, check point mutation of start, stop codon
+ }
+ }
+ }
+ }
+ }
+ foreach my $p (keys %{$codons->{'pairs'}}){
+ my $fscount=0;
+ my $orgcount = scalar(keys %{$codons->{'pairs'}->{$p}->{'orgs'}});
+ foreach my $org (keys %{$codons->{'pairs'}->{$p}->{'orgs'}}){
+ if(ref $codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[4]){
+ $fscount++;
+ }
+ }
+ foreach my $org (keys %{$codons->{'pairs'}->{$p}->{'orgs'}}){
+ #Skip frameshift if more occurs in more than $FS_FRACTIONGENOME
+ if(ref $codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[4]){
+ if(1){#$fscount/$orgcount <= $FS_FRACTIONGENOME){
+ $codons->{'pairs'}->{$p}->{'fsvars'}->{$org}=$codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[4];
+ print "#FS ",join(',',@{$codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[4]}),"\n" if($debug);
+ }
+ else{
+ next;
+ }
+ }
+ print "#CODONPAIR ",join(',',@{$codons->{'pairs'}->{$p}->{'orgs'}->{$org}}),"\n" if($debug);
+ $codons->{'pairs'}->{$p}->{'gfreq'}++;
+ $codons->{'pairs'}->{$p}->{'afreq'}++ if($codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[3]==1); #inc only if annotated
+ $codons->{'pairs'}->{$p}->{'length'}+=($codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[1] - $codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[0]);
+ #-2 encodes a new orf, no prior annotation on this $org
+ if($codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[3] == -2){
+ $codons->{'pairs'}->{$p}->{'neworfs'}->{$org}->{'fmin'} = $codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[0];
+ $codons->{'pairs'}->{$p}->{'neworfs'}->{$org}->{'fmax'} = $codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[1];
+ $codons->{'pairs'}->{$p}->{'neworfs'}->{$org}->{'orient'} = $codons->{'pairs'}->{$p}->{'orgs'}->{$org}->[2];
+ }
+ }
+ $codons->{'pairs'}->{$p}->{'len'} = $codons->{'pairs'}->{$p}->{'length'}/$codons->{'pairs'}->{$p}->{'gfreq'} if($codons->{'pairs'}->{$p}->{'gfreq'} > 0);
+ }
+ $classesstr = join(';',sort {$a cmp $b} keys %{$cluster_attrs});
+ #Suggest edits for inconsistently annotated clusters
+ if($doconsistencychecks){
+ #Choose N best start,stop pairs according to sortkeys
+ my @bestcodonpair = sort {
+ if($codons->{'pairs'}->{$a}->{$sortkeys[0]} eq $codons->{'pairs'}->{$b}->{$sortkeys[0]}){
+ if($codons->{'pairs'}->{$a}->{$sortkeys[1]} eq $codons->{'pairs'}->{$b}->{$sortkeys[1]}){
+ #sort on tertiary sortkey, eg length
+ $codons->{'pairs'}->{$b}->{$sortkeys[2]} <=> $codons->{'pairs'}->{$a}->{$sortkeys[2]};
+ }
+ else{
+ #sort on secondary sortkey, eg annotated frequency
+ $codons->{'pairs'}->{$b}->{$sortkeys[1]} <=> $codons->{'pairs'}->{$a}->{$sortkeys[1]};
+ }
+ }
+ else{
+ #sort on primary sortkey, eg. aligned frequency of start codon in the genome
+ $codons->{'pairs'}->{$b}->{'gfreq'} <=> $codons->{'pairs'}->{$a}->{'gfreq'};
+ }
+ } (keys %{$codons->{'pairs'}});
+ if(scalar(@bestcodonpair)>0){
+ open EFILE, ">$options{'prefix'}cluster$cluster_id.edits.out";
+ for(my $i=0;$i<scalar(@bestcodonpair);$i++){
+ my $bestcodon = $bestcodonpair[$i];
+ if($codons->{'pairs'}->{$bestcodon}->{'gfreq'} > 1){
+ my $codonlength = ($codons->{'pairs'}->{$bestcodon}->{'length'}/$codons->{'pairs'}->{$bestcodon}->{'gfreq'});
+ my $deltafracmax = ($maxlen-$codonlength)/$maxlen;
+ if($deltafracmax < $ORFLEN_MAXDELTA && $deltafracmax > (-1)*$ORFLEN_MAXDELTA){
+ print EFILE ">CLUSTER_$cluster_id $bestcodon\n";
+ foreach my $org (keys %{$codons->{'pairs'}->{$bestcodon}->{'orgs'}}){
+ #Check if there are existing annotations
+ if(scalar(keys %{$codons->{'pairs'}->{$bestcodon}->{'features'}->{$org}})>0){
+ foreach my $feat_name (keys %{$codons->{'pairs'}->{$bestcodon}->{'features'}->{$org}}){
+ my $pred_feat = $codons->{'pairs'}->{$bestcodon}->{'orgs'}->{$org};
+ #Check if codon pair results in an alternative ORF
+ if($pred_feat->[0] ne $features->{$feat_name}->[1] ||
+ $pred_feat->[1] ne $features->{$feat_name}->[2]){
+ my $fs = (defined $codons->{'pairs'}->{$bestcodon}->{'orgs'}->{$org}->[4]) ? "F" : "";
+ #if($fs eq 'F'){
+ #die if(! exists $codons->{'pairs'}->{$bestcodon}->{'fsvars'}->{$org});
+ #my @fsruns = @{$codons->{'pairs'}->{$bestcodon}->{'fsvars'}->{$org}};
+ #foreach my $r (@fsruns){
+ # $fs .= print "[$r->[0]-$r->[1] $r->[2]:$r->[3]] $r->[4]";
+ #}
+ #}
+ my $olapgenes = &getFeaturesByInterval($atree,$org,$pred_feat->[0],$pred_feat->[1],$pred_feat->[2]);
+ my $olaps;
+ foreach my $feat (keys %$olapgenes){
+ if($feat ne $feat_name && ! exists $codons->{'pairs'}->{$bestcodon}->{'features'}->{$org}->{$feat}){
+ print "#Overlapping gene found on $org ",join(",",@{$olapgenes->{$feat}}),"\n" if($verbose);
+ $olaps->{$feat} = $olapgenes->{$feat};
+ $codons->{'pairs'}->{$bestcodon}->{'olaps'}->{$org}++
+ }
+ }
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'fmin'}=$pred_feat->[0];
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'fmax'}=$pred_feat->[1];
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'orient'}=$pred_feat->[2];
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'fs'}=$fs;
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'olaps'}=$olaps;
+ my $sdist;
+ my $sameframe=1;
+ if($pred_feat->[2] eq $features->{$feat_name}->[4]){
+ $sameframe=1;
+ }
+ else{
+ $sameframe=0;
+ }
+ if($pred_feat->[2] eq '+'){
+ $sdist = $pred_feat->[0]-$features->{$feat_name}->[1];
+ }
+ elsif($pred_feat->[2] eq '-'){
+ $sdist = $pred_feat->[1]-$features->{$feat_name}->[2];
+ }
+ print EFILE "$feat_name\t$org\t$pred_feat->[0]\t$pred_feat->[1]\t",($pred_feat->[1] - $pred_feat->[0]),"\t$pred_feat->[2]\t$sameframe\t$sdist\t$fs\t";
+ my @olaplist = keys %$olaps;
+ for(my $i=0;$i<scalar(@olaplist);$i++){
+ print EFILE "$olaplist[$i]($olaps->{$olaplist[$i]}->[3];$olaps->{$olaplist[$i]}->[6] bp;";
+ die "Bad feat $olaplist[$i]" if(! exists $features->{$olaplist[$i]} && scalar(keys %$featlist)==0);
+ printf EFILE "%.1f)",($olaps->{$olaplist[$i]}->[6]/$features->{$olaplist[$i]}->[3]) if(exists $features->{$olaplist[$i]});
+ print EFILE "," if($i<scalar(@olaplist)-1);
+ }
+ print EFILE "\n";
+ }
+ }
+ }
+ else{
+ #Report an annotation in a region with annotations below cutoffs or neworfs
+ my $pred_feat = $codons->{'pairs'}->{$bestcodon}->{'orgs'}->{$org};
+ my $fs = (defined $codons->{'pairs'}->{$bestcodon}->{'orgs'}->{$org}->[4]) ? "F" : "";
+ my $olapgenes = &getFeaturesByInterval($atree,$org,$pred_feat->[0],$pred_feat->[1],$pred_feat->[2]);
+ my $feat_name;
+ if(scalar(keys %$olapgenes)>0){
+ $feat_name = "ALTORF_C$cluster_id";
+ }
+ else{
+ #NEWORF due to frame
+ if(! exists $codons->{'pairs'}->{$bestcodon}->{'neworfs'}->{$org}){
+ print STDERR "Unexpected neworf NEWORF_C$cluster_id: ",join(',',@$pred_feat),"\n";
+ $codons->{'pairs'}->{$bestcodon}->{'neworfs'}->{$org}->{'fmin'} = $pred_feat->[0];
+ $codons->{'pairs'}->{$bestcodon}->{'neworfs'}->{$org}->{'fmax'} = $pred_feat->[1];
+ $codons->{'pairs'}->{$bestcodon}->{'neworfs'}->{$org}->{'orient'} = $pred_feat->[2];
+ }
+ $feat_name = "NEWORF_C$cluster_id";
+ }
+ my $sdist;
+ my $sameframe=1;
+ if(exists $features->{[keys %$olapgenes]->[0]}){
+ if($pred_feat->[2] eq $features->{[keys %$olapgenes]->[0]}->[4]){
+ $sameframe=1;
+ }
+ else{
+ $sameframe=0;
+ }
+ if($pred_feat->[2] eq '+'){
+ $sdist = $pred_feat->[0]-$features->{[keys %$olapgenes]->[0]}->[1];
+ }
+ elsif($pred_feat->[2] eq '-'){
+ $sdist = $pred_feat->[1]-$features->{[keys %$olapgenes]->[0]}->[2];
+ }
+ }
+ #name,org,fmin,fmax,len,orient,sameframe,startdist,fs,overlaps
+ print EFILE "$feat_name\t$org\t$pred_feat->[0]\t$pred_feat->[1]\t",($pred_feat->[1] - $pred_feat->[0]),"\t$pred_feat->[2]\t$sameframe\t$sdist\t$fs\t";
+ my @olaplist = keys %$olapgenes;
+ for(my $i=0;$i<scalar(@olaplist);$i++){
+ print EFILE "$olaplist[$i]($olapgenes->{$olaplist[$i]}->[3];$olapgenes->{$olaplist[$i]}->[6] bp;";
+ die if(! exists $features->{$olaplist[$i]} && scalar(keys %$featlist)==0);
+ printf EFILE "%.1f)",($olapgenes->{$olaplist[$i]}->[6]/$features->{$olaplist[$i]}->[3]) if(exists $features->{$olaplist[$i]});
+ print EFILE "," if($i<scalar(@olaplist)-1);
+ $codons->{'pairs'}->{$bestcodon}->{'olapgenes'}->{$org}++;
+ }
+ print EFILE "\n";
+ if(exists $codons->{'pairs'}->{$bestcodon}->{'neworfs'}->{$org} && scalar(keys %$olapgenes)>0){
+ foreach my $gene (keys %$olapgenes){
+ if(exists $mapped->{$gene}){
+ print STDERR "#Neworfs marked in region on $org with genes already mapped into clusters ",join(',',@{$olapgenes->{$gene}}),"\n";
+ }
+ else{
+ print STDERR "#Neworfs marked in region on $org with other annotations ",join(',',@{$olapgenes->{$gene}}),"\n";
+ }
+ }
+ }
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'fmin'}=$pred_feat->[0];
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'fmax'}=$pred_feat->[1];
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'orient'}=$pred_feat->[2];
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'fs'}=$fs;
+ $altcodons->{$bestcodon}->{'orgs'}->{$org}->{'olaps'}=$olapgenes;
+ }
+ }
+ $altcodons->{$bestcodon}->{'name'}="ALT$i";
+ $altcodons->{$bestcodon}->{'gfreq'}=$codons->{'pairs'}->{$bestcodon}->{'gfreq'};
+ $altcodons->{$bestcodon}->{'afreq'}=$codons->{'pairs'}->{$bestcodon}->{'afreq'};
+ $altcodons->{$bestcodon}->{'len'}=$codons->{'pairs'}->{$bestcodon}->{'len'};
+ $altcodons->{$bestcodon}->{'neworfs'}=$codons->{'pairs'}->{$bestcodon}->{'neworfs'};
+ $altcodons->{$bestcodon}->{'fs'}=$codons->{'pairs'}->{$bestcodon}->{'fsvars'};
+ my $newclassesstr = $codons->{'pairs'}->{$bestcodon}->{'cluster_attrs'};
+ print "#CODON $bestcodon $codons->{'pairs'}->{$bestcodon}->{'gfreq'} max_annotated_len:$maxlen ";
+ if($debug){
+ print "delta_len_max:$deltafracmax\n";
+ }
+ else{
+ print "\n";
+ }
+ #EDITTBL cluster_id, codon, genome_freq, annotated_freq, len, neworfs, overlaps, comments
+ print "#EDITTBL\tC$cluster_id\t$bestcodon\t$codons->{'pairs'}->{$bestcodon}->{'gfreq'}";
+ if(scalar(keys%{$codons->{'pairs'}->{$bestcodon}->{'neworfs'}}) > 0){
+ print "(N:",scalar(keys%{$codons->{'pairs'}->{$bestcodon}->{'neworfs'}}),")";
+ }
+ if(scalar(keys %{$codons->{'pairs'}->{$bestcodon}->{'fsvars'}}) > 0){
+ print "(F:",scalar(keys%{$codons->{'pairs'}->{$bestcodon}->{'fsvars'}}),")";
+ }
+ print "\t$codons->{'pairs'}->{$bestcodon}->{'afreq'}\t$codons->{'pairs'}->{$bestcodon}->{'len'}\t";
+ print scalar(keys %{$codons->{'pairs'}->{$bestcodon}->{'olaps'}}),"\t";
+ if(scalar(keys %{$codons->{'pairs'}->{$bestcodon}->{'olaps'}})){
+ print "#OVERLAP ";
+ $altcodons->{$bestcodon}->{'isoverlap'}=1;
+ }
+ if($codonlength eq $maxlen){
+ print "#MAXLENEDIT ";
+ $altcodons->{$bestcodon}->{'maxlen'}=1;
+ }
+ if($codons->{'pairs'}->{$bestcodon}->{'gfreq'} eq (scalar(keys %$mappedorgs)+scalar(keys %$unmappedorgs))){
+ print "#FCONSISTENT ";
+ $altcodons->{$bestcodon}->{'fcon'}=1;
+ }
+ print "\n";
+ #Collapse all indels into runs
+ foreach my $org (keys %{$codons->{'pairs'}->{$bestcodon}->{'fsvars'}}){
+ # my @coords = sort {$a->[0] <=> $b->[0]} (@{$codons->{'pairs'}->{$bestcodon}->{'fsvars'}->{$org}});
+# my @runs;
+# my $indelstr1;
+# my $indelstr2;
+# my $last;
+# my $start;
+# my $end;
+# for(my $i=0;$i<@coords;$i++){
+# #print join(',',@{$coords[$i]}),"\n";
+# if($i==0){
+# $start=$coords[$i]->[0];
+# }
+# elsif(abs($last+1 - $coords[$i]->[0]) > 1){
+# push @runs,[$start,$last,$indelstr1,$indelstr2];
+# $indelstr1="";
+# $indelstr2="";
+# $start=$coords[$i]->[0];
+# }
+# $last=$coords[$i]->[0];
+# $indelstr1.=$coords[$i]->[1];
+# $indelstr2.=$coords[$i]->[2];
+# }
+# push @runs,[$start,$last,$indelstr1,$indelstr2];
+# #Remove runs that are multiple of 3
+# my @fsruns;
+# foreach my $r (@runs){
+# die if(length($r->[2]) != length($r->[3]));
+# if(length($r->[2])%3!=0){
+# push @fsruns,$r;
+# }
+# }
+ my @fsruns = @{$codons->{'pairs'}->{$bestcodon}->{'fsvars'}->{$org}};
+ print "#FS $org ";
+ foreach my $r (@fsruns){
+ print "[$r->[0]-$r->[1] $r->[2]:$r->[3]] $r->[4]";
+ }
+ print "\n";
+ }
+ foreach my $org (keys %{$codons->{'pairs'}->{$bestcodon}->{'neworfs'}}){
+ my $fmin = $codons->{'pairs'}->{$bestcodon}->{'neworfs'}->{$org}->{'fmin'};
+ my $fmax = $codons->{'pairs'}->{$bestcodon}->{'neworfs'}->{$org}->{'fmax'};
+ my $orient = $codons->{'pairs'}->{$bestcodon}->{'neworfs'}->{$org}->{'orient'};
+ my $olapgenes = &getFeaturesByInterval($atree,$org,$fmin,$fmax,$orient);
+ if(scalar (keys %$olapgenes)>0){
+ foreach my $feat (keys %$olapgenes){
+ print STDERR "Unexpected genes found ",join(",",@{$olapgenes->{$feat}}),"\n";
+ }
+ #die;
+ }
+ print "#NEWORF $org $fmin,$fmax,",($fmax-$fmin),",$orient\n";
+ $new_orfs->{$org}++;
+ }
+ }
+ }
+ else{
+ #print STDERR "#WARNING Codon $bestcodon has 0 frequency\n";
+ }
+ }
+ close EFILE;
+ }
+ }
+ }
+ #Print cluster
+ $classesallstr = join(';',sort {$a cmp $b} keys %{$cluster_attrs});
+ $classes_all->{$classesallstr}->{'ngenes'} += scalar(keys %$mappedgenes);
+ $classes_all->{$classesallstr}->{'nclusters'}++;
+ $classes_all->{$classesallstr}->{'new_orfs'}+= scalar(keys %$new_orfs);
+ &reportCluster($query,$mappedorgs,$mappedgenes,$unmappedorgs,$unmappedgenes,$feat_attrs,$cluster_attrs,$seq_attrs,$new_orfs);
+ $classesstr = join(';',sort {$a cmp $b} keys %{$cluster_attrs});
+ $clusters->{$cluster_id}->{'alts'} = $altcodons;
+ $clusters->{$cluster_id}->{'codons'} = $codons->{'pairs'};
+ $classes_sum->{$classesstr}->{'ngenes'} +=scalar(keys %$mappedgenes);
+ $classes_sum->{$classesstr}->{'nclusters'}++;
+ $classes_sum->{$classesstr}->{'new_orfs'}+= scalar(keys %$new_orfs);
+ $neworfcount+=scalar(keys %$new_orfs);
+ $validcluster++;
+ print "#VALID\tCLUSTER_$cluster_id\tNum_organisms=",scalar(keys %$mappedorgs)+1,
+ "\tNum_genes=",scalar(keys %$mappedgenes),"\n" if($debug);;
+ #For unmapped genes, save the best overlapping alignment
+ foreach my $feat_name (keys %$unmappedgenes){
+ if(!exists $mapped->{$feat_name}){
+ if(exists $unmapped->{$feat_name} #first alignment encountered
+ || $unmappedgenes->{$feat_name}->{'cov'} > $unmapped->{$feat_name}->{'cov'}){ #better coverage
+ $unmapped->{$feat_name}->{'cov'} = $unmappedgenes->{$feat_name}->{'cov'}/$features->{$feat_name}->[3]; #%coverage over gene length
+ if($unmappedgenes->{$feat_name}->{'len'}){
+ $unmapped->{$feat_name}->{'pid'} = $unmappedgenes->{$feat_name}->{'pid'}/$unmappedgenes->{$feat_name}->{'len'}; #%id over aligned length
+ }
+ else{
+ $unmapped->{$feat_name}->{'pid'} = 0;
+ }
+ $unmapped->{$feat_name}->{'len'} = $unmappedgenes->{$feat_name}->{'len'};
+ $unmapped->{$feat_name}->{'WGA_cluster'} = $cluster_id;
+ }
+ else{
+ die if(exists $unmappedgenes->{$feat_name} && !exists $unmapped->{$feat_name});
+ }
+ }
+ }
+ }
+ else{
+ die "Feature $query not mapped but marked so" if(exists $mapped->{$query});
+ #Cluster is a singleton, skip it or print for debugging
+ print "#SKIPPED\t$query\tWGA$cluster_id\tNum_organisms=",scalar(keys %$mappedorgs),
+ "\tNum_genes=",scalar(keys %$unmappedgenes),"\n" if($debug);
+ #This cluster was skipped because it does not pass coverage cutoffs
+ #Optionally print
+ if($printskipped){
+ #print "#$query\tWGA$cluster_id\t$currorg\tcov:",$qcov/($fmax-$fmin),"\tid:1\tspan:$fmin-$fmax\tlen:",$fmax-$fmin,"\n";
+ foreach my $organism (sort {$a cmp $b} keys %$unmappedorgs){
+ if(ref $unmappedorgs->{$organism} && exists $unmappedorgs->{$organism}->{'features'}){
+ my($start,$end) = &getspan($unmappedgenes,keys %{$unmappedorgs->{$organism}->{'features'}});
+ my @ogenes = sort {$features->{$a}->[1] <=> $features->{$b}->[1]} (keys %{$unmappedorgs->{$organism}->{'features'}});
+ my @ocovs = map {sprintf("%.2f",$unmappedgenes->{$_}->{'cov'}/$features->{$_}->[3])} (@ogenes);
+ my @oids = map {sprintf("%.2f",$unmappedgenes->{$_}->{'pid'}/$unmappedgenes->{$_}->{'len'})} (@ogenes);
+ print "#",join(',', at ogenes),
+ "\tWGA$cluster_id",
+ "\t$organism",
+ "\tcov:",join(',', at ocovs),
+ "\tid:",join(',', at oids),
+ "\tspan:$start-$end len:",$end-$start,
+ "\n" if($debug);
+ }
+ }
+ }
+ }
+ #foreach my $organism (keys %$new_orfs){
+ #my $orfidx=0;
+ #foreach my $alt (@{$new_orfs->{$organism}}){
+ #$neworfcount++;
+ #}
+ #}
+ if($autocorrect){
+ #Auto-correct cluster
+ my @neworfs;
+ foreach my $organism (sort {$a cmp $b} keys %$mappedorgs){
+ my @ogenes = sort {$features->{$a}->[1] <=> $features->{$b}->[1]} (keys %{$mappedorgs->{$organism}});
+ my $classes;
+ my $longestorf=0;
+ my $longestpairc=0;
+ foreach my $gene (@ogenes){
+ if(exists $feat_attrs->{$gene}){
+ foreach my $c (sort {$a cmp $b} keys %{$feat_attrs->{$gene}}){
+ $classes->{$c}++;
+ }
+ }
+ $longestorf = ($features->{$gene}->[3] > $longestorf) ? $features->{$gene}->[3] : $longestorf;
+ $longestpairc = ($feat_attrs->{$gene}->{'pairfreq'} > $longestpairc) ? $feat_attrs->{$gene}->{'pairfreq'} : $longestpairc;
+ }
+ ##
+ #my @attrs = sort {$a cmp $b} keys %$classes;
+ if(exists $seq_attrs->{$organism}){
+ #Report alternative start sites if they result in a longer ORF
+ my @alts;
+ my $orfidx=0;
+ foreach my $alt (@{$seq_attrs->{$organism}}){
+ #Report alternative starts or possible frameshifts
+ if($alt =~ /alt_start/){
+ print "#$alt\n" if($debug);;
+ #Only report if results in a longer ORF
+ my($astart,$aend,$aorient,$alen) = ($alt =~ /alt_start=(\d+)-(\d+),orient\:([^,]+),len\:(\d+)/);
+ my($apairfreq) = ($alt =~ /pairfreq:(\d+)/);
+ print "#alt $astart,$aend,$aorient,$alen\n" if($debug);;
+ print STDERR "BAD $alt" if(!$astart || !$aend || !$aorient || !$alen);
+ if(!$longer_altstarts || $alen>$longestorf){
+ if(!$moreconsistent_altstarts || $apairfreq>$longestpairc){
+ push @alts,["ALTSTARTgene$organism$orfidx",$astart,$aend,$aend-$astart,$aorient];
+ $orfidx++;
+ }
+ }
+ else{
+ print "#Skipping $alt $alen<$longestorf\n" if($debug);;
+ }
+ }
+ }
+ #Report frameshifts if they result in a longer ORF
+ #This should also include alt start, frameshift pairs if they result in a longer ORF
+ foreach my $alt (@{$seq_attrs->{$organism}}){
+ #Report alternative starts or possible frameshifts
+ if($alt =~ /alt_fs/){
+ print "#$alt\n" if($debug);;
+ #Only report if results in a longer ORF
+ my($astart,$aend,$aorient,$alen) = ($alt =~ /alt_fs=(\d+)-(\d+),orient\:([^,]+),len\:(\d+)/);
+ print "#alt $astart,$aend,$aorient,$alen\n" if($debug);;
+ die "$alt" if(!$astart || !$aend || !$aorient || !$alen);
+ if(!$longer_altstarts || $alen>$longestorf){
+ push @alts,["ALTFSgene$organism$orfidx",$astart,$aend,$aend-$astart,$aorient];
+ $orfidx++;
+ }
+ else{
+ print "#Skipping $alt $alen<$longestorf\n" if($debug);;
+ }
+ }
+ }
+ #Replace $ogenes
+ if(scalar(@alts)>0){
+ my @sortedalts = sort {$b->[3] <=> $a->[3]} @alts;
+ my $neworf = $sortedalts[0];
+ print "#Num genes ",scalar(keys %$mappedgenes),"\n" if($debug);;
+ if($autocorrect){
+ foreach my $gene (@ogenes){
+ $deleted->{$gene}++;
+ delete $mappedgenes->{$gene};
+ delete $mappedorgs->{$organism}->{$gene};
+ print "#Possible deleting $gene\n" if($debug);;
+ }
+ }
+ my $featlen = $neworf->[3];
+ #if($neworfcov/$featlen >= $coverage_cutoff && #%coverage over matching gene length
+ # $neworfpid/$alnlen >= $pid_cutoff){ #%id over aligned length onl
+ $features->{$neworf->[0]} = [$organism,$neworf->[1],$neworf->[2],$neworf->[3],$neworf->[4]];
+ $mappedgenes->{$neworf->[0]}->{'fmin'} = $neworf->[1];
+ $mappedgenes->{$neworf->[0]}->{'fmax'} = $neworf->[2];
+ $mappedgenes->{$neworf->[0]}->{'len'} = $neworf->[3];
+ $mappedgenes->{$neworf->[0]}->{'relorient'} = $neworf->[4];
+ $mappedorgs->{$organism}->{'features'}->{$neworf->[0]}++;
+ $mappedorgs->{$organism}->{'qcov'} = '?';
+ #Add new gene
+ print "#Adding ",join(',',@$neworf),"\n" if($debug);;
+ push @neworfs,$neworf->[0];
+ $adjustedorfs++;
+ print "#Num genes ",scalar(keys %$mappedgenes),"\n" if($debug);
+ }
+ }
+ }
+ if(scalar(keys %$mappedgenes)>1 && scalar(keys %$mappedorgs)>1){
+ my($feat_attrs,$cluster_attrs,$codons) = &annotateCluster($atree,$mappedgenes,$mappedorgs);
+ my $classesstr = join(';',sort {$a cmp $b} keys %{$cluster_attrs});
+ $newclasses_sum->{$classesstr}->{'ngenes'} +=scalar(keys %$mappedgenes);
+ $newclasses_sum->{$classesstr}->{'nclusters'}++;
+ }
+ foreach my $neworf (@neworfs){
+ delete $mappedgenes->{$neworf};
+ delete $features->{$neworf};
+ }
+ }
+ }
+print "#NUM CLUSTERS $validcluster\n";
+#Mark the remaining features as singletons categorized as
+#1)not found in any alignments !exists mapped && !exists unmapped
+#2)aligned but below cutoffs !exists mapped && exists unmapped
+#If duplications file provided, mark accordingly
+($nomatches,$dups) = &findSingletons($atree,$mapped,$unmapped,$subsumed,$datree);
+#Calculate summary stats
+my $avgcov=0;
+my $avgid=0;
+my $mappedgenescount=0;
+my $unmappedgenescount=0;
+my $avgunmappedcov=0;
+my $avgunmappedid=0;
+my $unmappeddups=0;
+my $nohit=0;
+my $nohitdupcount=0;
+my $neworfscount=0;
+foreach my $feat_name (keys %$features){
+ my $fmin = $features->{$feat_name}->[1];
+ my $fmax = $features->{$feat_name}->[2];
+ if(exists $mapped->{$feat_name}){
+ die if(exists $unmapped->{$feat_name});
+ die if(exists $nomatches->{$feat_name});
+ if($mapped->{$feat_name}->{'cov'}>1){
+ print STDERR "Bad cov ",$mapped->{$feat_name}->{'cov'},"\n" if($verbose);
+ #$mapped->{$feat_name}->{'cov'}=1;
+ }
+ if($mapped->{$feat_name}->{'pid'}>1){
+ print STDERR "Bad id ",$mapped->{$feat_name}->{'pid'},"\n" if($verbose);
+ #$mapped->{$feat_name}->{'pid'}=1;
+ }
+ $avgcov+=$mapped->{$feat_name}->{'cov'};
+ $avgid+=$mapped->{$feat_name}->{'pid'};
+ $mappedgenescount++;
+ }
+ elsif(exists $unmapped->{$feat_name}){
+ die if(exists $mapped->{$feat_name});
+ die if(exists $nomatches->{$feat_name});
+ if($unmapped->{$feat_name}->{'cov'}>1){
+ print STDERR "Bad cov ",$unmapped->{$feat_name}->{'cov'},"\n" if($verbose);
+ #$unmapped->{$feat_name}->{'cov'}=1;
+ }
+ if($unmapped->{$feat_name}->{'pid'}>1){
+ print STDERR "Bad id ",$unmapped->{$feat_name}->{'pid'},"\n" if($verbose);
+ #$unmapped->{$feat_name}->{'pid'}=1;
+ }
+ $avgunmappedcov+=$unmapped->{$feat_name}->{'cov'};
+ $avgunmappedid+=$unmapped->{$feat_name}->{'pid'};
+ $unmappedgenescount++;
+ if(exists $dups->{$feat_name}){
+ $unmappeddups++;
+ }
+ }
+ elsif(exists $dups->{$feat_name}){
+ $nohitdupcount++;
+ }
+ elsif(exists $nomatches->{$feat_name}){
+ $nohit++;
+ }
+ else{
+ #Genes should be categorized in mapped,unmapped,singletons
+ die if(exists $mapped->{$feat_name});
+ die if(exists $unmapped->{$feat_name});
+ die if(exists $nomatches->{$feat_name});
+ #The rest are either deleted or newly called ORFs that are discarded
+ die if(!exists $subsumed->{$feat_name} && !exists $neworfs->{$feat_name});
+ }
+print STDERR "#Mismatch between NOHIT=$nohit and nomatches lookup",scalar(keys %$nomatches),"\n" if($nohit != scalar(keys %$nomatches));
+#Print summary stats
+print "\n\n\n";
+print "Class legend\n";
+print "C{S,E}1 - consistent start,stop\n";
+print "C{S,E}2 - inconsistent start,stop. More than one annotated in a group\n";
+print "C{S,E}3 - unaligned start,stop\n";
+print "C{S,E}4 - invalid start,stop according to translation table\n";
+print "C{S,E}0 - start,stop at/off contig boundary\n";
+print "CM1 - multiple gene fragments. possible interrupted genes\n";
+print "CX - invalid translation\n";
+#print "CS/E2.1 - there is only one annotated gene for each genome, but not all genomes use the same start/stop\n";
+#print "CS/E2.2 - the start/stop of some genes fall in a gapped region of the alignment\n";
+print "Summary classes\n";
+foreach my $cstr (sort {$classes_sum->{$b}->{'ngenes'} <=> $classes_sum->{$a}->{'ngenes'}} (keys %$classes_sum)){
+ print "$cstr: num_genes:$classes_sum->{$cstr}->{'ngenes'} num_clusters:$classes_sum->{$cstr}->{'nclusters'}\n";
+print "Complete classes\n";
+foreach my $cstr (sort {$classes_all->{$b}->{'ngenes'} <=> $classes_all->{$a}->{'ngenes'}} (keys %$classes_all)){
+ print "$cstr: num_genes:$classes_all->{$cstr}->{'ngenes'} num_clusters:$classes_all->{$cstr}->{'nclusters'}\n";
+print "Number of clusters containing aligned features\n";
+print "CLUSTERS:$validcluster\n";
+print "Number aligned features mapped into clusters\n";
+print "MAPPED:$mappedgenescount AVGCOV:",$avgcov/$mappedgenescount," AVGID:",$avgid/$mappedgenescount,"\n" if($mappedgenescount);
+print "Number features with an overlapping alignment but are not mapped into clusters\n";
+print "UNMAPPED:$unmappedgenescount AVGCOV:",$avgunmappedcov/$unmappedgenescount," AVGID:",$avgunmappedid/$mappedgenescount," NUMDUPS:$nohitdupcount\n" if($unmappedgenescount && $mappedgenescount);
+if(exists $options{'duplications'}){
+ print "Number of features with no mapping and marked as duplications\n";
+ print "DUPS:$nohitdupcount\n";
+print "Number of features with no overlapping alignment\n";
+print "NOHIT:$nohit\n";
+print "Number of missing annotations\n";
+print "MISSORF:$neworfcount\n";
+close $cfh;
+close $ctfh;
+close $ctfh2;
+# Subroutines
+#Primary method of obtaining mapped annotation from an alignment
+#Build a cluster of aligned features/genes based on a single query
+#gene, $query
+#TODO: Confirm qcov,qpid,cov,pid are calculated correctly. Correct for overlapping alignments
+sub buildCluster{
+ my ($atree,$query) = @_;
+ #Attributes of the query
+ my $qseqname = $features->{$query}->[0];
+ $feat2organism->{$query} = $qseqname;
+ my $qcurrorg = '?';
+ my $qcov = 0;
+ my $qpid = 0;
+ my $qalnfmin = undef;
+ my $qalnfmax = undef;
+ my $qfmin = $features->{$query}->[1];
+ my $qfmax = $features->{$query}->[2];
+ my $qfeatlen = $qfmax-$qfmin;
+ my $qrelorient = 0;
+ my $qorient = $features->{$query}->[4];
+ print "#MAPFEATURE Mapping $query $qseqname:$qfmin-$qfmax len:",$qfmax-$qfmin,"\n" if($debug);;
+ #AlignmentTree::map()
+ #returns [0=alignment_name,1=seqid,2=align_start,3=align_stop,4=align_cov,5=feature_name,6=seqid,7=feature_cov,8=feature_pid]
+ my @isect = $atree->map($qseqname,$qfmin,$qfmax);
+ #List of alignments that comprise the current cluster
+ my $goodalignments = {};
+ #List of seqs that overlap query in the cluster
+ my $allseqs = {};
+ #List of organism_ids in the current cluster
+ my $mappedorgs = {}; #passes cutoffs
+ my $unmappedorgs = {}; #do not pass cutoffs
+ #List of genes in the current cluster
+ my $mappedgenes = {}; #passes cutoffs
+ my $unmappedgenes = {}; #do not pass cutoffs
+ #Contains list of annotations that are overlapping in an alignment
+ my $alnfeats = {};
+ my $alnorgs = {};
+ my $valid=0;
+ my $nisect;
+ ($nisect,$allseqs,$goodalignments) = &getAlignedFeatures($atree,$qseqname,$query,$qfmin,$qfmax,'gene');
+ if($verbose){
+ print "#QUERY=$query len=$features->{$query}->[3]";
+ print " coords=$qfmin-$qfmax len=$features->{$query}->[3] strand=$features->{$query}->[4]";
+ print " Num_alignments=",scalar(keys %$goodalignments);
+ print "\n";
+ }
+ #Report annotated frame relative to query
+ my $seqalnpos;
+ my @isect;
+ if($qorient eq '+'){
+ @isect = $atree->intersect($qseqname,$qfmin,$qfmin+3);
+ }
+ else{
+ @isect = $atree->intersect($qseqname,$qfmax-3,$qfmax);
+ }
+ foreach my $r (@isect){
+ my $feat_name = $r->[0];
+ my $seqname = $r->[1];
+ my $align_name = $r->[5];
+ #print "#Setting $seqname query frame: qorient $qorient, alnframe: $r->[7] $r->[2]-$r->[3]\n";
+ if(exists $goodalignments->{$feat_name} && $feat_name =~ /^WGA/){
+ if($qorient eq '+'){
+ if($r->[7] eq '-'){
+ $seqalnpos->{$seqname}=$r->[3];
+ }
+ else{
+ $seqalnpos->{$seqname}=$r->[2];
+ }
+ }
+ else{
+ if($r->[7] eq '-'){
+ $seqalnpos->{$seqname}=$r->[3];
+ }
+ else{
+ $seqalnpos->{$seqname}=$r->[2];
+ }
+ }
+ }
+ }
+ foreach my $r ( sort { $features->{$b->[0]}->[3] <=> $features->{$a->[0]}->[3] } #sort on feature length
+ @$nisect){
+ my $feat_name = $r->[0];
+ my $seqname = $r->[1];
+ my $align_name = $r->[5];
+ #Check if we want to consider this alignment
+ if(exists $goodalignments->{$align_name}){
+ my($alnobj,$bv,$width) = $atree->getAlignment($align_name);
+ $feat_name =~ s/gene\://;
+ if(!exists $features->{$feat_name}){
+ print "#Bad feature found $feat_name. Not in input file. Skipping\n" if($debug);
+ next;
+ }
+ #Capture some stats on the matching genes
+ #TODO the cov,pid stats assume non-overlapping alignments
+ if($query ne $feat_name){
+ #Only report genes that have not been mapped
+ if(!exists $mapped->{$feat_name} && !exists $deleted->{$feat_name} && exists $features->{$feat_name}){
+ print "#MAP:",join("\t",$cluster_id,@$r),"\n" if($debug);
+ die "Mismatching orientation for $feat_name. Mapping showing $r->[12]. Input reporting $features->{$feat_name}->[4]" if($r->[12] ne $features->{$feat_name}->[4]);
+ die "fmax < fmin" if($r->[3]<$r->[2]);
+ die "Mismatched strand for $feat_name. Expecting $r->[12], got $features->{$feat_name}->[4]" if($r->[12] ne $features->{$feat_name}->[4]);
+ #Sum the coverage for each gene versus the query
+ if(exists $alnfeats->{$feat_name}->{'fmin'}){
+ $alnfeats->{$feat_name}->{'fmin'}=($r->[2]<$alnfeats->{$feat_name}->{'fmin'}) ? $r->[2]: $alnfeats->{$feat_name}->{'fmin'};
+ }
+ else{
+ $alnfeats->{$feat_name}->{'fmin'}=$r->[2];
+ }
+ if(exists $alnfeats->{$feat_name}->{'fmax'}){
+ $alnfeats->{$feat_name}->{'fmax'}=($r->[3]>$alnfeats->{$feat_name}->{'fmax'}) ? $r->[3]: $alnfeats->{$feat_name}->{'fmax'};
+ }
+ else{
+ $alnfeats->{$feat_name}->{'fmax'}=$r->[3];
+ }
+ $alnfeats->{$feat_name}->{'cov'}+=$r->[7];
+ $alnfeats->{$feat_name}->{'pid'}+=$r->[8];
+ $alnfeats->{$feat_name}->{'len'}+=($r->[3]-$r->[2]);
+ die "Bad pid $alnfeats->{$feat_name}->{'pid'} > $alnfeats->{$feat_name}->{'len'} from pid:$r->[8] len:($r->[3]-$r->[2]) ".($r->[3]-$r->[2]) if($alnfeats->{$feat_name}->{'pid'} > $alnfeats->{$feat_name}->{'len'});
+ $alnfeats->{$feat_name}->{'relorient'} = $r->[11];
+ $feat2organism->{$feat_name} = $r->[1];
+ #num aligned residues $r->[8] indicates matches on query seq
+ # |NNNNNN---NNNNNNNNN| query 15 residues
+ # |NNNNNNNNN---NNNNNN| hit 15 residues - 9 matching , qcov=9/15, cov=9/15
+ $alnfeats->{$feat_name}->{'qcov'}+=$r->[4];
+ print "#$feat_name $align_name $r->[2]-$r->[3] len:",$r->[3]-$r->[2],
+ ",$alnfeats->{$feat_name}->{'len'} cov:$r->[7],$alnfeats->{$feat_name}->{'cov'} id:$r->[8],$alnfeats->{$feat_name}->{'pid'} alnorient:$r->[10] featorient:$r->[11]\n" if($debug);
+ }
+ else{
+ #This feature has already been mapped
+ print "#Alternative mapping for $feat_name cov:$r->[4] pid:$r->[8] len:",$r->[3]-$r->[2]," matchingorient:$r->[10],$r->[11]\n" if($debug);;
+ }
+ }
+ else{
+ die if($feat_name ne $query);
+ die if($r->[10] ne $r->[11]);
+ #Capture some stats on the query
+ if(defined $qalnfmin){
+ $qalnfmin = ($r->[2] < $qalnfmin) ? $r->[2] : $qalnfmin;
+ }
+ else{
+ $qalnfmin = $r->[2];
+ }
+ if(defined $qalnfmax){
+ $qalnfmax = ($r->[3] > $qalnfmax) ? $r->[3] : $qalnfmax;
+ }
+ else{
+ $qalnfmax = $r->[3];
+ }
+ $qcov += $r->[7];
+ $qcurrorg = $r->[6];
+ $qpid += $r->[8];
+ $qrelorient = $r->[10];
+ }
+ }
+ }
+ $mappedgenes->{$query}->{'fmin'} = $qalnfmin;
+ $mappedgenes->{$query}->{'fmax'} = $qalnfmax;
+ $mappedgenes->{$query}->{'cov'} = $qalnfmax-$qalnfmin;#$qcov;
+ $mappedgenes->{$query}->{'pid'} = $qpid;#TODO, not pid, rather %aln: allows mismatches but no gaps
+ $mappedgenes->{$query}->{'len'} = $qfeatlen;
+ $mappedgenes->{$query}->{'relorient'} = $qrelorient;
+ $mappedgenes->{$query}->{'alignments'} = [keys %$goodalignments];
+ $mappedorgs->{$qcurrorg}->{'features'}->{$query}++;
+ $mappedorgs->{$qcurrorg}->{'qcov'} = $qalnfmax-$qalnfmin;
+ #Set query coverage
+ foreach my $feat_name (keys %$alnfeats){
+ my $seqname = $features->{$feat_name}->[0];
+ my $fmin = $features->{$feat_name}->[1];
+ my $fmax = $features->{$feat_name}->[2];
+ my $orient = $features->{$feat_name}->[4];
+ my @isect;
+ my $qmatchstart;
+ if($orient eq '+'){
+ @isect = $atree->intersect($seqname,$fmin,$fmin+3);
+ }
+ else{
+ @isect = $atree->intersect($seqname,$fmax-3,$fmax);
+ }
+ my $alignedstart=0;
+ foreach my $r (@isect){
+ my $feat_name = $r->[0];
+ if(exists $goodalignments->{$feat_name} && $feat_name =~ /^WGA/){
+ if($r->[1] eq $qseqname){
+ my $align_name = $r->[5];
+ if($r->[7] eq '-'){
+ $qmatchstart=$r->[3];
+ }
+ else{
+ $qmatchstart=$r->[2];
+ }
+ }
+ elsif($r->[1] eq $seqname){
+ if(($orient eq '+' && $fmin == $r->[2] && $fmin+3 == $r->[3]) ||
+ ($orient eq '-' && $fmax-3 == $r->[2] && $fmax == $r->[3])){
+ $alignedstart=1;
+ print "#Aligned start $feat_name $seqname $orient $fmin-$fmax $r->[2]-$r->[3]\n" if($debug);
+ }
+ else{
+ print "#Unaligned start for $feat_name $seqname $fmin == $r->[2] && $fmax == $r->[3]\n" if($debug);
+ }
+ }
+ }
+ }
+ my $featlen = $fmax-$fmin;
+ if($alnfeats->{$feat_name}->{'cov'}/$featlen >= $coverage_cutoff && #%coverage over matching gene length
+ $alnfeats->{$feat_name}->{'pid'}/$alnfeats->{$feat_name}->{'len'} >= $pid_cutoff){ #%id over aligned length only
+ print "Summing query coverage feat_name $feat_name $feat2organism->{$feat_name} = $alnfeats->{$feat_name}->{'qcov'}. Current total $alnorgs->{$feat2organism->{$feat_name}}->{'qcov'}\n" if($debug);
+ $alnorgs->{$feat2organism->{$feat_name}}->{'qcov'} += $alnfeats->{$feat_name}->{'qcov'};
+ }
+ if(exists $seqalnpos->{$seqname} && $alignedstart){
+ if($features->{$feat_name}->[4] eq '+' ){
+ #print "#Feat $feat_name $features->{$feat_name}->[4] $seqalnpos->{$seqname}-$fmin ",$seqalnpos->{$seqname}-$fmin," ",($seqalnpos->{$seqname}-$fmax)%3,"\n";
+ $alnfeats->{$feat_name}->{'relqrysdist'}=($seqalnpos->{$seqname}-$fmin);
+ $alnfeats->{$feat_name}->{'frame'}=($seqalnpos->{$seqname}-$fmin)%3;
+ $alnfeats->{$feat_name}->{'frameinqry'}=($qfmin-$qmatchstart)%3;
+ #print "#qframe $feat_name $alnfeats->{$feat_name}->{'frameinqry'}\n";
+ }
+ else{
+ #print "#Feat $feat_name $features->{$feat_name}->[4] $seqalnpos->{$seqname}-$fmax ",$seqalnpos->{$seqname}-$fmax," ",($seqalnpos->{$seqname}-$fmax)%3,"\n";
+ $alnfeats->{$feat_name}->{'relqrysdist'}=($seqalnpos->{$seqname}-$fmax);
+ $alnfeats->{$feat_name}->{'frame'}=($seqalnpos->{$seqname}-$fmax)%3;
+ $alnfeats->{$feat_name}->{'frameinqry'}=($qfmax-$qmatchstart)%3;
+ #print "#qframe $feat_name $alnfeats->{$feat_name}->{'frameinqry'}\n";
+ }
+ }
+ }
+ foreach my $feat_name (keys %$alnfeats){
+ #Check gene is part of input feature list [optional]
+ die "Bad gene $feat_name" if(! exists $features->{$feat_name});
+ die "Query gene should not map to itself" if($feat_name eq $query);
+ #die "Can't find $feat_name in organism lookup" if(!exists $feat2organism->{$feat_name});
+ #die "Bad organism $feat2organism->{$feat_name" if(!exists $alnorgs->{$feat2organism->{$feat_name}});
+ #Check gene has not already been mapped
+ die if(exists $mapped->{$feat_name});
+ #coverage cutoff and percent identity cutoff
+ my $fmin = $features->{$feat_name}->[1];
+ my $fmax = $features->{$feat_name}->[2];
+ my $featlen = $fmax-$fmin;
+ die if($featlen<1);
+ if($verbose){
+ print STDERR "Bad query coverage $qcov > $qfeatlen for $feat_name $fmin-$fmax\n" if($qcov > $qfeatlen);
+ print STDERR "Bad match coverage $alnfeats->{$feat_name}->{'cov'} > $featlen==$fmax-$fmin for $feat_name\n" if($alnfeats->{$feat_name}->{'cov'} > $featlen);
+ print STDERR "Bad match pid $alnfeats->{$feat_name}->{'pid'} > $alnfeats->{$feat_name}->{'len'} for $feat_name\n" if($alnfeats->{$feat_name}->{'pid'} > $alnfeats->{$feat_name}->{'len'});
+ print STDERR "#WARNING Bad len $alnfeats->{$feat_name}->{'len'} > ($features->{$feat_name}->[2]-$features->{$feat_name}->[1]) ".($features->{$feat_name}->[2]-$features->{$feat_name}->[1])." for $feat_name\n"
+ if($alnfeats->{$feat_name}->{'len'} > ($features->{$feat_name}->[2]-$features->{$feat_name}->[1]));
+ }
+ #
+ #Coverage and percent_id cutoffs are checked here in the following order
+ #Check that coverage over shorter of query and hit
+ #query_coverage > coverage_cutoff || hit_coverage > coverage_cutoff && hit_pid > pid_cutoff
+ #query_coverage > coverage_cutoff && hit_coverage > coverage_cutoff && hit_pid > pid_cutoff
+ #
+ print "Cutoff check $feat_name $feat2organism->{$feat_name} $alnorgs->{$feat2organism->{$feat_name}}->{'qcov'},$qfeatlen qcov=",($alnorgs->{$feat2organism->{$feat_name}}->{'qcov'}/$qfeatlen)," >= $query_coverage_cutoff ",
+ $alnfeats->{$feat_name}->{'cov'}/$featlen ," >= $coverage_cutoff ",
+ $alnfeats->{$feat_name}->{'pid'}/$alnfeats->{$feat_name}->{'len'},">= $pid_cutoff\n" if($debug);
+ if(($query_coverage_cutoff==0 || ($alnorgs->{$feat2organism->{$feat_name}}->{'qcov'}/$qfeatlen >= $query_coverage_cutoff)) && #%coverage over query(longer feature in the comparison)
+ $alnfeats->{$feat_name}->{'cov'}/$featlen >= $coverage_cutoff && #%coverage over matching gene length (shorter feature in the comparison)
+ $alnfeats->{$feat_name}->{'pid'}/$alnfeats->{$feat_name}->{'len'} >= $pid_cutoff){ #%id over aligned length only
+ print "PASSED $feat_name\n" if($debug);
+ #Check matching len is <= length of gene
+ $mappedorgs->{$feat2organism->{$feat_name}}->{'features'}->{$feat_name}++;
+ #print "WARNING query coverage > query length: $alnorgs->{$feat2organism->{$feat_name}}->{'qcov'} > $qfeatlen\n"
+ #if($alnorgs->{$feat2organism->{$feat_name}}->{'qcov'} > $qfeatlen);
+ $mappedorgs->{$feat2organism->{$feat_name}}->{'qcov'} = $alnorgs->{$feat2organism->{$feat_name}}->{'qcov'};
+ $mappedgenes->{$feat_name}->{'fmin'} = $alnfeats->{$feat_name}->{'fmin'};
+ $mappedgenes->{$feat_name}->{'fmax'} = $alnfeats->{$feat_name}->{'fmax'};
+ $mappedgenes->{$feat_name}->{'cov'} = $alnfeats->{$feat_name}->{'cov'};
+ $mappedgenes->{$feat_name}->{'pid'} = $alnfeats->{$feat_name}->{'pid'};
+ $mappedgenes->{$feat_name}->{'len'} = $alnfeats->{$feat_name}->{'len'};
+ $mappedgenes->{$feat_name}->{'relorient'} = $alnfeats->{$feat_name}->{'relorient'};
+ #print "FRAME $alnfeats->{$feat_name}->{'frame'}\n";
+ #print "RELQRYSTARTDIST $alnfeats->{$feat_name}->{'relqrysdist'}\n";
+ $mappedgenes->{$feat_name}->{'frame'} = $alnfeats->{$feat_name}->{'frame'};
+ $mappedgenes->{$feat_name}->{'frameinqry'} = $alnfeats->{$feat_name}->{'frameinqry'};
+ $mappedgenes->{$feat_name}->{'relqrysdist'} = $alnfeats->{$feat_name}->{'relqrysdist'};
+ }
+ else{
+ print "BELOW $feat_name\n" if($debug);
+ #Does not pass cutoffs
+ $unmappedgenes->{$feat_name}->{'cov'} = $alnfeats->{$feat_name}->{'cov'};
+ $unmappedgenes->{$feat_name}->{'fmin'} = $alnfeats->{$feat_name}->{'fmin'};
+ $unmappedgenes->{$feat_name}->{'fmax'} = $alnfeats->{$feat_name}->{'fmax'};
+ $unmappedgenes->{$feat_name}->{'pid'} = $alnfeats->{$feat_name}->{'pid'};
+ $unmappedgenes->{$feat_name}->{'len'} = $alnfeats->{$feat_name}->{'len'};
+ $unmappedgenes->{$feat_name}->{'relorient'} = $alnfeats->{$feat_name}->{'relorient'};
+ $unmappedgenes->{$feat_name}->{'frame'} = $alnfeats->{$feat_name}->{'frame'};
+ $unmappedgenes->{$feat_name}->{'frameinqry'} = $alnfeats->{$feat_name}->{'frameinqry'};
+ $unmappedgenes->{$feat_name}->{'relqrysdist'} = $alnfeats->{$feat_name}->{'relqrydist'};
+ }
+ }
+ foreach my $seq (keys %$allseqs){
+ if(!exists $mappedorgs->{$seq}){
+ #Does not pass cutoffs
+ $unmappedorgs->{$seq} = {};
+ }
+ }
+ foreach my $feat_name (keys %{$unmappedgenes}){
+ if(!exists $mappedorgs->{$feat2organism->{$feat_name}}){
+ die "ORG found in mapped list $feat2organism->{$feat_name} $feat_name query:$query queryorg:$qcurrorg" if(exists $mappedorgs->{$feat2organism->{$feat_name}});
+ $unmappedorgs->{$feat2organism->{$feat_name}}->{'features'}->{$feat_name}++;
+ $unmappedorgs->{$feat2organism->{$feat_name}}->{'qcov'} += $alnfeats->{$feat_name}->{'qcov'};
+ }
+ }
+ return($mappedorgs,$mappedgenes,$unmappedorgs,$unmappedgenes);
+#Classify consistency of annotations within a cluster
+#Returns start,stop codon positions of annotated genes only
+#Clusters are assigned one or more classes based on consistent gene structures
+#Class CS1: All start codons in the cluster are aligned
+#Class CS2: There are multiple, inconsistent start codons in the cluster
+#Class CS3: One or more of the start codons are not aligned in the cluster
+#Class CS4: Invalid annotated start codon
+#Class CE1-3. Same as CS1-3 but for stop codons
+#Class CM1 : Multiple spanned features in the cluster
+sub annotateCluster{
+ my($atree,$genes,$orgs) = @_;
+ my $cluster_attrs = {};
+ my $feat_attrs = {};
+ my $starts = {};
+ my $stops = {};
+ my $codonpairs = {};
+ my $alignedstartcount=0;
+ my $alignedstopcount=0;
+ my $seqstarts = {};
+ my $seqstops = {};
+ my $featstarts = {};
+ my $featstops = {};
+ foreach my $org (keys %$orgs){
+ if(scalar(keys %{$orgs->{$org}->{'features'}})>1){
+ print "#Class CM1. Multiple genes spanning query. Count ",scalar(keys %{$orgs->{$org}->{'features'}}),"\n" if($debug);;
+ $cluster_attrs->{'CM1'} = [$org,scalar(keys %{$orgs->{$org}->{'features'}})];
+ }
+ }
+ print "#Annotating cluster\n" if($debug);;
+ foreach my $feat_name (keys %$genes){
+ #Save relative position of start and stop codons in the
+ #alignment $align_name
+ die if(!exists $features->{$feat_name});
+ my ($seqname,$fmin,$fmax,$len,$orient) = @{$features->{$feat_name}};
+ my $relorient = $genes->{$feat_name}; #relative orientation of the annotation on the aligned seq
+ #$relorient == 1 Annotation and alignment are on the same strand
+ #$relorient == 0 Annotation and alignment are on opposite strands
+ my($startcodon,$stopcodon,$partial_start,$partial_stop,$bad_start,$bad_stop) = &findCodons($atree,
+ $seqname,
+ $fmin,
+ $fmax,
+ $orient,$feat_name);
+ if($verbose && !$bad_stop && !$bad_start){
+ print "BAD ORF $seqname,$fmin,$fmax\n" if(&isORF($db,$seqname,$fmin,$fmax,$orient)<=0);
+ }
+ if(ref $startcodon){
+ my($mcol,$align_name) = (@$startcodon);
+ my $token = $mcol.$CODON_DELIM.$align_name;
+ if($debug){
+ if($orient eq '+'){
+ my @res= AlignmentTree::coordstocolumn($atree->{_alignments}->{$align_name}->[0],$seqname,$fmin,$fmin+3);
+ die "$res[0] ne $mcol $seqname,$fmin,$fmin+3" if($res[0] ne $mcol);
+ }
+ else{
+ my @res= AlignmentTree::coordstocolumn($atree->{_alignments}->{$align_name}->[0],$seqname,$fmax-3,$fmax);
+ die "$res[0] ne $mcol $seqname,$fmin,$fmin+3" if($res[0] ne $mcol);
+ }
+ }
+ $starts->{$token}++;
+ print "#Start codon $feat_name $startcodon->[0] $startcodon->[1] $startcodon->[2] $startcodon->[3] $orient\n" if($debug);;
+ $alignedstartcount++;
+ $features->{$feat_name}->[7] = $startcodon->[0];
+ $features->{$feat_name}->[8] = $startcodon->[1];
+ $seqstarts->{$seqname}->{$token}++;
+ $featstarts->{$feat_name} = $token;
+ if($partial_start){
+ $feat_attrs->{$feat_name}->{'CS0'}++; #start codon in PMARK spacer adjacent to contig boundary
+ $cluster_attrs->{'CS0'}++;
+ }
+ if($debug){
+ $feat_attrs->{$feat_name}->{'startcol:'.$mcol}++;
+ }
+ if($bad_start){#$startcodon == -1){
+ $feat_attrs->{$feat_name}->{'CS4'}++; #invalid start
+ $cluster_attrs->{'CS4'}++;
+ }
+ }
+ else{
+ $feat_attrs->{$feat_name}->{'CS3'}++;
+ $cluster_attrs->{'CS3'}++;
+ }
+ if(ref $stopcodon){
+ my($mcol,$align_name) = (@$stopcodon);
+ my $token = $mcol.$CODON_DELIM.$align_name;
+ if($debug){
+ if($orient eq '+'){
+ my @res= AlignmentTree::coordstocolumn($atree->{_alignments}->{$align_name}->[0],$seqname,$fmax-3,$fmax);
+ die "$res[0] ne $mcol" if($res[0] ne $mcol);
+ }
+ else{
+ my @res= AlignmentTree::coordstocolumn($atree->{_alignments}->{$align_name}->[0],$seqname,$fmin,$fmin+3);
+ die "$res[0] ne $mcol" if($res[0] ne $mcol);
+ }
+ }
+ $stops->{$token}++;
+ print "#Stop codon $feat_name $stopcodon->[0] $stopcodon->[1] $stopcodon->[2] $stopcodon->[3] $orient\n" if($debug);;
+ $alignedstopcount++;
+ $features->{$feat_name}->[9] = $stopcodon->[0];
+ $features->{$feat_name}->[10] = $stopcodon->[1];
+ $seqstops->{$seqname}->{$token}++;
+ $featstops->{$feat_name} = $token;
+ if($partial_stop){
+ $feat_attrs->{$feat_name}->{'CE0'}++; #stop codon in PMARK spacer adjacent to contig boundary
+ $cluster_attrs->{'CE0'}++;
+ }
+ if($debug){
+ $feat_attrs->{$feat_name}->{'stopcol:'.$mcol}++;
+ }
+ if($bad_stop){#$stopcodon == -1){
+ $feat_attrs->{$feat_name}->{'CE4'}++; #invalid stop
+ $cluster_attrs->{'CE4'}++;
+ }
+ }
+ else{
+ $feat_attrs->{$feat_name}->{'CE3'}++;
+ $cluster_attrs->{'CE3'}++;
+ }
+ if(scalar(keys%{$orgs->{$seqname}->{'features'}}==1) && exists $featstarts->{$feat_name} && $featstops->{$feat_name}){
+ #$codonpairs->{$featstarts->{$feat_name}.':'.$featstops->{$feat_name}}->{'gfreq'}++;
+ #$codonpairs->{$featstarts->{$feat_name}.':'.$featstops->{$feat_name}}->{'afreq'}++;
+ #$codonpairs->{$featstarts->{$feat_name}.':'.$featstops->{$feat_name}}->{'length'}+=$len;
+ $codonpairs->{$featstarts->{$feat_name}.':'.$featstops->{$feat_name}}->{'orgs'}->{$seqname} = [$fmin,$fmax,$orient,1]; #[fmin,fmax,orient,is_annotated,fs_type]
+ }
+ }
+ if(scalar(keys %$starts)==1){
+ #There is only one annotated start
+ my @start = keys %$starts;
+ if($starts->{$start[0]}==scalar(keys %$genes)){
+ #and every gene has this annotated start
+ print "#Class CS1. Consistent starts\n" if($debug);;
+ $cluster_attrs->{'CS1'}++;
+ }
+ else{
+ #some genes are missing this start codon but there are no others
+ print "#Class CS3. Unaligned starts ",$starts->{$start[0]}, "==",scalar(keys %$genes),"\n" if($debug);;
+ $cluster_attrs->{'CS2'}++;
+ $cluster_attrs->{'CS3'}++;
+ }
+ }
+ else{
+ if($alignedstartcount == scalar(keys %$genes)){
+ #there is one annotated start codon for each genome, but not all genomes use the same start
+ print "#Class CS2. Inconsistent starts\n" if($debug);;
+ $cluster_attrs->{'CS2'}++;
+ }
+ else{
+ #there are multiple annotated start codons for genome
+ print "#Class CS3. Unaligned starts ",$alignedstartcount," == ",scalar(keys %$genes),"\n" if($debug);;
+ $cluster_attrs->{'CS2'}++;
+ }
+ }
+ if(scalar(keys %$stops)==1){
+ #There is only one annotated stop
+ my @stop = keys %$stops;
+ if($stops->{$stop[0]}==scalar(keys %$genes)){
+ #and every gene is annotated with this stop
+ print "#Class CE1. Consistent stops\n" if($debug);;
+ $cluster_attrs->{'CE1'}++;
+ }
+ else{
+ #some genes are missing this stop codon but there are no others
+ print "#Class CE3. Unaligned stops\n" if($debug);;
+ $cluster_attrs->{'CE2'}++;
+ $cluster_attrs->{'CE3'}++;
+ }
+ }
+ else{
+ if($alignedstopcount == scalar(keys %$genes)){
+ #there is one annotated stop codon for each genome, but not all genomes use the same stop
+ print "#Class CE2. Inconsistent stops\n" if($debug);;
+ $cluster_attrs->{'CE2'}++;
+ }
+ else{
+ #there are multiple annotated stop codons for genome
+ print "#Class CE3. Unaligned stops\n" if($debug);;
+ $cluster_attrs->{'CE2'}++;
+ }
+ }
+ #Save frequency of annotated starts, stops
+ foreach my $feat_name (keys %$genes){
+ #$feat_attrs->{$feat_name}->{'pairfreq='.$codonpairs->{"$featstarts->{$feat_name}"."$featstops->{$feat_name}"}}++;
+ if(exists $featstarts->{$feat_name}){
+ $feat_attrs->{$feat_name}->{'startfreq='.$starts->{$featstarts->{$feat_name}}}++;
+ $feat_attrs->{$feat_name}->{'startcodon='.$featstarts->{$feat_name}}++;
+ }
+ if(exists $featstops->{$feat_name}){
+ $feat_attrs->{$feat_name}->{'stopfreq='.$stops->{$featstops->{$feat_name}}}++;
+ $feat_attrs->{$feat_name}->{'stopcodon='.$featstops->{$feat_name}}++;
+ }
+ }
+ return ($feat_attrs,$cluster_attrs,{'starts'=>$seqstarts,'stops'=>$seqstops,'pairs'=>$codonpairs,'featstops'=>$featstops,'featstarts'=>$featstarts});
+#Classify singletons and unannotated regions
+#Singletons consist of all annotated ORFs that do not map into an existing cluster above cutoffs
+#Singletons are classified into the following classes
+#Class SLTN1: there are no alignments that overlap the singleton. apparently true singleton
+#Class SLTN2: there are overlapping alignments, annotated ORF start can be modified to pass cutoffs into an existing cluster
+#Class SLTN3: there are overlapping alignments, annotated ORF stop can be modified to pass cutoffs into an existing cluster
+#Class SLTN4: there are overlapping alignments and unannotated ORFs can be mapped above cutoffs
+#Class SLTN5: there are overlapping alignments, but no overlapping ORFs above cutoffs
+sub annotateSingletons{
+ my($atree,$seqname,$feat_name,$fmin,$fmax) = @_;
+ my @classes;
+ my @isect = $atree->intersect($seqname,$fmin,$fmax,$aligntoken);
+ my $goodalignments = {};
+ foreach my $r (@isect){
+ my $feat_name = $r->[0];
+ my $seqname = $r->[1];
+ my $align_name = $r->[5];
+ #Only consider WGA alignments (alignment name in $align_name) that span query (gene name in $feat_name)
+ if($feat_name eq 'gene:'.$feat_name){
+ $goodalignments->{$align_name}++;
+ }
+ }
+ if(scalar (@isect)==0){
+ push @classes,"classSLTN1";
+ }
+ else{
+ push @classes,"classSLTN5 Num_alns:".scalar(keys %$goodalignments);
+ }
+ return \@classes;
+#Check if fmin-fmax,orient on seqname is a valid ORF
+sub isORF(){
+ my($db,$seqname,$fmin,$fmax,$orient,$fs) = @_;
+ #hack to avoid some bioperl warnings that i cannot turn off
+ Bio::Root::Root::verbose(0);
+ open(FOO, ">/dev/null");
+ my $seqlen = ($fmax-$fmin);
+ if((! defined $fs && $seqlen%3!=0) || $seqlen > $MAXORFLEN || $seqlen < $MINORFLEN){
+ print "#Bad ORF length $seqname $fmin-$fmax ",$seqlen," ",$seqlen%3,"\n" if($verbose);
+ return 0;
+ }
+ #my $seqobj = $db->get_Seq_by_id($seqname);
+ my $seqobj = $db->{$seqname};
+ die "Bad coordinates $fmin-$fmax @_" if($fmin >= $fmax);
+ my $codon_table = Bio::Tools::CodonTable->new(-id=>11);
+ if($seqobj){
+ if($orient eq '+'){
+ #die "Bad coordinates $fmax extends past end of sequence" if($fmax >= $seqobj->length());
+ my $newobj;
+ my $fsadj=0;
+ my $pmark=0;
+ my $adj=0;
+ if($fs){
+ my $newobjs = $seqobj->trunc($fmin+1,$fmax);
+ my $gseq = $newobjs->seq();
+ #print "Seq size ",length($gseq)," ",$newobjs->length(),"\n";
+ #Check for PMARK spacer
+ #my $encoding = 'C'x$newobjs->length();
+ print "GSEQPRE:$gseq\n" if($debug);
+ foreach my $f (sort {$b->[0] <=> $a->[0]} @$fs){
+ #print "SIZE ",scalar(@$fs),"\n";
+ foreach my $start (sort {$b <=> $a} @{$f->[5]}){
+ if($start>=$fmin && $start<=$fmax){
+ #print "SAM $seqname,$fsadj $fmin-$fmax $start $f->[0] $f->[1] $f->[2] $f->[3] $f->[4] $adj\n";
+ #$fsadj+=$f->[4];
+ #die if(($start-$fmin) >= length($encoding));
+ die if(($start-$fmin) < 0);
+ if($f->[4] == 1){
+ #substr($encoding,$start-$fmin+$adj,1,'F');
+ substr($gseq,$start-$fmin,1) = '';
+ }
+ elsif($f->[4] == -1){
+ #substr($encoding,$start-$fmin+1+$adj,0) = 'B';
+ substr($gseq,$start-$fmin+1,0) = 'N';
+ $adj++;
+ }
+ elsif($f->[4] == 0){
+ #substr($encoding,$start-$fmin,1,'G');
+ }
+ }
+ }
+ }
+ if($newobjs->seq() =~ /$PMARK_SPACER/){
+ my $sloc = $-[0];
+ print "FOUND PMARK+ $sloc ",substr($newobjs->seq(),$sloc,36),"\n" if($verbose);
+ # substr($encoding,$sloc,36,'G'x36);
+ $pmark=1;
+ }
+ $newobj = new Bio::Seq(-seq=>$gseq);
+ #$newobj = $newobj->revcom();
+ print "GSEQPOST:$gseq\n" if($debug);
+ #print "GSEQobj:",$newobj->seq(),"\n";
+ #print "#Encoding $encoding\n" if($debug);
+ #return 0 if(($seqlen+$fsadj)%3!=0);
+ # if(0){
+# $newobjs->verbose(0);
+# Bio::Root::Root::verbose(0);
+# eval{
+# $newobj = new Bio::Seq::EncodedSeq(-seq=>$newobjs->seq(),
+# -encoding=>$encoding,
+# -verbose=>0,
+# );
+# }
+# or do{
+# print "ERROR: ",$@,"\n" if($verbose);
+# print "$seqname $fmin,$fmax,$orient PMARK=$pmark ",$newobjs->seq(),"\n",$encoding,"\n" if($verbose);
+# return 0;
+# };
+# }
+ }
+ else{
+ $newobj = $seqobj->trunc($fmin+1,$fmax);
+ }
+ die if($verbose && $newobj->length() > $MAXORFLEN);
+ die if($verbose && $newobj->length() < $MINORFLEN);
+ if(1){#if($codon_table->is_start_codon($newobj->subseq(1,3)) &&
+ #($codon_table->is_ter_codon($newobj->subseq($newobj->length()-3+1,$newobj->length())))){
+ #*STDERR = *FOO;
+ my $protein_seq_obj;
+ eval{
+ if(0 && $fs){
+ #print "Using FS\n";
+ $protein_seq_obj = $newobj->cds()->translate(
+ -codontable_id =>11,
+ #-orf=>1,
+ -complete => 1,
+ -throw => 1,
+ -verbose => 0
+ );
+ }
+ else{
+ $protein_seq_obj = $newobj->translate(-codontable_id =>11,
+ -complete => 1,
+ -throw => 1
+ );
+ }
+ }
+ or do {
+ print "ERROR translate: ",$@,"\n" if($verbose);
+ print "ERROR translate $seqname $fmin,$fmax,$orient ",$newobj->seq(),"\n" if($verbose);
+ return 0;
+ };
+ return 0 if(!$protein_seq_obj);
+ if($protein_seq_obj->length()>0){# +1 == ($seqlen+$fsadj)/3){
+ die if($protein_seq_obj->seq() =~ /\*/);
+ return 1+$pmark;
+ }
+ else{
+ print "#Unexpected sequence length ",$protein_seq_obj->length()," expecting ",($seqlen+$fsadj)/3," from ORF $seqname $fmin-$fmax $orient ",$protein_seq_obj->seq(),"\n" if($verbose);
+ }
+ }
+ else{
+ #print "Possible alternative ORF on $seqname $fmin-$fmax,$orient has invalid start:",$newobj->subseq(1,3)," ",$codon_table->is_start_codon($newobj->subseq(1,3))," or stop:",$newobj->subseq($newobj->length()-3+1,$newobj->length())," ",$codon_table->is_ter_codon($newobj->subseq($newobj->length()-3+1,$newobj->length())),"\n" if($verbose);
+ return -1;
+ }
+ }
+ else{
+ die if($orient ne '-');
+ my $newobj;
+ my $fsadj=0;
+ my $pmark=0;
+ my $adj=0;
+ my $encoding;
+ if($fs){
+ my $newobjs = $seqobj->trunc($fmin+1,$fmax);
+ my $gseq = $newobjs->seq();
+ #$encoding = 'C'x$newobjs->length();
+ print "GSEQPRE:$gseq\n" if($debug);
+ #print "FS ",join(',',@$fs),"\n";
+ #print $newobjs->seq(),"\n";
+ foreach my $f (sort {$b->[0] <=> $a->[0]} @$fs){
+ #print "SIZE ",scalar(@$fs),"\n";
+ foreach my $start (sort {$b <=> $a} @{$f->[5]}){
+ if($start>=$fmin && $start<=$fmax){
+ #print "SAM $seqname,$fsadj $fmin-$fmax $start $f->[4] $adj\n";
+ #$fsadj+=(-1*($f->[4]*(scalar(@{$f->[5]}))));
+ if($f->[4] == 1){
+ #substr($encoding,$fmax - $start-1,1,'F');
+ substr($gseq,$start-$fmin,1) = '';
+ $adj++;
+ }
+ elsif($f->[4] == -1){
+ #substr($encoding,$fmax - $start,1,'B');
+ #substr($encoding,$fmax- $start,0) = 'B';
+ substr($gseq,$start-$fmin+1,0) = 'N';
+ }
+ elsif($f->[4] == 0){
+ #substr($encoding,$fmax - $start-1,1,'I');
+ }
+ die if(($fmax-$start) < 0);
+ #die if(($fmax-$start) >= length($encoding));
+ }
+ }
+ }
+ eval{
+ $newobjs = $newobjs->revcom();
+ };
+ if($newobjs->seq() =~ /$PMARK_SPACER/){
+ my $sloc = $-[0];
+ print "FOUND PMARK- $sloc ",substr($newobjs->seq(),$sloc,length($PMARK_SPACER)),"\n" if($verbose);
+ #substr($encoding,$sloc,length($PMARK_SPACER),'F'x36);
+ $pmark=1;
+ }
+ $newobj = new Bio::Seq(-seq=>$gseq);
+ $newobj = $newobj->revcom();
+ print "GSEQPOST:$gseq\n" if($debug);
+ #print "GSEQobj:",$newobj->seq(),"\n";
+ #print "$seqname $seqlen+$fsadj $fmin,$fmax,$orient PMARK=$pmark ",$newobjs->seq(),"\n",$encoding,"\n";
+ print "#Encoding $encoding\n" if($debug);
+ #return 0 if(($seqlen+$fsadj)%3!=0);
+ #return 0 if(length($encoding) != $newobjs->length());
+ #die if(length($encoding) != $newobjs->length());
+ # if(0){
+# eval{
+# $newobj = new Bio::Seq::EncodedSeq(-seq=>$newobjs->seq(),
+# -encoding=>$encoding,
+# -verbose=>0);
+# }
+# or do{
+# print "ERROR encoding: ",$@,"\n" if($verbose);
+# print "$seqname $fmin,$fmax,$orient PMARK=$pmark ",$newobjs->seq(),"\n",$encoding,"\n" if($verbose);
+# return 0;
+# };
+# }
+ }
+ else{
+ $newobj = $seqobj->trunc($fmin+1,$fmax);
+ eval{
+ $newobj = $newobj->revcom();
+ };
+ }
+ #print "NEW ",$newobj->seq(),"\n";
+ die if($verbose && $newobj->length() > $MAXORFLEN);
+ die if($verbose && $newobj->length() < $MINORFLEN);
+ #Check if valid start codon
+ if(1){#$codon_table->is_start_codon($newobj->subseq(1,3)) && ($codon_table->is_ter_codon($newobj->subseq($newobj->length()-3+1,$newobj->length())))){
+ my $protein_seq_obj;
+ eval{
+ if(0 && $fs){
+ #print "Using FS\n";
+ $newobj->verbose(0);
+ $protein_seq_obj = $newobj->cds()->translate(
+ -codontable_id =>11,
+ #-orf=>1,
+ -complete => 1,
+ -throw => 1,
+ -verbose => 0
+ );
+ }
+ else{
+ #print "PRETRANS ",$newobj->seq(),"\n";
+ $protein_seq_obj = $newobj->translate(-codontable_id =>11,
+ -complete => 1,
+ -throw => 1
+ );
+ }
+ }
+ or do {
+ print "ERROR translate: ",$@,"\n" if($verbose);
+ print "ERROR translate $seqname $fmin,$fmax,$orient ",$newobj->seq(),"\n" if($verbose);
+ return 0;
+ };
+ return 0 if(!$protein_seq_obj);
+ #print $protein_seq_obj->length()," ",$newobj->length()," ",$fsadj,"\n";
+ if($protein_seq_obj->length() >0){#== ($newobj->length()+$fsadj)/3){
+ die if($protein_seq_obj->seq() =~ /\*/);
+ return 1+$pmark;
+ }
+ else{
+ print "#Unexpected sequence length ",$protein_seq_obj->length()," expecting ",($newobj->length()+$fsadj)/3," from ORF $seqname $fmin-$fmax $orient ",$protein_seq_obj->seq(),"\n" if($verbose);
+ for(my $i=0;$i<$protein_seq_obj->length();$i++){
+ print $protein_seq_obj->subseq($i+1,$i+1)," ",$newobj->subseq($i*3+1,$i*3+3),"\n";
+ }
+ }
+ }
+ else{
+ #print "Possible alternative ORF on $seqname $fmin-$fmax,$orient has invalid start:",$newobj->subseq(1,3)," ",$codon_table->is_start_codon($newobj->subseq(1,3))," or stop:",$newobj->subseq($newobj->length()-3+1,$newobj->length())," ",$codon_table->is_ter_codon($newobj->subseq($newobj->length()-3+1,$newobj->length()))," ",$newobj->seq(),"\n" if($verbose);
+ return -1;
+ }
+ }
+ }
+ close FOO;
+ return 0;
+#Print members and attributes for a cluster
+#$query is the longest member of a cluster
+#Supported attributes
+sub reportCluster{
+ my($query,$mappedorgs,$mappedgenes,$unmappedorgs,$unmappedgenes,$feat_attrs,$cluster_attrs,$seq_attrs,$new_orfs) = @_;
+ if(scalar(keys %$mappedgenes)>0){
+ print $cogfh "COG = $cluster_id, size ",scalar(keys %$mappedgenes), ", connections = 0, perfect = 0;\n";
+ print $cogfh "\t$features->{$query}->[5]\n";
+ foreach my $organism (sort {$a cmp $b} keys %$mappedorgs){
+ foreach my $gene (sort {$features->{$a}->[1] <=> $features->{$b}->[1]} (keys %{$mappedorgs->{$organism}->{'features'}})){
+ if($gene ne $query){
+ print $cogfh "\t$features->{$gene}->[5]\n";
+ }
+ }
+ }
+ my @posscauses = ('CS3','CE3','CS4','CE4','CS0','CE0');
+ my $causesstr;
+ foreach my $p (@posscauses){
+ if(exists $cluster_attrs->{$p}){
+ $causesstr .= "$p;";
+ delete $cluster_attrs->{$p};
+ }
+ }
+ my $classesstr = join(';',sort {$a cmp $b} keys %{$cluster_attrs});
+ print ">CLUSTER_$cluster_id num_seqs=",scalar(keys %$mappedorgs)," num_genes=",scalar(keys %$mappedgenes);
+ if(exists $mappedgenes->{$query}->{'alignments'}){
+ print " classes=$classesstr query=$query ";
+ if(length($causesstr)>0){
+ print " causes=$causesstr ";
+ }
+ print " num_alignments=",scalar(@{$mappedgenes->{$query}->{'alignments'}})," alignments=",join(',',@{$mappedgenes->{$query}->{'alignments'}}) if($debug);
+ }
+ print "\n";
+ print $cfh "CLUSTER_$cluster_id (",scalar(keys %$mappedgenes)," features,",scalar(keys %$mappedorgs)," genomes, classes=$classesstr, query=$query): ";
+ print $ctfh "C_$cluster_id\t";
+ print $ctfh2 "C_$cluster_id\t";
+ $clusters->{$cluster_id}->{'num_feats'} = scalar(keys %$mappedgenes);
+ $clusters->{$cluster_id}->{'num_genomes'} = scalar(keys %$mappedorgs);
+ $clusters->{$cluster_id}->{'num_alignments'} = scalar(@{$mappedgenes->{$query}->{'alignments'}});
+ $clusters->{$cluster_id}->{'classes'} = $classesstr;
+ my $qfmin = $features->{$query}->[1];
+ my $qfmax = $features->{$query}->[2];
+ my $qseqname = $features->{$query}->[0];
+ my @mappedfeats;
+ my $outtable = [];
+ foreach my $organism (sort {$a cmp $b} keys %$mappedorgs){
+ my($start,$end) = &getspan($mappedgenes,keys %{$mappedorgs->{$organism}->{'features'}});
+ my @ogenes = sort {$features->{$a}->[1] <=> $features->{$b}->[1]} (keys %{$mappedorgs->{$organism}->{'features'}});
+ my @ocovs;# = map {sprintf("%.2f",$mappedgenes->{$_}->{'cov'}/$features->{$_}->[3])} (@ogenes); #%coverage over gene length
+ my @oids;# = map {sprintf("%.2f",$mappedgenes->{$_}->{'pid'}/$mappedgenes->{$_}->{'len'})} (@ogenes); #%id over aligned length
+ my @orients;
+ my @names;
+ my @frames;
+ my @qframes;
+ my @sdist;
+ my $classes;
+ my $longestorf=0;
+ foreach my $gene (@ogenes){
+ if(exists $feat_attrs->{$gene}){
+ foreach my $c (sort {$a cmp $b} keys %{$feat_attrs->{$gene}}){
+ $classes->{$c}++;
+ }
+ }
+ $longestorf = ($features->{$gene}->[3] > $longestorf) ? $features->{$gene}->[3] : $longestorf;
+ push @ocovs,(sprintf("%.2f",$mappedgenes->{$gene}->{'cov'}/$features->{$gene}->[3])) if($features->{$gene}->[3]>0);
+ push @oids,(sprintf("%.2f",$mappedgenes->{$gene}->{'cov'}/$features->{$gene}->[3])) if($features->{$gene}->[3]>0);
+ push @orients,"$features->{$gene}->[4]";
+ #Print frame and sdist for inconsistent clusters only
+ if(exists $cluster_attrs->{'CS1'} && exists $cluster_attrs->{'CE1'}){
+ }
+ else{
+ if($mappedgenes->{$gene}->{'relqrysdist'}>0 || $verbose==1){
+ push @frames,"altframeqry=$mappedgenes->{$gene}->{'frame'}";
+ push @qframes,"frameinqry=$mappedgenes->{$gene}->{'frameinqry'}";
+ push @sdist,"sdist=$mappedgenes->{$gene}->{'relqrysdist'}" if(scalar(@ogenes)==1);
+ }
+ }
+ if(defined $features->{$gene}->[11]){
+ push @names,"product=$features->{$gene}->[11]";
+ }
+ #push @attrs,"aln_orient=$mappedgenes->{$gene}->{'relorient'}";
+ #my $frame;
+ #if($features->{$gene}->[4] eq '-'){
+ # $frame=($end%3)*-1;
+ #}
+ #else{
+ # $frame=$start%3;
+ #}
+ #push @attrs,"frame=$frame";
+ }
+ #Print attributes
+ my @attrs = sort {$a cmp $b} keys %$classes;
+ push @attrs, at frames, at qframes, at sdist;
+ #push @attrs,map {"frame=$_"} @frames;
+ #push @attrs,map {"sdist=$_"} @sdist;
+ #Brief cluster output
+ print $cfh join(',', at ogenes),"($organism) ";
+ my($realorg) = ($organism =~ /([^\.]+)/);
+ $outtable->[$seqindex->{$realorg}]->[0] = join(',', at ogenes);
+ $outtable->[$seqindex->{$realorg}]->[1] = $start;
+ $outtable->[$seqindex->{$realorg}]->[2] = $end;
+ $outtable->[$seqindex->{$realorg}]->[3] = join(',', at orients);
+ #Detailed output
+ print join(',', at ogenes),
+ "\tC$cluster_id",
+ "\t$organism",
+ "\tcov=",join(',', at ocovs),
+ "\tpid=",join(',', at oids),
+ "\tqcov=",sprintf("%.2f",$mappedorgs->{$organism}->{'qcov'}/($qfmax-$qfmin)),
+ "\t$start-$end",
+ "\t",join(',', at orients),
+ "\t",$end-$start,
+ "\t",join(';', at attrs, at names),
+ "\n";
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'genes'} = \@ogenes;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'cov'} = \@ocovs;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'pid'} = \@oids;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'frame'} = \@frames;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'frameinqry'} = \@qframes;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'sdist'} = \@sdist;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'fmin'} = $start;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'fmax'} = $end;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'len'} = $end-$start;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'orient'} = \@orients;
+ $clusters->{$cluster_id}->{'orgs'}->{$organism}->{'desc'} = join(';', at names);
+ }
+ if($verbose){
+ foreach my $organism (sort {$a cmp $b} keys %$unmappedorgs){
+ if(! exists $unmappedorgs->{$organism}->{'features'}){
+ if(exists $new_orfs->{$organism}){
+ print "#ALIGNED_NEWORFS $organism aligned with unannotated matching ORFs\n";
+ }
+ else{
+ #No annotated features on this org
+ print "#ALIGNED_NOORFS $organism aligned with no matching ORFs, possibly in a gapped region of the alignment\n";
+ }
+ }
+ else{
+ #print annotated features
+ my($start,$end) = &getspan($unmappedgenes,keys %{$unmappedorgs->{$organism}->{'features'}});
+ my @ogenes = sort {$features->{$a}->[1] <=> $features->{$b}->[1]} (keys %{$unmappedorgs->{$organism}->{'features'}});
+ my @ocovs;
+ my @oids;
+ my @orients;
+ my @names;
+ my $classes;
+ my $longestorf=0;
+ foreach my $gene (@ogenes){
+ if(exists $feat_attrs->{$gene}){
+ foreach my $c (sort {$a cmp $b} keys %{$feat_attrs->{$gene}}){
+ $classes->{$c}++;
+ }
+ }
+ $longestorf = ($features->{$gene}->[3] > $longestorf) ? $features->{$gene}->[3] : $longestorf;
+ push @ocovs,(sprintf("%.2f",$unmappedgenes->{$gene}->{'cov'}/$features->{$gene}->[3])) if($features->{$gene}->[3]>0);
+ push @oids,(sprintf("%.2f",$unmappedgenes->{$gene}->{'cov'}/$features->{$gene}->[3])) if($features->{$gene}->[3]>0);
+ push @orients,"$features->{$gene}->[4]";
+ if(defined $features->{$gene}->[11]){
+ push @names,"product=$features->{$gene}->[11]";
+ }
+ }
+ #Print attributes
+ my @attrs = sort {$a cmp $b} keys %$classes;
+ print join(',', at ogenes),
+ "\tC$cluster_id",
+ "\t$organism",
+ "\tcov=",join(',', at ocovs),
+ "\tpid=",join(',', at oids),
+ "\tqcov=",sprintf("%.2f",$unmappedorgs->{$organism}->{'qcov'}/($qfmax-$qfmin)),
+ "\t$start-$end",
+ "\t",join(',', at orients),
+ "\t",$end-$start,
+ "\t",join(';', at attrs, at names),
+ "\n";
+ }
+ }
+ }
+ ##
+ #Report ORFs that are conserved and aligned but not annotated
+# foreach my $organism (keys %$new_orfs){
+# my $orfidx=0;
+# foreach my $alt (@{$new_orfs->{$organism}}){
+# die if(exists $mappedorgs->{$organism});
+# my($astart,$aend) = ($alt =~ /alt_start=(\d+)-(\d+)/);
+# my($len) = ($alt =~ /len:(\d+)/);
+# my($orient) = ($alt =~ /orient:([\+\-])/);
+# die "Mismatching lengths $len != $aend - $astart" if($len != ($aend-$astart));
+# #Check that this ORF is longer than genes that are already annotated on $organism in this region
+# my @unmappedlist;
+# foreach my $feat_name (keys %$unmappedgenes){
+# if($feat2organism->{$feat_name} eq $organism){
+# push @unmappedlist,[$feat_name,$features->{$feat_name}->[3]];
+# }
+# }
+# my @longestunmapped = sort {$b->[1] <=> $a->[1]} @unmappedlist;
+# if(scalar (%$unmappedgenes) ==0 || $len > $longestunmapped[0]){
+# push @mappedfeats,[[[$organism,$astart,$aend,'+',($aend-$astart).'M']],"NEWORF$organism$orfidx",'gene'];
+# print "NEWORF$organism$orfidx",
+# "\tWGA$cluster_id",
+# "\t$organism",
+# "\tcov=",
+# "\tpid=",
+# "\t$astart-$aend",
+# "\t",$aend-$astart,
+# "\t",join(';',@{$new_orfs->{$organism}}),
+# "\n";
+# $orfidx++;
+# }
+# }
+# }
+ for(my $i=0;$i<scalar(@sortedallseqs);$i++)
+ {
+ print $ctfh $outtable->[$i]->[0];
+ print $ctfh2 "$outtable->[$i]->[0] $outtable->[$i]->[1] $outtable->[$i]->[2] $outtable->[$i]->[3]";
+ if($i!=$#sortedallseqs){
+ print $ctfh "\t";
+ print $ctfh2 "\t";
+ }
+ else{
+ print $ctfh "\n";
+ print $ctfh2 "\n";
+ }
+ }
+ if($printalignments){
+ print "#Printing query $query $qseqname,$qfmin,$qfmax\n" if($debug);
+ my $outfh;#=\*STDOUT;
+ open $outfh,"+>$options{'prefix'}cluster_${cluster_id}.aln.out";
+ my @isect = $atree->map($qseqname,$qfmin,$qfmax);
+ #Print all features overlapping the alignment window.
+ #This may include addl features than those in the cluster
+ my $printedfeats = {};
+ foreach my $feat (@isect){
+ my $feat_name = $feat->[0];
+ $feat_name =~ s/gene\://;
+ $printedfeats->{$feat_name}++;
+ }
+ foreach my $feat_name (keys %$printedfeats){
+ my $fmin = $features->{$feat_name}->[1];
+ my $fmax = $features->{$feat_name}->[2];
+ my $seqname = $features->{$feat_name}->[0];
+ my $orient = $features->{$feat_name}->[4];
+ if(exists $mappedgenes->{$feat_name}){
+ push @mappedfeats,[[[$seqname,$fmin,$fmax,$orient,($fmax-$fmin).'M']],'gene:'.$feat_name,'gene'];
+ }
+ else{
+ #print "#WARNING Expected gene $feat_name in unmapped list: ".join(',',keys %$unmappedgenes)."\n" if(!exists $unmappedgenes->{$feat_name});
+ if(exists $features->{$feat_name} && $features->{$feat_name}->[3]){
+ my $cov = sprintf("c%.1f,i%.1f ",$unmappedgenes->{$feat_name}->{'cov'}/$features->{$feat_name}->[3],
+ $unmappedgenes->{$feat_name}->{'pid'}/$features->{$feat_name}->[3]);
+ push @mappedfeats,[[[$seqname,$fmin,$fmax,$orient,($fmax-$fmin).'M']]," $cov *gene:".$feat_name.":$orient",'gene'];
+ }
+ }
+ }
+ #Sort all alignments that span query gene
+ my @qryalns;
+ #TODO fix for reverse strand
+ foreach my $align_name (@{$mappedgenes->{$query}->{'alignments'}}){
+ die if(!exists $feat2organism->{$query} || length($feat2organism->{$query})==0);
+ my $alni = $atree->getAlignedInterval($align_name,$feat2organism->{$query});
+ if($alni){
+ print "#QRYALN $align_name $feat2organism->{$query} $alni->[0] $alni->[1] $alni->[2]\n" if($debug);
+ push @qryalns,[$align_name,$alni->[1]];
+ }
+ }
+ my $aidx;
+ foreach my $al (sort {$a->[1] <=> $b->[1]} @qryalns){
+ my($align_name) = @$al;
+ #Check that new range is still within $alignment
+ print "#Checking the $qseqname,$qfmin,$qfmax,$align_name is within range\n" if($debug);;
+ my @isect = $atree->intersect($qseqname,$qfmin,$qfmax,$align_name);
+ my $printfmin;
+ my $printfmax;
+ foreach my $aln (@isect){
+ if($aln->[1] eq $qseqname && $aln->[0] eq $align_name){
+ #print join(',',@$aln),"\n";
+ $printfmin = $aln->[2];
+ if($aidx==0){
+ if($atree->contains($align_name,$qseqname,$printfmin-20,$printfmin)){
+ $printfmin -= 20;
+ }
+ }
+ $printfmax = $aln->[3];
+ if($aidx==(scalar(@qryalns)-1)){
+ if($atree->contains($align_name,$qseqname,$printfmax,$printfmax+20)){
+ $printfmax += 20;
+ }
+ }
+ print "#Resetting print range to $printfmin-$printfmax from $qfmin-$qfmax\n" if($debug);
+ }
+ }
+ if(defined $printfmin && defined $printfmax){
+ print $outfh "CLUSTER_$cluster_id ALIGNMENT:$align_name\n";
+ my($colstart,$colend) = AlignmentTree::coordstocolumn($atree->{_alignments}->{$align_name}->[0],$qseqname,$printfmin,$printfmax,1);
+ $atree->printAlignment($outfh,$align_name,$colstart,$colend,$db,$qseqname,\@mappedfeats,$htmlout);
+ }
+ else{
+ die;
+ }
+ $aidx++;
+ }
+ close $outfh;
+ print `cat $options{'prefix'}cluster_${cluster_id}.aln.out`;
+ }
+ print "\n";
+ print $cfh "\n";
+ }
+ else{
+ #No genes in cluster
+ die;
+ }
+sub getFeaturesByInterval{
+ my($atree,$org,$fmin,$fmax,$orient) = @_;
+ my @misects = $atree->intersect($org,$fmin,$fmax,"gene");
+ my $feats;
+ foreach my $fisectn (@misects){
+ #my($fname,$fseq,$fstart,$fend,$fcoverage,$fpid,$forient1,$forient2) = @$fisectn;
+ my $feat_name = $fisectn->[0];
+ $feat_name =~ s/^gene://;
+ $feats->{$feat_name} = [$fisectn->[1],$fisectn->[2],$fisectn->[3],$fisectn->[7],$feat_name,$feat_name,$fisectn->[4],$fisectn->[5]];
+ }
+ return $feats;
+#Returns @features, at seqs, at alignments that overlap query
+sub getAlignedFeatures{
+ my($atree,$seqname,$query,$fmin,$fmax,$type) = @_;
+ #Aligned features
+ my @nisects;
+ #Aligned seqs
+ my $seqs = {};
+ #Alignments
+ my $alignments = {};
+ #Parse overlapping genes
+ my @isect = $atree->map($seqname,$fmin,$fmax);
+ #First screen all overlapping alignments to ensure that they
+ #include the query gene
+ my $alignments;
+ foreach my $r (@isect){
+ my $feat_name = $r->[0];
+ my $seqname = $r->[1];
+ my $align_name = $r->[5];
+ #Only consider WGA alignments (alignment name in $align_name) that span query (gene name in $query)
+ if($feat_name eq $type.':'.$query){
+ print "#Mapped $query $align_name\n" if($debug);
+ $alignments->{$align_name}++;
+ }
+ }
+ #Capture all seqs in this alignment
+ foreach my $align_name (keys %$alignments){
+ my $alignedseqs = $atree->{_alignments}->{$align_name}->[0];
+ foreach my $seq (@$alignedseqs){
+ die if(ref $seq->[0]);
+ $seqs->{$seq->[0]}++;
+ #TODO capture stats on alignment
+ #$seqs->{$seq->[0]}->{'len'}+=;
+ #$seqs->{$seq->[0]}->{'pid'}+=;
+ #$seqs->{$seq->[0]}->{'cov'}+=;
+ }
+ }
+ #Transform feat_name, stripping leading type:
+ my @nisect;
+ foreach my $r (@isect) {
+ $r->[0] =~ s/$type\://;
+ if(! exists $features->{$r->[0]} && !defined $featlist){
+ print STDERR "Unknown feature $r->[0]\n";
+ }
+ print "#SAM$r->[0] ",(exists $features->{$r->[0]}),"\n" if($debug);
+ push @nisect,$r if(exists $features->{$r->[0]});
+ }
+ return (\@nisect,$seqs,$alignments);
+sub findSingletons{
+ my($atree,$mapped,$unmapped,$subsumed,$datree) = @_;
+ my $singletons = {};
+ my $dups = {};
+ foreach my $feat_name (keys %$features){
+ my $fmin = $features->{$feat_name}->[1];
+ my $fmax = $features->{$feat_name}->[2];
+ my $seqname = $features->{$feat_name}->[0];
+ if(! exists $mapped->{$feat_name}){
+ die if(exists $mapped->{$feat_name});
+ my $classes = &annotateSingletons($atree,$features->{$feat_name}->[0],$feat_name,$fmin,$fmax);
+ my $nisect;
+ my $allseqs;
+ my $goodalignments;
+ my $dupfeats;
+ if(defined $datree){
+ #print "Querying $seqname,$feat_name,$fmin,$fmax,'gene'\n";
+ ($nisect,$allseqs,$goodalignments) = &getAlignedFeatures($datree,$seqname,$feat_name,$fmin,$fmax,'gene');
+ #print scalar(@$nisect),"\n";
+ foreach my $r (
+ sort { $features->{$b->[0]}->[3] <=> $features->{$a->[0]}->[3] } #sort on feature length
+ @$nisect){
+ my $dfeat_name = $r->[0];
+ my $seqname = $r->[1];
+ my $align_name = $r->[5];
+ #Check if we want to consider this alignment
+ if(exists $goodalignments->{$align_name}){
+ $dfeat_name =~ s/gene\://;
+ if(!exists $features->{$dfeat_name}){
+ print STDERR "#Bad feature found $dfeat_name. Not in input file. Skipping\n";
+ next;
+ }
+ if($dfeat_name ne $feat_name){
+ #print "Saving dup $dfeat_name , $feat_name\n";
+ $dupfeats->{$feat_name}->{$dfeat_name}->{'cov'} += $r->[7];
+ $dupfeats->{$feat_name}->{$dfeat_name}->{'pid'} += $r->[8];
+ $dupfeats->{$feat_name}->{$dfeat_name}->{'len'} += ($r->[3]-$r->[2]);
+ }
+ }
+ }
+ }
+ if(exists $unmapped->{$feat_name}){
+ my $query=$feat_name;
+ my($mappedorgs,$mappedgenes,$unmappedorgs,$unmappedgenes) = &buildCluster($atree,$query);
+ my($feat_attrs,$cluster_attrs,$codons) = &annotateCluster($atree,$mappedgenes,$mappedorgs);
+ my $new_orfs = &findnewORFs($db,$atree,$mappedorgs,$mappedgenes,$codons);
+ if(scalar(keys %$new_orfs)){
+ my $seq_attrs = {};
+ &reportCluster($query,$mappedorgs,$mappedgenes,$unmappedorgs,$unmappedgenes,$feat_attrs,$cluster_attrs,$seq_attrs,$new_orfs);
+ $cluster_id++;
+ }
+ my $featlen = $fmax-$fmin;
+ my $mappedlen = $unmapped->{$feat_name}->{'len'};
+ if($featlen <= 0){
+ print STDERR "#Bad featlen for feature $feat_name $fmax-$fmin\n";
+ $featlen=1;
+ }
+ if($mappedlen <= 0){
+ print STDERR "#Bad coverage for feature $feat_name Coverage:$unmapped->{$feat_name}->{'len'}\n";
+ $mappedlen=1;
+ }
+ my ($seqname,$fmin,$fmax,$len,$orient) = @{$features->{$feat_name}};
+ print "#SINGLETON $feat_name len:$features->{$feat_name}->[3]\tbest_cluster:C$unmapped->{$feat_name}->{'WGA_cluster'}\tcov:";
+ #printf("%.2f",$unmapped->{$feat_name}->{'cov'}/$featlen);
+ printf("%.2f",$unmapped->{$feat_name}->{'cov'});
+ print " pid:";
+ #printf("%.2f",$unmapped->{$_}->{'pid'}/$mappedlen);
+ printf("%.2f",$unmapped->{$feat_name}->{'pid'});
+ printf(" lenbp:%f ",$mappedlen);
+ join(' ',@$classes);
+ if(defined $features->{$feat_name}->[11]){
+ print " product=$features->{$feat_name}->[11]";
+ }
+ if(scalar(keys %{$dupfeats->{$feat_name}}) > 0){
+ foreach my $dfeat_name (sort {$dupfeats->{$feat_name}->{$b}->{'pid'} <=> $dupfeats->{$feat_name}->{$a}->{'pid'}} keys %{$dupfeats->{$feat_name}}){
+ print " #DUP matches:$dfeat_name(pid:";
+ printf("%.2f",($dupfeats->{$feat_name}->{$dfeat_name}->{'pid'}/$features->{$feat_name}->[3]));
+ print ",cov:";
+ printf("%.2f",($dupfeats->{$feat_name}->{$dfeat_name}->{'cov'}/$features->{$feat_name}->[3]));
+ print ") ";
+ }
+ }
+ print "\n";
+ }
+ else{
+ if(exists $subsumed->{$feat_name}){
+ print "#DELETED $feat_name\n";
+ }
+ else{
+ print "#SINGLETON $feat_name len:$features->{$feat_name}->[3] ",join(' ',@$classes);
+ if(defined $features->{$feat_name}->[11]){
+ print " product=$features->{$feat_name}->[11]";
+ }
+ if(scalar(keys %{$dupfeats->{$feat_name}}) > 0){
+ foreach my $dfeat_name (keys %{$dupfeats->{$feat_name}}){
+ print " #DUP matches:$dfeat_name(pid:";
+ printf("%.2f",($dupfeats->{$feat_name}->{$dfeat_name}->{'pid'}/$features->{$feat_name}->[3]));
+ print ",cov:";
+ printf("%.2f",($dupfeats->{$feat_name}->{$dfeat_name}->{'cov'}/$features->{$feat_name}->[3]));
+ print ") ";
+ }
+ $dups->{$feat_name}++;
+ }
+ else{
+ $singletons->{$feat_name}++;
+ }
+ print "\n";
+ }
+ }
+ }
+ else{
+ #Mapped ORF, not a singleton
+ }
+ }
+ return ($singletons,$dups);
+#General utility funcs
+sub getspan{
+ my($features) = shift;
+ my @coords;
+ foreach my $gene (@_){
+ die if(! exists $features->{$gene});
+ die if(! exists $features->{$gene});
+ push @coords,$features->{$gene}->{'fmin'},$features->{$gene}->{'fmax'};
+ }
+ my @sortedcoords = sort {$a <=> $b} @coords;
+ return ($sortedcoords[0],$sortedcoords[$#coords]);
+sub findCoords{
+ my($atree,$seqname,$startcodon,$stopcodon) = @_;
+ #$codon is a tuple of alignment,aligned_column
+ my($startcol,$aln_s) = split(/$CODON_DELIM_REGEX/,$startcodon);
+ #find corresponding stop
+ my($stopcol,$aln_e) = split(/$CODON_DELIM_REGEX/,$stopcodon);
+ my $si = &getAlignment($atree,$aln_s,$seqname);
+ my $ei = &getAlignment($atree,$aln_e,$seqname);
+ my $start_s;
+ my $start_e;
+ my $stop_s;
+ my $stop_e;
+ if($si){
+ ($start_s,$start_e) = AlignmentTree::columntocoords($si,$startcol,$startcol+2);
+ if($start_s == $start_e){
+ #aligned to a gap
+ $start_s = undef;
+ $start_e = undef;
+ }
+ if($ei){
+ ($stop_s,$stop_e) = AlignmentTree::columntocoords($ei,$stopcol,$stopcol+2);
+ if($stop_s == $stop_e){
+ #aligned to a gap
+ $stop_s = undef;
+ $stop_e = undef;
+ }
+ }
+ else{
+ print "Can't find alignment $aln_s on $seqname from $startcodon\n" if($debug);
+ return undef;
+ }
+ }
+ else{
+ print "Can't find alignment $aln_s on $seqname from $startcodon\n" if($debug);
+ return undef;
+ }
+ if($start_s<$stop_s){
+ #forward strand 5'start -----> 3'stop
+ return ($start_s,$stop_e,'+');
+ }
+ else{
+ #reverse strand
+ #3'stop <-- 5'start
+ return ($stop_s,$start_e,'-');
+ }
+ return undef;
+#Returns aligned location of start and stop codons
+#If annotation is not a valid start or stop codons returns -1
+#If codon is not aligned returns undef
+sub findCodons{
+ my($atree,$seqname,$fmin,$fmax,$orient,$fname) = @_;
+ #my($name,$seq,$start,$end,$coverage,$qpid) = @$aln;
+ my $codon_table = Bio::Tools::CodonTable->new(-id=>11);
+ #my $seqobj = $db->get_Seq_by_id($seqname);
+ my $seqobj = $db->{$seqname};
+ if(!$seqobj){
+ print "Can't find seqname: $seqname\n";
+ return;
+ }
+ my $startcodon=undef;
+ my $stopcodon=undef;
+ my $is_partial_start=0;
+ my $is_partial_stop=0;
+ my $is_bad_start=0;
+ my $is_bad_stop=0;
+ my $aln_orient=undef;
+ if($orient eq '+'){
+ if($fmin+1<=0){print STDERR "Bad start parameter $fmin+1<=0 $seqname,$fmin,$fmax,$orient,$fname\n";return}
+ if($fmax-3+1<=0){print STDERR "Bad end parameter $fmax-3+1<=0 $seqname,$fmin,$fmax,$orient,$fname\n";return}
+ if($fmin+3>$seqobj->length()){print STDERR "Bad start parameter $fmin+1<=0 $seqname,$fmin,$fmax,$orient,$fname\n";return};
+ if($fmax>$seqobj->length()){print STDERR "Bad end parameter $fmax-3+1<=0 $seqname,$fmin,$fmax,$orient,$fname\n";return};
+ if(!$codon_table->is_start_codon($seqobj->subseq($fmin+1,$fmin+3))){ #bioperl is 1-base coordinates
+ print "#Bad start codon $fname,$seqname,$fmin,$fmax,$orient codon $fmin+1,$fmin+2+1 ",$seqobj->subseq($fmin+1,$fmin+3)," aln_orient:$aln_orient\n" if($verbose || $debug);
+ $startcodon = &getAlignedCols($atree,$seqname,$fmin,$fmin+3);
+ $is_bad_start=1;
+ }
+ else{
+ #Find start codon + strand
+ $startcodon = &getAlignedCols($atree,$seqname,$fmin,$fmin+3);
+ }
+ if(!$codon_table->is_ter_codon($seqobj->subseq($fmax-3+1,$fmax))){
+ print "#Bad stop $fname,$seqname,$fmin,$fmax,$orient codon $fmax-3+1,$fmax ",$seqobj->subseq($fmax-3+1,$fmax)," aln_orient:$aln_orient\n" if($verbose || $debug);
+ $stopcodon = &getAlignedCols($atree,$seqname,$fmax-3,$fmax);
+ $is_bad_stop=1;
+ }
+ else{
+ #Find stop codon - strand
+ $stopcodon = &getAlignedCols($atree,$seqname,$fmax-3,$fmax);
+ }
+ #Check if in pmark spacer adjacent to contig boundary
+ if($fmin-length($PMARK_SPACER)>0 && $fmin+length($PMARK_SPACER) <= $seqobj->length()){
+ my $startregion = $seqobj->subseq($fmin-length($PMARK_SPACER),$fmin+length($PMARK_SPACER));
+ if($startregion =~ /$PMARK_SPACER/){
+ $is_partial_start=1;
+ }
+ }
+ if($fmax-length($PMARK_SPACER)>0 && $fmax+length($PMARK_SPACER) <= $seqobj->length()){
+ my $stopregion = $seqobj->subseq($fmax-length($PMARK_SPACER),$fmax+length($PMARK_SPACER));
+ if($stopregion =~ /$PMARK_SPACER/){
+ $is_partial_stop=1;
+ }
+ }
+ }
+ else{
+ die "Bad orient $orient" if($orient ne '-');
+ print STDERR "Bad start parameter $fmin+1<=0 $seqname,$fmin,$fmax,$orient,$fname" if($fmax-3+1<=0);
+ print STDERR "Bad end parameter $fmax-3+1<=0 $seqname,$fmin,$fmax,$orient,$fname" if($fmin+1<=0);
+ print STDERR "Bad start parameter $fmax>$seqobj->length() $seqname,$fmin,$fmax,$orient,$fname" if($fmax>$seqobj->length());
+ print STDERR "Bad end parameter $fmin+3>$seqobj->length() $seqname,$fmin,$fmax,$orient,$fname" if($fmin+3>$seqobj->length());
+ eval{
+ if(!$codon_table->is_start_codon(revcom($seqobj->subseq($fmax-3+1,$fmax))->seq())){
+ print "#Bad start codon $fname,$seqname,$fmin,$fmax,$orient codon $fmax-3+1,$fmax ",revcom($seqobj->subseq($fmax-3+1,$fmax))->seq()," aln_orient:$aln_orient\n" if($verbose || $debug);
+ $startcodon = &getAlignedCols($atree,$seqname,$fmax-3,$fmax);
+ $is_bad_start=0;
+ }
+ else{
+ #Find start codon on - strand
+ $startcodon = &getAlignedCols($atree,$seqname,$fmax-3,$fmax);
+ }
+ if(!$codon_table->is_ter_codon(revcom($seqobj->subseq($fmin+1,$fmin+3))->seq())){
+ print "#Bad stop codon $fname,$seqname,$fmin,$fmax,$orient codon $fmin+1,$fmin+3 ",revcom($seqobj->subseq($fmin+1,$fmin+3))->seq()," aln_orient:$aln_orient\n" if($verbose || $debug);
+ $stopcodon = &getAlignedCols($atree,$seqname,$fmin,$fmin+3);
+ $is_bad_stop=1;
+ }
+ else{
+ #Find stop codon on - strand
+ $stopcodon = &getAlignedCols($atree,$seqname,$fmin,$fmin+3);
+ }
+ #Check if in pmark spacer adjacent to contig boundary
+ } or do{
+ warn $@ if($verbose);
+ print STDERR "ERROR invalid start,stop codons or invalid translation. $seqname,$fmin,$fmax,$orient,$fname\n";
+ return 0;
+ };
+ if($fmax-length($PMARK_SPACER)>0 && $fmax+length($PMARK_SPACER) <= $seqobj->length()){
+ my $startregion = $seqobj->subseq($fmax-length($PMARK_SPACER),$fmax+length($PMARK_SPACER));
+ if($startregion =~ /$PMARK_SPACER/){
+ $is_partial_start=1;
+ }
+ }
+ if($fmin-length($PMARK_SPACER)>0 && $fmin+length($PMARK_SPACER) <= $seqobj->length()){
+ my $stopregion = $seqobj->subseq($fmin-length($PMARK_SPACER),$fmin+length($PMARK_SPACER));
+ if($stopregion =~ /$PMARK_SPACER/){
+ $is_partial_stop=1;
+ }
+ }
+ }
+ return ($startcodon,$stopcodon,$is_partial_start,$is_partial_stop,$is_bad_start,$is_bad_stop);
+sub getAlignment{
+ my($atree,$align_name,$seqname) = @_;
+ my $alignment = $atree->{_alignments}->{$align_name}->[0];
+ foreach my $i (@$alignment){
+ if($i->[0] eq $seqname){
+ return $i;
+ }
+ }
+ print "#Can't find $seqname on alignment $align_name\n" if($debug);
+ return undef;
+#Look for indels in alignment columns [$codon-$offset,$codon+2]
+#Refseq is optional, otherwise uses most frequently occuring allele as reference
+sub reportVariants{
+ my($atree,$db,$aln,$seq,$startcol,$endcol,$refseq) = @_;
+ my $skipgapcheck=0;
+ my $GAPWINDOW=10;
+ die if($endcol<$startcol);
+ print "#Analyzing codon position $startcol in alignment $aln seq $seq \n" if($debug);
+ print "#Retrieving alignment matrix for $startcol-$endcol for alignment $aln \n" if($debug);
+ my ($mmatrix,$seqmatrix,$names) = $atree->getAlignmentMatrix($aln,$startcol,$endcol,$db,$refseq,$seq);
+ print "#Expecting width ",($endcol-$startcol+1)," row count ",scalar(@$mmatrix)," ",scalar(@$names),"\n" if($debug);
+ #List of columns with variants
+ my $results = {};
+ my @edits;
+ my $qryidx;
+ #For optional reference seq
+ my $refidx=-1;
+ my $width;
+ for(my $i=0;$i<@$mmatrix;$i++){
+ if($names->[$i] eq $seq){
+ $qryidx = $i;
+ }
+ if(defined $refseq && $names->[$i] eq $refseq){
+ $refidx = $i;
+ }
+ }
+ #Matrix cols start at 0
+ for(my $j=0;$j<($endcol-$startcol+1);$j++){
+ if(defined $refseq){
+ if(uc(substr($mmatrix->[$refidx],$j,1)) ne uc(substr($mmatrix->[$qryidx],$j,1))){
+ #print uc(substr($mmatrix->[$refidx],$j,1))," ", uc(substr($mmatrix->[$qryidx],$j,1)),"\n";
+ $results->{$j}++;
+ }
+ }
+ else{
+ for(my $i=0;$i<@$mmatrix;$i++){
+ if(substr($mmatrix->[$i],$j,1) ne '.'){
+ if($skipgapcheck || substr($mmatrix->[$i],$j,$GAPWINDOW) =~ /\./ ){ #gap < GAPWINDOW
+ #column $i has multiple characters, gaps or mutations
+ print "#MUT $i $j ",substr($mmatrix->[$i],$j,1)," $names->[$i] $seq\n" if($debug);
+ $results->{$j}++;
+ }
+ }
+ }
+ }
+ }
+ my $alni = &getAlignment($atree,$aln,$seq);
+ foreach my $r (sort {$a cmp $b} keys %$results){
+ my $reloffset = $startcol+$r;
+ my $freqchar = {};
+ my $refchar;
+ my $qrychar;
+ if(defined $refseq){
+ $qrychar = substr($mmatrix->[$qryidx],$r,1);
+ $refchar = substr($mmatrix->[$refidx],$r,1);
+ }
+ else{
+ for(my $i=0;$i<@$mmatrix;$i++){
+ my $char;
+ #TODO this is slow, improve perf
+ if(substr($mmatrix->[$i],$r,1) eq '-'){
+ #gap
+ $char = substr($mmatrix->[$i],$r,1);
+ #die "Unexpected char $i $r $seqmatrix->[$i]->[$r] $mmatrix->[$i]->[$r]" if(defined $seqmatrix->[$i]->[$r]);
+ }
+ else{
+ #retrieve base
+ $char = substr($seqmatrix->[$i],$r,1);
+ }
+ die "Bad char '$char'" if(length($char)!=1);
+ $freqchar->{$char}++;
+ }
+ }
+ my $fstype = 0;
+ if(defined $refseq){
+ if(uc($refchar) ne uc($qrychar)){
+ if($refchar eq '-'){
+ $fstype=1;
+ }
+ elsif($qrychar eq '-'){
+ $fstype=-1;
+ }
+ else{
+ $fstype=0;
+ }
+ #Ignore point mutations for now
+ if($fstype!=0){
+ my($fsstart,$fsend) = AlignmentTree::columntocoords($alni,$reloffset,$reloffset);
+ if(1){#$fsstart != $fsend){
+ print "#ALT col:$reloffset coord:$fsstart-$fsend base:$refchar freq:$freqchar->{$refchar} $seq:$qrychar $freqchar->{$qrychar} fstype:$fstype\n" if($debug);
+ push @edits,[$fsstart,$refchar,$qrychar,$reloffset,$fstype];
+ #if(scalar(@edits)>$FS_THRESHOLD){
+ #return \@edits;
+ #}
+ }
+ }
+ }
+ }
+ else{
+ die;
+ #report most frequent character
+ my @sortedchars = sort {$b <=> $a} (keys %$freqchar);
+ #retrieve coordinate on $seq for reloffset
+ foreach my $base (@sortedchars){
+ if(uc($base) ne uc($qrychar)
+ #&& $freqchar->{$base}>=$freqchar->{$qrychar} #only consider bases that occur more frequently than
+ #&& $freqchar->{$base}>=scalar(@$mmatrix)/2){ #optionally also in majority of sequences
+ ){
+ if($base eq '-'){
+ $fstype=1;
+ }
+ elsif($qrychar eq '-'){
+ $fstype=-1;
+ }
+ else{
+ $fstype=0;
+ }
+ my($fsstart,$fsend) = AlignmentTree::columntocoords($alni,$reloffset,$reloffset);
+ print "#ALT col:$reloffset coord:$fsstart-$fsend base:$base freq:$freqchar->{$base} $seq:$qrychar $freqchar->{$qrychar} fstype:$fstype\n" if($debug);
+ push @edits,[$fsstart,$base,$qrychar,$reloffset,$fstype];
+ }
+ else{
+ #last;#can shortcircuit, only consider more frequent bases
+ }
+ }
+ }
+ }
+ return \@edits;
+#Returns the overlapping alignment and start-end column for a sequence range
+#Returns [start_colnum,alignment_obj,end_colnum,matching_bits]
+sub getAlignedCols{
+ my($atree,$seqname,$fmin,$fmax) = @_;
+ my $ret;
+ my @alignments = $atree->intersect($seqname,$fmin,$fmax,$aligntoken);
+ my $found=0;
+ foreach my $aln (@alignments){
+ if($seqname eq $aln->[1]){
+ my $align_name = $aln->[0];
+ my $align_start = $aln->[2];
+ my $align_end = $aln->[3];
+ die "Bad alignment name $align_name" if(!exists $atree->{_alignments}->{$align_name});
+ die "Mis-mathed orient $aln->[6] ne $aln->[7]" if($aln->[7] ne $aln->[6]);
+ my $alni = $atree->{_alignments}->{$align_name}->[0];
+ if($align_start == $fmin && $fmax == $align_end){
+ if($found){
+ print "#WARNING Overlapping aligned region found for $seqname,$fmin,$fmax. $align_name and $ret->[1]\n" if($debug);
+ }
+ my @res= AlignmentTree::coordstocolumn($alni,$seqname,$fmin,$fmax,1);
+ $ret = [$res[0],$align_name,$res[1],$res[2]];
+ $found=1;
+ }
+ }
+ }
+ return $ret;
+#Attempts to call an ORF using start codon specified by [start-end]
+#Start,end should be codon coordinates relative to the + strand. start<end
+#Will attempt to call an ORF on one strand.
+#Leading strand 5'->3' increasing coordinates [start-firstStop]
+#Lagging strand 5'->3' decreasing coordinates [end-firstStop]
+#Will only call ORF if start,end,orient corresponds to an acutal start
+#codon, specified by the configurable codon table
+#fsedits is a array reference of signed locations of the frameshift relative to the sequence start
+#eg. +10 is a forward frameshift 10 bp downstream from translation start
+# -9 is a backward frameshift 9 bp downstream from translation start
+sub callORF{
+ my($seqobj,$codon_start,$codon_end,$orient,$fs) = @_;
+ die "Bad start codon $seqobj:$codon_start-$codon_end $orient" if($codon_end < $codon_start || $codon_end - $codon_start != 3);
+ my $codon_table = Bio::Tools::CodonTable->new(-id=>11);
+ if($seqobj){
+ if($orient eq '+'){
+ my $seqlen = ($seqobj->length()>$MAXORFLEN) ? $codon_start+$MAXORFLEN : $seqobj->length();
+ my $newobjs = $seqobj->trunc($codon_start+1,$seqlen);
+ my $encoding = 'C'x$newobjs->length();
+ foreach my $fs_loc (@$fs){
+ if(defined $fs_loc){
+ if($fs_loc>0){
+ print "Encoding a forward frameshift at $fs_loc in ORF of length ",$newobjs->length(),"\n";
+ #a forward frameshift
+ #substr($encoding,$fs_loc,1) = 'F';
+ substr($encoding,$fs_loc,1,'F');
+ }
+ else{
+ #a backward frameshift
+ print "Encoding a reverse frameshift at $fs_loc in ORF of length ",$newobjs->length(),"\n";
+ #substr($encoding,($fs_loc*-1),1) = 'B';
+ substr($encoding,($fs_loc*-1),1,'B');
+ }
+ }
+ }
+ die if(length($encoding)!=$newobjs->length());
+ my $newobj = new Bio::Seq::EncodedSeq(-seq=>$newobjs->seq(),
+ -encoding=>$encoding);
+ #Check if valid start codon
+ if($codon_table->is_start_codon($newobj->subseq(1,3))){
+ my $protein_seq_obj = $newobj->translate(-orf => 1,
+ -codontable_id =>11);
+ return ($protein_seq_obj->seq(),$orient);
+ }
+ else{
+ print "#callORF trying '-' $seqobj,$codon_start,$codon_end,$orient Bad start codon ",$newobj->subseq(1,3) if($debug);;
+ my $seqlen = ($codon_end>$MAXORFLEN) ? $codon_end-$MAXORFLEN : 1;
+ my $newobj = $seqobj->trunc($seqlen,$codon_end);
+ eval{
+ $newobj = $newobj->revcom();
+ };
+ #print " REV:",$codon_table->is_start_codon($newobj->subseq(1,3))," ",$newobj->subseq(1,3),"\n";
+ if($codon_table->is_start_codon($newobj->subseq(1,3))){
+ my $protein_seq_obj = $newobj->translate(-orf => 1,
+ -codontable_id =>11);
+ return ($protein_seq_obj->seq(),'-');
+ }
+ else{
+ print "#WARNING: Skipping callORF $seqobj,$codon_start,$codon_end,$orient. '",$newobj->subseq(1,3),"' is not a valid start codon\n" if($debug);
+ }
+ }
+ }
+ else{
+ die if($orient ne '-');
+ my $seqlen = ($codon_end>$MAXORFLEN) ? $codon_end-$MAXORFLEN : 1;
+ my $newobj = $seqobj->trunc($seqlen,$codon_end);
+ eval{
+ $newobj = $newobj->revcom();
+ };
+ #Check if valid start codon
+ if($codon_table->is_start_codon($newobj->subseq(1,3))){
+ my $protein_seq_obj = $newobj->translate(-orf => 1,
+ -codontable_id =>11);
+ return ($protein_seq_obj->seq(),$orient);
+ }
+ else{
+ print "#callORF trying '+' $seqobj,$codon_start,$codon_end,$orient Bad start codon ",$newobj->subseq(1,3) if($debug);;
+ my $seqlen = ($seqobj->length()>$MAXORFLEN) ? $codon_start+$MAXORFLEN : $seqobj->length();
+ my $newobj = $seqobj->trunc($codon_start+1,$seqlen);
+ if($codon_table->is_start_codon($newobj->subseq(1,3))){
+ my $protein_seq_obj = $newobj->translate(-orf => 1,
+ -codontable_id =>11);
+ return ($protein_seq_obj->seq(),'+');
+ }
+ else{
+ print "WARNING: Skipping callORF $seqobj,$codon_start,$codon_end,$orient. '",$newobj->subseq(1,3),"' is not a valid start codon\n";
+ }
+ }
+ }
+ }
+ else{
+ print "#ERROR invalid seq obj $seqobj\n" if($debug);;
+ }
+ return undef;
+#Print alternative start sites
+#Report aligned but un-annotated start codons
+#(1) alternative start location, frequency annotated in the alignment
+#(2) resulting ORF, len
+sub checkStarts{
+ my ($db,$codons,$seqs,$seq_attrs) = @_;
+ my $altorfs;
+ #Save list of all start codons $codon->$freq
+ my $starts = {};
+ my $stops = {};
+ die if(!exists $codons->{'starts'});
+ foreach my $seqname (keys %{$codons->{'starts'}}){
+ foreach my $codon (keys %{$codons->{'starts'}->{$seqname}}){
+ $starts->{$codon} += $codons->{'starts'}->{$seqname}->{$codon};
+ }
+ }
+ die if(!exists $codons->{'stops'});
+ foreach my $seqname (keys %{$codons->{'stops'}}){
+ foreach my $codon (keys %{$codons->{'stops'}->{$seqname}}){
+ $stops->{$codon} += $codons->{'stops'}->{$seqname}->{$codon};
+ }
+ }
+ foreach my $seqname (@{$seqs}){
+ #Consider all codons that are not currently annotated on this sequence
+ foreach my $codon (keys %$starts){
+ if(! exists $codons->{'starts'}->{$seqname}->{$codon}){ #start codon is not annotated on $seqname
+ print "#CODON $codon not annotated on $seqname\n" if($debug);;
+ #check if $codon is aligned
+ #$codon is a tuple of alignment,aligned_column
+ my($col,$aln) = split(/$CODON_DELIM_REGEX/,$codon);
+ my $gapped=1; #isgapped
+ #check is $col,$col+3 is gapped, return start coordinate on the genome
+ my $i = &getAlignment($atree,$aln,$seqname);
+ if($i){
+ die "Cannot find alignment $aln that contains $seqname" if(!$i);
+ #Obtain coordinates of the putative start codon
+ my($start,$end) = AlignmentTree::columntocoords($i,$col,$col+2);
+ $gapped = (abs($end-$start) == 3) ? 0 : 1;
+ if(!$gapped){
+ #codon is aligned, attempt to call ORF
+ #save and report it. save frequency
+ if($db){
+ my $orient = $i->[3];
+ print "#Looking for ORF $start,$end,$orient on $seqname\n" if($debug);
+ #my $seqobj = $db->get_Seq_by_id($seqname);
+ my $seqobj = $db->{$seqname};
+ if($seqobj){
+ die "Can't find sequence $seqname obj:$seqobj" if(!defined $seqobj);
+ my ($neworf,$callorient) = &callORF($seqobj,$start,$end,$orient);
+ if(length($neworf)>$MINORFLEN){
+ print "#Calling ORF on strand $callorient start coord = $start\n" if($debug);;
+ #$codons->{'alt_starts'}->{$seqname}->{$codon}->{'freq'} = $starts->{$codon};
+ #$codons->{'alt_starts'}->{$seqname}->{$codon}->{'neworf'} = $neworf;
+ #$codons->{'alt_starts'}->{$seqname}->{$codon}->{'orient'} = $callorient;
+ my $fmin;
+ my $fmax;
+ if($callorient eq '+'){
+ $fmin=$start;
+ $fmax=$start+(length($neworf)*3);
+ }
+ else{
+ $fmin=$end-(length($neworf)*3);
+ $fmax=$end;
+ }
+ if(!$fmin || $fmin<0){
+ print STDERR "Bad ORF call on $seqname $start,$end converted to $fmin,$fmax\n";
+ next;
+ }
+ #$codons->{'alt_starts'}->{$seqname}->{$codon}->{'start'} = $fmin;
+ #$codons->{'alt_starts'}->{$seqname}->{$codon}->{'end'} = $fmax;
+ my($strc,$stpc) = &findCodons($atree,
+ $seqname,
+ $fmin,
+ $fmax,
+ $callorient);
+ #if($callorient eq '-'){
+ # ($strc,$stpc) = ($stpc,$strc);
+ # }
+ my $startcodon;
+ my $stopcodon;
+ if(ref $strc){
+ my($mcol,$align_name) = (@$strc);
+ $startcodon = $mcol.$CODON_DELIM.$align_name;
+ #die "Can't find start $mcol,$align_name $callorient,$orient from $seqname $codon" if(!exists $starts->{$startcodon});
+ #if(!exists $starts->{$startcodon}){
+ # $codons->{'alt_starts'}->{$seqname}->{$codon}->{'startfreq'} = 0;
+ #}
+ #else{
+ # $codons->{'alt_starts'}->{$seqname}->{$codon}->{'startfreq'} = $starts->{$startcodon};
+ #}
+ #$codons->{'alt_starts'}->{$seqname}->{$codon}->{'startcol'} = $mcol;
+ #$codons->{'alt_starts'}->{$seqname}->{$codon}->{'startcodon'} = $startcodon;
+ }
+ if(ref $stpc){
+ my($mcol,$align_name) = (@$stpc);
+ $stopcodon = $mcol.$CODON_DELIM.$align_name;
+ #die "Can't find stop $mcol,$align_name $callorient,$orient from $seqname $codon" if(!exists $stops->{$stopcodon});
+ #if(!exists $stops->{$stopcodon}){
+ # $codons->{'alt_starts'}->{$seqname}->{$codon}->{'stopfreq'} = 0;
+ #}
+ #else{
+ # $codons->{'alt_starts'}->{$seqname}->{$codon}->{'stopfreq'} = $stops->{$stopcodon};
+ #}
+ #$codons->{'alt_starts'}->{$seqname}->{$codon}->{'stopcol'} = $mcol;
+ #$codons->{'alt_starts'}->{$seqname}->{$codon}->{'stopcodon'} = $stopcodon;
+ }
+ #Save start,stop pair
+ if($startcodon && $stopcodon){
+ #$codons->{'pairs'}->{$startcodon.':'.$stopcodon}->{'gfreq'}++;
+ #$codons->{'pairs'}->{$startcodon.':'.$stopcodon}->{'length'} += ($fmax-$fmin);
+ #$codons->{'pairs'}->{$startcodon.':'.$stopcodon}->{'orgs'}->{$seqname} = [$fmin,$fmax,0];
+ push @$altorfs,[$seqname,$fmin,$fmax,$callorient,$startcodon,$stopcodon];
+ }
+ }
+ else{
+ print "Skipping short ORF ",length($neworf)," <$MINORFLEN $start,$end,$orient\n" if($debug);
+ }
+ }
+ else{
+ print "#WARNING. Sequence $seqname not found in FASTA file. Skipping calling new ORFs.\n";
+ }
+ }
+ else{
+ print "#WARNING. No FASTA file, cannot call new ORFs\n";
+ }
+ }
+ else{
+ print "#alignment to codon contains gaps $start,$end\n" if($debug);;
+ }
+ }
+ else{
+ print "#can't find alignment $aln $seqname\n" if($debug);;
+ }
+ }
+ }
+ }
+ return $altorfs;
+sub findNearestNeighbor{
+ my($atree,$seqname,$mappedseqs,$start,$end) = @_;
+ my @res = $atree->map($seqname,$start,$end);
+ my @sres = sort {$b->[8] <=> $a->[8]} @res;
+ foreach my $s (@sres){
+ if($s->[1] ne $seqname && exists $mappedseqs->{$s->[1]}){
+ return $s->[1];
+ }
+ }
+sub reportFrameShifts{
+ my($atree,$db,$seqname,$nearestseq,$startcodon,$stopcodon) = @_;
+ #$codon is a tuple of alignment,aligned_column
+ my($startcol,$aln_s) = split(/$CODON_DELIM_REGEX/,$startcodon);
+ #find corresponding stop
+ my($stopcol,$aln_e) = split(/$CODON_DELIM_REGEX/,$stopcodon);
+ my $si = &getAlignment($atree,$aln_s,$seqname);
+ my $ei = &getAlignment($atree,$aln_e,$seqname);
+ my($startcoord) = AlignmentTree::columntocoords($si,$startcol,$startcol);
+ my($stopcoord) = AlignmentTree::columntocoords($ei,$stopcol,$stopcol);
+ #Make sure we have not traversed a rearrangement
+ if(abs($stopcoord-$startcoord)<$MAXORFLEN){
+ my $fsvars = [];
+ my $netfs = 0;
+ #TODO, relax to allow multiple spanning alignments
+ print "#Looking for frameshifts in $startcodon,$stopcodon $aln_s $aln_e $startcoord $stopcoord\n" if($debug);
+ my @sortedproj;
+ if($startcoord < $stopcoord){
+ my @proj = $atree->intersect($seqname,$startcoord,$stopcoord,'WGA');
+ @sortedproj = sort {$a->[2] <=> $b->[2]} @proj;
+ }
+ else{
+ my @proj = $atree->intersect($seqname,$stopcoord,$startcoord,'WGA');
+ @sortedproj = sort {$b->[3] <=> $a->[3]} @proj;
+ }
+ print "#Found ",scalar(@sortedproj)," alignments\n" if($debug);
+ foreach my $aln (@sortedproj){
+ if($aln->[1] eq $seqname){
+ my ($startcol,$stopcol) = AlignmentTree::coordstocolumn($atree->{_alignments}->{$aln->[0]}->[0],$seqname,$aln->[2],$aln->[3],1);
+ my $sv = &reportVariants($atree,$db,$aln->[0],$seqname,$startcol,$stopcol,$nearestseq);
+ foreach my $v (@$sv){
+ if($v->[4] != 0){
+ print "#FSVAR $seqname ",join(',',@$v),"\n" if($debug);
+ push @$fsvars,$v;
+ $netfs += $v->[4];
+ #if(abs($netfs) > $FS_THRESHOLD){
+ #Short circuit
+ #return undef;
+ #}
+ }
+ else{
+ die;
+ }
+ }
+ }
+ }
+ my @coords = sort {$a->[0] <=> $b->[0]} (@$fsvars);
+ my $pos = [];
+ my @runs;
+ my $indelstr1;
+ my $indelstr2;
+ my $last;
+ my $start;
+ my $lasttype;
+ my $end;
+ for(my $i=0;$i<@coords;$i++){
+ if($i==0){
+ $start=$coords[$i]->[0];
+ $lasttype=$coords[$i]->[4];
+ $pos= [];
+ }
+ elsif(abs($last+1 - $coords[$i]->[0]) > 1 || $coords[$i]->[4] != $lasttype){
+ #print "Adding $start,$last,$indelstr1,$indelstr2,$lasttype ",scalar(@$pos),"\n";
+ push @runs,[$start,$last,$indelstr1,$indelstr2,$lasttype,$pos];
+ $indelstr1="";
+ $indelstr2="";
+ $start=$coords[$i]->[0];
+ $pos=[];
+ }
+ $last=$coords[$i]->[0];
+ $lasttype=$coords[$i]->[4];
+ $indelstr1.=$coords[$i]->[1];
+ $indelstr2.=$coords[$i]->[2];
+ push @$pos,$coords[$i]->[0];
+ }
+ if($last){
+ #print "Adding_post $start,$last,$indelstr1,$indelstr2,$lasttype ",scalar(@$pos),"\n";
+ push @runs,[$start,$last,$indelstr1,$indelstr2,$lasttype,$pos];
+ }
+ my $ispmark=0;
+ foreach my $r (@runs){
+ $ispmark = ($r->[2] =~ /$PMARK_SPACER/) ? 1 : 0;
+ $ispmark = ($r->[3] =~ /$PMARK_SPACER/) ? 1 : 0 if(!$ispmark);
+ last if($ispmark);
+ }
+ #Remove runs that are multiple of 3
+ my @fsruns;
+ foreach my $r (@runs){
+ die if(length($r->[2]) != length($r->[3]));
+ if($ispmark==1){
+ push @fsruns,$r;
+ }else{#if(length($r->[2])%3!=0 || (length($r->[2]) < $FSLEN_THRESHOLD)){
+ push @fsruns,$r;
+ }
+ }
+ if($verbose){
+ print "#FS num_runs ",scalar(@fsruns),"\n";
+ foreach my $r (@fsruns){
+ print "[$r->[0]-$r->[1] $r->[2]:$r->[3]] $r->[4] ",scalar(@{$r->[5]}),"\n";
+ }
+ print "\n";
+ }
+ return (\@fsruns,$netfs);
+ }
+#Report ORFs on aligned sequences that are unannotated
+#For all aligned segments that do not contain any annotated ORF
+#Attempt to use annotated and aligned start codons from other genomes in the cluster
+#to call new ORFs
+sub findnewORFs{
+ my($db,$atree,$mappedorgs,$mappedgenes,$codons) = @_;
+ #Consider all possible aligned starts in the cluster
+ my $allcodons = {};
+ foreach my $seq (keys %{$codons->{'starts'}}){
+ foreach my $codon (keys %{$codons->{'starts'}->{$seq}}){
+ $allcodons->{$codon}++;
+ }
+ }
+ my $noorfseqs = {};
+ #Foreach codon, attempt to find ORFs if none annotated above cutoffs
+ print "#Total number of possible codons ",scalar(keys %$allcodons),"\n" if($debug);;
+ foreach my $codon (keys %$allcodons){
+ my($col,$aln) = split(/\$CODON_DELIM_REGEX/,$codon);
+ my $alignedseqs = $atree->{_alignments}->{$aln}->[0]; #get seqs for $lan
+ foreach my $alnseq (@$alignedseqs){
+ my $seq = $alnseq->[0];
+ #Check if sequence already has a mapped gene
+ if(! exists $mappedorgs->{$seq}){
+ print "#No ORFs on seq $seq\n" if($debug);;
+ $noorfseqs->{$seq}++;
+ }
+ }
+ }
+ my $seq_attrs = {};
+ print "#Looking for new starts in new orfs\n" if($debug);;
+ &checkStarts($db,$codons,[keys %$noorfseqs],$seq_attrs,1);
+ return $seq_attrs;
+sub printExtAlts{
+#Longest row in Green
+#Frameshifts in Red
+#Start fully consistent
+sub printExtAlignments{
+#Use CSS classes for each codon, highlight in color
+#Use CSS classes for each gene
+#grid.getView().getRowClass = function(record, index){
+#return (record.data.change<0.7 ? (record.data.change<0.5 ? (record.data.change<0.2 ? 'red-row' : 'green-row') : 'blue-row') : '');
+ var fDataTpl = new Ext.XTemplate(
+ '<tpl for=".">',
+ '<div>',
+ '<pre class="x-fixed">{element}</pre>',
+ '</div>',
+ '</tpl>'
+ );
+#From http://www.sencha.com/blog/2010/07/13/a-side-by-side-diff-viewer-built-with-ext-js/
+# // Obtain reference to HTML templates
+# lineTpl = Ext.ux.CodeViewer.lineTpl,
+# emptyLineTpl = Ext.ux.CodeViewer.emptyLineTpl,
+# // Create a "pre" tag to hold the code
+# pre = this.el.createChild({tag: 'pre'}),
+# var el = lineTpl.append(pre, [i+1, this.highlightLine(lines[i])]);
+# Ext.fly(el).addClass('ux-codeViewer-modified');
+sub printExtJSCluster{
+ my($cluster_id,$clusterref) = @_;
+ #List cluster members
+ my @clustergrid;
+#Longest row in Green
+#Frameshifts in Red
+#Start fully consistent
+ my @altgrid;
+ foreach my $alt (keys %{$clusterref->{'alts'}}){
+ my $isfcon = (exists $clusterref->{'alts'}->{$alt}->{'fcon'}) ? 1 : 0;
+ my $ismax = (exists $clusterref->{'alts'}->{$alt}->{'maxlen'}) ? 1 : 0;
+ push @altgrid,["'".$clusterref->{'alts'}->{$alt}->{'name'}."'",
+ $clusterref->{'alts'}->{$alt}->{'gfreq'},
+ $clusterref->{'alts'}->{$alt}->{'afreq'},
+ $clusterref->{'alts'}->{$alt}->{'len'},
+ scalar(keys %{$clusterref->{'alts'}->{$alt}->{'neworfs'}}),
+ scalar(keys %{$clusterref->{'alts'}->{$alt}->{'fs'}}),
+ $isfcon,
+ $ismax];
+ }
+ foreach my $g (keys %{$clusterref->{'orgs'}}){
+ my @codoninfo;
+ foreach my $alt (keys %{$clusterref->{'alts'}}){
+ if(exists $clusterref->{'codons'}->{$alt}->{'features'}){
+ push @codoninfo,$clusterref->{'alts'}->{$alt}->{'name'};
+ }
+ }
+ my $gref = $clusterref->{'orgs'}->{$g};
+ push @clustergrid,["'CLUSTER_".$cluster_id."'",
+ "'".$g."'",
+ "'".join(',',@{$gref->{'genes'}})."'",
+ "'".join(',',@{$gref->{'cov'}})."'",
+ "'".join(',',@{$gref->{'pid'}})."'",
+ $gref->{'fmin'},
+ $gref->{'fmax'},
+ $gref->{'len'},
+ "'".join(',',@{$gref->{'orient'}})."'",
+ "'".join(',', at codoninfo)."'",
+ "'".$gref->{'desc'}."'"
+ ];
+ }
+ #List edits
+ #Show alignment
+ if($htmlout){
+ #Link to prev and next cluster
+ my $jsfh;
+ my $htmlfh;
+ my $htmlrelpath = basename("$options{'prefix'}cluster_${cluster_id}.html");
+ my $jsrelpath = basename("$options{'prefix'}cluster_${cluster_id}.js");
+ open $jsfh,"+>$options{'prefix'}cluster_${cluster_id}.js";
+ open $htmlfh,"+>$options{'prefix'}cluster_${cluster_id}.html";
+ print $htmlfh <<_CLUSTERHTMLHEADER;
+ <html>
+ <head>
+ <title>Cluster $cluster_id</title>
+ <link rel="stylesheet" type="text/css" href="http://dev.sencha.com/deploy/dev/resources/css/ext-all.css" />
+ <script type="text/javascript" src="http://dev.sencha.com/deploy/dev/adapter/ext/ext-base.js"></script>
+ <script type="text/javascript" src="http://dev.sencha.com/deploy/dev/ext-all-debug.js"></script>
+ </head>
+ <body>
+ <script type="text/javascript" src="$jsrelpath"></script>
+ <div id="my-div" class="x-hidden">
+ <pre>
+ print $htmlfh `cat $options{'prefix'}cluster_${cluster_id}.aln.out`;
+ print $htmlfh <<_CLUSTERHTMLFOOTER;
+ </pre>
+ print $jsfh <<_CLUSTERJSHEADER;
+ function renderGeneURL(val){
+ return '<a href="javascript:document.getElementById(\\''+val+'\\').scrollIntoView(true);">'+val+'</a>';
+ }
+ Ext.onReady(function(){
+ Ext.QuickTips.init();
+ var xg = Ext.grid;
+ var featstore = new Ext.data.ArrayStore({
+ fields: [
+ {name: 'cluster'},
+ {name: 'genome'},
+ {name: 'name'},
+ {name: 'coverage', type: 'float'},
+ {name: 'identity', type: 'float'},
+ {name: 'fmin', type: 'float'},
+ {name: 'fmax', type: 'float'},
+ {name: 'length', type: 'float'},
+ {name: 'strand'},
+ {name: 'codon_pairs'},
+ {name: 'desc'}
+ ]
+ });
+ var altstore = new Ext.data.ArrayStore({
+ fields: [
+ {name: 'name'},
+ {name: 'gfreq'},
+ {name: 'afreq'},
+ {name: 'len'},
+ {name: 'neworfs'},
+ {name: 'fs'},
+ {name: 'isfcon'},
+ {name: 'ismax'},
+ ]
+ });
+ featstore.loadData(xg.clusterData);
+ altstore.loadData(xg.altData);
+ var alntext = new Ext.Panel({
+ 'id':'alntext',
+ 'title':'Alignment detail',
+ 'region':'south',
+ split:true,
+ height:300,
+ collapsible: true,
+ autoScroll:true,
+ contentEl:'my-div'
+ });
+ var featgrid = new xg.GridPanel({
+ store: featstore,
+ columns: [
+ {id:'cluster',header: "Cluster", width: 70, sortable: true, dataIndex: 'cluster'},
+ {header: "Feature", width: 100, sortable: true, dataIndex: 'name'},
+ {header: "Genome", width: 100, sortable: true, dataIndex: 'genome'},
+ {header: "fmin", width: 50, sortable: true, dataIndex: 'fmin'},
+ {header: "fmax", width: 50, sortable: true, dataIndex: 'fmax'},
+ {header: "strand", width: 30, sortable: true, dataIndex: 'strand'},
+ {header: "Len", width: 50, sortable: true, dataIndex: 'length'},
+ {header: "Coverage", width: 50, sortable: true, dataIndex: 'coverage'},
+ {header: "Identity", width: 50, sortable: true, dataIndex: 'identity'},
+ {header: "Alt ORFs", width: 200, sortable: true, dataIndex: 'codon_pairs'},
+ {header: "Description", width: 500, sortable: true, dataIndex: 'desc'},
+ ],
+ viewConfig: {
+ forceFit: true},
+ frame: true,
+ animCollapse: false,
+ title: 'Cluster ${cluster_id} annotation summary',
+ iconCls: 'icon-grid',
+ fbar : ['->', {
+ text:'Save as text',
+ handler : null
+ }],
+ columnWidth: .6,
+ flex:1
+ });
+ var editgrid = new xg.GridPanel({
+ store: altstore,
+ columns: [
+ {id:'name',header: "ORF", width: 70, sortable: true, dataIndex: 'name'},
+ {header: "Aligned Freq", width: 50, sortable: true, dataIndex: 'gfreq'},
+ {header: "Annotated Freq", width: 50, sortable: true, dataIndex: 'afreq'},
+ {header: "Len", width: 50, sortable: true, dataIndex: 'len'},
+ {header: "# Missing", width: 50, sortable: true, dataIndex: 'neworfs'},
+ {header: "# FS", width: 50, sortable: true, dataIndex: 'fs'},
+ {header: "isfcon", width: 50, sortable: true, dataIndex: 'isfcon'},
+ {header: "ismax", width: 50, sortable: true, dataIndex: 'ismax'},
+ ],
+ split:true,
+ frame: true,
+ collapsible: true,
+ animCollapse: false,
+ title: 'Cluster 1 edit summary',
+ iconCls: 'icon-grid',
+ columnWidth: .4,
+ flex:1
+ });
+ var viewport = new Ext.Viewport({
+ layout:'border',
+ //defaults: {autoScroll:true,height:500},
+ items: [
+ new Ext.Panel({
+ layout:'fit',
+ region:'center',
+ items: [
+ new Ext.Panel({
+ layout:'hbox',
+ layoutConfig: {
+ align : 'stretch',
+ pack : 'start',
+ },
+ region:'center',
+ items: [ featgrid,editgrid]
+ })
+ ]
+ }),
+ alntext
+ ]
+ });
+ viewport.doLayout();
+ });
+ ;
+ print $jsfh "Ext.grid.clusterData = [";
+ foreach my $c (@clustergrid){
+ print $jsfh "[",join(',',@$c),"],\n";
+ }
+ print $jsfh "];\n";
+ print $jsfh "Ext.grid.altData = [";
+ foreach my $c (@altgrid){
+ print $jsfh "[",join(',',@$c),"],\n";
+ }
+ print $jsfh "];\n";
+ close $jsfh;
+ close $htmlfh;
+ }
+sub printExtJS{
+ my($clusters) = @_;
+ my @clustergrid;
+ #Summary is cluster_id,#genomes,#genes,class,
+ foreach my $cluster_id (keys %$clusters){
+ push @clustergrid,[$cluster_id,$clusters->{$cluster_id}->{'num_feats'},$clusters->{$cluster_id}->{'num_genomes'},"'".$clusters->{$cluster_id}->{'classes'}."'"];
+ &printExtJSCluster($cluster_id,$clusters->{$cluster_id});
+ }
+ my $jsfh;
+ my $htmlfh;
+ my $jsrelpath = basename("$options{'prefix'}main.js");
+ my $relpath = basename("$options{'prefix'}");
+ open $jsfh,"+>$options{'prefix'}main.js";
+ open $htmlfh,"+>$options{'prefix'}index.html";
+ print $htmlfh <<_HTMLHEADER;
+<title>Mugsy-Annotator Report</title>
+<link rel="stylesheet" type="text/css" href="http://dev.sencha.com/deploy/dev/resources/css/ext-all.css" />
+<script type="text/javascript" src="http://dev.sencha.com/deploy/dev/adapter/ext/ext-base.js"></script>
+<script type="text/javascript" src="http://dev.sencha.com/deploy/dev/ext-all-debug.js"></script>
+<script type="text/javascript" src="$jsrelpath"></script>
+ print $jsfh <<_MAINJSHEADER;
+ function renderClusterURL(val){
+ return '<a href="${relpath}cluster_'+val+'.html">CLUSTER_'+val+'</a>';
+ }
+ Ext.onReady(function(){
+ Ext.QuickTips.init();
+ var xg = Ext.grid;
+ // shared reader
+ var reader = new Ext.data.ArrayReader({}, [
+ {name: 'cluster_id'},
+ {name: 'num_feats'},
+ {name: 'num_genomes'},
+ {name: 'quality_class'}
+ ]);
+ var store = new Ext.data.GroupingStore({
+ reader: reader,
+ data: xg.summaryData,
+ sortInfo:{field: 'cluster_id', direction: "ASC"},
+ groupField:'quality_class'
+ });
+ var grid = new xg.GridPanel({
+ store: store,
+ columns: [
+ {id:'Cluster',header: "Cluster", width: 10, sortable: true, dataIndex: 'cluster_id', renderer:renderClusterURL},
+ {header: "Features", width: 10, sortable: true, dataIndex: 'num_feats'},
+ {header: "Genomes", width: 10, sortable: true, dataIndex: 'num_genomes'},
+ {header: "Class", width: 20, sortable: true, dataIndex: 'quality_class'},
+ ],
+ view: new Ext.grid.GroupingView({
+ forceFit:true,
+ groupTextTpl: '{text} ({[values.rs.length]} {[values.rs.length > 1 ? "Items" : "Item"]})'
+ }),
+ frame:true,
+ width: 700,
+ height: 450,
+ collapsible: true,
+ animCollapse: false,
+ title: 'Annotation summary',
+ iconCls: 'icon-grid',
+ fbar : ['->', {
+ text:'Clear Grouping',
+ iconCls: 'icon-clear-group',
+ handler : function(){
+ store.clearGrouping();
+ }
+ }],
+ renderTo: document.body
+ });
+ ;
+ print $jsfh "Ext.grid.summaryData = [";
+ foreach my $c (@clustergrid){
+ print $jsfh "[",join(',',@$c),"],\n";
+ }
+ print $jsfh "];\n";
+ close $jsfh;
+ close $htmlfh;
diff --git a/mapping/mugsy-annotator b/mapping/mugsy-annotator
new file mode 100644
index 0000000..3cbab63
--- /dev/null
+++ b/mapping/mugsy-annotator
@@ -0,0 +1,48 @@
+#USAGE: mugsy-annotator allgenomes.fsa aln.maf *.gbk
+#Features can be either a GFF3 file, a GBK genbank flat file, or 5 column text files ($featname $seqname $fmin $fmax $strand )
+#Generate a multi-FASTA file with all your genome sequences
+#cat genome1 ...genomeN > allgenomes.fsa
+if [ ! -d "$PREFIX" ]
+ echo "Cannot find installation directory $PREFIX. Edit the script to configure a valid directory"
+ exit 1
+if [ $# -lt 3 ]
+ echo "USAGE: mugsy-annotator allgenomes.fsa aln.maf *.gbk"
+ exit 1
+echo "Building index for alignment $2" >&2
+$PREFIX/mafindex.pl $IDXFILE < $2 > /tmp/$$.mafidx
+for gff in $@
+ do
+ isgb=`head -1 $gff | grep "^LOCUS"`
+ if [ "$isgb" != "" ]
+ then
+ echo "Converting file $gff to GFF" >&2
+ `bp_genbank2gff3.pl --filter misc_feature -in stdin -out - < $gff | grep -v "# Input" >> /tmp/$$.gff`;
+ else
+ cat $gff >> /tmp/$$.gff
+ fi
+echo "Building index for features" >&2
+$PREFIX/featureindex.pl $IDXFILE gff < /tmp/$$.gff > /tmp/$$.featidx
+echo "Mapping features" >&2
+echo "To print with aligment detail. Run $PREFIX/mapfeatures.pl --printalignments $IDXFILE $FSAFILE < /tmp/$$.gff"
+echo "To print with html reports. Run $PREFIX/mapfeatures.pl --printhtml $IDXFILE $FSAFILE < /tmp/$$.gff"
+$PREFIX/mapfeatures.pl $IDXFILE $FSAFILE < /tmp/$$.gff
diff --git a/mapping/mugsyindex.pl b/mapping/mugsyindex.pl
new file mode 100755
index 0000000..09eb557
--- /dev/null
+++ b/mapping/mugsyindex.pl
@@ -0,0 +1,38 @@
+#./mugsyindex.pl index.file < mugsy.out
+#Adds MUGSY output to a MUGSY formatted index
+#Each block is saved as type 'syntenyblk'
+use strict;
+use lib '/usr/local/projects/angiuoli/developer/sangiuoli/mugsy/trunk';
+use AlignmentTree;
+use Data::Dumper;
+my $atree = new AlignmentTree();
+if(-e $ARGV[0]){
+ $atree = AlignmentTree::deserialize($ARGV[0]);
+my $currscore;
+my $block = [];
+my $k=0;
+my $name;
+while(my $line=<STDIN>){
+ chomp $line;
+ if($line !~ /^[\s\#]/){
+ my @elts = split(/\s+/,$line);
+ if($name ne $elts[0]){
+ $atree->insert($block,$name,"synteny") if(scalar @$block>0 && $name);
+ $name = "$elts[0]";
+ $block = [];
+ }
+ push @$block,[$elts[1],$elts[3],$elts[4],$elts[2]];
+ }
+$atree->insert($block,$name,"synteny") if(scalar @$block>0 && $name);
+print STDERR "Writing index to $ARGV[0]\n";
diff --git a/mapping/mugsymapper b/mapping/mugsymapper
new file mode 100755
index 0000000..24cdd0b
--- /dev/null
+++ b/mapping/mugsymapper
@@ -0,0 +1,34 @@
+#USAGE: mugsymapper allgenomes.fsa aln.maf *.gbk
+#Features can be either a GFF3 file, a GBK genbank flat file, or 5 column text files ($featname $seqname $fmin $fmax $strand )
+#Generate a multi-FASTA file with all your genome sequences
+#cat genome1 ...genomeN > allgenomes.fsa
+echo "Building index for alignment $2" >&2
+$PREFIX/mafindex.pl $IDXFILE < $2 > /tmp/$$.mafidx
+for gff in $@
+ do
+ isgb=`head -1 $gff | grep "^LOCUS"`
+ if [ "$isgb" != "" ]
+ then
+ echo "Converting file $gff to GFF" >&2
+ `bp_genbank2gff3.pl --filter misc_feature -in stdin -out - < $gff | grep -v "# Input" >> /tmp/$$.gff`;
+ else
+ cat $gff >> /tmp/$$.gff
+ fi
+$PREFIX/featureindex.pl $IDXFILE gff < /tmp/$$.gff > /tmp/$$.featidx
+echo "Mapping features" >&2
+$PREFIX/mapfeatures.pl $IDXFILE $FSAFILE < /tmp/$$.gff
diff --git a/mapping/query.pl b/mapping/query.pl
new file mode 100644
index 0000000..4e6412c
--- /dev/null
+++ b/mapping/query.pl
@@ -0,0 +1,19 @@
+use strict;
+use AlignmentTree;
+use Storable qw(store retrieve);
+use Data::Dumper;
+$Storable::Deparse = 1;
+$Storable::Eval = 1;
+my $atree;
+if(-e $ARGV[0]){
+ $atree = retrieve($ARGV[0]);
+my @results = $atree->intersect($ARGV[1],$ARGV[2],$ARGV[3]);
+foreach my $r (@results){
+ print "INTERSECT RESULT ",join(' ',@$r),"\n";
diff --git a/mapping/reportvariants.pl b/mapping/reportvariants.pl
new file mode 100755
index 0000000..4d662b9
--- /dev/null
+++ b/mapping/reportvariants.pl
@@ -0,0 +1,118 @@
+#./reportvariants.pl index fasta
+use strict;
+use Bio::Perl;
+use Bio::DB::Fasta;
+use Bio::Seq;
+use lib '/usr/local/projects/angiuoli/developer/sangiuoli/mugsy/trunk/mapping/';
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+use AlignmentTree;
+my %options;
+my $results = GetOptions (\%options,
+ 'gap_window|g=s',
+ 'display_window|d=s',
+ 'gaps_allowed|a=s') || pod2usage(-verbose => 1);
+pod2usage(-verbose=>1) if($options{'help'});
+my $atree = AlignmentTree::deserialize($ARGV[0]);
+my $db = Bio::DB::Fasta->new($ARGV[1],'-reindex'=>1);
+my $gapthreshold=0;
+if(exists $options{'gaps_allowed'}){
+ $gapthreshold = $options{'gaps_allowed'};
+my $gap_window=5;
+if(exists $options{'gap_window'}){
+ $gap_window = $options{'gap_window'};
+my $display_window=5;
+if(exists $options{'display_window'}){
+ $display_window = $options{'display_window'};
+shift @ARGV;
+shift @ARGV;
+my $pwseqs = {};
+my $refname = shift @ARGV;
+foreach my $seq (@ARGV){
+ $pwseqs->{$seq}++;
+open VFILE,"+>$$.pwvariants.out" or die "Can't open file pwvariants.out";
+open SFILE,"+>$$.snpvariants.out" or die "Can't open file snpvariants.out";
+foreach my $alnname (sort {$a cmp $b} keys %{$atree->{_alignments}}){
+ my($alnobj,$aln_bv,$align_width) = @{$atree->{_alignments}->{$alnname}};
+ my ($mmatrix,$seqmatrix,$names) = $atree->getAlignmentMatrix($alnname,1,$align_width,$db);
+ if(@$seqmatrix > 1){
+ #print STDERR "Checking alignment $alnname $align_width ",scalar(@$seqmatrix),"\n";
+ my $ngaps;
+ my $nmismatches;
+ my $variants = {};
+ my $seqvariants = {};
+ my $refidx;
+ for(my $i=0;$i<@$seqmatrix;$i++){
+ if($names->[$i] eq $refname){
+ $refidx=$i;
+ }
+ }
+#Matrix cols start at 0
+ for(my $j=0;$j<$align_width;$j++){
+ my $b;
+ my $refbp = lc(substr($seqmatrix->[$refidx],$j,1));
+ for(my $i=0;$i<@$seqmatrix;$i++){
+ if($i ne $refidx){
+ my $currbp = lc(substr($seqmatrix->[$i],$j,1));
+ if($currbp ne $refbp && $currbp !~ /[yskrmwnw]/){
+ $variants->{$j}++;
+ $seqvariants->{$i}->{$j}++;
+ }
+ }
+ #print "$b=$currbp " if($b ne '-' && $currbp ne '-');
+ }
+ }
+ #print STDERR "variants ",scalar(keys %$variants),"\n";
+ foreach my $col (sort {$a <=> $b} keys %$variants){
+ my $gaps=0;
+ for(my $i=0;$i<@$seqmatrix;$i++){
+ my $start = $col - $gap_window;
+ $start = 0 if($start < 0);
+ my $end = $col + $gap_window;
+ $end = $align_width if($end > $align_width);
+ $gaps+= (substr($seqmatrix->[$i],$start,$end-$start+1) =~ tr/\-/\-/);
+ }
+ if($gaps<=$gapthreshold){
+ my $refc;
+ for(my $i=0;$i<@$seqmatrix;$i++){
+ my $start = $col - $display_window;
+ $start = 0 if($start < 0);
+ my $end = $col + $display_window;
+ $end = $align_width if($end > $align_width);
+ my($alni) = $atree->getAlignedInterval($alnname,$names->[$i]);
+ my $colstart = 1+$start;
+ my $colend = $colstart;
+ my($startc,$endc) = AlignmentTree::columntocoords($alni,$col+1,$col+1);
+ $refc = $startc if($names->[$i] eq "$refname");
+ #AlignmentTree::printAlignmentDebug($alnobj);
+ printf("%10s %s\tcoords:%d-%d\n",$names->[$i],lc(substr($seqmatrix->[$i],$start,$end-$start+1)),$startc,$endc);
+#, substr($seqmatrix->[$i],$start,$end-$start),"\n";
+ if($names->[0] eq "$refname" && exists $pwseqs->{$names->[$i]} && $seqvariants->{$i}->{$col}){
+ print SFILE "$names->[$i]\t$refname\t$refc\t",$refc+1,"\t",uc(substr($seqmatrix->[0],$col,1)),"\n";
+ print VFILE "$names->[$i]\t$refc\t",$refc+1,"\t",substr($seqmatrix->[0],$col,1),"/",substr($seqmatrix->[$i],$col,1),"\t$names->[$i]\t$startc-$endc\n";
+ }
+ }
+ printf("%10s ^ \n");
+ print "\n";
+ }
+ }
+ }
+close VFILE;
+close SFILE;
diff --git a/mapping/testitree.pl b/mapping/testitree.pl
new file mode 100755
index 0000000..58e2239
--- /dev/null
+++ b/mapping/testitree.pl
@@ -0,0 +1,327 @@
+use strict;
+use IntervalTree;
+use AlignmentTree;
+use Data::Dumper qw(Dumper);
+#remove only using for revcom
+use Bio::Perl;
+use Bio::DB::Fasta;
+use Bio::Seq;
+use Bio::Tools::CodonTable;
+#Assumptions fmin<fmax,colstart<colend
+#Genome coordinate system is 0 start, interbase coordinates. Feature length = fmax-fmin
+#Alignment coordinate system is 1 start counting bases. Feature length is fmax-fmin+1.
+#Test cases
+#Sequence 1 ...AATTGGCCAA...
+#Sequence 2 ...AATTGGCCAA...
+#Sequence 3 ...AATTGGCCAA...
+#Alignment 1 S1,S2,S3 +,+,+
+#Alignment 2 +,-,+
+#Alignment 3 -,-,+
+#Test feature1 orient='+' fmin=102 fmax=107 'TTGGC'
+#Test feature2 orient='-' fmin=103 fmax=108 'GCCAA'
+#+ Alignment, + annotation end5<end3 colorient '+' fmin -> coords increasing -> fmax
+#Eg. feature1
+#100 1 AATTGGCCAA 10 110
+#100 1 AATTGGCCAA 10 110
+#col 123456789
+#query:fmin=102,fmax=107 strand +
+#- Alignment, - annotation end3<end5 colorient '+' fmax -> coords decreasing -> fmin
+#Eg. feature2
+#110 1 TTGGCCAATT 10 100
+#110 1 TTGGCCAATT 10 100
+#col 123456789
+#query:fmin=102,fmax=107 strand -
+#+ Alignment, - annotation end3<end5 colorient '-' fmin -> coords increasing -> fmax. revcom matching interval
+#Eg. feature2
+#100 1 AATTGGCCAA 10 110
+#100 1 AATTGGCCAA 10 110
+# AACCG - reversed 107-102
+#col 123456789
+#query:fmin=102,fmax=107 strand -
+#- Alignment, + annotation end5<end3 colorient '-' fmax -> coords decreasing -> fmin. revcom matching interval
+#120 1 TTGGCCAATT 10 100
+#110 1 TTGGCCAATT 10 100
+# CGGTT - reversed 107-102
+#col 123456789
+#query:fmin=102,fmax=107 strand +
+my @alignments = ([
+ ['genome1',10,1000,'+','900M100X','g1'],
+ ['genome2',100,900,'+','100X800M100X','g2'],
+ ['genome3',350,1350,'+','1000M','g3'],
+ ],
+ [
+ ['genome1',20,2000,'+','1820M180X','g1'],
+ ['genome2',200,900,'+','180X700M1120X','g2'],
+ ['genome3',450,2350,'+','100X1900M','g3'],
+ ['genome4',450,2350,'+','100X1900M','g4']
+ ]
+ );
+my @alignqueries = (["genome1",1010,1020],
+ ["genome2",500,720]
+ );
+my @intervals = ([10,1000,1,'+'],
+ [100,900,2,'+'],
+ [350,10000,3,'+']);
+my @intqueries = ([1010,1020],
+ [500,720]
+ );
+my @filter = ('g1','g4');
+#Test intervaltree
+my $tree = new IntervalTree(1,1000000);
+foreach my $i (@intervals){
+ $tree->insert(@$i);
+#for(my $i=10000;$i>=0;$i--){
+# print "$i\n" if($i%1000==0);
+# $tree->insert($i,$i+1,$i);
+ #print Dumper($tree),"\n";
+#for(my $i=0;$i<10000;$i++){
+# print "$i\n" if($i%1000==0);
+# $tree->insert($i,$i+1,$i);
+foreach my $q (@intqueries){
+ print "QUERY ",join(' ',@$q),"\n";
+ my @results = $tree->intersect(@$q);
+ foreach my $r (@results){
+ print "RESULT $r\n";
+ }
+#Test alignment tree
+my $atree = new AlignmentTree();
+my $k=0;
+foreach my $a (@alignments){
+ $atree->insert($a,"MAUVE$k","MAUVE");
+ $k++;
+print "Alignmenttree intersect queries\n";
+foreach my $q (@alignqueries){
+ print "QUERY ",join(' ',@$q),"\n";
+ my @results = $atree->intersect(@$q);
+ foreach my $r (@results){
+ print "INTERSECT RESULT ",join(' ',@$r),"\n";
+ }
+ print "DONE\n";
+print "Alignmenttree map()\n";
+foreach my $q (@alignqueries){
+ print "QUERY ",join(' ',@$q),"\n";
+ my @results = $atree->map(@$q);
+ foreach my $r (@results){
+ print "MAP RESULT ",join(' ',@$r),"\n";
+ }
+print "DONE\n";
+print "Adding filter ",join(',', at filter),"\n";
+foreach my $q (@alignqueries){
+ print "QUERY ",join(' ',@$q),"\n";
+ my @results = $atree->intersect(@$q);
+ foreach my $r (@results){
+ print "INTERSECT RESULT ",join(' ',@$r),"\n";
+ }
+#TEST 1 +,+ alignment mapped features on opposing strands
+#Test alignment tree
+open FILE,">/tmp/$$.testing" or die "Can't open file /tmp/$$.testing";
+print FILE <<_FASTAEND;
+ ;
+close FILE;
+my $db = Bio::DB::Fasta->new("/tmp/$$.testing",'-reindex'=>1);
+my $atree = new AlignmentTree();
+#0 1 2 3 4 5 6 7 8 9 10
+# A A T T G G C C A A
+my @alignments2 = ([
+ ['genome1',100,110,'+','10M','g1'], #AATTGGCCAA
+ ['genome2',100,110,'+','10M','g2'] #AAATTGGCCA
+ ],
+ [
+ ['genome1',100,110,'+','5M1X5M','g1'], #AATTGGCCAA
+ ['genome2',100,110,'+','5M1X5M','g2'] #AAATTGGCCA
+ ]);
+my @features = ([['genome1',102,107,'+','5M']], #ATTGG
+ [['genome2',102,107,'-','5M']] #CCAAT
+ );
+my $expectedfeats = ['TTGGC','GCCAA','TTGGC','GCCAA'];
+my $k=0;
+my $alnidx=0;
+foreach my $a (@alignments2){
+ foreach my $f (@$a){
+ #convert from 0 base to 1 base
+ my $queryseq = $db->get_Seq_by_id($f->[0]);
+ my $queryseqsubstr = $queryseq->subseq($f->[1]+1,$f->[2]);
+ if($f->[3] eq '-'){
+ $queryseqsubstr = revcom($queryseqsubstr)->seq();
+ }
+ else{
+ $queryseqsubstr = $queryseq->subseq($f->[1]+1,$f->[2]);
+ }
+ if($queryseqsubstr =~ /N/){
+ die "ERROR unexpected alignment sequence $k found $queryseqsubstr =~ /N/\n";
+ }
+ if($queryseqsubstr ne $expectedalns->[$k]){
+ die "ERROR unexpected alignment sequence $k found $queryseqsubstr ne $expectedalns->[$k]\n";
+ }
+ $k++;
+ }
+ $atree->insert($a,"MAUVE$alnidx","MAUVE");
+ $alnidx++;
+foreach my $f (@features){
+ $atree->insert($f,'gene:'.$k,'gene');
+ #convert from 0 base to 1 base
+ my $queryseq = $db->get_Seq_by_id($f->[0]->[0]);
+ my $queryseqsubstr = $queryseq->subseq($f->[0]->[1]+1,$f->[0]->[2]);
+ if($f->[0]->[3] eq '-'){
+ print "REVCOM $queryseqsubstr\n";
+ $queryseqsubstr = revcom($queryseqsubstr)->seq();
+ }
+ if($queryseqsubstr =~ /N/){
+ die "ERROR unexpected sequence found $queryseqsubstr =~ /N/\n";
+ }
+ if($queryseqsubstr ne $expectedfeats->[$k]){
+ die "ERROR unexpected sequence found $queryseqsubstr ne $expectedfeats->[$k]\n";
+ }
+ print "QUERYSEQ $f->[0]->[1]-$f->[0]->[2] $f->[0]->[3] ",$queryseqsubstr,"\n";
+ $k++;
+print "INTERSECT TEST1\n";
+foreach my $f (@features){
+ my @results = $atree->intersect($f->[0]->[0],$f->[0]->[1],$f->[0]->[2],'gene');
+ die "More results than expected" if(scalar(@results)>1);
+ my $r = $results[0];
+ print "genome1 INTERSECT RESULT ",join(' ',@$r),"\n";
+ my $queryseq = $db->get_Seq_by_id($r->[1]);
+ my $queryseqsubstr = $queryseq->subseq($r->[2]+1,$r->[3]);
+ if($r->[6] eq '-'){
+ print "REVCOM $queryseqsubstr\n";
+ $queryseqsubstr = revcom($queryseqsubstr)->seq();
+ }
+ if($queryseqsubstr =~ /N/){
+ die "ERROR unexpected sequence found $queryseqsubstr =~ /N/ $r->[2]+1,$r->[3]\n";
+ }
+ if($queryseqsubstr ne $expectedfeats->[$k]){
+ die "ERROR unexpected sequence found $queryseqsubstr ne $expectedfeats->[$k] $r->[2]+1,$r->[3]\n";
+ }
+ $k++;
+ my @results1 = $atree->map($f->[0]->[0],$f->[0]->[1],$f->[0]->[2],'MAUVE');
+ foreach my $r (@results1){
+ print "MAP RESULT ",join(' ',@$r),"\n";
+ }
+my $alnidx=0;
+#For all alignments
+foreach my $a (@alignments2){
+ my($alnobj,$bv,$width) = $atree->getAlignment("MAUVE$alnidx");
+ my ($mmatrix,$seqmatrix,$names) = $atree->getAlignmentMatrix("MAUVE$alnidx",1,$width,$db);
+ #See if we can map features into alignment matrix
+ for(my $i=0;$i<@features;$i++){
+ my $f = $features[$i];
+ print "MATRIX MAUVE$alnidx ",join(',',@{$f->[0]}),"\n";
+ my $flen = $f->[0]->[2]-$f->[0]->[1];
+ die "Bad sequence $seqmatrix->[$i]" if(length ($seqmatrix->[$i])<1);
+ #Returned seq length == input length + 1
+ my($cs,$ce) = AlignmentTree::coordstocolumn($alnobj,$f->[0]->[0],$f->[0]->[1],$f->[0]->[2]);
+ my $queryseqsubstr = substr($seqmatrix->[$i],$cs-1,$ce-$cs+1);
+ $queryseqsubstr =~ s/\-//g;
+ if($features[$i]->[0]->[3] eq '-'){
+ print "REVCOM $queryseqsubstr\n";
+ $queryseqsubstr = revcom($queryseqsubstr)->seq();
+ }
+ if($queryseqsubstr =~ /N/){
+ die "ERROR unexpected sequence found $queryseqsubstr =~ /N/\n";
+ }
+ if($queryseqsubstr ne $expectedfeats->[$k]){
+ die "ERROR unexpected sequence $k found $queryseqsubstr ne $expectedfeats->[$k]\n";
+ }
+ else{
+ print "Sequence $k $queryseqsubstr eq $expectedfeats->[$k] OK\n";
+ }
+ $k++;
+ }
+ $atree->printAlignment("MAUVE$alnidx",1,$width,$db);
+ $alnidx++;
+foreach my $a (@alignments2){
+ my($alnobj,$bv,$width) = $atree->getAlignment("MAUVE$alnidx");
+ foreach my $f (@features){
+ my @results1 = $atree->map($f->[0]->[0],$f->[0]->[1],$f->[0]->[2],'MAUVE');
+ foreach my $r (@results1){
+ print "MAP RESULT ",join(' ',@$r),"\n";
+ }
+ $atree->printAlignment("MAUVE$alnidx",1,$width,$db,\@results1);
+ }
+ $alnidx++;
diff --git a/mapping/xmfaindex.pl b/mapping/xmfaindex.pl
new file mode 100755
index 0000000..beab1f4
--- /dev/null
+++ b/mapping/xmfaindex.pl
@@ -0,0 +1,145 @@
+#./mafindex.pl mugsyindex < mugsy.out
+#Adds an MAF formatted file to a MUGSY formatted index
+#Each alignment is saved as type 'alignment'
+use strict;
+use lib '/usr/local/projects/angiuoli/developer/sangiuoli/mugsy/trunk';
+use AlignmentTree;
+use Storable qw(store retrieve);
+use Data::Dumper;
+$Storable::Deparse = 1;
+$Storable::Eval = 1;
+my $atree = new AlignmentTree();
+if(-e $ARGV[0]){
+ $atree = AlignmentTree::deserialize($ARGV[0]);
+my $index=0;
+my $seqlookup = {};
+if(-e $ARGV[1]){
+ open FILE,"$ARGV[1]" or die "Can't open file $ARGV[1]";
+ while(my $line=<FILE>){
+ my($seq) = ($line =~ /\>?(\S+)/);
+ $seqlookup->{++$index} = $seq;
+ }
+ close FILE;
+my $currscore;
+my $block = [];
+my $k=0;
+my $label=0;
+my $seqname;
+my $start;
+my $end;
+my $orient;
+my @seqinfo;
+while(my $line=<STDIN>){
+ if($line =~ /^=/){
+ if(defined $seqname && $start>0){
+ my ($cigar,$len) = &get_cigar(join('', at seqinfo));
+ die "Bad match length $len in cigar $cigar" if ($end-$start+1 != $len);
+ #Convert alignment to zero start, interbase coordinates
+ push @$block,[$seqname,$start-1,$end,$orient,$cigar];
+ print "Adding aligned sequence $seqname $start-1,$end,$orient to alignment MAUVE_$label\n";
+ }
+ $atree->insert($block,"MAUVE_$label","alignment") if(scalar(@$block));
+ $label++;
+ $block=[];
+ $seqname=undef;
+ @seqinfo=();
+ }
+ elsif($line =~ /^>\s+(\d+)\:(\d+)-(\d+)\s+([\+\-])\s+(\S+)/){
+ if(defined $seqname && $start>0){
+ my ($cigar,$len) = &get_cigar(join('', at seqinfo));
+ die "Bad match length $len in cigar $cigar" if ($end-$start+1 != $len);
+ push @$block,[$seqname,$start-1,$end,$orient,$cigar];
+ print "Adding aligned sequence $seqname $start-1,$end,$orient to alignment MAUVE_$label\n";
+ }
+ my $seqid = $1;
+ $start = $2;
+ if($start>0){
+ $end = $3;
+ #XMFA format start always < end
+ die "Invalid coordinates $start-$end" if($start>$end);
+ #Relative orientation of the alignment
+ $orient = $4;
+ my $file = $5;
+ $seqname = $file;
+ if(exists $seqlookup->{$seqid}){
+ $seqname = $seqlookup->{$seqid};
+ }
+ else{
+ #Hack for strep pneumo xmfa files
+ $seqname =~ s/\.fsa//g;
+ }
+ }
+ @seqinfo=();
+ }
+ else{
+ if(defined $seqname){
+ chomp $line;
+ push @seqinfo,$line;
+ }
+ }
+$atree->insert($block,"MAUVE_$label","alignment") if(scalar(@$block));
+print STDERR "Writing index to $ARGV[0]\n";
+sub get_cigar{
+ my($seqs) = @_;
+ my $cig;
+ my $len=0;
+ my $mlen=0;
+ my @chars = split(//,$seqs);
+ my $count=0;
+ my $curr=0; #1 - match, 2 - gap
+ foreach my $c (@chars){
+ #match char
+ if($c ne '-'){
+ if($curr==2){
+ #in gap
+ #write prev gap
+ $cig .= $count."X";
+ $count=0;
+ }
+ #in match
+ $count++;
+ $curr=1;
+ }
+ else{
+ #gap char
+ if($curr==1){
+ #in match
+ #write prev gap
+ $cig .= $count."M";
+ $len += $count;
+ $count=0;
+ }
+ #in gap
+ $count++;
+ $curr=2;
+ }
+ }
+ if($curr==1){
+ #in gap
+ #write prev gap
+ $cig .= $count."M";
+ $len += $count;
+ }
+ if($curr==2){
+ #in gap
+ #write prev gap
+ $cig .= $count."X";
+ }
+ return ($cig,$len);
diff --git a/mugsy b/mugsy
new file mode 100755
index 0000000..b6dfd8c
--- /dev/null
+++ b/mugsy
@@ -0,0 +1,1013 @@
+if(! -d $ENV{'MUGSY_INSTALL'}){
+ my $default_install = "/usr/local/projects/angiuoli/mugsy_trunk/";
+ if( -d $default_install){
+ $ENV{'MUGSY_INSTALL'} = $default_install;
+ print STDERR "MUGSY_INSTALL environment variable not set. Using $default_install\n";
+ }
+ else{
+ print STDERR "ERROR: MUGSY_INSTALL environment variable not set. Set using export MUGSY_INSTALL=/somepath/to/mugsy\n";
+ }
+=head1 NAME
+mugsy - a multiple whole genome aligner
+=head1 USAGE
+mugsy [-p output prefix] multifasta_genome1.fsa multifasta_genome2.fsa ... multifasta_genomeN.fsa
+=head1 SYNOPSIS
+Mugsy is multiple whole genome aligner. Mugsy uses Nucmer for pairwise
+alignment, a custom graph based segmentation procedure for identifying
+LCBs (synchain-mugsy), and a segment-based progressive multiple
+alignment strategy from Seqan::TCoffee. Mugsy accepts draft genomes in
+the form of multi-FASTA files. Mugsy does not require a reference
+genome and is robust in the presence of large scale genome flux and
+genome rearrangments. Mugsy performs best on closely related genomes
+and has been used to align several dozens bacterial genomes.
+Mugsy outputs a series of alignments in MAF format.
+See http://mugsy.sf.net for more information
+=head1 INPUT
+Input is one or more (multi)FASTA files, one per genome. Each file
+should contain all the sequences for a single organism/species. The
+filename is used as the genome name.
+Limitations on FASTA input:
+ input FASTA headers must not contain ':' or '-'
+ ambiguity characters are converted to N in output
+Common options:
+ -p|prefix prefix for output files
+ --directory directory used to store output and temporary
+ files. Must be a absolute path
+ -d|--distance maximum distance along a single sequence (bp) for
+ chaining anchors into locally colinear blocks (LCBs). This is
+ used by the segmentation step synchain-mugsy. Default is 1000bp.
+ -c|--minlength minimum span of an aligned region in a colinear
+ block (bp). This is used by the segmentation step
+ synchain-mugsy. Default is 30bp.
+ -duplications 1 - Detect and report duplications. 0 - Skip. Default is 0.
+Other options:
+ -nucmeropts options passed through to the Nucmer
+ package. Eg. -nucmeropts "-l 15" sets the minimum MUM length in
+ NUCmer to 15. See the Nucmer documentation at
+ http://mummer.sf.net for more information. Default is -l 15.
+ -allownestedlcbs. Default=false. Places each multi-genome anchor
+ in exactly one LCB; the longest spanning LCB
+ -plot output genome dot plots in GNUplot format. Overlays LCBS
+ onto pairwise plots from mummerplot. Display of draft genomes in
+ these plots is not supported.
+ -fullsearch Run a complete all pairs Nucmer search with each
+ sequence as a reference and query (n^2-1 total searches). Default
+ is one direction only (n^2-1/2 searches).
+ -refine run an second iteration of Mugsy on each LCB to refine the
+ alignment using either Mugsy (--refine mugsy), FSA (--refine
+ fsa), Pecan (--refine pecan), MLAGAN (--refine mlagan). Requires
+ necessary tools are in your path:
+ fsa: fsa
+ pecan: muscle,exonerate, in the path. classpath set for bp.pecan.Pecan.
+ mlagan: mlagan.sh
+ -debug debug level. > 2 verbose
+=head1 OUTPUT
+Primary output is MAF format.
+Utilities for parsing MAF are available at the UCSC genome browser and
+in the multiz,TBA toolkit. GMAJ is a popular visualization tool for MAF.
+=head1 MORE INFO
+This script is a wrapper that invokes an all-against-all Nucmer search
+and the mugsy aligner. The two primary components of the aligner
+can also be run independently
+1) mugsyWGA
+Generates a whole genome alignment (WGA) from a library of pairwise
+alignments in XMFA format. Implemented with the refined segment graph
+and progressive consistency-based alignment procedure described in
+Seqan::TCoffee (Rausch et al 2008). Invokes synchain-mugsy to segment
+the input genomes into alignable regions.
+2) synchain-mugsy
+Derives a segmentation of genome anchors that fulfill --distance and
+--minlength criteria. Anchors can be any oriented features that span
+two or more of the input genomes. The output is a set of locally
+colinear blocks (LCBs)
+=head1 Using Mugsy with other aligners
+Mugsy supports realignment of LCBs using FSA,Pecan, MLAGAN. For FSA, make sure FSA is in your PATH and run with --refine fsa
+=head1 For more information
+Sam Angiuoli
+angiuoli at cs.umd.edu
+use strict;
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+use File::Basename;
+use Pod::Usage;
+use POSIX;
+#Only needed for TBA evaluation
+# require TreeParse;
+#if (! $@){
+# TreeParse->import();
+my %options;
+my $results = GetOptions (\%options,
+ 'prefix|p=s',
+ 'directory=s',
+ 'distance|d=s',
+ 'minlength|c=s',
+ 'fullsearch',
+ 'tree|t=s',
+ 'treefile|f=s',
+ 'skipsearch',
+ 'skiprefine',
+ 'allownestedlcbs',
+ 'refine:s',
+ 'colinear',
+ 'skipunique',
+ 'duplications=s',
+ 'keeptmpfiles',
+ 'keepsearchfiles',
+ 'tba|s',
+ 'mugsywga|s',
+ 'nucmeropts|o=s',
+ 'plot',
+ 'nofilter|n',
+ 'translated|s',
+ 'debug=s',
+ 'log=s',
+ 'help|h',
+ 'fasta_file_list=s'
+ ) || pod2usage(-verbose => 3);
+pod2usage(-verbose=>3) if($options{'help'});
+$options{'debug'} = 0 if(!defined $options{'debug'});
+my $mugsyinstall = $ENV{'MUGSY_INSTALL'};
+##Customized version of Nucmer with maf conversion utilities and
+#delta-filter -b for reporting duplications
+my $nucmerinstall = "$ENV{'MUGSY_INSTALL'}/MUMmer3.20";
+#Mugsy aligner
+my $mugsywgacmd = "$mugsyinstall/mugsyWGA";
+#Nucmer package
+my $nucmercmd = "$nucmerinstall/nucmer";
+my $promercmd = "$nucmerinstall/promer";
+my $searchcmd = $options{'translated'} ? $promercmd : $nucmercmd;
+my $deltafiltcmd = "$nucmerinstall/delta-filter";
+my $deltadupscmd = "$mugsyinstall/delta-dups.sh";
+my $mummerplotcmd = "$nucmerinstall/mummerplot";
+my $delta2mafcmd = "$nucmerinstall/delta2maf";
+#Mugsy utils
+my $maf2fastacmd = "$mugsyinstall/maf2fasta.pl";
+my $labelblockscmd = "$mugsyinstall/labelblocks.pl";
+my $fixnamescmd = "$mugsyinstall/fixMAFnames.pl";
+#MAF utils and TBA
+#This wrapper includes support to use TBA with Nucmer for evaluation purposes
+my $muscleinstall = "/usr/local/projects/angiuoli/developer/sangiuoli/muscle/trunk";
+my $multizinstall = "/usr/local/projects/angiuoli/developer/sangiuoli/multiz-tba/trunk";
+if(-d $multizinstall){
+ $ENV{'PATH'} = "$ENV{'PATH'}:$multizinstall/";
+my $tbacmd = "$multizinstall/tba";
+my $singlecovcmd = "$multizinstall/single_cov2";
+my $mafsortcmd = "$multizinstall/maf_sort";
+#Customized version MUSCLE v3.7 or later was required to quickly build guide tree
+my $musclecmd = "$muscleinstall/muscle";
+#characters like . cannot be included in the output prefix -prefix
+#because of assumptions made by some downstream parsers.
+#Check for problem characters in -prefix and report errors
+my $problemchars = "\.\?\-";
+#FASTA headers must not contain
+my $fastaproblemchars = "\:\-";
+if(defined $options{'directory'}){
+ if(! -d "$options{'directory'}"){
+ die "-directory must be a directory";
+ }
+ elsif($options{'directory'} !~ /\/$/){
+ $options{'directory'} .= "/";
+ }
+ $options{'directory'} = "/tmp/";
+my $absprefix = $options{'directory'};
+if(!defined $options{'prefix'}){
+ $absprefix .= "tmp";
+ if($options{'prefix'} =~ /([$problemchars])/){
+ die "Character '$1' found in --prefix=$options{'prefix'}. Please choose another --prefix that excludes '$1'.\n";
+ }
+ $absprefix .= $options{'prefix'};
+my $logfh;
+if(lc($options{'log'}) eq 'stderr'){
+ $logfh=*STDERR;
+ open $logfh,"+>$options{'log'}";
+ open $logfh,"+>$options{'prefix'}.mugsy.log";
+## We need to append the filenames present in the tag
+## so modifying the code so that it would take input
+## fasta files either from the command line or from
+## the tagged dataset.
+## Modified by: Mahesh Vangala
+my @inputseqfiles = ();
+unless($options{'fasta_file_list'}) {
+ @inputseqfiles = @ARGV;
+} else {
+ getFastaFilesPath($options{'fasta_file_list'}, \@inputseqfiles);
+#Attempt to detect and convert genbank files
+for(my $i=0;$i<@inputseqfiles;$i++){
+ if(`head -1 $inputseqfiles[$i]` =~ /^LOCUS\s+/){
+ print STDERR "Attempting to convert $inputseqfiles[$i] to FASTA\n";
+ my $bname = basename($inputseqfiles[$i]);
+ print `bp_seqconvert.pl --from genbank --to fasta < $inputseqfiles[$i] > $options{'directory'}/$bname.fsa`;
+ $inputseqfiles[$i]="$options{'directory'}/$bname.fsa";
+ }
+print LOG "Processing FASTA files ",join(',', at inputseqfiles),"\n" if($options{'debug'});
+pod2usage(-verbose=>3, -message => "Need to specify valid input fasta file") if(! scalar(@inputseqfiles));
+#Skipsearch automatically sets keepsearchfiles so
+#that search output is preserved
+ $options{'keepsearchfiles'}=1;
+#Set defaults for --distance and --minlength
+$options{'distance'} = (defined $options{'distance'}) ? $options{'distance'} : 1000;
+die "--distance must be an integer. Passed $options{'distance'}" if($options{'distance'} =~ /\D/);
+$options{'minlength'} = (defined $options{'minlength'}) ? $options{'minlength'} : 30;
+if(!defined $options{'nucmeropts'}){
+ #Added -l 15 to make defaults comparable with Mauve defaults
+ $options{'nucmeropts'} = $options{'nucmeropts'}." -l 15";
+if(defined $options{'collinear'}){
+ $options{'nucmeropts'} .= $options{'nucmeropts'}." -maxmatch";
+die "Cannot pass both --refine and --skiprefine" if(exists $options{'refine'} && exists $options{'skiprefine'});
+if(exists $options{'refine'}){
+ $options{'keeptmpfiles'}=1;
+if(!exists $options{'refine'} && !exists $options{'skiprefine'}){
+ $options{'skiprefine'}='true';
+die "Cannot pass both --tba and --mugsywga" if(exists $options{'mugsywga'} && $options{'tba'});
+#Multiple alignment method
+my $method;
+if(exists $options{'tba'}){
+ $method="tba";
+ $method="mugsywga";
+ $method="mugsywga";
+#TODO remove this
+if(defined $options{'plot'}){
+ $options{'keeptmpfiles'}=1;
+#Set default options for reporting duplications
+#if(! exists $options{'duplications'}){
+# $options{'duplications'}=1;
+my $seqfiles = {};
+my $genome2seqslookup = {};
+my $seqlengthlookup = {};
+#my $allfastafiles = {};
+my $cleanregex = '[\-]';
+#Cleanup directory
+foreach my $seqfile (@inputseqfiles){
+ if(! -e $seqfile){
+ die "Invalid input file. Can't find $seqfile\n";
+ }
+ #default species name will be basename of the file
+ #upto the first dot
+ my $fname = basename($seqfile);
+ $fname =~ s/\.[^.]+//g;
+ $fname =~ s/$cleanregex/_/g;
+ my $speciesname = $fname;
+ unlink "$options{'directory'}/$speciesname" if(-e "$options{'directory'}/$speciesname");
+#Aggregate all sequences for a lineage and concatenate together
+foreach my $seqfile (@inputseqfiles){
+ my $fname = basename($seqfile);
+ $fname =~ s/\.[^.]+//g;
+ $fname =~ s/$cleanregex/_/g;
+ my $speciesname = $fname;
+ print STDERR "Parsing sequences for $speciesname ";
+ my $header;
+ my $seqlen=0;
+ my @seqs;
+ open FILE,"$seqfile" or die "Can't open file $seqfile";
+ while(my $line=<FILE>){
+ if($line =~ /^>/){
+ if($seqlen>0){
+ &printFASTA("$options{'directory'}/$speciesname","$speciesname:$header:1:+:$seqlen",\@seqs);
+ $genome2seqslookup->{$speciesname} = [] if (!exists $genome2seqslookup->{$speciesname});
+ my $tbaheader;
+ if($speciesname eq $header){
+ $tbaheader = "$speciesname";
+ }
+ else{
+ $tbaheader = "$speciesname.$header";
+ }
+ push @{$genome2seqslookup->{$speciesname}},["$speciesname:$header:1:+:$seqlen",$tbaheader,$seqlen,"$options{'directory'}/$speciesname"];
+ $seqfiles->{"$options{'directory'}/$speciesname"}++;
+ }
+ $seqlen=0;
+ @seqs=();
+ $header='';
+ chomp $line;
+ if($line =~ /^>([^:]+):([^:]+):/){
+ #multiz,tba formatted headers
+ #species name specified, override filename
+ $speciesname = $1;
+ $header = $2;
+ print $logfh "Parsing FASTA entry header:$header speciesname:$speciesname\n" if($options{'debug'});
+ }
+ elsif($line =~ /gi\|\d+\|\w+\|([^.]+)\|\S+/){
+ #special handling of ncbi formatted headers
+ #just pull accession
+ $header = $1;
+ print $logfh "Parsing FASTA entry header:$header speciesname:$speciesname\n" if($options{'debug'});
+ }
+ elsif($line =~ /^>(\S+)/){
+ #plain ole header
+ $header = $1;
+ $header =~ s/$cleanregex/_/g;
+ print $logfh "Parsing FASTA entry header:$header speciesname:$speciesname\n" if($options{'debug'});
+ }
+ else{
+ die "Can't parse FASTA header for $seqfile";
+ }
+ }
+ else{
+ $line =~ s/\s//g;
+ $seqlen += length($line);
+ push @seqs,$line;
+ }
+ }
+ #
+ if($seqlen){
+ $seqlengthlookup->{$speciesname} = $seqlen;
+ &printFASTA("$options{'directory'}/$speciesname","$speciesname:$header:1:+:$seqlen",\@seqs);
+ $genome2seqslookup->{$speciesname} = [] if (!exists $genome2seqslookup->{$speciesname});
+ my $tbaheader;
+ if($speciesname eq $header){
+ $tbaheader = "$speciesname";
+ }
+ else{
+ $tbaheader = "$speciesname.$header";
+ }
+ die "Cannot file FASTA file $options{'directory'}/$speciesname" if(! -e "$options{'directory'}/$speciesname");
+ push @{$genome2seqslookup->{$speciesname}},
+ ["$speciesname:$header:1:+:$seqlen",$tbaheader,$seqlen,"$options{'directory'}/$speciesname"];
+ $seqfiles->{"$options{'directory'}/$speciesname"}++;
+ }
+ close FILE;
+ print STDERR " num_seqs:",scalar(@{$genome2seqslookup->{$speciesname}}),"\n";
+my @genomenodes;
+my $treestring;
+#This wrapper supports running TBA for evaluation purposes using the same
+#Nucmer input as is passed to Mugsy
+if(exists $options{'tba'}){
+#TODO Muscle is used to build a guide tree from kmer counts.
+#The tree was used to test Nucmer+TBA but is not needed by Mugsy.
+#Removing this tree code will also remove need for muscle
+ if(defined $options{'tree'} || defined $options{'treefile'}){
+ $treestring = $options{'tree'};
+ if(! -e $options{'treefile'}){
+ print $logfh "Writing tree file $absprefix.tree\n" if($options{'debug'});
+ open FILE,"+>$absprefix.tree" or die "Can't open file $absprefix.tree";
+ print FILE "$options{'tree'}\n";
+ close FILE;
+ $options{'treefile'} = "$absprefix.tree";
+ }
+ else{
+ open FILE, "$options{'treefile'}" or die "Can't open treefile $options{'treefile'}";
+ my @treein=<FILE>;
+ close FILE;
+ chomp @treein;
+ $treestring = join('', at treein);
+ chomp $treestring;
+ }
+ }
+ else{
+ print $logfh "Estimating phylogenetic tree from sequence using muscle. Shared k-mer distance method and UPGMA\n" if($options{'debug'});
+ print $logfh "Starting tree estimation: ",`date`;
+ my @seqs = keys %$seqfiles;
+ $treestring = &getkmerdisttree(\@seqs,"$absprefix.tree");
+ die "Unable to generate tree using MUSCLE. Check input FASTA files for correctness" if(! -e "$absprefix.tree");
+ unlink "$absprefix.tree" if(! defined $options{'keeptmpfiles'});
+ print $logfh "Ending tree estimation with MUSCLE: ",`date`;
+ }
+ print $logfh "Processing tree $treestring\n" if($options{'debug'});
+ print $logfh "Using guide tree $treestring\n";
+ my ($treeio) = new TreeParse();
+ my ($status) = $treeio->parseNHTree($treestring,1);
+ if($status != 0){
+ die "Failed to parse tree \"$treestring\", expecting Newick format\n";
+ }
+ my $tree = $treeio->getTree();
+ my @genomenodest= $tree->leaves_under($tree);
+ #Returns leaves of tree left->right.
+ foreach my $i (@genomenodest){
+ push @genomenodes,$i->attributes->{'nh_label'};
+ }
+ foreach my $seqfile (sort {$seqlengthlookup->{basename($b)} <=> $seqlengthlookup->{basename($a)}} (keys %$seqfiles)){
+ my $speciesname = basename($seqfile);
+ die "Can't find species $speciesname" if($seqlengthlookup->{$speciesname}<=0);
+ $speciesname =~ s/\.[^.]+//g;
+ $speciesname =~ s/$cleanregex/_/g;
+ push @genomenodes,$speciesname;
+ }
+print $logfh "Processing ",scalar(@genomenodes)," genomes\n" if($options{'debug'});
+# Pairwise alignment steps
+# Generate pairwise alignments
+# using the Nucmer packages
+# Input: FASTA files of input sequences and guide tree
+# Output: Pairwise alignments in MAF format
+my $currdir = `pwd`;
+chomp $currdir;
+print $logfh "Current dir:'$currdir'\n" if($options{'debug'});
+print STDERR scalar(@genomenodes), " genomes\n";
+print STDERR "Starting Nucmer: ",`date`;
+my @maffiles;
+my @dupmaffiles; #maf files of duplicated regions
+chdir($options{'directory'}) or die;
+for(my $i=0;$i<@genomenodes;$i++){
+ my $genomename1 = $genomenodes[$i];#$genomenodes[$i]->attributes->{'nh_label'};
+ print $logfh `date`;
+ unlink "$absprefix.$genomename1.queries.fsa" if(-e "$absprefix.$genomename1.queries.fsa");
+ my @queryfiles;
+ #Searches are performed uni-directional by default
+ my $start = (defined $options{'fullsearch'}) ? 0 : $i+1;
+ #for(my $j=0;$j<@nodes;$j++){
+ for(my $j=$start;$j<@genomenodes;$j++){
+ if($j!=$i){
+ my $genomename2 = $genomenodes[$j];#$genomenodes[$j]->attributes->{'nh_label'};
+ die "Unable to find sequences for genome $genomename2. Check FASTA file names or headers" if(! exists $genome2seqslookup->{$genomename2});
+ #print STDERR "$genome2seqslookup->{$genomename2}->[0]->[3]\n";
+ push @queryfiles,"$options{'directory'}/$genomename2";
+ }
+ }
+ if(@queryfiles>0){
+ my $catcmd = "cat ".join(' ', at queryfiles)." > $absprefix.$genomename1.queries.fsa";
+ print $logfh "CMD:$catcmd\n" if($options{'debug'});
+ print $logfh `$catcmd`;
+ my $deltafile = "$absprefix.$genomename1.filt.delta";
+ my $origdeltafile = "$absprefix.$genomename1.delta";
+ my $clusterfile = "$absprefix.$genomename1.cluster";
+ $deltafile =~ s/\\-/-/g;
+ $clusterfile =~ s/\\-/-/g;
+ #TODO: consider forking child processes here to provide simple parallelization
+ #print STDERR "Looking for existing delta file $deltafile\n";
+ if(($options{'skipsearch'}) && -e "$deltafile"){
+ print STDERR "Using existing delta file $deltafile\n";
+ print $logfh `touch $deltafile`;
+ }
+ else{
+ print STDERR ".";
+ &runsearch("$options{'directory'}/$genomename1","$absprefix.$genomename1.queries.fsa",$genomename1);
+ }
+ push @maffiles,&generateMAF($deltafile,$genomename1);
+ if($options{'duplications'}){
+ my $dupscmd = "$deltadupscmd $origdeltafile > $absprefix.$genomename1.dups.maf";
+ print $logfh "CMD:$dupscmd\n" if($options{'debug'});
+ print $logfh `$dupscmd`;
+ push @dupmaffiles,"$absprefix.$genomename1.dups.maf";
+ }
+ unlink "$absprefix.$genomename1.queries.fsa" if(! defined $options{'keeptmpfiles'});
+ #Keep for --plot
+ if(! defined $options{'plot'} && ! defined $options{'keepsearchfiles'} && ! defined $options{'duplications'}){
+ unlink "$deltafile" if(! defined $options{'keeptmpfiles'});
+ }
+ unlink "$absprefix.$genomename1.delta" if(! defined $options{'keeptmpfiles'});
+ unlink "$clusterfile" if(! defined $options{'keeptmpfiles'});
+ }
+print STDERR "\nFinished Nucmer ",`date`;
+# Progressive alignment steps
+# Produce multiple alignment blocks from pairwise input
+# Input: Set of MAF files for each pairwise comparison
+# Output: Single MAF file containing all alignment blocks
+my $mafoutput;
+my $pwfasta = "$absprefix.xmfa";
+my $allfsafile = "$absprefix.all.fsa";
+if($method eq "mugsywga"){
+ print STDERR "Starting MUGSYWGA: ",`date`;
+ my $pwdupsfasta = "$absprefix.dups.xmfa";
+ print $logfh `rm -f $allfsafile`;
+ foreach my $fsafile (keys %$seqfiles){
+ #Temp fix for headers
+ my $perlcmd = q|perl -ne 'if(/^\>([^\s\:]+)\:([^\s\:]+)/){if($1 ne $2){ print ">$1.$2 $1\n";} elsif(defined $1 && defined $2){print ">$1.$2\n";}else {print ">$1\n";}}else{die if(/\>/);print $_}'|;
+ print $logfh "CMD:cat $fsafile | $perlcmd >> $allfsafile\n";
+ print $logfh `cat $fsafile | $perlcmd >> $allfsafile`;
+ unlink $fsafile if(! defined $options{'keeptmpfiles'});
+ }
+ unlink "$pwfasta";
+ foreach my $maf (@maffiles){
+ my $maf2fasta = "$maf2fastacmd < $maf >> $pwfasta";
+ print $logfh "CMD:$maf2fasta\n" if($options{'debug'});
+ print $logfh `$maf2fasta`;
+ }
+ unlink "$pwdupsfasta";
+ foreach my $maf (@dupmaffiles){
+ my $maf2fasta = "$maf2fastacmd < $maf >> $pwdupsfasta";
+ print $logfh "CMD:$maf2fasta\n" if($options{'debug'});
+ print $logfh `$maf2fasta`;
+ unlink $maf if(! defined $options{'keeptmpfiles'});
+ }
+ }
+ if(scalar(@maffiles)==0 || -z "$pwfasta"){
+ open FILE, "+>$absprefix.maf" or die "Can't open $absprefix.maf";
+ print FILE "##maf version=1 scoring=mugsy\n";
+ print FILE "##eof maf\n";
+ close FILE;
+ $mafoutput="$absprefix.maf";
+ }
+ else{
+ $mafoutput = &runMugsywga($allfsafile,$pwfasta,$pwdupsfasta,$options{'distance'},$options{'minlength'});
+ print STDERR "\nFinished MUGSYWGA: ",`date`;
+ unlink $allfsafile if(! defined $options{'keeptmpfiles'});
+ }
+elsif($method eq "tba"){
+ print STDERR "Starting TBA: ",`date`;
+ #Munge tree format so that TBA is happy
+ #convert , to ' '
+ $treestring =~ s/,/ /g;
+ #remove distances
+ $treestring =~ s/\:-*\d+\.\d+e*-*\d*//g;
+ $treestring =~ s/\:-*\d+//g;
+ $treestring =~ s/\:-*\d+//g;
+ #remove ;
+ $treestring =~ s/\;//g;
+ die "No MAF files" if(scalar(@maffiles)==0);
+ $mafoutput = &runTBA(join(' ', at maffiles),$treestring);
+ print STDERR "Finished TBA: ",`date`;
+ print STDERR "Unsupported multiple alignment method\n";
+foreach my $file (@maffiles){
+ if(! defined $options{'keepsearchfiles'}){
+ unlink $file if(! defined $options{'keeptmpfiles'});
+ }
+chdir($currdir) or die;
+if(defined $options{'refine'}){
+ if($method eq "mugsywga"){
+ print `mv $absprefix.maf $absprefix.maf.orig`;
+ print STDERR "Alignment completed. MAF output $absprefix.maf.orig\n";
+ print STDERR "Starting iterative refinement: ",`date`;
+ if($options{'refine'} eq ''){
+ $options{'refine'} = 'true';
+ }
+ my $refinecmd = "$mugsywgacmd --outfile $absprefix --seq $absprefix.all.fsa --aln $absprefix.xmfa --distance $options{'distance'} --minlength $options{'minlength'} --refine $options{'refine'} --infile $absprefix.maf.orig\n";
+ print $logfh "CMD:$refinecmd\n" if($options{'debug'});
+ my $ret = system($refinecmd);
+ print STDERR "\nFinished refinement: ",`date`;
+ #print `mv $absprefix.maf.refined $absprefix.maf`;
+ #Add mult to MAF file for easy parsing
+ #TODO make mult draft genome aware
+ open FILE, "$absprefix.maf.refined" or die "Can't open refined MAF $absprefix.maf.refined";
+ open OUTFILE, "+>$absprefix.maf" or die "Can't open MAF $absprefix.maf";
+ my $mult=0;
+ my @buffer;
+ while(my $line=<FILE>){
+ if($line =~ /^a\s+/){
+ if(scalar(@buffer)>0){
+ if($buffer[0] =~ /^a\s+/){
+ chomp $buffer[0];
+ $buffer[0] .= " mult=$mult\n";
+ }
+ print OUTFILE @buffer;
+ }
+ $mult=0;
+ @buffer=();
+ push @buffer, $line;
+ }
+ else{
+ if($line =~ /^s\s+/){
+ $mult++;
+ }
+ push @buffer,$line;
+ }
+ }
+ if($buffer[0] =~ /^a\s+/){
+ chomp $buffer[0];
+ $buffer[0] .= " mult=$mult\n";
+ }
+ print OUTFILE @buffer;
+ close FILE;
+ }
+if(! defined $options{'directory'}){
+ print `cp $absprefix.maf .`;
+print STDERR "Final output (MAF format): ./$options{'prefix'}.maf\n";
+ print STDERR "Final output (MAF format): $absprefix.maf\n";
+#TODO, create separate singletons and core MAF files
+#open MAFFILE, "$absprefix.maf" or die "Can't open maf file $absprefix.maf";
+#open SFILE, "+>$absprefix.singletons.maf" or die "Can't open file $absprefix.maf";
+#my $printsingle=0;
+#my $printcore=0;
+#while(my $line=<MAFFILE>){
+# if($line =~ /^a/){
+# if($line =~ /mult=(\d+)/){
+# if(defined $1){
+# if(mult==1){
+# $printsingle=1;
+# }
+# }
+# }
+# }
+#close MAFFILE;
+#Print plot
+ for(my $i=0;$i<@genomenodes-1;$i++){
+ my $genomename = $genomenodes[$i];#->attributes->{'nh_label'};
+ my $mugsyoutput = "$absprefix.mugsy.out";
+ my $mugsyoutputtrimmed = "$absprefix.mugsy.out.brkpts";
+ my $varoutput;# = $detectvariants ? "$mugsyoutput.var.list" : "";
+ my $mugsyresultsfile;
+ my $plotcmd = "cat $mafoutput | $mugsyinstall/plot.pl $absprefix $genomename $mugsyresultsfile $varoutput > $absprefix.$genomename.plot.gp";
+ print $logfh "CMD:$plotcmd\n" if($options{'debug'});
+ print `$plotcmd`;
+ print STDERR "Alignment and synteny plot (gnuplot format): $absprefix.$genomename.plot.gp\n";
+ }
+#if(!exists $options{'refine'}){
+ #print $logfh "Iterative refinement realigns each region by running a second iteration of Mugsy and can sometimes improve the alignment. To run a second iteration and produce an output file $absprefix.refined.maf:\n $mugsywgacmd --outfile $absprefix --seq $allfsafile --aln $pwfasta --distance $options{'distance'} --minlength $options{'minlength'} --refine true --infile $absprefix.maf\n";
+print STDERR "Finished ",`date`;
+#Utility functions
+#Produce guide tree based on kmer distance between sequences
+#in $seqfile.
+#Write output in newick format to $outfile
+sub getkmerdisttree{
+ my($seqfiles,$outfile) = @_;
+ my @files;
+ foreach my $seqfile (@$seqfiles){
+ my $speciesname = basename($seqfile);
+ $speciesname =~ s/\.[^.]+//g;
+ print $logfh "Writing $absprefix$speciesname.header\n" if($options{'debug'});
+ print $logfh `echo ">$speciesname" > $absprefix$speciesname.header`;
+ my $fillfsacode = '$ns = "";$line = $_;chomp $line; if(length($line)<60){$ns = \'N\' x (60-length($line));} print $line,$ns,"\n"';
+ print $logfh `grep -h -v "^>" $seqfile | perl -ne '$fillfsacode' > $absprefix$speciesname.sequence`;
+ push @files,"$absprefix$speciesname.header";
+ push @files,"$absprefix$speciesname.sequence";
+ }
+ my $filestr = join(' ', at files);
+ unlink $outfile if(-e $outfile);
+ my $mcmd = "cat $filestr | $musclecmd -clusteronly -in - -tree1 $outfile 2>&1";
+ print $logfh "CMD:$mcmd\n" if($options{'debug'});
+ print $logfh `$mcmd`;
+ my $treestring;
+ open FILE, "$outfile" or die "Can't open treefile $outfile";
+ while(my $line=<FILE>){
+ $line =~ s/(^\S+)\s+.*\:(-*\d+\.\d+)/$1:$2/;
+ $treestring .= $line;
+ }
+ close FILE;
+ $treestring =~ s/\n//g;
+ foreach my $file (@files){
+ unlink $file if(! defined $options{'keeptmpfiles'});
+ }
+ return $treestring;
+#Run pairwise Nucmer on sequences in $reffile vs. $queryfile
+#Write output to $prefix
+sub runsearch{
+ my($reffile,$queryfile,$prefix) = @_;
+ #
+ my $nucmercmd = "$searchcmd $reffile $queryfile -p $absprefix.$prefix $options{'nucmeropts'} 2>&1";
+ print $logfh "CMD:$nucmercmd\n" if($options{'debug'});
+ print $logfh `$nucmercmd`;
+ #
+ #Run delta-filter to chain hits and exclude spurious matches
+ #-1 specifies intersection of LIS chaining of hits wrt ref and query; includes rearrangements but ignores duplications
+ #-m specifies union of LIS chaining of hits wrt ref and query; includes rearrangments and duplications
+ if(! defined $options{'nofilter'}){
+ #-1 Filter for one-to-one alignments only
+ my $chainingopt = "-1";# -o 0";#"-m";
+ if($options{'colinear'}){
+ $chainingopt = '-m';
+ }
+ my $deltacmd = "$deltafiltcmd $chainingopt $absprefix.$prefix.delta > $absprefix.$prefix.filt.delta";
+ print $logfh "CMD:$deltacmd\n" if($options{'debug'});
+ print $logfh `$deltacmd`;
+ }
+ else{
+ my $deltacmd = "cp $absprefix.$prefix.delta $absprefix.$prefix.filt.delta";
+ print $logfh "CMD:$deltacmd\n" if($options{'debug'});
+ print $logfh `$deltacmd`;
+ }
+#Convert delta to MAF
+sub generateMAF{
+ my($deltafile,$prefix) = @_;
+ die "Nucmer search failed. Can't find delta file $deltafile" if(! -e $deltafile);
+ #Convert delta to MAF using $delta2mafcmd
+ #TODO: Reduce IO bottlenecks in this step. A lot of wasted time here.
+ #1)Write MAF files directly as part of nucmer to limit IO bottlenecks reading and writing files
+ #2)Also merge $fixnamescmd and $mafsortcmd into code that directly dumps MAF
+ #3)Support direct output of pairwise MAF from a multi-way comparison to make TBA happy
+ #if($options{'skipsearch'} && -e "$absprefix.$prefix.orig.maf" && -e "$absprefix.$prefix.maf"){
+ if($options{'skipsearch'} && -e "$absprefix.$prefix.maf"){
+ print STDERR "Using existing MAF file $absprefix.$prefix.maf\n";
+ print $logfh `touch $absprefix.$prefix.maf`;
+ }
+ else{
+ my $mafcmd;
+ if($method eq "tba"){
+ $mafcmd = "$delta2mafcmd $absprefix.$prefix.filt.delta | $fixnamescmd | $mafsortcmd /dev/stdin $prefix 1> $absprefix.$prefix.maf ";
+ }
+ else{
+ $mafcmd = "$delta2mafcmd $absprefix.$prefix.filt.delta | $fixnamescmd 1> $absprefix.$prefix.maf ";
+ #Sort is not necessary?
+ #$mafcmd = "$delta2mafcmd $absprefix.$prefix.filt.delta | $fixnamescmd | $mafsortcmd /dev/stdin $prefix 1> $absprefix.$prefix.maf ";
+ }
+ print $logfh "CMD:$mafcmd\n" if($options{'debug'});
+ print $logfh `$mafcmd`;
+ }
+ my @mafprocessed;
+ #This wrapper supports TBA for evaluation purposes
+ #TBA requires splitting the MAF
+ if($method eq "tba"){
+ #Create species specific MAF files for TBA
+ #Make sure absprefix ends in a '_' This is used for parsing
+ my $splitmafcmd = "cat $absprefix.$prefix.maf | $mugsyinstall/splitmaf.pl $absprefix"."_";
+ print $logfh "CMD:$splitmafcmd\n" if($options{'debug'});
+ my @maffiles = `$splitmafcmd`;
+ foreach my $file (@maffiles){
+ print $logfh "Processing MAF file $file\n";
+ chomp $file;
+ my($genomename1,$genomename2) = ($file =~ /_([^\.\/]+)\.([^\.\/]+)\.maf/);
+ die "Can't parse names from $file" if(!defined $genomename1 || !defined $genomename2);
+ #Note: TBA is picky about the input file names
+ #MUST be of the form "$genomename1.$genomename2.sing.maf"
+ #Can't use $absprefix for now
+ #I've only been able to get singlecov to work with alignments
+ #that map one-to-one for some reason. It appears that
+ #singlecov does not handle multiple sequences per
+ #genome and will lead to removal of all regions that
+ #match multiple sequences. Besides, if delta-filter -1 is used then I think
+ #is singlecov redundant?
+ my $singcmd;
+ if(defined $options{'nofilter'}){
+ print STDERR "WARNING:singlecov removes regions that match multiple sequences and may trim aligned regions. Run without -nofilter to keep all best alignments between pairs of sequences\n";
+ $singcmd = "$singlecovcmd $file > $genomename1.$genomename2.sing.maf";
+ }
+ else{
+ $singcmd = "mv $file $genomename1.$genomename2.sing.maf";
+ }
+ $singcmd =~ s/\|/\\|/g;
+ print $logfh "CMD:$singcmd\n" if($options{'debug'});
+ if($options{'skipsearch'} && -e "$genomename1.$genomename2.sing.maf"){
+ print $logfh `touch $genomename1.$genomename2.sing.maf`;
+ }
+ else{
+ print $logfh `$singcmd`;
+ }
+ push @mafprocessed,"$genomename1.$genomename2.sing.maf ";
+ }
+ if(! defined $options{'keepsearchfiles'}){
+ unlink "$absprefix.$prefix.maf" if(! defined $options{'keeptmpfiles'});
+ }
+ }
+ else{#For mugsywga and others, pass through
+ push @mafprocessed,"$absprefix.$prefix.maf";
+ }
+ return @mafprocessed;
+# Run Mugsy whole genome aligner on a set of MAF files
+sub runMugsywga{
+ my($fsafile,$pwfasta,$pwdupsfasta,$distance,$minlength) = @_;
+ my $outputfile = "$absprefix.maf";
+ #
+ my $colinearopt = (exists $options{'colinear'}) ? "--refine colinear" : "";
+ my $uniqueopt = (defined $options{'skipunique'}) ? "" : "--unique true";
+ my $dupsopt = ($options{'duplications'}) ? ",$pwdupsfasta --duplications true " : "";
+ my $nestedlcbs = (exists $options{'allownestedlcbs'}) ?"--allownestedlcbs true" : "";
+ my $runmugsywgacmd = "$mugsywgacmd --outfile $absprefix --seq $fsafile --aln $pwfasta$dupsopt --distance $distance --minlength $minlength $colinearopt $uniqueopt $nestedlcbs > $absprefix.mugsywga.out 2> $absprefix.mugsywga.stderr";
+ print $logfh "CMD:$runmugsywgacmd\n";# if($options{'debug'});
+ my $ret = system($runmugsywgacmd);
+ #Cleanup and return
+ if($ret !=0){
+ die "system $runmugsywgacmd failed: $?:$!";
+ }
+ else{
+ if(!$options{'debug'}){
+ unlink "$absprefix.mugsywga.out" if(! defined $options{'keeptmpfiles'});
+ unlink "$absprefix.mugsywga.stderr" if(! defined $options{'keeptmpfiles'});
+ }
+ }
+ unlink "$pwfasta" if(! defined $options{'keeptmpfiles'});
+ unlink "$pwdupsfasta" if(defined $options{'duplications'} && ! defined $options{'keeptmpfiles'});
+ return "$outputfile";
+# Run TBA on a set of MAF files using the provided guidetree, $treestring
+sub runTBA{
+ my($maffiles,$treestring) = @_;
+ my $outputfile = "$absprefix.maf";
+ #
+ #Run TBA straight up
+ my $runtbacmd = "$tbacmd \"$treestring\" $maffiles $outputfile 1> $absprefix.tba.out 2> $absprefix.tba.stderr";
+ print $logfh "CMD:$runtbacmd\n" if($options{'debug'});
+ my $ret = system($runtbacmd);
+ if($ret !=0){
+ die "system $runtbacmd failed: $?:$!";
+ }
+ chdir($currdir) or die;
+ #
+ #Add block labels to MAF output in the form label=# These are
+ #used in post-processing as a unique identifier to keep track of
+ #blocks
+ my $runlabelcmd = "cat $absprefix.maf | $labelblockscmd > $outputfile.labelled";
+ print $logfh "CMD:$runlabelcmd\n" if($options{'debug'});
+ print $logfh `$runlabelcmd`;
+ return "$outputfile.labelled";
+sub printFASTA{
+ my($fname,$header,$seqs) = @_;
+ print $logfh "Writing file $fname\n" if($options{'debug'});
+ open FFILE,">>$fname" or die "Can't open file\n";
+ if($header =~ /([$fastaproblemchars])/){
+ #print STDERR "Invalid FASTA header $header with characters $fastaproblemchars\n";
+ #$header =~ s/([$fastaproblemchars])/_/g;
+ }
+ print FFILE ">$header\n";
+ foreach my $s (@$seqs){
+ print FFILE $s,"\n";
+ }
+ close FFILE;
+# takes the tag and a reference to a list
+# go through the tag and tests whether the
+# file listed exists and is readable.
+# If so, then appends the file into the reference
+# @author - Mahesh Vangala
+sub getFastaFilesPath {
+ my ($tag,$refArray) = @_;
+ open(FH, "<$tag") or die "Error in opening the file, $tag, $!\n";
+ while(my $file = <FH>) {
+ $file =~ s/^\s+//;
+ $file =~ s/\s+$//;
+ if(-e $file && -r $file) {
+ push @$refArray, $file;
+ }
+ }
+ close FH;
diff --git a/mugsy-seqan/projects/library/apps/Makefile b/mugsy-seqan/projects/library/apps/Makefile
new file mode 100644
index 0000000..4764fb5
--- /dev/null
+++ b/mugsy-seqan/projects/library/apps/Makefile
@@ -0,0 +1,48 @@
+# Link against runtime library on Linux systems
+OS_NAME=$(shell uname)
+ifeq ($(OS_NAME),Linux)
+ LDFLAGS += -lrt
+#check, if tbb_root is not absolute path (the filter keeps only /* paths)
+ifeq ($(filter /% $(SLASH)%, $(subst :, ,$(tbb_root)) ),)
+ # also changes related variables like work_dir
+ override tbb_root := $(CWD)$(SLASH)..
+ export TBB21_INSTALL_DIR := $(tbb_root)
+# explicitly compile for a 32 or 64 bit platform
+#CXXFLAGS += -m32
+#CXXFLAGS += -m64
+#-march=nocona -mfpmath=sse -msse2
+#CXXFLAGS += -O0 -g
+CXXFLAGS += -pedantic -W -Wall
+TARGETS = dfi/dfi seqan_tcoffee/seqan_tcoffee seqcons/seqcons razers/paramChooser razers/razers pair_align/pair_align micro_razers/micro_razers tree_recon/tree_recon mugsy/mugsy
+mugsy: mugsy/mugsy
+all: check_seqan_base $(TARGETS)
+dfi: check_seqan_base dfi/dfi
+razers: check_seqan_base razers/razers razers/paramChooser
+micro_razers: check_seqan_base micro_razers/micro_razers
+seqan_tcoffee: check_seqan_base seqan_tcoffee/seqan_tcoffee
+seqcons: check_seqan_base seqcons/seqcons
+pair_align: check_seqan_base pair_align/pair_align
+tree_recon: check_seqan_base tree_recon/tree_recon
+ @if [ ! -d "$(SEQAN_BASE)/seqan" ]; then \
+ echo "The directory $(SEQAN_BASE)/seqan could not be found!"; \
+ exit 1; \
+ fi
+ rm -f $(TARGETS) $(TARGETS:=.o)
diff --git a/mugsy-seqan/projects/library/apps/mugsy/mugsy.cpp b/mugsy-seqan/projects/library/apps/mugsy/mugsy.cpp
new file mode 100644
index 0000000..64fe0f7
--- /dev/null
+++ b/mugsy-seqan/projects/library/apps/mugsy/mugsy.cpp
@@ -0,0 +1,6035 @@
+//This code has mixed conventions because it combines different
+//original sources. I've done my best to use the Seqan conventions
+//where I can, but there is quite a jumble between STL and Seqan data
+//#define SEQAN_PROFILE2 //more verbose. SEQAN_PROFILE must also be defined
+//#define SEQAN_TEST
+#define NDEBUG //define this to disable assert statements
+#define TIMING
+#ifdef TIMING
+#include <time.h>
+time_t now;
+time_t lasttime;
+//#define DEBUGGING //SVA custom debugging
+//#define DEBUGGING_GRAPH //SVA custom debugging
+//#define DEBUGGING2 //SVA verbose custom debugging
+//There is some overhead to capturing scoring info
+//Undef to turn off reporting of SP scores
+//#define SCORING
+#include <seqan/basic.h>
+#include <seqan/graph_msa.h>
+#include <seqan/graph_types.h>
+#include <seqan/graph_align.h>
+#include <seqan/modifier.h>
+#include <seqan/refinement.h>
+#include "rna_alphabet.h"
+#include <seqan/modifier.h>
+#include <seqan/misc/misc_cmdparser.h>
+//#include "sangiuoli/mummer/trunk/MUMmer3.20/src/tigr/delta.hh"
+#include <sstream>
+#include <fstream>
+#include <vector>
+#include <set>
+#include <queue>
+#include <list>
+#include <bitset>
+#include <algorithm>
+#include <cstdlib>
+#include <errno.h>
+//#include <stdio.h>
+//#include <stdlib.h>
+#include <libgen.h>
+#include <boost/graph/iteration_macros.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/filtered_graph.hpp>
+#include <boost/graph/graph_utility.hpp>
+#include <boost/graph/connected_components.hpp>
+#include <boost/graph/strong_components.hpp>
+#include <boost/graph/topological_sort.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/graph/dijkstra_shortest_paths.hpp>
+#include <boost/graph/graph_traits.hpp>
+#include <boost/graph/properties.hpp>
+#include <boost/graph/edmonds_karp_max_flow.hpp>
+#include <boost/graph/kolmogorov_max_flow.hpp>
+#include <boost/graph/push_relabel_max_flow.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/read_dimacs.hpp>
+#include <boost/graph/graph_utility.hpp>
+SEQAN_PROTIMESTART(__myProfileTime); // Profiling
+//Example transform in multiz-tba/trunk/transformcoords.cpp
+#include "transformcoords.h"
+using namespace seqan;
+using namespace std;
+struct s_offset{
+ unsigned int offset;
+ unsigned int spanlen;
+ unsigned int seqlen;
+ unsigned int orient;
+struct iloc{
+ int first;
+ int second;
+ int blocknum;
+struct s_score{
+ unsigned int numGapEx;
+ unsigned int numGap;
+ unsigned int numPairs;
+ unsigned int numIdents;
+ unsigned int alignLen;
+ unsigned int totalLen;
+ unsigned int alignScore;
+ unsigned int seqCount;
+ String<unsigned int> colCount;
+ String<unsigned int> pairCount;
+ struct vectorsizecmp {
+ bool operator()( const String<Fragment<> > & s1, const String<Fragment<> > & s2 ) const {
+ return length(s1) > length(s2);
+ }
+ };
+ template <typename TMap>
+ class lcblencmp{
+ public:
+ lcblencmp(TMap & m)
+ :myMap(&m)
+ {}
+ bool operator() ( const int i, const int j) const {
+ assert(myMap != NULL);
+ assert(myMap->find(i)!=myMap->end());
+ assert(myMap->find(j)!=myMap->end());
+ return (myMap->find(i)->second > myMap->find(j)->second);
+ }
+ TMap *myMap;
+ };
+ template<typename TGraph>
+ class vertexdegreecmp
+ {
+ public:
+ vertexdegreecmp(TGraph & g)
+ :myGraph(&g)
+ {}
+ template<typename TVertexDescriptor>
+ bool operator()( const TVertexDescriptor v1, const TVertexDescriptor v2 ) const {
+ return degree(*myGraph,v1) > degree(*myGraph,v2);
+ }
+ TGraph * myGraph;
+ };
+ template<typename TPosScores>
+ class edgeposscorecmp
+ {
+ public:
+ edgeposscorecmp(TPosScores & p)
+ :posscores(&p)
+ {}
+ template<typename TEdgeDescriptor>
+ bool operator()( const TEdgeDescriptor &e1, const TEdgeDescriptor &e2 ) const {
+ return posscores->find(e1)->second < posscores->find(e2)->second;
+ }
+ TPosScores * posscores;
+ };
+ template<typename TPos>
+ class poscmp
+ {
+ public:
+ poscmp()
+ {}
+ bool operator()( const TPos &e1, const TPos &e2 ) const {
+ if(e1.first==e2.first){
+ //return false if e2 is interval close
+ /*
+ if(e2.second == false){
+ return 0;
+ }
+ else{
+ return 1;
+ }
+ */
+ return e1.second < e2.second;
+ }
+ else{
+ return e1.first < e2.first;
+ }
+ }
+ };
+ template<typename TGraph>
+ class vertexposcmp
+ {
+ public:
+ vertexposcmp(TGraph & g)
+ :myGraph(&g)
+ {}
+ template<typename TVertexDescriptor>
+ bool operator()( const TVertexDescriptor v1, const TVertexDescriptor v2 ) const {
+ return fragmentBegin(*myGraph,v1) < fragmentBegin(*myGraph,v2);
+ }
+ TGraph * myGraph;
+ };
+ template<typename TScoreMap>
+ class edgescorecmp
+ {
+ public:
+ edgescorecmp(TScoreMap * s)
+ :myScoreMap(s)
+ {}
+ edgescorecmp()
+ :myScoreMap(NULL)
+ {}
+ template<typename TEdgeDescriptor>
+ bool operator()( const TEdgeDescriptor e1, const TEdgeDescriptor e2 ) const {
+ if(myScoreMap!=NULL && myScoreMap->size()>0){
+ assert(myScoreMap->find(e1)!=myScoreMap->end());
+ assert(myScoreMap->find(e2)!=myScoreMap->end());
+ if(abs(cargo(e1)) == abs(cargo(e2))){
+ //Secondary sort on adjacency score
+ return myScoreMap->find(e1)->second < myScoreMap->find(e2)->second;
+ }
+ else{
+ //Primary sort on consistency score
+ return abs(cargo(e1))<abs(cargo(e2));
+ }
+ }
+ else{
+ //Primary sort on consistency score
+ return abs(cargo(e1))<abs(cargo(e2));
+ }
+ }
+ TScoreMap * myScoreMap;
+ };
+ template<typename TBlock, typename TSize=unsigned int>
+ class blockorder
+ {
+ public:
+ blockorder(TSize s){
+ currentSeq = s;
+ }
+ bool operator()( const TBlock * s1, const TBlock * s2 ) const {
+ assert(s1->currentSeq == currentSeq);
+ assert(s2->currentSeq == currentSeq);
+ return s1->begCoord < s2->begCoord;
+ }
+ TSize currentSeq;
+ };
+ template<typename TComponent = unsigned int, typename TSize = unsigned int,
+ typename TVertexDescriptor = unsigned int, typename TPos = unsigned int>
+ class SVABlock
+ {
+ public:
+ SVABlock()
+ {}
+ SVABlock(const SVABlock & s)
+ :begCoord(s.begCoord),
+ endCoord(s.endCoord),
+ orient(s.orient),
+ c(s.c),
+ currentSeq(s.currentSeq),
+ currV(s.currV)
+ {}
+ SVABlock(TComponent inc, TSize s, TPos b, TPos e, char o, TVertexDescriptor v)
+ :begCoord(b),endCoord(e),orient(o),c(inc),currentSeq(s)
+ {
+ currV.push_back(v);
+ }
+ template<typename TGraph>
+ void addVertex(TGraph & g, TVertexDescriptor v){
+ assert(sequenceId(g,v)==currentSeq);
+ if(fragmentBegin(g,v)<begCoord){
+ begCoord=fragmentBegin(g,v);
+ }
+ if(fragmentBegin(g,v)+fragmentLength(g,v)>endCoord){
+ endCoord=fragmentBegin(g,v)+fragmentLength(g,v);
+ }
+ currV.push_back(v);
+ }
+ TPos begCoord;
+ TPos endCoord;
+ char orient;//'+' or '-', consider changing to false, true to be consistent with fragment.reversed
+ TComponent c;
+ TSize currentSeq;
+ std::vector<TVertexDescriptor> currV;
+ };
+void process_mem_usage(double& vm_usage, double& resident_set)
+ using std::ios_base;
+ using std::ifstream;
+ using std::string;
+ vm_usage = 0.0;
+ resident_set = 0.0;
+ // 'file' stat seems to give the most reliable results
+ //
+ ifstream stat_stream("/proc/self/stat",ios_base::in);
+ // dummy vars for leading entries in stat that we don't care about
+ //
+ string pid, comm, state, ppid, pgrp, session, tty_nr;
+ string tpgid, flags, minflt, cminflt, majflt, cmajflt;
+ string utime, stime, cutime, cstime, priority, nice;
+ string O, itrealvalue, starttime;
+ // the two fields we want
+ //
+ unsigned long vsize;
+ long rss;
+ stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr
+ >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt
+ >> utime >> stime >> cutime >> cstime >> priority >> nice
+ >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest
+ long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
+ vm_usage = vsize / 1024.0;
+ resident_set = rss * page_size_kb;
+// Connected components
+//Return minimum distance seperating two intervals s1-e1 and s2-e2
+//If intervals are overlapping distance is 0
+template<typename TSize>
+inline unsigned int getIntervalDist(TSize &s1, TSize &e1, TSize &s2, TSize &e2){
+ //Overlapping or contained
+ if(s1>s2 && s1<e2){
+ return 0;
+ }
+ else{
+ if(s2>s1 && s2<e1){
+ return 0;
+ }
+ else{
+ if(s1<s2){
+ return s2-s1;
+ }
+ else{
+ return s1-s2;
+ }
+ }
+ }
+template<typename TGraph,
+ typename TVertexDescriptor,
+ typename TGenomeVertexMapIter>
+inline unsigned int distance(TGraph const& g,
+ TVertexDescriptor const & u,
+ std::pair<TGenomeVertexMapIter,TGenomeVertexMapIter> &vmapiter){
+ typedef unsigned int TSize;
+ TSize mindist = std::numeric_limits<TSize>::max();
+ for (TGenomeVertexMapIter vit=vmapiter.first; vit!=vmapiter.second; ++vit){
+ if(sequenceId(g,u)==sequenceId(g,vit->second)){
+ TSize beg1 = fragmentBegin(g,u);
+ TSize end1 = beg1+fragmentLength(g,u);
+ TSize beg2 = fragmentBegin(g,vit->second);
+ TSize end2 = beg2+fragmentLength(g,vit->second);
+ TSize dist = getIntervalDist(beg1,end1,beg2,end2);
+ mindist = dist<mindist ? dist : mindist;
+ }
+ }
+ return mindist;
+template<typename TSpec,
+ typename TVertexDescriptor,
+ typename TTokenMap,
+ typename TComponents,
+ typename TVal,
+ typename TGenomeVertexMap,
+ typename TNames,
+ typename TSize>
+inline void
+_cc_visit_g_ranked(Graph<TSpec> const& g,
+ TVertexDescriptor const u,
+ TTokenMap& tokenMap,
+ TComponents& components,
+ TVal label,
+ TVal &maxlabel, //changed to reference so that i can reassign
+ std::vector<TGenomeVertexMap> & genomeMap,
+ TNames &genomeNames,
+ TSize &maxdist)
+ typedef typename Iterator<Graph<TSpec>, AdjacencyIterator>::Type TAdjacencyIterator;
+ typedef typename EdgeDescriptor<Graph<TSpec> >::Type TEdgeDescriptor;
+ typedef typename Iterator<Graph<TSpec>, OutEdgeIterator>::Type TOutEdgeIterator;
+ //Add all edges from u to ccedges
+ if(getProperty(tokenMap, u) == false){
+ //TODO support for genomeidx in addition to sequenceid
+ assert(sequenceId(g,u)<length(genomeNames));
+ std::cout << "Connecting vertex " << u << " from sequence " << sequenceId(g,u) << std::endl;
+ assert(label<genomeMap.size());
+ //Multiple copies of this genome in the current component
+ TSize gname = genomeNames[sequenceId(g,u)];
+ if(genomeMap[label].find(genomeNames[sequenceId(g,u)]) != genomeMap[label].end()){
+ std::pair<typename TGenomeVertexMap::iterator,typename TGenomeVertexMap::iterator> vmapiter =genomeMap[label].equal_range(gname);
+ unsigned int dist = distance(g,u,vmapiter);
+ std::cout << "Multiple copies found for genome " << gname
+ << " seq " << sequenceId(g,u) << " on vertex " << u << std::endl;
+ std::cout << "Minimum distance to a copy: " << dist << std::endl;
+ for(typename TGenomeVertexMap::iterator it=vmapiter.first;it!=vmapiter.second;it++){
+ std::cout << "Copy " << it->second << " seq_id:" << sequenceId(g,it->second) << std::endl;
+ assert((TSize)genomeNames[sequenceId(g,it->second)]==gname);
+ assert(dist>=0);
+ assert(maxdist>0);
+ if(dist<=maxdist){
+ assert(sequenceId(g,it->second)==sequenceId(g,u));
+ }
+ }
+ //Start a new component
+ if(dist>maxdist){
+ //Increment and set maxlabel
+ ++maxlabel;
+ label=maxlabel;
+ std::cout << "Starting new component " << label << " for " << u << std::endl;
+ genomeMap.push_back(TGenomeVertexMap());
+ //Ensure genome is not present in the existing label
+ assert(label<genomeMap.size());
+ assert(label==genomeMap.size()-1);
+ assert(genomeMap[label].find(gname) == genomeMap[label].end());
+ }
+ }
+ std::cout << "Component " << label << " V:" << u << " seq:" << sequenceId(g,u) << std::endl;
+ assert(label<genomeMap.size());
+ //Add vertex,u to component,label
+ genomeMap[label].insert(std::make_pair(gname,u));
+ assignProperty(tokenMap, u, true);
+ assignProperty(components, u, label);
+ //Capture all edges for this vertex
+ std::vector<TEdgeDescriptor> ccedges;
+ for(TOutEdgeIterator itOut(g, u);!atEnd(itOut); ++itOut) {
+ //TODO if(!visited) shortcut
+ ccedges.push_back(*itOut);
+ }
+ assert(ccedges.size()==degree(g,u));
+ //Sort edges on consistency and visit most consistent edges first in a greedy fashion
+ sort(ccedges.begin(),ccedges.end(),edgescorecmp<std::map<TEdgeDescriptor,float> >());
+ for(typename std::vector<TEdgeDescriptor>::reverse_iterator cit = ccedges.rbegin();cit!=ccedges.rend();++cit){
+ TVertexDescriptor s = getSource(*cit);
+ TVertexDescriptor t = getTarget(*cit);
+ assert(s==u || t==u);
+ if(s!=u){
+ assert(getProperty(tokenMap,t)==true);
+ if (getProperty(tokenMap, s) == false) {
+ std::cout << " edge " << u << "-" << s;
+ std::cout << std::endl;
+ _cc_visit_g_ranked(g, s, tokenMap, components, label, maxlabel,genomeMap,genomeNames,maxdist);
+ }
+ }
+ else{
+ if(t!=u){
+ assert(getProperty(tokenMap,s)==true);
+ if (getProperty(tokenMap, t) == false) {
+ std::cout << " edge " << u << "-" << t;
+ std::cout << std::endl;
+ _cc_visit_g_ranked(g, t, tokenMap, components, label, maxlabel,genomeMap,genomeNames,maxdist);
+ }
+ }
+ else{
+ assert(false);
+ }
+ }
+ }
+ }
+..summary:Decomposes an undirected graph into its connected components.
+..signature:connected_components(g, components)
+..param.g:In-parameter:An undirected graph.
+...type:Spec.Undirected graph
+..param.components:Out-parameter:A property map.
+...remarks:Each vertex is mapped to a component id. If two vertices share the same id they are in the same component.
+..returns: The number of components.
+template<typename TSpec, typename TComponents, typename TNames, typename TSize2>
+typename Size<Graph<TSpec> >::Type
+connected_components_by_genome_ranked_RECURSIVE(Graph<TSpec> const& g,
+ TComponents& components,
+ TNames &genomeNames,
+ TSize2 maxdist)
+ typedef typename Size<Graph<TSpec> >::Type TSize;
+ typedef typename Iterator<Graph<TSpec>, EdgeIterator>::Type TEdgeIterator;
+ typedef typename Iterator<Graph<TSpec>, VertexIterator>::Type TVertexIterator;
+ typedef typename VertexDescriptor<Graph<TSpec> >::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<Graph<TSpec> >::Type TEdgeDescriptor;
+ typedef std::multimap<TSize,TVertexDescriptor> TGenomeVertexMap;
+ typedef typename Iterator<Graph<TSpec>, AdjacencyIterator>::Type TAdjacencyIterator;
+ typedef typename Iterator<Graph<TSpec>, OutEdgeIterator>::Type TOutEdgeIterator;
+ clear(components);
+ resizeVertexMap(g,components);
+ std::cout << "Calculating connected components on" << length(genomeNames) << "genomes" << std::endl;
+ // Initialization
+ String<bool> tokenMap;
+ fill(tokenMap, getIdUpperBound(_getVertexIdManager(g)), false);
+ // Genome tracker
+ std::vector<TGenomeVertexMap> genomeMap(1);
+ // Find connected components greedy on consistency score
+ TSize label = 0;
+ TSize maxlabel = label;
+ TEdgeIterator itE(g);
+ std::vector<TEdgeDescriptor> ccedges;
+ std::set<TVertexDescriptor> visited;
+ for(;!atEnd(itE);goNext(itE)){
+ ccedges.push_back(*itE);
+ }
+ //Sort on consistency score
+ sort(ccedges.begin(),ccedges.end(),edgescorecmp<std::map<TEdgeDescriptor,float> >());
+ TVertexDescriptor s,t;
+ //From most consistent edges in G to least, determine CC
+ for(typename std::vector<TEdgeDescriptor>::reverse_iterator cit = ccedges.rbegin();cit!=ccedges.rend();cit++){
+ s = getSource(*cit);
+ if (getProperty(tokenMap, s) == false) {
+ std::cout << "Component" << maxlabel
+ << std::endl;
+ //Capture all vertices connected to s
+ _cc_visit_g_ranked(g, s, tokenMap, components, label, maxlabel, genomeMap,genomeNames,maxdist);
+ ++maxlabel;
+ label=maxlabel;
+ genomeMap.push_back(TGenomeVertexMap());
+ }
+ t = getTarget(*cit);
+ if (getProperty(tokenMap, t) == false) {
+ _cc_visit_g_ranked(g, t, tokenMap, components, label, maxlabel, genomeMap,genomeNames,maxdist);
+ ++maxlabel;
+ label=maxlabel;
+ genomeMap.push_back(TGenomeVertexMap());
+ }
+ }
+ //Capture all vertices with no edges degree==0
+ TVertexIterator it(g);
+ TVertexDescriptor u;
+ for(;!atEnd(it);goNext(it)) {
+ u = getValue(it);
+ if (getProperty(tokenMap, u) == false) {
+ std::cout << "Component" << maxlabel
+ << std::endl;
+ assert(degree(g,u)==0);
+ _cc_visit_g_ranked(g, u, tokenMap, components, label, maxlabel, genomeMap,genomeNames,maxdist);
+ ++maxlabel;
+ label=maxlabel;
+ genomeMap.push_back(TGenomeVertexMap());
+ }
+ }
+ return label;
+//Connected components greedy on consistency score and ensuring one
+//anchor per genome.
+//Used to convert segment graph (V=genome segments on one genome) into
+//anchor graph (V=genome segments on multiple genomes)
+//Run DFS to determine connected components. Order traversal by edge
+//score largest-smallest. Break and start a new component upon
+//encountering a second anchor in a genome that has already been
+//visited if the new anchor > maxdist from the other anchors already
+template<typename TSpec, typename TComponents, typename TNames, typename TSize2>
+typename Size<Graph<TSpec> >::Type
+connected_components_by_genome_ranked(Graph<TSpec> const& g,
+ TComponents& components,
+ TNames &genomeNames,
+ TSize2 maxdist){
+ typedef typename Size<Graph<TSpec> >::Type TSize;
+ typedef typename Iterator<Graph<TSpec>, EdgeIterator>::Type TEdgeIterator;
+ typedef typename Iterator<Graph<TSpec>, VertexIterator>::Type TVertexIterator;
+ typedef typename VertexDescriptor<Graph<TSpec> >::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<Graph<TSpec> >::Type TEdgeDescriptor;
+ typedef std::multimap<TSize,TVertexDescriptor> TGenomeVertexMap;
+ typedef typename Iterator<Graph<TSpec>, AdjacencyIterator>::Type TAdjacencyIterator;
+ typedef typename Iterator<Graph<TSpec>, OutEdgeIterator>::Type TOutEdgeIterator;
+ clear(components);
+ resizeVertexMap(g,components);
+ std::cout << "Calculating connected components" << std::endl;
+ // Initialization
+ String<bool> tokenMap;
+ fill(tokenMap, getIdUpperBound(_getVertexIdManager(g)), false);
+ fill(components,getIdUpperBound(_getVertexIdManager(g)), 0);
+ for(unsigned int i=0;i<getIdUpperBound(_getVertexIdManager(g));++i){
+ assignProperty(components,i,0);
+ assert(getProperty(components,i)==0);
+ }
+ // Genome tracker
+ std::vector<TGenomeVertexMap> genomeMap(1);
+ //TODO
+ //Initial CC with maxdist
+ //Save all nodes,edges > maxdist
+ //Score edges by adjacency score
+ //Break edges < cutoff || keep only highest scoring node per genome
+ //Recompute CC
+ // Connected components
+ TEdgeIterator itE(g);
+ std::vector<TEdgeDescriptor> ccedges;
+ int maxlabel=-1;
+ for(;!atEnd(itE);goNext(itE)){
+ ccedges.push_back(*itE);
+ }
+ sort(ccedges.begin(),ccedges.end(),edgescorecmp<std::map<TEdgeDescriptor,float> >());
+ std::vector<std::pair<TVertexDescriptor,TVertexDescriptor> > stack;
+ //outer loop ensures we visit disconnected subgraphs, considering most consistent edges first
+ for(typename std::vector<TEdgeDescriptor>::reverse_iterator cit = ccedges.rbegin();cit!=ccedges.rend();cit++){
+ TVertexDescriptor s = getSource(*cit);
+ TVertexDescriptor t = getTarget(*cit);
+ std::cout << "Edge score:" << cargo(*cit) << std::endl;
+ if (getProperty(tokenMap, s) == false){
+ if(getProperty(tokenMap,t) == false) {
+ stack.push_back(std::make_pair(s, t));
+ stack.push_back(std::make_pair(t, s));
+ }
+ else{
+ stack.push_back(std::make_pair(s, t));
+ }
+ }
+ else{
+ if(getProperty(tokenMap,t) == false) {
+ stack.push_back(std::make_pair(t, s));
+ }
+ }
+ while(!stack.empty()){
+ std::pair<TVertexDescriptor,TVertexDescriptor> & node = stack.back();
+ TVertexDescriptor u = node.first;
+ TVertexDescriptor prev = node.second;
+ assert(sequenceId(g,u)<length(genomeNames));
+ TSize gname = genomeNames[sequenceId(g,u)];
+ stack.pop_back();
+ if(getProperty(tokenMap,u)==false){
+ assert(getProperty(components,u)==0);
+ std::cout << "New node " << u << " " << getProperty(components,u)
+ << std::endl;
+ //Encountered new node, assign label, track genome
+ assignProperty(tokenMap, u, true);
+ int label=-1;
+ if(getProperty(tokenMap,prev)==false){
+ //Use new label
+ ++maxlabel;
+ std::cout << "Starting new component " << maxlabel << " for " << u << std::endl;
+ label=maxlabel;
+ genomeMap.push_back(TGenomeVertexMap());
+ }
+ else{
+ //there is already a label
+ int prevlabel=getProperty(components,prev);
+ if(genomeMap[prevlabel].find(gname) != genomeMap[prevlabel].end()){
+ //there is already a genome, retrieve all the anchors in this genome to determine the anchors
+ std::pair<typename TGenomeVertexMap::iterator,typename TGenomeVertexMap::iterator> vmapiter =genomeMap[prevlabel].equal_range(gname);
+ unsigned int dist = distance(g,u,vmapiter);
+ std::cout << "Multiple copies found for genome " << gname
+ << " seq " << sequenceId(g,u) << " on vertex " << u << std::endl;
+ std::cout << "Minimum distance to a copy: " << dist << std::endl;
+ for(typename TGenomeVertexMap::iterator it=vmapiter.first;it!=vmapiter.second;it++){
+ std::cout << "Copy " << it->second << " seq_id:" << sequenceId(g,it->second) << std::endl;
+ assert((TSize)genomeNames[sequenceId(g,it->second)]==gname);
+ assert(dist>=0);
+ if(dist<=maxdist){
+ assert(sequenceId(g,it->second)==sequenceId(g,u));
+ }
+ }
+ //Start a new component
+ if(dist>=maxdist){
+ //Increment and set maxlabel
+ ++maxlabel;
+ std::cout << "Starting new component " << maxlabel << " for " << u << std::endl;
+ label=maxlabel;
+ genomeMap.push_back(TGenomeVertexMap());
+ //Ensure genome is not present in the existing label
+ assert((unsigned int)maxlabel<genomeMap.size());
+ assert(genomeMap[maxlabel].find(gname) == genomeMap[maxlabel].end());
+ }
+ else{
+ std::cout << "Adding to component " << getProperty(components,prev) << " for " << u << std::endl;
+ label=prevlabel;
+ }
+ }
+ else{
+ std::cout << "Adding to component " << getProperty(components,prev) << " for " << u << std::endl;
+ label=prevlabel;
+ }
+ }
+ assignProperty(components, u, label);
+ std::cout << "V:" << u << " component " << getProperty(components,u) << std::endl;
+ //assert(getProperty(components,u)>=0);
+ assert(getProperty(components,u) < genomeMap.size());
+ genomeMap[label].insert(std::make_pair(gname,u));
+ //Add all edges from u to ccedges
+ std::vector<TEdgeDescriptor> ccedges;
+ //Edge iterator
+ TOutEdgeIterator itOut(g, u);
+ for(;!atEnd(itOut); ++itOut) {
+ if(getProperty(tokenMap,getSource(*itOut)) || getProperty(tokenMap,getTarget(*itOut))){
+ ccedges.push_back(*itOut);
+ }
+ }
+ //assert(ccedges.size()==degree(g,u));
+ sort(ccedges.begin(),ccedges.end(),edgescorecmp<std::map<TEdgeDescriptor,float> >());
+ //traverse scores low->high so highest scores are last on the stack
+ for(typename std::vector<TEdgeDescriptor>::iterator cit = ccedges.begin();cit!=ccedges.end();cit++){
+ TVertexDescriptor s = getSource(*cit);
+ TVertexDescriptor t = getTarget(*cit);
+ assert(s==u || t==u);
+ if(s!=u){
+ assert(getProperty(tokenMap,t)==true);
+ if (getProperty(tokenMap, s) == false) {
+ std::cout << " edge " << u << "-" << s;
+ std::cout << std::endl;
+ std::cout << "Edge score:" << cargo(*cit) << std::endl;
+ stack.push_back(std::make_pair(s,u));
+ }
+ }
+ else{
+ if(t!=u){
+ assert(getProperty(tokenMap,s)==true);
+ if (getProperty(tokenMap, t) == false) {
+ std::cout << " edge " << u << "-" << t;
+ std::cout << std::endl;
+ std::cout << "Edge score:" << cargo(*cit) << std::endl;
+ stack.push_back(std::make_pair(t,u));
+ }
+ }
+ else{
+ assert(false);
+ }
+ }
+ }
+ }
+ }
+ }
+ //Capture all vertices with no edges degree==0
+ TVertexIterator it(g);
+ TVertexDescriptor u;
+ for(;!atEnd(it);goNext(it)) {
+ u = getValue(it);
+ if (getProperty(tokenMap, u) == false) {
+ std::cout << "Component" << maxlabel
+ << std::endl;
+ assert(degree(g,u)==0);
+ ++maxlabel;
+ assignProperty(components, u, maxlabel);
+ genomeMap.push_back(TGenomeVertexMap());
+ TSize gname = genomeNames[sequenceId(g,u)];
+ assert(sequenceId(g,u)<length(genomeNames));
+ genomeMap[getProperty(components,u)].insert(std::make_pair(gname,u));
+ }
+ }
+ return maxlabel+1;
+template<typename TSpec, typename TComponents>
+typename Size<Graph<TSpec> >::Type
+connected_components_ranked(Graph<TSpec> const& g,
+ TComponents& components){
+ typedef typename Size<Graph<TSpec> >::Type TSize;
+ typedef typename Iterator<Graph<TSpec>, EdgeIterator>::Type TEdgeIterator;
+ typedef typename Iterator<Graph<TSpec>, VertexIterator>::Type TVertexIterator;
+ typedef typename VertexDescriptor<Graph<TSpec> >::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<Graph<TSpec> >::Type TEdgeDescriptor;
+ typedef typename Iterator<Graph<TSpec>, AdjacencyIterator>::Type TAdjacencyIterator;
+ typedef typename Iterator<Graph<TSpec>, OutEdgeIterator>::Type TOutEdgeIterator;
+ clear(components);
+ resizeVertexMap(g,components);
+ std::cout << "Calculating connected components" << std::endl;
+ // Initialization
+ String<bool> tokenMap;
+ fill(tokenMap, getIdUpperBound(_getVertexIdManager(g)), false);
+ fill(components,getIdUpperBound(_getVertexIdManager(g)), 0);
+ for(unsigned int i=0;i<getIdUpperBound(_getVertexIdManager(g));++i){
+ assignProperty(components,i,0);
+ assert(getProperty(components,i)==0);
+ }
+ TEdgeIterator itE(g);
+ std::vector<TEdgeDescriptor> ccedges;
+ int maxlabel=-1;
+ for(;!atEnd(itE);goNext(itE)){
+ ccedges.push_back(*itE);
+ }
+ sort(ccedges.begin(),ccedges.end(),edgescorecmp<std::map<TEdgeDescriptor,float> >());
+ std::vector<std::pair<TVertexDescriptor,TVertexDescriptor> > stack;
+ //outer loop ensures we visit disconnected subgraphs, considering most consistent edges first
+ for(typename std::vector<TEdgeDescriptor>::reverse_iterator cit = ccedges.rbegin();cit!=ccedges.rend();cit++){
+ TVertexDescriptor s = getSource(*cit);
+ TVertexDescriptor t = getTarget(*cit);
+ if (getProperty(tokenMap, s) == false){
+ if(getProperty(tokenMap,t) == false) {
+ stack.push_back(std::make_pair(s, t));
+ stack.push_back(std::make_pair(t, s));
+ }
+ else{
+ stack.push_back(std::make_pair(s, t));
+ }
+ }
+ else{
+ if(getProperty(tokenMap,t) == false) {
+ stack.push_back(std::make_pair(t, s));
+ }
+ }
+ while(!stack.empty()){
+ std::pair<TVertexDescriptor,TVertexDescriptor> & node = stack.back();
+ TVertexDescriptor u = node.first;
+ TVertexDescriptor prev = node.second;
+ stack.pop_back();
+ if(getProperty(tokenMap,u)==false){
+ assert(getProperty(components,u)==0);
+ std::cout << "New node " << u << " " << getProperty(components,u)
+ << std::endl;
+ assignProperty(tokenMap, u, true);
+ int label=-1;
+ if(getProperty(tokenMap,prev)==false){
+ //Use new label
+ ++maxlabel;
+ std::cout << "Starting new component " << maxlabel << " for " << u << std::endl;
+ label=maxlabel;
+ }
+ else{
+ //there is already a label
+ int prevlabel=getProperty(components,prev);
+ std::cout << "Adding to component " << getProperty(components,prev) << " for " << u << std::endl;
+ label=prevlabel;
+ }
+ assignProperty(components, u, label);
+ std::cout << "V:" << u << " component " << getProperty(components,u) << std::endl;
+ //Add all edges from u to ccedges
+ std::vector<TEdgeDescriptor> ccedges;
+ //Edge iterator
+ TOutEdgeIterator itOut(g, u);
+ for(;!atEnd(itOut); ++itOut) {
+ if(getProperty(tokenMap,getSource(*itOut)) || getProperty(tokenMap,getTarget(*itOut))){
+ ccedges.push_back(*itOut);
+ }
+ }
+ //assert(ccedges.size()==degree(g,u));
+ sort(ccedges.begin(),ccedges.end(),edgescorecmp<std::map<TEdgeDescriptor,float> >());
+ for(typename std::vector<TEdgeDescriptor>::iterator cit = ccedges.begin();cit!=ccedges.end();cit++){
+ TVertexDescriptor s = getSource(*cit);
+ TVertexDescriptor t = getTarget(*cit);
+ assert(s==u || t==u);
+ if(s!=u){
+ assert(getProperty(tokenMap,t)==true);
+ if (getProperty(tokenMap, s) == false) {
+ std::cout << " edge " << u << "-" << s;
+ std::cout << std::endl;
+ stack.push_back(std::make_pair(s,u));
+ }
+ }
+ else{
+ if(t!=u){
+ assert(getProperty(tokenMap,s)==true);
+ if (getProperty(tokenMap, t) == false) {
+ std::cout << " edge " << u << "-" << t;
+ std::cout << std::endl;
+ stack.push_back(std::make_pair(t,u));
+ }
+ }
+ else{
+ assert(false);
+ }
+ }
+ }
+ }
+ }
+ }
+ //Capture all vertices with no edges degree==0
+ TVertexIterator it(g);
+ TVertexDescriptor u;
+ for(;!atEnd(it);goNext(it)) {
+ u = getValue(it);
+ if (getProperty(tokenMap, u) == false) {
+ std::cout << "Component" << maxlabel
+ << std::endl;
+ assert(degree(g,u)==0);
+ ++maxlabel;
+ assignProperty(components, u, maxlabel);
+ }
+ }
+ return maxlabel+1;
+..summary:Given a multiple alignment, this function calculates all kinds of alignment statistics.
+alignmentEvaluation(graph, score_type, gapExCount, gapCount, pairCount, numPairs, len)
+..param.graph:An alignment graph.
+...type:Spec.Alignment Graph
+..param.score_type:A score object.
+..param.gapExCount:Number of gap extensions.
+..param.gapCount:Number of gaps.
+..param.pairCount:Number of aligned pairs.
+..param.numPairs:Counter for each pair.
+..param.len:Alignment length.
+..returns:Score of the alignment.
+template<typename TStringSet, typename TCargo, typename TSpec, typename TScore, typename TSize>
+//inline typename Value<TScore>::Type
+alignmentEvaluationCustom(Graph<Alignment<TStringSet, TCargo, TSpec> > const& g,
+ TScore const& score_type,
+ TSize& gapExCount,
+ TSize& gapCount,
+ TSize& pairCount,
+ TSize& pairIdent,
+ String<TSize>& numPairs,
+ String<TSize>& numIdentCols,
+ TSize& len,
+ TSize& totalLen)
+ typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TGraph;
+ typedef typename Value<TScore>::Type TScoreValue;
+ typedef typename Value<typename Value<TStringSet>::Type>::Type TAlphabet;
+ TSize alphSize = ValueSize<TAlphabet>::VALUE;
+ s_score sscore;
+ // Initialization;
+ gapExCount = 0;
+ gapCount = 0;
+ pairCount = 0;
+ clear(numPairs);
+ // Convert the graph
+ String<char> mat;
+ convertAlignment(g, mat);
+ char gapChar = gapValue<char>();
+ TScoreValue gap = scoreGapExtend(score_type);
+ TScoreValue gapOpen = scoreGapOpen(score_type);
+ TSize nseq = length(stringSet(g));
+ len = length(mat) / nseq;
+ for(TSize i = 0; i<nseq; ++i) {
+ totalLen += length(stringSet(g)[i]);
+ }
+ fill(numIdentCols, nseq+1, 0);
+ for(TSize j=0; j<=nseq; ++j) {
+ assert(numIdentCols[j]==0);
+ }
+ char c;
+ for(TSize k=0;k<len; ++k) {
+ TSize numIdents=0;
+ for(TSize j=0; j<nseq; ++j) {
+ if (value(mat, j*len+k) != gapChar) {
+ if(numIdents==0){
+ c = TAlphabet(value(mat, j*len+k));
+ ++numIdents;
+ }
+ else{
+ if(TAlphabet(value(mat, j*len+k))==c){
+ ++numIdents;
+ }
+ else{
+ numIdents=0;
+ break;
+ }
+ }
+ }
+ }
+ assert(numIdents<=nseq);
+ numIdentCols[numIdents]++;
+ }
+ bool gapOpeni = false;
+ bool gapOpenj = false;
+ TScoreValue totalScore = 0;
+ fill(numPairs, alphSize * alphSize, 0);
+ for(TSize i = 0; i<nseq-1; ++i) {
+ for(TSize j=i+1; j<nseq; ++j) {
+ for(TSize k=0;k<len; ++k) {
+ if (value(mat, i*len+k) != gapChar) {
+ if (value(mat, j*len + k) != gapChar) {
+ gapOpeni = false;
+ gapOpenj = false;
+ ++pairCount;
+ if(TAlphabet(value(mat, i*len+k)) == TAlphabet(value(mat, j*len + k))){
+ ++pairIdent;
+ }
+ TSize index1 = ordValue(TAlphabet(value(mat, i*len+k)));
+ TSize index2 = ordValue(TAlphabet(value(mat, j*len + k)));
+ value(numPairs, index1 * alphSize + index2) += 1;
+ totalScore += score(const_cast<TScore&>(score_type), TAlphabet(value(mat, i*len+k)), TAlphabet(value(mat, j*len + k)));
+ } else {
+ if (gapOpenj) {
+ ++gapExCount;
+ totalScore += gap;
+ } else {
+ gapOpenj = true;
+ ++gapCount;
+ totalScore += gapOpen;
+ }
+ }
+ } else if (value(mat, j*len + k) != gapChar) {
+ if (gapOpeni) {
+ ++gapExCount;
+ totalScore += gap;
+ } else {
+ ++gapCount;
+ gapOpeni = true;
+ totalScore += gapOpen;
+ }
+ }
+ }
+ }
+ }
+ sscore.alignScore = totalScore;
+ sscore.numGap = gapCount;
+ sscore.numGapEx = gapExCount;
+ sscore.numPairs = pairCount;
+ sscore.numIdents = pairIdent;
+ sscore.alignLen = len;
+ sscore.totalLen = totalLen;
+ sscore.colCount = numIdentCols;
+ sscore.seqCount = nseq;
+ assert(length(numIdentCols)==nseq+1);
+ sscore.pairCount = numPairs;
+ //return totalScore;
+ return sscore;
+//Read set of LCBs
+//File format is
+//I seq1 start-end orient seq2 ....
+//V v1 v2 v3 ;
+//List of vertices in each LCB on a line ending with ;
+//start-end are ignored. Boundaries are determined by extent of the member anchors
+template<typename TVertexDescriptor,
+ typename TNames,
+ typename TVertexOrientMap,
+ typename TVertexSeqMap,
+ typename TGraph>
+void doReadBlockFile(const std::string & filename,
+ std::map<unsigned int, std::set<TVertexDescriptor> > & block2fragMap,
+ std::vector<std::vector<TVertexDescriptor> > & lcbs,
+ TNames &sequenceNames,
+ TVertexOrientMap & vertexOrientMap,
+ TVertexSeqMap & vertexSeqMap,
+ TGraph & g,
+ bool checkbounds){
+ std::ifstream file;
+ file.open(filename.c_str(), std::ios_base::in | std::ios_base::binary);
+ typedef std::ifstream TFile;
+ typedef Value<TFile>::Type TValue;
+ std::string line;
+ std::vector<unsigned int> currblock;
+ std::vector<unsigned int> currlcb;
+ //Map sequence name -> char
+ std::map<std::string,char> sequenceOrientMap;
+ std::map<std::string,std::pair<unsigned int, unsigned int> > sequenceCoordsMap;
+ std::map<String<char>,int> seqNamesIdxMap;
+ for(int i=0;i<length(sequenceNames);++i){
+ seqNamesIdxMap[sequenceNames[i]]=i;
+ }
+ unsigned int vertexcount = 0;
+ while(file){
+ getline(file,line);
+ std::istringstream in(line);
+ std::string c;
+ in >> c;
+ if(c == "V"){
+ while(in>>c){
+ if(c == ";"){
+ for(std::vector<unsigned int>::iterator it=currblock.begin();it!=currblock.end();it++){
+ assert(block2fragMap.find(*it)!=block2fragMap.end());
+ //currlcb.insert(currlcb.end(),block2fragMap[*it].begin(),block2fragMap[*it].end());
+ for(typename std::set<TVertexDescriptor>::iterator vit=block2fragMap[*it].begin();vit!=block2fragMap[*it].end();++vit){
+ assert(vertexSeqMap.find(*vit)!=vertexSeqMap.end());
+ std::string sname(toCString(sequenceNames[vertexSeqMap[*vit]]));
+ //Check if segment is reported as part of the LCB
+ //The LCB identification step may not report all sequences that are part of the anchor
+ if(sequenceOrientMap.find(sname) != sequenceOrientMap.end()){
+ if(!checkbounds || fragmentBegin(g,*vit)>=sequenceCoordsMap[sname].first && fragmentBegin(g,*vit)<=sequenceCoordsMap[sname].second){
+ vertexOrientMap[*vit] = sequenceOrientMap[sname];
+ //add node to block
+ currlcb.push_back(*vit);
+ std::cout << "Adding segment V:" << *vit << " from anchor:" << *it << std::endl;
+ }
+ else{
+ std::cout << "Skipping out-of-bounds anchor segment " << *vit << " len:" << fragmentLength(g,*vit) << " from anchor " << *it << " on sequence " << sname << " " << sequenceOrientMap[sname]
+ << " fragmentBegin:" << fragmentBegin(g,*vit) << " bounds:" << sequenceCoordsMap[sname].first << "-" << sequenceCoordsMap[sname].second << " " << vertexOrientMap[*vit] << std::endl;
+ }
+ }
+ else{
+ std::cout << "Skipping anchor segment " << *vit << " len:" << fragmentLength(g,*vit) << " from anchor " << *it << " on sequence " << sname << std::endl;
+ for(std::map<std::string,char>::iterator sit=sequenceOrientMap.begin();sit!=sequenceOrientMap.end();++sit){
+ std::cerr << sit->first << " " << sit->second << std::endl;
+ }
+ //currlcb.push_back(*vit);
+ //assert(false);
+ }
+ }
+ }
+ vertexcount = vertexcount + currlcb.size();
+ lcbs.push_back(currlcb);
+ currblock.clear();
+ currlcb.clear();
+ }
+ else{
+ currblock.push_back(atoi(c.c_str()));
+ //Update vertex orientation
+ }
+ }
+ currblock.clear();
+ }
+ else{
+ sequenceOrientMap.clear();
+ sequenceCoordsMap.clear();
+ if(c == "I"){
+ while(in>>c){
+ if(c != ";"){
+ //read sequence name
+ std::string seqname=c;
+ char orient;
+ std::string coords;
+ assert(seqname != ";");
+ //read orient
+ in >> orient;
+ assert(orient != ';');
+ assert(orient == '+' || orient == '-');
+ sequenceOrientMap[seqname] = orient;
+ //read coords
+ in >> coords;
+ std::string start;
+ std::string end;
+ std::istringstream coordsin(coords);
+ getline(coordsin, start, '-');
+ getline(coordsin, end, '-');
+ unsigned int startcoord = atoi(start.c_str());
+ unsigned int endcoord = atoi(end.c_str());
+ /*
+ if(orient == '-'){
+ int slen = length(getValueById(stringSet(g), seqNamesIdxMap[seqname]));
+ unsigned int tmpstartcoord=startcoord;
+ startcoord = slen-endcoord;
+ endcoord = slen-startcoord;
+ }
+ */
+ sequenceCoordsMap[seqname] = make_pair(startcoord,endcoord);
+ }
+ }
+ }
+ }
+ }
+ std::cerr << "Read " << lcbs.size() << " LCBs containing " << vertexcount << " segments " << std::endl;
+ //Sort based on length
+template<typename TVertexDescriptor,
+ typename TNames,
+ typename TVertexOrientMap,
+ typename TGraph>
+void readBlockFile(const std::string & filename,
+ std::map<unsigned int, std::set<TVertexDescriptor> > & block2fragMap,
+ std::vector<std::vector<unsigned int> > & lcbs,
+ TNames &sequenceNames,
+ TVertexOrientMap & vertexOrientMap,
+ TGraph & g,
+ bool checkbounds=false){
+ typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
+ std::map<TVertexDescriptor,unsigned> vertexSeqMap;
+ TVertexIterator it(g);
+ for(;!atEnd(it);goNext(it)) {
+ vertexSeqMap[*it] = sequenceId(g,*it);
+ }
+ doReadBlockFile(filename,block2fragMap,lcbs,sequenceNames,vertexOrientMap,vertexSeqMap,g,checkbounds);
+ std::cerr << "Sorting LCBs by length" << std::endl;
+ //Sort LCBs in decreasing order by length
+ std::map<int,int> lcbidxlenmap;
+ std::vector<int> lcbsidx;
+ for(unsigned int i=0;i<lcbs.size();i++){
+ int totalsize=0;
+ std::vector<unsigned int>::const_iterator vit;
+ for(vit = lcbs[i].begin();vit!=lcbs[i].end();vit++){
+ totalsize = totalsize + fragmentLength(g,*vit);
+ }
+ assert(totalsize!=0);
+ lcbidxlenmap[i] = totalsize;
+ lcbsidx.push_back(i);
+ }
+ assert(lcbidxlenmap.size()==lcbs.size());
+ assert(lcbsidx.size()==lcbs.size());
+ sort(lcbsidx.begin(),lcbsidx.end(),lcblencmp<std::map<int,int> >(lcbidxlenmap));
+ std::vector<std::vector<unsigned int> > newlcbs;
+ for(std::vector<int>::iterator lit = lcbsidx.begin();lit != lcbsidx.end();++lit){
+ assert((unsigned int)*lit<lcbs.size());
+ newlcbs.push_back(lcbs[*lit]);
+ if(lit != lcbsidx.begin()){
+ std::cout << "LCB: " << *lit << " " << lcbidxlenmap[*lit] << " <= " << lcbidxlenmap[*(lit-1)] << std::endl;
+ assert(lcbidxlenmap[*lit]<=lcbidxlenmap[*(lit-1)]);
+ }
+ }
+ lcbs=newlcbs;
+template<typename TSeqs>
+void do_segmentation_MERCATOR(){
+ std::fstream strm;
+ //(2)Support for Mercator
+ //G.chroms
+ std::string genomestr;
+ for(unsigned int i=0;i<nSeq;i++){
+ std::ostringstream currfilename;
+ currfilename << "G" << i;
+ genomestr += currfilename.str();
+ genomestr += " ";
+ currfilename << ".chroms";
+ strm.open(currfilename.str().c_str(), std::ios_base::out | std::ios_base::trunc);
+ strm << "S" << i << "\t" //seqname
+ << length(seqSet[i]) << std::endl; //chromLength
+ strm.close();
+ }
+ for(unsigned int i=0;i<nSeq;i++){
+ std::ostringstream currfilename;
+ currfilename << "G" << i << ".anchors";
+ strm.open(currfilename.str().c_str(), std::ios_base::out | std::ios_base::trunc);
+ typename std::vector<TBlock>::const_iterator bit = blocks.begin();
+ for(bit = blocks.begin();
+ bit!=blocks.end();
+ bit++){
+ if(bit->currentSeq==i){
+ //mercator format
+ strm << bit->currentSeq << "." << bit->c << "\t" //anchorname
+ << "S" << bit->currentSeq << "\t" //seqname
+ << bit->orient << "\t" //strand
+ <<bit->begCoord << " " << bit->endCoord << "\t" //startCoord endCoord 0-based half open interval [start, end)
+ << 1 << std::endl; //isCoding
+ }
+ }
+ strm.close();
+ }
+ //G1-G2.hits
+ std::map<std::pair<unsigned int,unsigned int>,std::set<int> > hitMap;
+ std::map<std::pair<unsigned int,unsigned int>,std::set<int> >::iterator hmit;
+ bool inserted;
+ bit2 = blocksbycomponent.begin();
+ for(; bit2!= blocksbycomponent.end();bit2++){//all cc
+ std::vector<TBlock> currblocks = bit2->second;
+ typename std::vector<TBlock>::iterator it1,it2;
+ for(it1=currblocks.begin();it1!=currblocks.end();it1++){
+ for(it2=currblocks.begin();it2!=currblocks.end();it2++){
+ if(it1!=it2){
+ std::pair<unsigned int,unsigned int> key;
+ if(it1->currentSeq<it2->currentSeq){
+ key = std::make_pair(it1->currentSeq,it2->currentSeq);
+ }
+ else{
+ key = std::make_pair(it2->currentSeq,it1->currentSeq);
+ }
+ std::pair<std::map<std::pair<unsigned int,unsigned int>,std::set<int> >::iterator,bool> s
+ = hitMap.insert(std::make_pair(key,std::set<int>()));
+ hmit = s.first;
+ inserted = s.second;
+ hmit->second.insert(it1->c);
+ }
+ }
+ }
+ }
+ for(hmit=hitMap.begin();hmit!=hitMap.end();hmit++){
+ std::ostringstream currfilename;
+ currfilename << "G" << hmit->first.first << "-" << "G" << hmit->first.second << ".hits";
+ strm.open(currfilename.str().c_str(), std::ios_base::out | std::ios_base::trunc);
+ for(std::set<int>::iterator it=hmit->second.begin();it!=hmit->second.end();it++){
+ strm << hmit->first.first << "." << *it << "\t" //anchorName1
+ << hmit->first.second << "." << *it << "\t" //anchorName2
+ << 1 << "\t"
+ << 1 << std::endl;
+ }
+ strm.close();
+ }
+template<typename TBlock,
+ typename TVertexDescriptor,
+ typename TMSAOptions,
+ typename TSeqs,
+ typename TNames,
+ typename TGenomeNames,
+ typename TVertexOrientMap,
+ typename TGraph>
+void do_segmentation_ENREDO(std::vector<TBlock> & blocks,
+ std::vector<std::vector<unsigned int> > & lcbs,
+ std::map<unsigned int, std::set<TVertexDescriptor> > & block2fragMap,
+ std::string distance,
+ std::string minlen,
+ TMSAOptions const &msaOpt,
+ TSeqs & seqSet,
+ TNames & sequenceNames,
+ TGenomeNames & genomeNames,
+ TVertexOrientMap & vertexOrientMap,
+ TGraph & g){
+ /*
+ From Enredo README
+ The input file contains the result of mapping a set of anchors onto several
+ genomes. Anchors are expected to be sorted by organism, chromosome and
+ position. Each line should correspond to an anchor and each line contains 6
+ values separated by tabs. The six values are: the anchor name (a string),
+ the species name (a string), the chromosome name (a string), the start
+ position (an integer value), the end position (an integer value), the strand
+ (either + or -) and the score (a real value). Here is an example:
+ A1 Spcs1 X 53 85 + 123
+ B1 Spcs1 X 458 498 + 11
+ C1 Spcs1 X 3601 3639 + 434
+ B1 Spcs1 X 5480 5520 + 1
+ D1 Spcs1 X 6479 6510 + 41
+ A Spcs1 Y 1379 4410 + 1567
+ E Spcs1 Y 5879 5910 + 311
+ E Spcs1 Y 6479 6510 + 217
+ D Spcs1 Y 6567 6593 + 135
+ */
+ std::fstream strm;
+ std::fstream strm2;
+ String<char> pf = msaOpt.outfile;
+ char * pfilename = toCString(pf);
+ std::string projfilename(pfilename);
+ projfilename = projfilename + "enredo.anchors";
+ std::string idxfilename(pfilename);
+ idxfilename = idxfilename + "enredo.idx";
+ std::cerr << "Writing ENREDO anchors to " << projfilename.c_str() << std::endl;
+ strm.open(idxfilename.c_str(), std::ios_base::out | std::ios_base::trunc);
+ strm2.open(projfilename.c_str(), std::ios_base::out | std::ios_base::trunc);
+ //enredo anchors file
+ //G.anchors
+ unsigned int nseq = length(seqSet);
+ for(unsigned int i=0;i<nseq;i++){
+ std::vector<TBlock *> seqblocks;
+ int idx=0;
+ strm << i << " " << sequenceNames[i] << std::endl;
+ for(typename std::vector<TBlock>::const_iterator bit = blocks.begin(); bit!=blocks.end();bit++){
+ if(bit->currentSeq==i){
+ seqblocks.push_back(&(blocks[idx]));
+ }
+ idx++;
+ }
+ std::sort(seqblocks.begin(),seqblocks.end(),blockorder<TBlock,unsigned int>(i));
+ unsigned int blen = seqblocks.size();
+ //if(blen>1){
+ for(unsigned int j=0;j<blen;j++){
+ strm2 << seqblocks[j]->c << "\t"
+ << i << "\t"
+ << genomeNames[i] << "\t"
+ //1-start base coordinates
+ << seqblocks[j]->begCoord+1 << "\t" << seqblocks[j]->endCoord << "\t"
+ << seqblocks[j]->orient << "\t"
+ << seqblocks[j]->endCoord - seqblocks[j]->begCoord
+ << std::endl;
+ //}
+ }
+ }
+ /*
+ unsigned int nSeq = length(seqSet);
+ for(unsigned int i=0;i<nSeq;i++){
+ strm << i << " " << sequenceNames[i] << std::endl;
+ typename std::vector<TBlock>::const_iterator bit = blocks.begin();
+ for(bit = blocks.begin();
+ bit!=blocks.end();
+ bit++){
+ if(bit->currentSeq==i){
+ //enredo format
+ strm2 << bit->c << "\t"
+ << bit->currentSeq << "\t"
+ << genomeNames[i] << "\t";
+ strm2 << bit->begCoord << "\t"
+ << bit->endCoord << "\t";
+ strm2 << bit->orient << "\t"
+ << bit->endCoord-bit->begCoord<< std::endl;
+ }
+ }
+ }
+ */
+ strm.close();
+ strm2.close();
+ //(2)Sort anchors
+ std::string sortedprojfilename(projfilename+".sorted");
+ std::string sortcmd = "sort -k 2,3 -k 4n,4n < " + projfilename + " > " + sortedprojfilename;
+ int res = system(sortcmd.c_str());
+ if(res!=0){
+ perror("Could not run system command: ");
+ std::cerr << sortcmd.c_str() << std::endl
+ << "SYSTEM:" << res << std::endl;
+ exit(1);
+ }
+ //(3)Run Enredo
+ std::string mugsyinstall = std::string(std::getenv("MUGSY_INSTALL"));
+ assert(mugsyinstall.length()>0);
+ std::string cmd = mugsyinstall+"/enredo ";
+ std::string stdoutfilename(boost::lexical_cast<std::string>(getpid())+"lcbs.out");
+ std::string stderrfilename(boost::lexical_cast<std::string>(getpid())+"synchain-mugsy.out");
+ char * enredoenvopts = std::getenv("ENREDO_OPTS");
+ std::string enredoopts;
+ if(enredoenvopts==NULL || strlen(enredoenvopts)==0){
+ enredoopts = std::string(" --min-score 0 --max-ratio 0 ") + std::string(" --min-length ") + minlen + std::string(" --max-gap-length ") + distance + std::string(" --min-anchors 1 ");
+ }
+ else{
+ enredoopts = std::string(enredoenvopts);
+ }
+ cmd = cmd + enredoopts + " " + sortedprojfilename
+ + " | "+mugsyinstall+"/enredo2mugsy.pl "+idxfilename+" > "+stdoutfilename+" 2> "+stderrfilename;
+ assert(cmd.length()>0);
+ //#ifdef DEBUGGING
+ std::cerr << "Running " << cmd.c_str() << std::endl;
+ //#endif
+ res = system(cmd.c_str());
+ if(res!=0){
+ perror("Could not run system command: ");
+ std::cerr << cmd.c_str() << std::endl
+ << "SYSTEM:" << res << std::endl;
+ exit(1);
+ }
+ assert(res==0);
+ //(3) Read output file to obtain list of LCBs
+ readBlockFile(stdoutfilename,
+ block2fragMap,
+ lcbs,
+ sequenceNames,
+ vertexOrientMap,
+ g,
+ true); //must check bounds
+template<typename TBlock,
+ typename TNames,
+ typename TGenomeNames>
+void writeProjectionFile(std::string projfilename,
+ std::vector<TBlock> & blocks,
+ TNames & sequenceNames,
+ TGenomeNames & genomeNames){
+ std::fstream strm;
+ strm.open(projfilename.c_str(), std::ios_base::out | std::ios_base::trunc);
+ unsigned int nseq = length(sequenceNames);
+ assert(nseq==length(sequenceNames));
+ assert(nseq==length(genomeNames));
+ unsigned int cdist=0;
+ //(1)Project blocks onto each sequence and write to a file
+ //Topological sort of the blocks over each sequence
+ //projecting block onto sequence and printing neighbors (n->n+1)
+ for(unsigned int i=0;i<nseq;i++){
+ std::vector<TBlock *> seqblocks;
+ int idx=0;
+ for(typename std::vector<TBlock>::const_iterator bit = blocks.begin(); bit!=blocks.end();bit++){
+ if(bit->currentSeq==i){
+ seqblocks.push_back(&(blocks[idx]));
+ }
+ idx++;
+ }
+ std::sort(seqblocks.begin(),seqblocks.end(),blockorder<TBlock,unsigned int>(i));
+ unsigned int blen = seqblocks.size();
+ if(blen>1){
+ for(unsigned int j=0;j<blen;j++){
+ if(j<blen-1){
+ assert(seqblocks[j]->currentSeq==seqblocks[j+1]->currentSeq);
+ cdist = std::abs((long int)(seqblocks[j]->endCoord - seqblocks[j+1]->begCoord));
+ strm << seqblocks[j]->c << " " << seqblocks[j+1]->c << " " << sequenceNames[i] << " " << cdist << " " << genomeNames[i] << " "
+ << seqblocks[j]->orient << " " << seqblocks[j+1]->orient << " "
+ << seqblocks[j]->begCoord << " " << seqblocks[j]->endCoord << " "
+ << seqblocks[j+1]->begCoord << " " << seqblocks[j+1]->endCoord
+ << std::endl;
+ }
+ }
+ }
+ else{
+ if(blen==1){
+ strm << seqblocks[0]->c << " " << seqblocks[0]->c << " " << sequenceNames[i] << " " << 0 << " " << genomeNames[i] << " "
+ << seqblocks[0]->orient << " " << seqblocks[0]->orient << " "
+ << seqblocks[0]->begCoord << " " << seqblocks[0]->endCoord << " "
+ << seqblocks[0]->begCoord << " " << seqblocks[0]->endCoord
+ << std::endl;
+ }
+ }
+ }
+ strm.close();
+template<typename TBlock,
+ typename TVertexDescriptor,
+ typename TMSAOptions,
+ typename TNames,
+ typename TGenomeNames,
+ typename TVertexOrientMap,
+ typename TGraph>
+void do_segmentation_MUGSY( std::vector<TBlock> & blocks,
+ std::vector<std::vector<unsigned int> > & lcbs,
+ std::map<unsigned int, std::set<TVertexDescriptor> > & block2fragMap,
+ std::string distance,
+ std::string minlen,
+ TMSAOptions const &msaOpt,
+ TNames & sequenceNames,
+ TGenomeNames & genomeNames,
+ TVertexOrientMap & vertexOrientMap,
+ TGraph & g){
+ std::fstream strm;
+ String<char> pf = msaOpt.outfile;
+ char * pfilename = toCString(pf);
+ //(1)Write projection file
+ std::string projfilename(pfilename);
+ projfilename = projfilename + "projections.out";
+ writeProjectionFile(projfilename,blocks,sequenceNames,genomeNames);
+ //(2)Run synchain-mugsy using the projection
+ std::string mugsyinstall = std::string(std::getenv("MUGSY_INSTALL"));
+ assert(mugsyinstall.length()>0);
+ std::string cmd = "cat "+projfilename+" | "+mugsyinstall+"/synchain-mugsy ";
+ std::string stdoutfilename(boost::lexical_cast<std::string>(getpid())+"lcbs.out");
+ std::string stderrfilename(boost::lexical_cast<std::string>(getpid())+"synchain-mugsy.out");
+ cmd = cmd + distance + " " + minlen + " "+ minlen
+ + " > "+stdoutfilename+" 2> "+stderrfilename;
+ assert(cmd.length()>0);
+ //#ifdef DEBUGGING
+ std::cerr << "Running " << cmd.c_str() << std::endl;
+ //#endif
+ #ifdef TIMING
+ time(&now);
+ std::cerr << "TIME PRE-SYNCHAIN:" << lasttime << " " << now << " " << now-lasttime << std::endl;
+ lasttime=now;
+ #endif
+ int res = system(cmd.c_str());
+ #ifdef TIMING
+ time(&now);
+ std::cerr << "TIME SYNCHAIN:" << lasttime << " " << now << " " << now-lasttime << std::endl;
+ lasttime=now;
+ #endif
+ if(res!=0){
+ perror("Could not run system command: ");
+ std::cerr << cmd.c_str() << std::endl
+ << "SYSTEM:" << res << std::endl;
+ exit(1);
+ }
+ assert(res==0);
+ //(3) Read output file to obtain list of LCBs
+ readBlockFile(stdoutfilename,
+ block2fragMap,
+ lcbs,
+ sequenceNames,
+ vertexOrientMap,
+ g,
+ false); //No bounds check
+ if(res==0){
+ ;
+ unlink(stdoutfilename.c_str());
+ unlink(stderrfilename.c_str());
+ unlink(projfilename.c_str());
+ }
+ #ifdef TIMING
+ time(&now);
+ std::cerr << "TIME POST-SYNCHAIN:" << lasttime << " " << now << " " << now-lasttime << std::endl;
+ lasttime=now;
+ #endif
+//Build ungapped profiles from the segment graph
+//Output is array of TBlocks needed for mugsy-chaining
+template<typename TGraph,
+ typename TComponentMap,
+ typename TComponent,
+ typename TBlock,
+ typename TName,
+ typename TLoc,
+ typename TNames
+ >
+void convertCC2Blocks(TGraph &g,
+ TComponentMap& component,
+ std::map<std::pair<TComponent,TComponent>,TBlock *> & componentVertexMap,
+ std::vector<std::vector<TBlock> > & blocksbycomponent,
+ std::map<TName,std::vector<TLoc> >&aintervals,
+ TNames & sequenceNames){
+ typedef typename Id<TGraph>::Type TIdType;
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ //typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ typedef typename Size<TGraph>::Type TSize;
+ typedef std::pair<TIdType, TSize> TKey;
+ typedef std::map<TKey, TVertexDescriptor> TPosToVertexMap;
+ typedef FragmentInfo<TIdType, TSize> TFragmentInfo;
+ // data_pvMap is an STL Map to retrieve a vertex given SeqId, Position
+ // first.first == seqId
+ // first.second == pos
+ // second == VertexDescriptor
+ typename TPosToVertexMap::const_iterator it1 = g.data_pvMap.begin();
+ typename TPosToVertexMap::const_iterator it1End = g.data_pvMap.end();
+ typedef typename Position<TGraph>::Type TPos;
+ std::map<TComponent,int> seqsPerComponent;
+ TPos begCoord,endCoord;
+ char orient='?';
+ int lostbp=0;
+ int numlostv=0;
+ //Track number of sequences per component
+ for(;it1!=it1End;++it1) {
+ TVertexDescriptor currV = it1->second;
+ if(currV != getNil<TVertexDescriptor>()){
+ assert(getProperty(component,currV)==component[currV]);
+ TComponent c = getProperty(component, currV);
+ assert(c < blocksbycomponent.size());
+ if(seqsPerComponent.find(c)!=seqsPerComponent.end()){
+ seqsPerComponent[c]++;
+ }
+ else{
+ seqsPerComponent.insert(std::make_pair(c,1));
+ }
+ }
+ }
+ for(it1 = g.data_pvMap.begin();it1!=it1End;++it1) {
+ TVertexDescriptor currV = it1->second;
+ if(currV != getNil<TVertexDescriptor>()){
+ assert(getProperty(component,currV)==component[currV]);
+ TComponent c = getProperty(component, currV);
+ assert(c < blocksbycomponent.size());
+ TSize currentSeq = sequenceId(g,currV);
+ if(seqsPerComponent[c] > 1){
+ std::cout << "Component " << c << " V:" << currV << " seq:" << currentSeq << " degree:" << degree(g,currV) << " coord:" << fragmentBegin(g,currV) << std::endl;
+ //First block for currentseq
+ typename std::map<std::pair<TComponent,TComponent>,TBlock *>::iterator fit = componentVertexMap.find(std::make_pair(c,currentSeq));
+ if(fit==componentVertexMap.end()){
+ begCoord = fragmentBegin(g,currV);
+ assert((int)begCoord>=0);
+ endCoord = begCoord+fragmentLength(g,currV);
+ orient='?';
+ typename std::vector<TBlock>::iterator bit = blocksbycomponent[c].insert(blocksbycomponent[c].end(),
+ TBlock(c,currentSeq,begCoord,endCoord,orient,currV));
+ componentVertexMap[std::make_pair(c,currentSeq)] = &(*bit);
+ //blocksbycomponent[c].push_back(TBlock(c,currentSeq,begCoord,endCoord,orient,currV));
+ //unsigned int idx = blocksbycomponent[c].size()-1;
+ //componentVertexMap[std::make_pair(c,currentSeq)] = &(blocksbycomponent[c][idx]);
+ std::cout << "Adding component " << c << " seq:" << currentSeq << " coords"
+ << begCoord << "-" << endCoord << " o:" << orient << " V:" << currV << std::endl;
+ }
+ else{
+ //Block already inserted
+ TBlock * blk = fit->second;
+ blk->addVertex(g,currV);
+ std::cout << "Adding vertex to component " << c << " seq:" << currentSeq
+ << " coords:" << begCoord << "-" << endCoord << " V:" << currV << std::endl;
+ }
+ }
+ else{
+ //Repetitive sequence
+ /* Remove to improve reporting of unique sequences
+ if(degree(g,currV)>0){
+ lostbp = lostbp + fragmentLength(g,currV);
+ numlostv++;
+ typename std::map<TName,std::vector<TLoc > >::iterator ait = aintervals.find(sequenceNames[currentSeq]);
+ if(ait==aintervals.end()){
+ aintervals.insert(std::make_pair(sequenceNames[currentSeq],std::vector<TLoc >()));
+ }
+ ait = aintervals.find(sequenceNames[currentSeq]);
+ assert(ait!=aintervals.end());
+ TLoc t1,t2;
+ t1.first = fragmentBegin(g,currV);
+ t1.second = 1;
+ t1.blocknum = 0;
+ ait->second.push_back(t1);
+ t2.first = t1.first+fragmentLength(g,currV);
+ t2.second = -1;
+ t2.blocknum = 0;
+ ait->second.push_back(t2);
+ }
+ */
+ }
+ }
+ }
+ std::cerr << "Disconnected " << numlostv << " vertices marking "
+ << lostbp << " aligned bp" << std::endl;
+//Assign orientation using greedy approach
+//Start assignment of edges with best consistency score.
+//Break ties with positional score
+//?TODO?: Break inconsistent edges
+//Output:Blocks: std::vector<TBlock>
+// vertexOrientMap: map vertex->orient
+template<typename TGraph,
+ typename TBlock,
+ typename TVertexOrientMap,
+ typename TEdgeDescriptor>
+void assignBlockOrientation(TGraph &g,
+ std::vector<std::vector<TBlock> > &blocksbycomponent,
+ std::vector<TBlock> &blocks,
+ TVertexOrientMap &vertexOrientMap,
+ std::map<TEdgeDescriptor,float> &posScores){
+ typedef unsigned int TSize;
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typename std::vector<std::vector<TBlock> >::iterator bit2 = blocksbycomponent.begin();
+ //Track number of conflicting orientation assignments
+ int conflicts=0;
+ std::set<TEdgeDescriptor> conflictEdges;
+ bool removeConflictingEdges=false;
+ for(; bit2!= blocksbycomponent.end();bit2++){//all cc
+ if(bit2->size()>0){
+ std::vector<TEdgeDescriptor> ccedges;
+ //std::vector<TBlock> * currblocks = bit2;
+ int unorientedSegments=bit2->size();
+ std::cout << "Examining block with " << unorientedSegments << " sequences" << std::endl;
+ std::map<TSize,TBlock *> seqBlockMap;
+ //capture all edges in component
+ //For all segments on seqs i+1->numseqs{
+ for(unsigned int i=0;i<bit2->size();i++){
+ for(unsigned int j=i+1;j<bit2->size();j++){
+ //all members of block
+ for(typename std::vector<TVertexDescriptor>::iterator vit=bit2->at(i).currV.begin();vit!=bit2->at(i).currV.end();vit++){
+ TVertexDescriptor currV0 = *vit;
+ //assert(degree(g,currV0)>0);
+ for(typename std::vector<TVertexDescriptor>::iterator vit1=bit2->at(j).currV.begin();vit1!=bit2->at(j).currV.end();vit1++){
+ TVertexDescriptor currV = *vit1;
+ //assert(degree(g,currV)>0);
+ TEdgeDescriptor ed = findEdge(g,currV0,currV);
+ if(ed!=0){
+ ccedges.push_back(ed);
+ }
+ else{
+ assert(findEdge(g,currV,currV0)==0);
+ }
+ }
+ }
+ }
+ seqBlockMap.insert(std::make_pair(bit2->at(i).currentSeq,&bit2->at(i)));
+ std::cout << "Block " << i << " seq:" << bit2->at(i).currentSeq << " vertices:" << bit2->at(i).currV.size() << std::endl;
+ for(typename std::vector<TVertexDescriptor>::iterator vit=bit2->at(i).currV.begin();vit!=bit2->at(i).currV.end();vit++){
+ TVertexDescriptor currV0 = *vit;
+ std::cout << "V:"<<currV0 << std::endl;
+ }
+ }
+ assert(seqBlockMap.size()==bit2->size());
+ std::cout << "Number of edges " << ccedges.size() << std::endl;
+ //Sort edges in order of
+ //(1) consistency
+ //(2) posscore
+ //This way the most consistent and syntenic edges should determine the
+ //relative orientation of segments in the block
+ sort(ccedges.begin(),ccedges.end(),edgescorecmp<std::map<TEdgeDescriptor,float> >(&posScores));
+ //traverse edges in decreasing order ranked by consistency, posScores
+ typename std::vector<TEdgeDescriptor>::reverse_iterator eit=ccedges.rbegin();
+ TEdgeDescriptor ed = *eit;
+ TVertexDescriptor v1 = getSource(ed);
+ TVertexDescriptor v2 = getTarget(ed);
+ assert(seqBlockMap.find(sequenceId(g,v1))!=seqBlockMap.end());
+ assert(seqBlockMap.find(sequenceId(g,v2))!=seqBlockMap.end());
+ TBlock * blockv1 = seqBlockMap[sequenceId(g,v1)];
+ TBlock * blockv2 = seqBlockMap[sequenceId(g,v2)];
+ assert(blockv2->orient == '?');
+ assert(blockv1->orient == '?');
+ blockv1->orient = '+';
+ assert(cargo(ed)!=0);
+ if(cargo(ed)>0){
+ std::cout << " SAME ORIENT " << blockv1->orient << std::endl;
+ blockv2->orient = blockv1->orient;
+ }
+ else{
+ std::cout << " OPPOSITE ORIENT OF " << blockv1->orient << std::endl;
+ blockv2->orient = (blockv1->orient=='+'?'-':'+');
+ }
+ std::cout << "Examining edge " << " " << v1 << "-" << v2
+ << blockv1->orient << " " << blockv2->orient
+ << std::endl;
+ //check all currV in blockv1,blockv2 for consistency with this assignment
+ //First two blocks are oriented relative to each other
+ unorientedSegments-=2;
+ eit++;
+ //Propogate relative orientation through the component graph
+ while(unorientedSegments>0){
+ std::cout << "Num blocks unoriented " << unorientedSegments << std::endl;
+ for(;eit!=ccedges.rend();eit++){
+ ed = *eit;
+ v1 = getSource(ed);
+ v2 = getTarget(ed);
+ assert(v1!=v2);
+ assert(seqBlockMap.find(sequenceId(g,v1))!=seqBlockMap.end());
+ assert(seqBlockMap.find(sequenceId(g,v2))!=seqBlockMap.end());
+ blockv1 = seqBlockMap[sequenceId(g,v1)];
+ blockv2 = seqBlockMap[sequenceId(g,v2)];
+ std::cout << "Examining edge:" << " " << v1 << "-" << v2
+ << blockv1->orient << " " << blockv2->orient
+ << std::endl;
+ if(blockv1->orient == '?'){
+ if(blockv2->orient != '?'){
+ //assignment
+ unorientedSegments--;
+ if(cargo(ed)>0){
+ std::cout << " SAME ORIENT " << blockv2->orient << std::endl;
+ blockv1->orient = blockv2->orient;
+ }
+ else{
+ std::cout << " OPPOSITE ORIENT OF " << blockv2->orient << std::endl;
+ blockv1->orient = (blockv2->orient=='+'?'-':'+');
+ }
+ }
+ else{
+ //no assignment
+ }
+ }
+ else{
+ if(blockv2->orient == '?'){
+ if(blockv1->orient != '?'){
+ //assignment
+ unorientedSegments--;
+ if(cargo(ed)>0){
+ std::cout << " SAME ORIENT " << blockv1->orient << std::endl;
+ blockv2->orient = blockv1->orient;
+ }
+ else{
+ std::cout << " OPPOSITE ORIENT OF " << blockv1->orient << std::endl;
+ blockv2->orient = (blockv1->orient=='+'?'-':'+');
+ }
+ }
+ else{
+ //no assignment
+ }
+ }
+ else{
+ //already assigned
+ //check
+ assert(cargo(ed)!=0);
+ if(cargo(ed)>0){
+ if(blockv1->orient!=blockv2->orient){
+ std::cout << "Conflicting orientation. Edge:" << cargo(ed)
+ << " for vertices V1:" << v1 << "," << blockv1->orient
+ << " V2:" << v2 << "," << blockv2->orient << std::endl;
+ //TODO break edge?
+ conflicts++;
+ conflictEdges.insert(ed);
+ }
+ }
+ else{
+ if(blockv1->orient==blockv2->orient){
+ std::cout << "Conflicting orientation. Edge:" << cargo(ed)
+ << " for vertices V1:" << v1 << "," << blockv1->orient
+ << " V2:" << v2 << "," << blockv2->orient << std::endl;
+ //TODO break edge?
+ conflicts++;
+ conflictEdges.insert(ed);
+ }
+ }
+ }
+ }
+ }
+ //Start search again at beginning of list of edges
+ eit=ccedges.rbegin();
+ }
+ } //if
+ //copy final output
+ blocks.insert(blocks.end(),bit2->begin(),bit2->end());
+ for(unsigned int i=0;i<bit2->size();i++){
+ //all members of block
+ for(typename std::vector<TVertexDescriptor>::iterator vit=bit2->at(i).currV.begin();vit!=bit2->at(i).currV.end();vit++){
+ vertexOrientMap[*vit] = bit2->at(i).orient;
+ }
+ }
+ } //for all CC
+ //Resolve conflicts if necessary
+ if(removeConflictingEdges){
+ for(typename std::set<TEdgeDescriptor>::iterator eit=conflictEdges.begin();eit!=conflictEdges.end();++eit){
+ removeEdge(g,*eit);
+ }
+ }
+ std::cerr << "Num conflicts: " << conflicts << " when assigning orientation" << std::endl;
+//Fragments can be oriented
+template<typename TFragment,
+ typename TGraph,
+ typename TVertexDescriptor,
+ typename TSize>
+void buildFrag(TFragment & frag,
+ TGraph & g,
+ TVertexDescriptor vd1,
+ TVertexDescriptor vd2,
+ TSize id1,
+ int vd1len,
+ unsigned offset1,
+ char orient1,
+ TSize id2,
+ int vd2len,
+ unsigned offset2,
+ char orient2){
+ if(orient1 == '-'){
+ if(orient2 == '+'){
+ //id1:- id2:+
+ frag = TFragment(id1,
+ vd1len-(fragmentBegin(g,vd1)+fragmentLength(g,vd1))-offset1,
+ id2,
+ fragmentBegin(g,vd2)-offset2,
+ fragmentLength(g,vd1),
+ true);
+ }
+ else{
+ //id1:- id2:-
+ assert(orient1 == '-');
+ assert(orient2 == '-');
+ frag = TFragment(id1,
+ vd1len-(fragmentBegin(g,vd1)+fragmentLength(g,vd1))-offset1,
+ id2,
+ vd2len-(fragmentBegin(g,vd2)+fragmentLength(g,vd2))-offset2,
+ fragmentLength(g,vd1),
+ false);
+ }
+ }
+ else{
+ if(orient2 == '-'){
+ //id1:+ id2:-
+ assert(orient1 == '+');
+ assert(orient2 == '-');
+ frag = TFragment(id1,
+ fragmentBegin(g,vd1)-offset1,
+ id2,
+ vd2len-(fragmentBegin(g,vd2)+fragmentLength(g,vd1))-offset2,
+ fragmentLength(g,vd1),
+ true);
+ }
+ else{
+ //id1:+ id2:+
+ assert(orient1 == '+');
+ assert(orient2 == '+');
+ frag = TFragment(id1,
+ fragmentBegin(g,vd1)-offset1,
+ id2,
+ fragmentBegin(g,vd2)-offset2,
+ fragmentLength(g,vd1),
+ false);
+ }
+ }
+ assert(frag.begin1 >=0);
+ assert(frag.begin2 >=0);
+ assert(frag.begin1+frag.len <=vd1len);
+ assert(frag.begin2+frag.len <=vd2len);
+//Matches and fragment graph both have coordinates on the leading strand
+template<typename TGraph,
+ typename TString,
+ typename TSpec,
+ typename TFragmentString,
+ typename TScoreValues>
+void buildMatchesFromGraph(TGraph &g,
+ StringSet<TString, TSpec> &seqSet,
+ TFragmentString &currmatches,
+ TScoreValues &currscores){
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ typedef typename Size<StringSet<TString, TSpec> >::Type TSize;
+ typedef Fragment<> TFragment;
+ typedef typename Id<TGraph>::Type TId;
+ TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
+ typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
+ TEdgeIterator itE(g);
+ TVertexDescriptor vd1,vd2;
+ TSize vd1seq,vd2seq;
+ for(;!atEnd(itE);goNext(itE)){
+ vd1 = getSource(*itE);
+ vd2 = getTarget(*itE);
+ vd1seq = sequenceId(g,vd1);
+ vd2seq = sequenceId(g,vd2);
+ assert(vd1!=nilVertex);
+ assert(vd2!=nilVertex);
+ assert(vd1!=vd2);
+ //findEdge() implemented in graph_impl_undirected.h
+ //traced from data_align object in graph_impl_align.h
+ TEdgeDescriptor ed = findEdge(g,vd1,vd2);
+ assert(ed);
+ //There is an alignment between vd1 and vd2
+ assert(vd1seq!=vd2seq);
+ int vd1len = length(getValueById(stringSet(g), vd1seq));
+ int vd2len = length(getValueById(stringSet(g), vd2seq));
+ TFragment currfrag;
+ buildFrag(currfrag,g,vd1,vd2,
+ vd1seq,vd1len,0,'+',
+ vd2seq,vd2len,0,(int)(cargo(ed)<0) ? '-' : '+');
+ assert(currfrag.begin1 >=0);
+ assert(currfrag.begin2 >=0);
+ assert(currfrag.begin1+currfrag.len <=length(getValueById(stringSet(g), vd1seq)));
+ assert(currfrag.begin2+currfrag.len <=length(getValueById(stringSet(g), vd2seq)));
+ assert(currfrag.len==fragmentLength(g,vd1));
+ assert(currfrag.len==fragmentLength(g,vd2));
+ if(currfrag.reversed){
+ currfrag.begin2 = length(seqSet[currfrag.seqId2]) - (currfrag.begin2+currfrag.len);
+ }
+ appendValue(currmatches, currfrag);
+ appendValue(currscores, fragmentLength(g,vd1));
+ }
+//Populate a TFragmentString(currmatches)
+//based on edges in TGraph(g) that connect vertices in TLCB(lit)
+//Only vertices with sequenceId in seqIdMap are considered
+//TODO: Current impl iterates over all pairs of vertices. Performance
+//can be improved using a BFS
+template<typename TGraph,
+ typename TString,
+ typename TSpec,
+ typename TSeqLenMap,
+ typename TLCB,
+ typename TSize,
+ typename TOffsets,
+ typename TFragmentString,
+ typename TScoreValues>
+void buildMatchesFromGraph(TGraph &g,
+ StringSet<TString, TSpec> &seqSet,
+ TSeqLenMap &seqLenMap,
+ TLCB &lit,
+ std::map<TSize,TOffsets> &offsets,
+ std::map<TSize,TSize> &seqIdMap,
+ TFragmentString &currmatches,
+ TScoreValues &currscores){
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ typename TLCB::const_iterator vit;
+ typename TLCB::const_iterator vit2;
+ typename TLCB::const_iterator vit_end;
+ typename TLCB::const_iterator vit2_end;
+ typedef Fragment<> TFragment;
+ typedef typename Id<TGraph>::Type TId;
+#ifdef NDEBUG
+ ;
+ TVertexDescriptor nilVertex = getNil<TVertexDescriptor>();
+ //if edge(vit,vit2) present in input graph
+ //then mark as present in output graph
+ //Add vertex from graph g to LCB graph
+ vit_end = lit.end();
+ vit2_end = lit.end();
+ std::cout << "Building matches from graph for LCB" << std::endl;
+ TVertexDescriptor vd1,vd2;
+ TSize vd1seq,vd2seq;
+ TEdgeDescriptor ed;
+ for(vit = lit.begin();vit!=vit_end;vit++){
+ vd1 = *vit;
+ vd1seq = sequenceId(g,vd1);
+ for(vit2 = lit.begin();vit2!=vit2_end;vit2++){
+ //TODO see if shortcircuit on id1!=id2 improves performance here
+ if(vit != vit2){
+ assert(vd1!=nilVertex);
+ //assert(degree(g,vd1)>0);
+ vd2 = *vit2;
+ vd2seq = sequenceId(g,vd2);
+ assert(vd2!=nilVertex);
+ //assert(degree(g,vd2)>0);
+ assert(vd1!=vd2);
+ //findEdge() implemented in graph_impl_undirected.h
+ //traced from data_align object in graph_impl_align.h
+ ed = findEdge(g,vd1,vd2);
+ if(!ed){
+ }
+ //There is an alignment between vd1 and vd2
+ if(ed){
+ assert(vd1seq!=vd2seq);
+ TId id1 = idToPosition(seqSet, vd1seq);
+ TId id2 = idToPosition(seqSet, vd2seq);
+ assert(id1!=id2);
+ if(seqIdMap.find(id1)!=seqIdMap.end() &&
+ seqIdMap.find(id2)!=seqIdMap.end()){ //Check edge and if sequence was not trimmed out of LCB due to length
+ assert(seqIdMap.find(id1)!=seqIdMap.end());
+ assert(seqIdMap.find(id2)!=seqIdMap.end());
+ int vd1len = seqLenMap[vd1seq];
+ int vd2len = seqLenMap[vd2seq];
+ //assert(vd1len == length(getValueById(stringSet(g), vd1seq)));
+ //assert(vd2len == length(getValueById(stringSet(g), vd2seq)));
+ //length(getValueById(stringSet(g), idToPosition(seqSet,vd1seq))) //length(getValueById(stringSet(g),id1));
+ //length(getValueById(stringSet(g), idToPosition(seqSet,vd2seq))) //length(getValueById(stringSet(g),id2));
+ std::cout << " seqs:"
+ << seqIdMap[vd1seq] << ":" << offsets[id1].orient
+ << " "
+ << seqIdMap[vd2seq] << ":" << offsets[id2].orient
+ << " lengths:" << vd1len << " " << vd2len
+ << " coords:"
+ << fragmentBegin(g,vd1) << "-" << fragmentBegin(g,vd1) + fragmentLength(g,vd1)
+ << ","
+ << fragmentBegin(g,vd2) << "-" << fragmentBegin(g,vd2) + fragmentLength(g,vd1)
+ << " offset1 " << offsets[id1].offset
+ << " offset2 " << offsets[id2].offset
+ << " edge weight:" << cargo(ed)
+ << std::endl;
+ if(offsets[id1].orient == '-'){
+ if(offsets[id2].orient=='+'){
+ //id1:- id2:+
+ assert(offsets[id1].orient == '-');
+ assert(offsets[id2].orient == '+');
+ appendValue(currmatches, TFragment(seqIdMap[id1],
+ vd1len-(fragmentBegin(g,vd1)+fragmentLength(g,vd1))-offsets[id1].offset,
+ seqIdMap[id2],
+ fragmentBegin(g,vd2)-offsets[id2].offset,
+ fragmentLength(g,vd1),
+ false));
+ }
+ else{
+ //id1:- id2:-
+ assert(offsets[id1].orient == '-');
+ assert(offsets[id2].orient == '-');
+ appendValue(currmatches, TFragment(seqIdMap[id1],
+ vd1len-(fragmentBegin(g,vd1)+fragmentLength(g,vd1))-offsets[id1].offset,
+ seqIdMap[id2],
+ vd2len-(fragmentBegin(g,vd2)+fragmentLength(g,vd2))-offsets[id2].offset,
+ fragmentLength(g,vd1),
+ false));
+ }
+ appendValue(currscores, fragmentLength(g,vd1));
+ }
+ else{
+ if(offsets[id2].orient == '-'){
+ //id1:+ id2:-
+ assert(offsets[id1].orient == '+');
+ assert(offsets[id2].orient == '-');
+ appendValue(currmatches, TFragment(seqIdMap[id1],
+ fragmentBegin(g,vd1)-offsets[id1].offset,
+ seqIdMap[id2],
+ vd2len-(fragmentBegin(g,vd2)+fragmentLength(g,vd1))-offsets[id2].offset,
+ fragmentLength(g,vd1),
+ false));
+ }
+ else{
+ //id1:+ id2:+
+ assert(offsets[id1].orient == '+');
+ assert(offsets[id2].orient == '+');
+ appendValue(currmatches, TFragment(seqIdMap[id1],
+ fragmentBegin(g,vd1)-offsets[id1].offset,
+ seqIdMap[id2],
+ fragmentBegin(g,vd2)-offsets[id2].offset,
+ fragmentLength(g,vd1),
+ false));
+ }
+ appendValue(currscores, fragmentLength(g,vd1));
+ }
+ }
+ else{
+ //Ignore matches, one of the sequences was trimmed from the LCB, probably due to length
+ if(seqIdMap.find(id1)==seqIdMap.end()){
+ std::cout << "Ignoring match. Trimmed seq V" << vd1 << "-V" << vd2 << " S1" <<id1 << std::endl;
+ }
+ if(seqIdMap.find(id2)==seqIdMap.end()){
+ std::cout << "Ignoring match. Trimmed seq V" << vd1 << "-V" << vd2 << " S2" <<id2 << std::endl;
+ }
+ }
+ }
+ else{
+ //Ignore matches
+ //Segments vd1,vd2 are connected in the component
+ //but not directly via an alignment edge
+ TId id1 = idToPosition(seqSet, vd1seq);
+ TId id2 = idToPosition(seqSet, vd2seq);
+ if(id1 != id2
+ && seqIdMap.find(id1)!=seqIdMap.end()
+ && seqIdMap.find(id2)!=seqIdMap.end()){
+ assert(seqIdMap.find(id1)!=seqIdMap.end());
+ assert(seqIdMap.find(id2)!=seqIdMap.end());
+ /*
+ std::cout << "Ignoring match. Indirect connections V" << vd1 << "-V" << vd2 << " S1" <<id1 << "-" << " S2" << id2 << std::endl;
+ std::cout << "Ignored seqs:"
+ << seqIdMap[vd1seq] << ":" << offsets[id1].orient
+ << " "
+ << seqIdMap[vd2seq] << ":" << offsets[id2].orient
+ << " coords:"
+ << fragmentBegin(g,vd1) << "-" << fragmentBegin(g,vd1) + fragmentLength(g,vd1)
+ << ","
+ << fragmentBegin(g,vd2) << "-" << fragmentBegin(g,vd2) + fragmentLength(g,vd1)
+ << " offset1 " << offsets[id1].offset
+ << " offset2 " << offsets[id2].offset
+ << std::endl;
+ */
+ }
+ }
+ }
+ }
+ }
+//Set currseqs,offsets
+template<typename TSeqID,
+ typename TString,
+ typename TSpec,
+ typename TMap,
+ typename TOffsets>
+void setLCBProps(TSeqID i,
+ char currorient,
+ unsigned int min,
+ unsigned int max,
+ TString & str,
+ StringSet<TString, TSpec> &seqSet,
+ TMap & seqIdMap,
+ TOffsets &offsets){
+ //Check if mapped ids
+ TSeqID seqidx = i;
+ if(seqIdMap.find(i)!=seqIdMap.end()){
+ seqidx = seqIdMap[i];
+ }
+ //Set substring in currseqs
+ //and offsets
+ //First, check orientation
+ if(currorient == '+'){
+ str = infix(seqSet[seqidx],min,max);
+ //appendValue(currseqs,infix(seqSet[i],min,max));
+ //orients[i] = '+';
+ offsets[i].orient = '+';
+ //offsets[i] = min;
+ offsets[i].offset = min;
+ //spanlens[i] = max-min;
+ offsets[i].spanlen = max-min;
+ //seqlens[i] = length(seqSet[i]);
+ offsets[i].seqlen = length(seqSet[seqidx]);
+ }
+ else{
+ assert(currorient == '-');
+ //Handle reverse orientation
+ std::cout << "REVERSING SEQUENCE " << i << " of length " << length(seqSet[seqidx])
+ << " " << min << " - " << max << " " << std::endl;
+ assert(min<max);
+ assert(int(min)>=0);
+ assert(max<=length(seqSet[seqidx]));
+ //TString str = DnaStringReverseComplement(infix(seqSet[i],min,max));
+ //TString str = infix(seqSet[i],min,max);
+ str = infix(seqSet[seqidx],min,max);
+ //Reverse complement
+ convertInPlace(str, FunctorComplement<Dna5>());
+ reverseInPlace(str);
+ //appendValue(currseqs,str);
+ //Now relative to - strand
+ //Offsets assumed relative to matching strand
+ //Also MAF stores coordinates relative to matching strand
+ int tmpmin = min;
+ min = length(seqSet[seqidx]) - max;
+ max = length(seqSet[seqidx]) - tmpmin;
+ offsets[i].orient = '-';
+ offsets[i].offset = min;
+ offsets[i].spanlen = max-min;
+ offsets[i].seqlen = length(seqSet[seqidx]);
+ }
+//Determines orienation for sequence i in LCB lit
+//Uses the vertexOrientMap that was previously built using relative orientation
+//of most consistent matches in the original alignment graph
+template<typename TGraph,
+ typename TString,
+ typename TSpec,
+ typename TLCB,
+ typename SeqID,
+ typename TVertexOrientMap>
+bool getLCBProps(TGraph &g,
+ StringSet<TString, TSpec> &seqSet,
+ TLCB &lit,
+ SeqID i,
+ char & currorient,
+ unsigned int &min,
+ unsigned int &max,
+ unsigned int &alnlen,
+ TVertexOrientMap &vertexOrientMap){
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ std::vector<unsigned int>::const_iterator vit;
+ std::vector<unsigned int>::const_iterator vit_end;
+ bool seqPresent=false;
+ bool resetOrientMajorityRule=false;
+ if(resetOrientMajorityRule){
+ //Resolve conflicts in orientation, use a majority rule to assign block orientation
+ //TODO, determine what best to do with misoriented, conflicting blocks
+ int plusorient=0;
+ int minusorient=0;
+ vit_end = lit->end();
+ for(vit = lit->begin();vit!=vit_end;++vit){
+ TVertexDescriptor vd1 = *vit;
+ if(idToPosition(seqSet, sequenceId(g,vd1))==i){
+ seqPresent=true;
+ if(vertexOrientMap[vd1] == '+'){
+ plusorient++;
+ }
+ else{
+ assert(vertexOrientMap[vd1] == '-');
+ minusorient++;
+ }
+ }
+ }
+ if(plusorient>=minusorient){
+ currorient = '+';
+ }
+ else{
+ currorient = '-';
+ }
+ }
+ else{
+ //Orient already set in vertexOrientMap
+ }
+ //Currently, this method will use the first encountered
+ //orientation for sequence $i in LCB $lit
+ vit_end = lit->end();
+ for(vit = lit->begin();vit!=vit_end;++vit){
+ TVertexDescriptor vd1 = *vit;
+ if(idToPosition(seqSet, sequenceId(g,vd1))==i){
+ seqPresent=true;
+ //Determine orientation for the block
+ assert(vertexOrientMap.find(vd1) != vertexOrientMap.end());
+ assert(vertexOrientMap[vd1] != '?');
+ if(currorient != '?'){
+ //All vertices in a block should have the same orientation
+ if(vertexOrientMap[vd1] != currorient){
+ //There is a conflict
+ std::cout << "Conflicting orientation on seq:" << i << " currorient:" << currorient << " V:" << vd1 << " expecting:" << vertexOrientMap[vd1] << std::endl;
+ if(resetOrientMajorityRule){
+ vertexOrientMap[vd1] = currorient;
+ }
+ else{
+ //if(msaOpt.refine == "colinear"){
+ //assert(false);
+ //}
+ }
+ }
+ }
+ else{
+ if(resetOrientMajorityRule){
+ assert(false);//should be using majority rule code now
+ }
+ else{
+ currorient = vertexOrientMap[vd1];
+ }
+ }
+ assert(currorient != '?');
+ std::cout << "Determining orient for LCB " << " seq:" << i << " V:" << vd1 << " orient:" << currorient
+ << " block:" << std::endl;
+ std::cout << "Coords:" << fragmentBegin(g,vd1) << "-" << fragmentBegin(g,vd1)+fragmentLength(g,vd1)
+ << " min:" << min << " max:" << max << std::endl;
+ //Min max are always on the leading strand here
+ assert((int)fragmentBegin(g,vd1)>=0);
+ alnlen = alnlen + fragmentLength(g,vd1);
+ min = (fragmentBegin(g,vd1)<min) ? fragmentBegin(g,vd1) : min;
+ max = (fragmentBegin(g,vd1)+fragmentLength(g,vd1)>max) ? fragmentBegin(g,vd1)+ fragmentLength(g,vd1) : max;
+ }
+ }
+ return seqPresent;
+//Populate TFragmentString(currmatches) for TLCB(currlcb) using TGraph(g)
+//TODO:retrieve LCBSegments from initial match set (TFragmentString matches)
+//rather than the alignment graph
+template<typename TGraph,
+ typename TString,
+ typename TSpec,
+ typename TString2,
+ typename TSpec2,
+ typename TMap,
+ typename TVertexOrientMap,
+ typename TLCB,
+ typename TNames,
+ typename TSequence,
+ typename TFragmentString,
+ typename TScoreValues,
+ typename TSize,
+ typename TOffsets,
+ typename TCoveredSet,
+ typename TSortedV>
+void retrieveLCBSegments(TGraph & g,
+ StringSet<TString, TSpec> &seqSetv, //can include placeholder seqs, ie. no seq strings
+ StringSet<TString2, TSpec2> &seqSetReal, //can include virtual seqs, must have strings
+ TMap & seqIdxMap, //mapping between seqSetv->seqSetReal
+ TVertexOrientMap &vertexOrientMap,
+ TLCB &currlcb,
+ unsigned int lcbid,
+ TNames & sequenceNames,
+ StringSet<TSequence, Owner<> > & currseqs,
+ TFragmentString &currmatches,
+ TScoreValues &currscores,
+ TNames & currnameSet,
+ std::map<TSize,TOffsets> &offsets,
+ TCoveredSet & coveredSet,
+ TSortedV & vseqs,
+ unsigned int MIN_FRAGMENT_SIZE){
+ assert(MIN_FRAGMENT_SIZE>=1);
+ if(lcbid>0){};
+ //Vars
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ //Need to trim seqSet to members and subsequences present in the current LCB
+ //Map to track old seqid to newseqid
+ std::map<TSize,TSize> seqIdMap;
+ std::map<TSize,TSize> seqLenMap;
+ std::set<TVertexDescriptor> currlcbset;
+ typename std::set<TVertexDescriptor>::iterator pos;
+ typedef typename Id<TGraph>::Type TId;
+ typename std::vector<TVertexDescriptor>::const_iterator vit;
+ typename std::vector<TVertexDescriptor>::const_iterator vit_end;
+ vit_end = currlcb->end();
+ for(vit = currlcb->begin();vit!=vit_end;vit++){
+ currlcbset.insert(*vit);
+ }
+ //Determine sequences present in LCB
+ //and calculate spanning coords min,max
+ for(TSize i = 0; i<length(seqSetv); ++i) {
+ //Change to intmax
+ unsigned int min=std::numeric_limits<unsigned int>::max();
+ unsigned int max=0;
+ unsigned int alnlen=0;
+ char currorient = '?';
+ //
+ //Filter LCBs so that we only include sequences that span >
+ if(getLCBProps(g,seqSetv,currlcb,
+ i,currorient,min,max,alnlen,
+ vertexOrientMap)){
+ if(alnlen>=MIN_FRAGMENT_SIZE &&
+ max-min>=MIN_FRAGMENT_SIZE){
+ assert(currorient!='?');
+ assert(max>0);
+ assert((int)min>=0);
+ assert(min<max);
+ std::cout << "LCB:" << lcbid << " seq:" << i << " "
+ << min << "-" << max << " spanlen:" << max-min
+ << " alnlen:" << alnlen
+ << " orient: " << currorient << std::endl;
+#ifdef NDEBUG
+ ;
+ unsigned int nSeq = length(currseqs);
+ //Subsequence of lcb on seq $i
+ TString lcbseqstr;
+ setLCBProps(i,currorient,min,max,lcbseqstr,
+ seqSetReal,
+ seqIdxMap,
+ offsets);
+ //Save association between current seq $i
+ //and position in $currseqs
+ assert(length(lcbseqstr)==max-min);
+ appendValue(currseqs,lcbseqstr);
+ appendValue(currnameSet,sequenceNames[i]);
+ assert(length(currnameSet)==length(currseqs));
+ seqIdMap.insert(std::make_pair(i,length(currseqs)-1));
+ if(seqIdxMap.find(i)!=seqIdxMap.end()){
+ seqLenMap.insert(std::make_pair(i,length(getValueById(stringSet(g),idToPosition(seqSetReal, seqIdxMap[i])))));
+ }
+ else{
+ seqLenMap.insert(std::make_pair(i,length(getValueById(stringSet(g),idToPosition(seqSetReal, i)))));
+ }
+ //Make sure that we have added one seq
+ assert(length(currseqs)==nSeq+1);
+ //Using sort vertices on seqs, capture any missing vertices that are spanned by the LCB
+ //TODO, use findVertex and index to avoid looking through all vertices
+ //
+ //TVertexDescriptor act_knot = findVertex(ali_g,seq_id,begin_pos);
+ if(i<vseqs.size()){
+ for(vit = vseqs[i].begin();vit!=vseqs[i].end();++vit){
+ //Check that vertex is not already aligned
+ assert(sequenceId(g,*vit)==i);
+ if(fragmentBegin(g,*vit)>=min){
+ if(fragmentBegin(g,*vit)<max){
+ currlcbset.insert(*vit);
+ coveredSet.insert(*vit);
+ }
+ else{
+ //past max, we can stop looking
+ break;
+ }
+ }
+ }
+ }
+ else{
+ //Vseqs not populated
+ }
+ }
+ else{
+ //Seq fragment is too short to include in LCB
+ }
+ }
+ else{
+ //Seq not present in LCB
+ //This is ok
+ }
+ }
+ assert(seqIdMap.size()==length(currseqs));
+ std::cerr << "LCB Init of segments done: " << SEQAN_PROTIMEUPDATE(__myProfileTime)
+ << " seconds" << std::endl;
+ std::cerr << "LCB Building segments" << std::endl;
+ //All coordinates for fragments
+ //must be relative to the orientation determined previously
+ //offsets array is always relative to the matching strand
+ typedef Fragment<> TFragment;
+ buildMatchesFromGraph(g,
+ seqSetv,
+ seqLenMap,
+ currlcbset,//currlcb,
+ offsets,
+ seqIdMap,
+ currmatches,
+ currscores);
+ //TODO double check that matches all match expected values in lcbseqstr here
+ std::cerr << "LCB Building segments done: " << SEQAN_PROTIMEUPDATE(__myProfileTime)
+ << " seconds" << std::endl;
+//TODO retrieve LCBSegments from initial match set (TFragmentString matches)
+//Rather than the alignment graph
+template<typename TGraph,
+ typename TString,
+ typename TSpec,
+ typename TVertexOrientMap,
+ typename TLCB,
+ typename TNames,
+ typename TSequence,
+ typename TFragmentString,
+ typename TScoreValues,
+ typename TSize,
+ typename TOffsets,
+ typename TCoveredSet,
+ typename TV>
+void retrieveLCBSegments(TGraph & g,
+ StringSet<TString, TSpec> &seqSet,
+ TVertexOrientMap &vertexOrientMap,
+ TLCB &currlcb,
+ unsigned int lcbid,
+ TNames & sequenceNames,
+ StringSet<TSequence, Owner<> > & currseqs,
+ TFragmentString &currmatches,
+ TScoreValues &currscores,
+ TNames & currnameSet,
+ std::map<TSize,TOffsets> &offsets,
+ TCoveredSet &coveredset,
+ TV & vseqs,
+ unsigned int MIN_FRAGMENT_SIZE){
+ //dummy empty map
+ std::map<TSize,TSize> seqIdxMap;
+ retrieveLCBSegments(g,
+ seqSet,
+ seqSet,
+ seqIdxMap,
+ vertexOrientMap,
+ currlcb,lcbid,
+ sequenceNames,
+ currseqs,
+ currmatches,
+ currscores,
+ currnameSet,
+ offsets,
+ coveredset,
+ vseqs,
+void transformMAF(const char * maffile,
+ FILE * outstrm,
+ std::map<std::string,std::string> &currnameSet,
+ std::map<std::string,unsigned int> &offsets,
+ std::map<std::string,char> &orients,
+ std::map<std::string,unsigned int> & seqlens){
+ struct mafFile *mf;
+ std::cout << "Transforming maf file " << maffile << std::endl;
+ mf = mafOpen(maffile, 0);
+ struct mafAli *a, *A, *last_a;
+ struct mafComp *c;
+ A = last_a = NULL;
+ while ((a = mafNext(mf)) != NULL) {
+ if ((c = a->components) == NULL)
+ assert(false);//fatal("empty maf entry");
+ if (last_a == NULL)
+ A = a;
+ else
+ last_a->next = a;
+ last_a = a;
+ }
+ if(A==NULL){
+ std::cout << "can't find any alignments" << std::endl;
+ }
+ else{
+ //Do transform
+ char chrName[200], species_name[200];
+ for (a = A; a != NULL; a = a->next) {
+ int i=0;
+ for(c=a->components; c!=NULL; c=c->next) {
+ //Update coordinates
+ parseSrcName(c->src, species_name, chrName);
+ assert(currnameSet.find(std::string(chrName))!=currnameSet.end());
+ //From UCSC FAQ The start of the aligning
+ //region in the source sequence. This is a
+ //zero-based number. If the strand field is
+ //'-' then this is the start relative to the
+ //reverse-complemented source sequence.
+ //TODO
+ //Confirm reverse alignments during refine are not handled properly
+ if(c->strand == '+'){
+ c->start = c->start+offsets[std::string(chrName)];
+ }
+ else{
+ //Must convert relative to matching strand from original match
+ assert(c->strand == '-');
+ c->start = c->start+offsets[std::string(chrName)];
+ }
+ c->strand = orients[std::string(chrName)];
+ c->src = (char *)currnameSet[std::string(chrName)].c_str();
+ c->srcSize = seqlens[std::string(chrName)];
+ i++;
+ }
+ mafWrite(outstrm, a);
+ }
+ mafFileFree(&mf);
+ }
+template<typename TScore>
+void runIterativeMUGSY(std::string & outputdir,
+ const std::string & fastafiles,
+ std::string & prefix,
+ const std::string & outprefix,
+ MsaOptions<Dna5 , TScore> const& msaOpt){
+ char * mugsyinstall = std::getenv("MUGSY_INSTALL");
+ std::ostringstream refinecmd;
+ refinecmd << mugsyinstall
+ << "/mugsy "
+ << "--debug 5 --log refine.log "
+ << " --distance " << msaOpt.distance
+ << " --minlength 15"
+ << " --nucmeropts \"-l 10 -c 15\"" //relax matchlen
+ //TODO consider removing this option during refine to allow for refinement of short blocks
+ << " --skipunique --directory " << outputdir
+ << " --skiprefine --colinear "
+ << " --prefix " << prefix
+ << " " << fastafiles
+ << " 1>"
+ << outprefix << "mugsyrefine.stdout"
+ << " 2>" << outprefix << "mugsyrefine.stderr";
+ std::cout << refinecmd.str() << std::endl;
+ int ret = system(refinecmd.str().c_str());
+ if(ret!=0){
+ std::cerr << refinecmd.str() << std::endl
+ << "SYSTEM:" << ret << std::endl;
+ }
+ else{
+ ;
+ std::string stdout(outprefix+"mugsyrefine.stdout");
+ std::string stderr(outprefix+"mugsyrefine.stderr");
+ std::string log(prefix+".mugsy.log");
+ unlink(stdout.c_str());
+ unlink(stderr.c_str());
+ unlink(log.c_str());
+ }
+ assert(ret==0);
+//Refinement using Mugsy
+//Also support for fsa,pecan,lagan aligners. They must be in your path
+//TODO, save label,dups from original MAF
+template<typename TScore>
+void refineMSA(const char * maffile,
+ MsaOptions<Dna5 , TScore> const& msaOpt){
+ std::fstream strmmaf;
+ FILE * strmmafrefined;
+ struct mafFile *mf;
+ mf = mafOpen(maffile, 0);
+ struct mafAli *a, *A, *last_a;
+ struct mafComp *c;
+ A = last_a = NULL;
+ while ((a = mafNext(mf)) != NULL) {
+ if ((c = a->components) == NULL)
+ assert(false);//fatal("empty maf entry");
+ if (last_a == NULL)
+ A = a;
+ else
+ last_a->next = a;
+ last_a = a;
+ }
+ if(A==NULL){
+ std::cout << "can't find any alignments" << std::endl;
+ }
+ else{
+ std::string outfile(msaOpt.outfile);
+ std::vector<char> writable(outfile.size() + 1);
+ std::copy(outfile.begin(), outfile.end(), writable.begin());
+ std::string outputdir(dirname(&writable[0]));
+ if(outputdir[outputdir.length()-1] != '/'){
+ outputdir = outputdir + '/';
+ }
+ strmmafrefined = fopen(std::string(outfile+".maf.refined").c_str(),"w");//, std::ios_base::out | std::ios_base::trunc);
+ mafWriteStart(strmmafrefined, "mugsy_refined");
+ //Do transform
+ char chrName[200], species_name[200];
+ int lcbid=0;
+ int COL_WIDTH=60;
+ for (a = A; a != NULL; a = a->next) {
+ std::map<std::string,unsigned int> curroffsets;
+ std::map<std::string,unsigned int> currspanlens;
+ std::map<std::string,unsigned int> currseqlens;
+ std::map<std::string,char> currorients;
+ std::map<std::string,std::string> currnameSetv;
+ int ncol = a->textSize;
+ int i=0;
+ std::ostringstream tmpgraph;
+ tmpgraph << "MUGTMP" << getpid() << "_" << lcbid;
+ std::vector<std::string> fnames;
+ for(c=a->components; c!=NULL; c=c->next) {
+ std::fstream strmfsa;
+ std::string fname(tmpgraph.str());
+ fname = outputdir+fname + "_S"+boost::lexical_cast<std::string>(i) + ".fsa";
+ strmfsa.open(fname.c_str(), std::ios_base::out | std::ios_base::trunc);
+ fnames.push_back(fname);
+ parseSrcName(c->src, species_name, chrName);
+ //Write FASTA
+ if(msaOpt.refine=="fsa"){
+ //Write XMFA style
+ strmfsa << ">" << tmpgraph.str() << "_S" <<boost::lexical_cast<std::string>(i)
+ << "." << c->src << ":" << 1 << "-" << c->size << " " << c->strand << " " << c->size << std::endl;
+ }
+ else{
+ strmfsa << ">" << c->src << std::endl ;
+ }
+ int col=0;
+ int j=0;
+ for (col = j = 0; j < ncol; ++j) {
+ if(c->text[j]=='-'){
+ }
+ else{
+ strmfsa << c->text[j];
+ ++col;
+ if (col == COL_WIDTH) {
+ strmfsa << std::endl;
+ col = 0;
+ }
+ }
+ }
+ if (col != 0){
+ strmfsa << std::endl;
+ }
+ std::string sname(c->src);
+ curroffsets[sname] = c->start;
+ currspanlens[sname] = c->size;
+ currseqlens[sname] = c->srcSize;
+ currorients[sname] = c->strand;
+ currnameSetv[sname] = sname;
+ ++i;
+ strmfsa.close();
+ }
+ //Require more than one sequence
+ if(fnames.size()>1){
+ //
+ std::ostringstream fastafiles;
+ for(int k=0;k<(int)fnames.size();k++){
+ fastafiles << fnames[k] << " " ;
+ }
+ //Output MAF file
+ std::string prefix("MGREF");
+ if(a->label>=0){
+ prefix = prefix + boost::lexical_cast<std::string>(a->label);
+ }
+ std::string maffile(outputdir+"/"+prefix+".maf");
+ //Clean up old maf with same name
+ unlink(maffile.c_str());
+ //Run refinement
+ //Support for other aligners is provided for evaluation
+ if(msaOpt.refine=="pecan"){
+ //Support for pecan aligner
+ std::ostringstream treecmd;
+ //treecmd << "cat " << fastafiles.str() << " | /usr/local/projects/angiuoli/developer/sangiuoli/muscle/trunk/muscle -clusteronly -in - -tree1 /tmp/pecan.tree 1> /dev/null 2> /dev/null";
+ treecmd << "cat " << fastafiles.str() << " | muscle -clusteronly -in - -tree1 /tmp/pecan.tree 1> /dev/null 2> /dev/null";
+ int ret = system(treecmd.str().c_str());
+ if(ret!=0){
+ std::cerr << treecmd.str() << std::endl
+ << "SYSTEM:" << ret << std::endl;
+ }
+ std::ostringstream refinecmd;
+ //refinecmd << "java -cp /usr/local/projects/angiuoli/developer/sangiuoli/pecan_v0.8/pecan_v0.8.jar bp.pecan.Pecan -J /usr/local/projects/angiuoli/developer/sangiuoli/exonerate-2.2.0-x86_64/bin/exonerate -E `cat /tmp/pecan.tree | perl -ne 'chomp;print'` -F " << fastafiles.str() << " >> pecan." << getpid() << ".mfa";
+ refinecmd << "java -cp pecan_v0.8.jar bp.pecan.Pecan -J exonerate -E `cat /tmp/pecan.tree | perl -ne 'chomp;print'` -F " << fastafiles.str() << " >> pecan." << getpid() << ".mfa";
+ ret = system(refinecmd.str().c_str());
+ if(ret!=0){
+ std::cerr << refinecmd.str() << std::endl
+ << "SYSTEM:" << ret << std::endl;
+ }
+ else{
+ }
+ }
+ else if(msaOpt.refine == "mlagan"){
+ std::ostringstream refinecmd;
+ //refinecmd << "/usr/local/projects/angiuoli/developer/sangiuoli/lagan20/mlagan.sh " << fastafiles.str() << " >> lagan."<< getpid() << ".mfa 2> /dev/null";
+ refinecmd << "mlagan.sh " << fastafiles.str() << " >> lagan."<< getpid() << ".mfa 2> /dev/null";
+ int ret = system(refinecmd.str().c_str());
+ if(ret!=0){
+ std::cerr << refinecmd.str() << std::endl
+ << "SYSTEM:" << ret << std::endl;
+ }
+ }
+ else if(msaOpt.refine == "fsa"){
+ ostringstream refinecmd;
+ //refinecmd << "/usr/local/projects/angiuoli/developer/sangiuoli/fsa-1.15.3/src/main/fsa --fast --noindel2 --refinement 0 " << fastafiles.str() << " > fsa." << getpid() << ".mfa 2>> test.fsa.stderr";
+ //refinecmd << "fsa --anchored --maxram 15000 --fast --noindel2 --refinement 0 " << fastafiles.str() << " > fsa." << getpid() << ".mfa 2>> test.fsa.stderr";
+ refinecmd << "fsa --fast --noindel2 --refinement 0 " << fastafiles.str() << " > fsa." << getpid() << ".mfa 2>> test.fsa.stderr";
+ int ret = system(refinecmd.str().c_str());
+ if(ret!=0){
+ std::cerr << refinecmd.str() << std::endl
+ << "SYSTEM:" << ret << std::endl;
+ }
+ ostringstream convertcmd;
+ convertcmd << "echo '=' >> fsa." << getpid() << ".mfa;"
+ << std::getenv("MUGSY_INSTALL") << "/xmfa2maf.pl < fsa." << getpid() << ".mfa > " << maffile.c_str() << " 2>> test.maf.stderr";
+ ret = system(convertcmd.str().c_str());
+ if(ret!=0){
+ std::cerr << convertcmd.str() << std::endl
+ << "SYSTEM:" << ret << std::endl;
+ }
+ }
+ else{
+ runIterativeMUGSY(outputdir,fastafiles.str(),prefix,tmpgraph.str(),msaOpt);
+ }
+ std::cerr << ".";
+ //Need to clean up here to prevent huge proliferation of files
+ ;
+ for(int k=0;k<(int)fnames.size();k++){
+ unlink(fnames[k].c_str());
+ }
+ //Library call added to multiz for parsing
+ FILE* intFileDescriptor ;
+ struct stat stat_FileStatistics ;
+ intFileDescriptor = fopen(maffile.c_str(), "r");
+ if(intFileDescriptor != NULL){
+ fstat(fileno(intFileDescriptor), &stat_FileStatistics) ;
+ unsigned long size = stat_FileStatistics.st_size ;
+ fclose(intFileDescriptor);
+ if(size>0){
+ assert(currnameSetv.size()==fnames.size());
+ transformMAF(maffile.c_str(),
+ strmmafrefined,
+ currnameSetv,
+ curroffsets,
+ currorients,
+ currseqlens);
+ ;
+ unlink(maffile.c_str());
+ }
+ else{
+ //Refined MAF file has zero length
+ }
+ }
+ ++lcbid;
+ }
+ else{
+ mafWrite(strmmafrefined, a);
+ }
+ //mafAliFree(&a);
+ }
+ }
+template<typename TStringSet,
+ typename TCargo,
+ typename TSpec,
+ typename TLCB,
+ typename TStringSet1,
+ typename TNames,
+ typename TGenomeNames,
+ typename TVertexOrientMap,
+ typename TIntervals,
+ typename TScore>
+void generateLCBs(Graph<Alignment<TStringSet, TCargo, TSpec> > &g,
+ TStringSet1 &seqSet,
+ TNames &sequenceNames,
+ TGenomeNames &genomeNames,
+ TVertexOrientMap &vertexOrientMap,
+ TIntervals & aintervals,
+ MsaOptions<Dna5 , TScore> const& msaOpt){
+ //Configurable options
+ bool useadjscores=false; //Generate and use adjacency scores
+ bool bpanalysis=(msaOpt.segmentation=="none") ? false : true; //Set to false to skip breakpoint analysis entirely, each CC in segment graph will be an LCB
+ //*******
+ //Retrieve initial set of alignment blocks(LCBs) from segment graph
+ //A block is a set of segments that are connected in the segment graph
+ std::cerr << "Converting segments to multi-genome anchors "
+ << length(seqSet) << " "
+ << length(sequenceNames) << " "
+ << length(genomeNames) << " "
+ << vertexOrientMap.size() << " "
+ << LCBs.size() << " "
+ << numVertices(g) << " "
+ << numEdges(g) << std::endl;
+ typedef Dna5 TAlphabet;
+ typedef typename Value<TScore>::Type TScoreValue;
+ typedef typename Size<TStringSet>::Type TSize;
+ typedef typename Value<TStringSet1>::Type TString;
+ typedef typename Value<TNames>::Type TName;
+ //typedef Graph<Alignment<TStringSet, TSize> > TGraph;
+ //Using int to support negative edge scores
+ typedef Graph<Alignment<TStringSet, int> > TGraph;
+ typedef typename Id<TGraph>::Type TId;
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ typedef typename EdgeType<TGraph>::Type TEdgeStump;
+ typedef typename Iterator<String<TEdgeStump*> const, Rooted>::Type TIterConst;
+ typedef typename Iterator<String<TEdgeStump*>, Rooted>::Type TIter;
+ //
+ typedef std::map<unsigned int, unsigned int> TComponentLength;
+ // Strongly Connected Components, topological sort, and length of each component
+ typedef String<unsigned int> TComponentMap;
+ typedef typename Value<TComponentMap>::Type TComponent;
+ typedef typename Position<TGraph>::Type TPos;
+ typedef SVABlock<TComponent,TSize,TVertexDescriptor,TPos> TBlock;
+ TComponentMap component;
+ typedef typename Value<TComponentMap>::Type TComponent;
+ //Hold input blocks that will be used to generate LCBs
+ std::map<std::pair<TComponent,TComponent>,TBlock *> componentVertexMap;
+ std::vector<std::vector<TBlock> > blocksbycomponent;
+ TSize numComponents;
+ //Greedy algorithm for resolving conflicts in connecting segments
+ //into blocks. Conflicts arise when there are more than 2 segments
+ //connected from the same genome seperated by < msaOpt.poscombinewindow
+ //Considering two methods
+ //(1)Connect using best positional score first.
+ //Derive positional score from an intial clustering
+ //Break gaps that violate constraints using a mincut
+ //(2)Connect using best consistency score
+ //Start new cluster whenever a repeat/dup is to be added
+ std::map<TEdgeDescriptor,float> posScores;
+ if(useadjscores){
+ //Adjacency scoring is optional, off by default
+ //(1) Using adjacency and consistency score
+ std::cerr << "Not implemented" << std::endl;
+ exit(1);
+ /*
+ numComponents = convertSegments2BlocksAdjacency(g,
+ component,
+ componentVertexMap,
+ blocksbycomponent,
+ seqSet,
+ genomeNames,
+ posScores,
+ msaOpt,
+ cuts);
+ convertCC2Blocks(g,
+ component,
+ componentVertexMap,
+ blocksbycomponent,
+ aintervals,
+ sequenceNames);
+ */
+ }
+ else{
+ //(2) Using consistency score
+ std::cerr << "Greedy CC on consistency score " << std::endl;
+ //numComponents = connected_components_by_genome_ranked_RECURSIVE(g, component, genomeNames, 100);
+ //std::cerr << "numc ranked recur" << numComponents << std::endl;
+ //numComponents = connected_components(g,component);
+ //std::cerr << "numc reg" << numComponents << std::endl;
+ //Convert segment graph (V=genome segments on one genome) into
+ //anchor graph (V=genome segments on multiple genomes)
+ numComponents = connected_components_by_genome_ranked(g, component, genomeNames, msaOpt.anchorwin);
+ std::cerr << "Num components:" << numComponents << std::endl;
+ blocksbycomponent.resize(numComponents);
+ //Collapse CC into blocks
+ convertCC2Blocks(g,
+ component,
+ componentVertexMap,
+ blocksbycomponent,
+ aintervals,
+ sequenceNames);
+ //Ensure there are not blocks with 2 seqs from the same genome
+ typename std::vector<std::vector<TBlock> >::iterator bit2 = blocksbycomponent.begin();
+ for(; bit2!= blocksbycomponent.end();bit2++){//all cc
+ if(bit2->size()>0){
+ for(unsigned int i=0;i<bit2->size();i++){
+ for(unsigned int j=i+1;j<bit2->size();j++){
+ for(typename std::vector<TVertexDescriptor>::iterator vit=bit2->at(i).currV.begin();vit!=bit2->at(i).currV.end();vit++){
+ TVertexDescriptor currV0 = *vit;
+ //assert(degree(g,currV0)>0);
+ for(typename std::vector<TVertexDescriptor>::iterator vit1=bit2->at(j).currV.begin();vit1!=bit2->at(j).currV.end();vit1++){
+ TVertexDescriptor currV = *vit1;
+ if(sequenceId(g,currV0)!=sequenceId(g,currV)){
+ std::cout << "V1:" << currV0 << " " << " V2:" << currV
+ << " seq: " << sequenceId(g,currV0) << " " << sequenceId(g,currV)
+ << " genome:" << genomeNames[sequenceId(g,currV0)] << "-" << genomeNames[sequenceId(g,currV)] << std::endl;
+ assert(genomeNames[sequenceId(g,currV0)]!=genomeNames[sequenceId(g,currV)]);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ //
+ //
+ //Report some stats after building blocks
+ unsigned totalavaillen=0;
+ //unsigned totalmatchingbp=0;
+ unsigned totalseqlen=0;
+ typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
+ TVertexIterator itV2(g);
+ for(;!atEnd(itV2);goNext(itV2)){
+ if(degree(g,*itV2)>0){
+ totalavaillen+=fragmentLength(g,*itV2);
+ }
+ }
+ //TVertexIterator itV(g);
+ //for(;!atEnd(itV);goNext(itV)){
+ //if(degree(g,*itV)>0){
+ //totalmatchingbp+=fragmentLength(g,*itV);
+ //}
+ //}
+ TSize seqSetLen = length(seqSet);
+ for(unsigned int i=0;i<seqSetLen;i++){
+ totalseqlen+=length(seqSet[i]);
+ }
+ //std::cerr << "Excluded unique, repeat/duplicated bp:" << totalmatchingbp-totalavaillen
+ //<< " "
+ //<< "=" << (float)(totalmatchingbp-totalavaillen)/totalseqlen << std::endl;
+ std::cerr << "Percentage matching bp (not including matching repeats/dups):"
+ << totalavaillen << "/" << totalseqlen
+ << "=" << (float)totalavaillen/totalseqlen << std::endl;
+ //assert(totalavaillen<=totalmatchingbp);
+ std::cerr << "Anchor conversion done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ std::cerr << "Num anchors: " << numComponents << std::endl;
+ //*********
+ //Need to rescore because edges may have been removed
+ //invalidating edge pointers
+ posScores.clear();
+ if(useadjscores){
+ //Score for positional conservation
+ std::cerr << "Rescoring for positional conservation" << std::endl;
+ std::cerr << "Not implemented" << std::endl;
+ exit(1);
+ //scorePosCons(g,
+ // component,
+ // numComponents,
+ // posScores,
+ // msaOpt.posscorewindow);
+ }
+ //*********
+ //Assign orientation to LCBs
+ std::cerr << "Assigning orientation to " << blocksbycomponent.size() << " anchors" << std::endl;
+ std::fstream rawstrm;
+ rawstrm.open("refinegraphpreorient.out", std::ios_base::out | std::ios_base::trunc);
+ write(rawstrm,g,seqSet,Raw());
+ rawstrm.close();
+ //Array of blocks
+ std::vector<TBlock> blocks;
+ //Assign orientation to blocks
+ //+ reversed==false
+ //- reversed==true
+ //Save the orientation using vertexOrientMap
+ //Need a map so we can lookup orientation for each vertex in a block
+ //std::map<TVertexDescriptor,char> vertexOrientMap;
+ assert(vertexOrientMap.size()==0); //expecting empty map to start
+ assignBlockOrientation(g,
+ blocksbycomponent,
+ blocks,
+ vertexOrientMap,
+ posScores);
+ posScores.clear();
+ blocksbycomponent.clear();
+ componentVertexMap.clear();
+ clear(component);
+ std::cerr << "Orientation done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ std::cerr << "Building an orthology map using " << blocks.size() << " anchors" << std::endl;
+ //Determine collinear runs, ie LCBs
+ //Save runs in the LCBs vector
+ //Each LCB is a set of TVertexDescriptors
+ //std::vector<std::vector<TVertexDescriptor> > LCBs;
+ std::map<unsigned int, std::set<TVertexDescriptor> > block2fragMap;
+ typename std::vector<TBlock>::const_iterator bit = blocks.begin();
+ for(bit = blocks.begin();
+ bit!=blocks.end();
+ bit++){
+ typename std::vector<TVertexDescriptor>::const_iterator dvit;
+ for(dvit = bit->currV.begin();dvit!=bit->currV.end();++dvit){
+ assert((*dvit)!=getNil<TVertexDescriptor>());
+ }
+ if(block2fragMap.find(bit->c) == block2fragMap.end()){
+ block2fragMap.insert(std::make_pair(bit->c,std::set<TVertexDescriptor>()));
+ block2fragMap[bit->c].insert(bit->currV.begin(),bit->currV.end());
+ }
+ else{
+ block2fragMap[bit->c].insert(bit->currV.begin(),bit->currV.end());
+ }
+ }
+ if(!bpanalysis){
+ //Assign each block as an LCB
+ //Useful for some simple testing
+ typename std::map<unsigned int, std::set<TVertexDescriptor> >::iterator it;
+ for(it = block2fragMap.begin();it!=block2fragMap.end();it++){
+ std::vector<unsigned int> currlcb;
+ currlcb.insert(currlcb.end(),it->second.begin(),it->second.end());
+ LCBs.push_back(currlcb);
+ }
+ }
+ else{
+ //Read LCBs/blocks from input file if specified
+ if(msaOpt.blockfile.length()>0){
+ readBlockFile(msaOpt.blockfile,
+ block2fragMap,
+ LCBs,
+ sequenceNames,
+ vertexOrientMap,
+ g,
+ false);
+ }
+ else{
+ //Calculate LCBs using segmentation method
+ //Enredo, Mercator, or MUGSY
+ std::string diststr(msaOpt.distance);
+ std::string minlenstr(msaOpt.minlength);
+ if(msaOpt.segmentation == "enredo"){
+ do_segmentation_ENREDO(blocks,
+ LCBs,
+ block2fragMap,
+ diststr,
+ minlenstr,
+ msaOpt,
+ seqSet,
+ sequenceNames,
+ genomeNames,
+ vertexOrientMap,
+ g);
+ }
+ else{
+ if(msaOpt.segmentation == "mercator"){
+ //TODO, fix this tested but now broken do_segmentation_MERCATOR()
+ std::cerr << "Mercator segmentation not implemented" << std::endl;
+ }
+ else{
+ if(msaOpt.refine == "colinear"){
+ //For refinement, assume all within a single LCB
+ typename std::map<unsigned int, std::set<TVertexDescriptor> >::iterator it;
+ std::vector<unsigned int> currlcb;
+ for(it = block2fragMap.begin();it!=block2fragMap.end();it++){
+ currlcb.insert(currlcb.end(),it->second.begin(),it->second.end());
+ }
+ LCBs.push_back(currlcb);
+ }
+ else{
+ //Default method is our own
+ do_segmentation_MUGSY(blocks,
+ LCBs,
+ block2fragMap,
+ diststr,
+ minlenstr,
+ msaOpt,
+ sequenceNames,
+ genomeNames,
+ vertexOrientMap,
+ g);
+ }
+ }
+ }
+ }
+ }
+ blocks.clear();
+ //Print out LCBs
+ for(typename std::vector<std::vector<unsigned int> >::const_iterator lit = LCBs.begin();lit!=LCBs.end();lit++){
+ std::vector<unsigned int>::const_iterator vit;
+ for(vit = lit->begin();vit!=lit->end();vit++){
+ std::cout << *vit << ",";
+ assert(*vit!=getNil<TVertexDescriptor>());//Null vertex
+ }
+ std::cout << std::endl;
+ }
+ std::cerr << "LCB identification done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+template<typename TSequenceSet,
+ typename TIds,
+ typename TTreeMap,
+ typename TDistanceValue>
+void getGuideTree(TSequenceSet &seqSet,
+ TIds &curridset,
+ TTreeMap &seqguideTrees,
+ Graph<Tree<TDistanceValue> > &currguideTree){
+ typedef String<TDistanceValue> TDistanceMatrix;
+ TDistanceMatrix distanceMatrix;
+ typedef Dna5 TAlphabet;
+ typedef unsigned TSize;
+ typedef String<TAlphabet> TSequence;
+ //Parse subtree
+ TDistanceMatrix currdistanceMatrix;
+ typedef typename Value<TDistanceMatrix>::Type TValue;
+ typedef typename Iterator<TDistanceMatrix>::Type TMatrixIterator;
+ //This assumes the best estimate is obtained by kmers across whole genome
+ //Alternatively, build guide tree using subsequence of the LCBs for LCBs of length
+ //greater than some cutoff to avoid needlessly building trees for extremely short LCBs
+ std::ostringstream curridsetstr;
+ for(std::set<unsigned int>::iterator it = curridset.begin();it!=curridset.end();it++){
+ curridsetstr << *it << ":";
+ }
+ std::string curridsetstring = curridsetstr.str();
+ if(seqguideTrees.find(curridsetstring) == seqguideTrees.end()){
+ std::cout << "Generating a new guide tree for " << curridsetstring << std::endl;
+ //Copy genome string for sequences present in the current LCB
+ StringSet<TSequence, Owner<> > currStringSet;
+ for(TSize i = 0; i<length(seqSet); ++i) {
+ if(curridset.find(i) != curridset.end()){
+ appendValue(currStringSet,seqSet[i]);
+ }
+ }
+ //if (empty(distanceMatrix)) getDistanceMatrix(currG, currdistanceMatrix, KmerDistance());
+ getKmerSimilarityMatrix(currStringSet, currdistanceMatrix, 3, TAlphabet());
+ // Similarity to distance conversion
+ TMatrixIterator matIt = begin(currdistanceMatrix);
+ TMatrixIterator endMatIt = end(currdistanceMatrix);
+ for(;matIt != endMatIt;++matIt) value(matIt) = (1 - (*matIt)) * 100;
+ upgmaTree(currdistanceMatrix, currguideTree, UpgmaMin());
+ //njTree(currdistanceMatrix, currguideTree);
+ //Save tree for the combination specified by curridset
+ seqguideTrees.insert(std::make_pair(curridsetstring,currguideTree));
+ clear(currdistanceMatrix);
+ }
+ else{
+ std::cout << "Using existing guide tree for " << curridsetstring << std::endl;
+ currguideTree=seqguideTrees[curridsetstring];
+ }
+ // else{
+ //use guidetree that covers all sequences
+ //assert(numVertices(seqguideTree)>0);
+ //currguideTree = seqguideTree;
+ //assert(false);
+ //}
+ std::cout << "LCB Guide tree done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+template<typename TGraph,
+ typename TGraph2,
+ typename TSize,
+ typename TStringSet,
+ typename TGuideTree,
+ typename TScore>
+s_score alignSingleLCB(TGraph &currG,
+ TGraph2 &currgOut,
+ TSize lcbid,
+ TStringSet &currseqs,
+ TGuideTree &currguideTree,
+ MsaOptions<Dna5 , TScore> const& msaOpt){
+ typedef typename Value<TScore>::Type TScoreValue;
+ typedef Dna5 TAlphabet;
+ //Currently disabled
+ bool inlinerefine= (msaOpt.refine=="true") ? true : false; //Compute iterative refinement inline
+ if(lcbid>0){}
+ if(inlinerefine){}
+ TSize nSeq = length(currseqs);
+ TSize threshold = 100;
+ std::cout << "LCB Performing triplet extension " << lcbid << std::endl;
+ if (nSeq < threshold) tripletLibraryExtension(currG);
+ else tripletLibraryExtension(currG, currguideTree, threshold / 2);
+ std::cout << "LCB Triplet extension done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ //
+ //*******
+ //Alignment
+ //*******
+ std::cout << "LCB Performing progressive alignment" << std::endl;
+ std::fstream rawstrm2;
+ std::string lcbgname = "lcbgraph"+boost::lexical_cast<std::string>(lcbid)+".out";
+ rawstrm2.open(lcbgname.c_str(), std::ios_base::out | std::ios_base::trunc);
+ write(rawstrm2,currG,currseqs,Raw());
+ rawstrm2.close();
+ //Perform the alignment
+ progressiveAlignment(currG, currguideTree, currgOut);
+ std::cout << "LCB progressive alignment done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ std::fstream rawstrm3;
+ std::string alcbgname = "alignedlcbgraph"+boost::lexical_cast<std::string>(lcbid)+".out";
+ rawstrm3.open(alcbgname.c_str(), std::ios_base::out | std::ios_base::trunc);
+ write(rawstrm3,currgOut,currseqs,Raw());
+ rawstrm3.close();
+ s_score sscore;
+#ifdef SCORING
+ // Print the scoring information
+ //TScoreValue gop = msaOpt.sc.data_gap_open;
+ //TScoreValue gex = msaOpt.sc.data_gap_extend;
+ //TSize alphSize = ValueSize<TAlphabet>::VALUE;
+ // Print the alignment information
+ TSize numGapEx = 0;
+ TSize numGap = 0;
+ TSize numPairs = 0;
+ TSize numIdents = 0;
+ TSize alignLen = 0;
+ TSize totalLen = 0;
+ String<TSize> pairCount;
+ String<TSize> colCount;
+ //TScoreValue alignScore;
+ sscore = alignmentEvaluationCustom(currgOut,
+ msaOpt.sc,
+ numGapEx,
+ numGap,
+ numPairs,
+ numIdents,
+ pairCount,
+ colCount,
+ alignLen,
+ totalLen);
+ /*
+ sscore.alignScore = alignScore;
+ sscore.numGap = numGap;
+ sscore.numGapEx = numGapEx;
+ sscore.numPairs = numPairs;
+ sscore.numIdents = numIdents;
+ sscore.alignLen = alignLen;
+ sscore.totalLen = totalLen;
+ sscore.colCount = colCount;
+ sscore.seqCount = nSeq;
+ assert(length(colCount)==nSeq+1);
+ sscore.pairCount = pairCount;
+ */
+#ifdef DEBUGGING2
+ TSize alphSize = ValueSize<TAlphabet>::VALUE;
+ TScoreValue gop = msaOpt.sc.data_gap_open;
+ TScoreValue gex = msaOpt.sc.data_gap_extend;
+ std::cout << "LCBID:" << lcbid << std::endl;
+ std::cout << "Scoring parameters:" << std::endl;
+ std::cout << "*Gap opening: " << gop << std::endl;
+ std::cout << "*Gap extension: " << gex << std::endl;
+ std::cout << "*Scoring matrix: " << std::endl;
+ std::cout << " ";
+ for(TSize col = 0; col<alphSize; ++col) std::cout << TAlphabet(col) << ',';
+ std::cout << std::endl;
+ for(TSize row = 0; row<alphSize; ++row) {
+ for(TSize col = 0; col<alphSize; ++col) {
+ if (col == 0) std::cout << TAlphabet(row) << ": ";
+ //std::cout << score(scType, TAlphabet(row), TAlphabet(col));
+ if (col < alphSize - 1) std::cout << ',';
+ }
+ std::cout << std::endl;
+ }
+ std::cout << std::endl;
+ std::cout << "Alignment Score: " << sscore.alignScore << std::endl;
+ std::cout << "Alignment Length: " << alignLen << std::endl;
+ std::cout << "#Match-Mismatch pairs: " <<numPairs << std::endl;
+ std::cout << "Score contribution by match-mismatch pairs: " << (sscore.alignScore - (((TScoreValue) numGap * gop) + ((TScoreValue) numGapEx * gex))) << std::endl;
+ std::cout << "#Gap extensions: " << numGapEx << std::endl;
+ std::cout << "Score contribution by gap extensions: " << ((TScoreValue) numGapEx * gex) << std::endl;
+ std::cout << "#Gap openings: " << numGap << std::endl;
+ std::cout << "Score contribution by gap openings: " << ((TScoreValue) numGap * gop) << std::endl;
+ std::cout << std::endl;
+ std::cout << "#Pairs: " << std::endl;
+ std::cout << " ";
+ for(TSize col = 0; col<alphSize; ++col) std::cout << TAlphabet(col) << ',';
+ std::cout << std::endl;
+ for(TSize row = 0; row<alphSize; ++row) {
+ for(TSize col = 0; col<alphSize; ++col) {
+ if (col == 0) std::cout << TAlphabet(row) << ": ";
+ std::cout << value(pairCount, row * alphSize + col);
+ if (col < alphSize - 1) std::cout << ',';
+ }
+ std::cout << std::endl;
+ }
+ return sscore;
+template<typename TName,
+ typename TLoc,
+ typename TNames>
+void saveInterval(std::map<TName,std::vector<TLoc> >&aintervals,
+ TNames &currnameSet,
+ std::vector<unsigned int> &curroffsets,
+ std::vector<unsigned int> &currspanlens,
+ std::vector<unsigned int> &currseqlens,
+ std::vector<char> &currorients,
+ int lcbid,
+ bool dup=false){
+ for(int i=0;i<(int)length(currnameSet);i++){
+ TName n = currnameSet[i];
+ std::cout << "Saving aligned intervals for " << n << std::endl;
+ typename std::map<TName,std::vector<TLoc > >::iterator ait = aintervals.find(n);
+ if(ait==aintervals.end()){
+ aintervals.insert(std::make_pair(n,std::vector<TLoc >()));
+ }
+ ait = aintervals.find(n);
+ assert(ait!=aintervals.end());
+ int fmin,fmax;
+ //assert(curroffsets[i]>=0);
+ std::cout << "Interval orient " << currorients[i] << " offset" << curroffsets[i] << " spans:" << currspanlens[i] << " len: " << currseqlens[i] << std::endl;
+ if(currorients[i] == '+'){
+ fmin=curroffsets[i];
+ fmax=curroffsets[i]+currspanlens[i];
+ }
+ else{
+ fmax=currseqlens[i]-curroffsets[i];
+ fmin=currseqlens[i]-(curroffsets[i]+currspanlens[i]);
+ }
+ std::cout << "Intervals for " << fmin << "-" << fmax << " " << currseqlens[i] << std::endl;
+ assert(fmin>=0);
+ assert(fmax<=(int)currseqlens[i]);
+ //-1 - duplication end
+ //0 align end
+ //1 duplication start
+ //2 align start
+ if(dup){
+ //ait->second.push_back(make_pair(fmin,1));//align start
+ //ait->second.push_back(make_pair(fmax,-1));//align end
+ TLoc t1,t2;
+ t1.first = fmin;
+ t1.second = 1;
+ t1.blocknum = lcbid;
+ ait->second.push_back(t1);
+ t2.first = fmax;
+ t2.second = -1;
+ t2.blocknum = lcbid;
+ ait->second.push_back(t2);
+ }
+ else{
+ //ait->second.push_back(make_pair(fmin,2));//align start
+ //ait->second.push_back(make_pair(fmax,0));//align end
+ TLoc t1,t2;
+ t1.first = fmin;
+ t1.second = 2;
+ t1.blocknum = lcbid;
+ ait->second.push_back(t1);
+ t2.first = fmax;
+ t2.second = 0;
+ t2.blocknum = lcbid;
+ ait->second.push_back(t2);
+ }
+ }
+template<typename TGraph,
+ typename TLCBs,
+ typename TStringSet1,
+ typename TStringSet2,
+ typename TNames,
+ typename TGenomeNames,
+ typename TVertexOrientMap,
+ typename TStream1,
+ typename TName,
+ typename TLoc,
+ typename TScore>
+std::vector<s_score> alignLCBs(TGraph &g,
+ TLCBs &LCBs,
+ TStringSet1 &seqSet,
+ TStringSet2 &genomeSeqSet,
+ TNames &sequenceNames,
+ TGenomeNames &genomeNames,
+ TVertexOrientMap &vertexOrientMap,
+ TStream1 &strmmaf,
+ std::map<TName,std::vector<TLoc> > &aintervals,
+ MsaOptions<Dna5 , TScore> const &msaOpt){
+ typedef double TDistanceValue;
+ typedef unsigned TSize;
+ typedef Dna5 TAlphabet;
+ typedef String<TDistanceValue> TDistanceMatrix;
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ typedef typename Value<TScore>::Type TScoreValue;
+ std::vector<s_score> allscores;
+ TDistanceMatrix distanceMatrix;
+ Graph<Tree<TDistanceValue> > seqguideTree;
+ std::map<std::string, Graph<Tree<TDistanceValue> > > seqguideTrees;
+ TSize nSeq = length(seqSet);
+ TSize nGenomes=0;
+ for(TSize i=0;i<length(genomeNames);i++){
+ nGenomes = (genomeNames[i] > nGenomes) ? genomeNames[i] : nGenomes;
+ }
+ nGenomes = nGenomes+1;
+ std::cerr << "Saving interval tree marking location of duplications" << std::endl;
+ //Save interval tree of duplications
+ //Save interval trees
+ typedef IntervalAndCargo<int,TSize> TInterval;
+ typedef Graph<Directed<void,WithoutEdgeId> > TIGraph;
+ typedef IntervalTreeNode<TInterval> TNode;
+ typedef String<TNode> TPropertyMap;
+ String<String<TInterval> > dintervals;
+ resize(dintervals,length(seqSet));
+ String<TIGraph> dgs;
+ String<TPropertyMap> dpms;
+ for(int i=0;i<(int)length(seqSet);i++){
+ std::map<int,std::pair<int,int> > tmpintervals;
+ std::map<int,std::pair<int,int> >::iterator pos;
+ bool inserted=false;
+ typename std::map<TName,std::vector<TLoc> >::iterator ait=aintervals.find(sequenceNames[i]);
+ if(ait!=aintervals.end()){
+ for(typename std::vector<TLoc>::iterator pit = ait->second.begin();pit!=ait->second.end();pit++){
+ boost::tie(pos, inserted) = tmpintervals.insert(std::make_pair(pit->blocknum,std::make_pair(0,0)));
+ if(pit->second==1){
+ pos->second.first = pit->first;
+ }
+ else{
+ if(pit->second==-1){
+ pos->second.second = pit->first;
+ }
+ }
+ }
+ }
+ for(std::map<int,std::pair<int,int> >::iterator it = tmpintervals.begin();it!=tmpintervals.end();++it){
+ //std::cout << it->second.first << " " << it->second.second << " bnum:" << it->first << std::endl;
+ appendValue(dintervals[i],IntervalAndCargo<int,unsigned int>(it->second.first,it->second.second,it->first));
+ }
+ }
+ resize(dgs,nSeq);
+ resize(dpms,nSeq);
+ for(unsigned int i=0;i<nSeq;i++){
+ unsigned center = length(seqSet[i])/2;
+ createIntervalTree(dgs[i],dpms[i],dintervals[i],center);
+ //intervals for sequence i are not needed anymore
+ clear(dintervals[i]);
+ }
+ std::cerr << "Saving interval tree done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ std::cerr << "Sorting vertices on each seq" << std::endl;
+ //Sort vertices on seq
+ std::vector<std::vector<TVertexDescriptor> > vseqs;
+ vseqs.resize(nSeq);
+ typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
+ TVertexIterator it(g);
+ for(;!atEnd(it);goNext(it)) {
+ TVertexDescriptor v = *it;
+ if(degree(g,v)>0){
+ assert(sequenceId(g,*it)<vseqs.size());
+ vseqs[sequenceId(g,v)].push_back(v);
+ }
+ }
+ for(unsigned int i=0;i<nSeq;i++){
+ //Sort vertices on seq
+ sort(vseqs[i].begin(),vseqs[i].end(),vertexposcmp<TGraph>(g));
+ }
+ std::cerr << "Sorting vertices done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ unsigned int lcbid=0;
+ //
+ //Loop over each LCB and align
+ std::cerr << "Aligning " << LCBs.size() << " LCBs" << std::endl;
+ //For tracking if an anchor has been aligned in an LCB
+ std::set<TVertexDescriptor> coveredSet;
+ // TODO, for parallel mugsy, refactor into a sub-process and parallel loop
+ for(typename std::vector<std::vector<TVertexDescriptor> >::iterator lit = LCBs.begin();lit!=LCBs.end();lit++){
+ std::cout << "LCB:" << lcbid << " num_anchors:" << lit->size() << std::endl;
+ std::cout << "LCB of size " << lit->size() << std::endl;
+ std::cout << "LCB Initializing segments for LCB" << std::endl;
+ if(lcbid%1000==0){
+ std::cout << ".";
+ }
+ //Matches, scores, seqs, ids for current LCB
+ typedef String<Fragment<> > TFragmentString;
+ typedef typename Value<TScore>::Type TScoreValue;
+ typedef String<TScoreValue> TScoreValues;
+ typedef String<TAlphabet> TSequence;
+ TFragmentString currmatches;
+ TScoreValues currscores;
+ StringSet<TSequence, Owner<> > currseqs;
+ TNames currnameSet;
+ //For tracking substrings
+ //std::map<TSize,unsigned int> offsets;
+ //std::map<TSize,unsigned int> spanlens;
+ //std::map<TSize,unsigned int> seqlens;
+ //std::map<TSize,char> orients;
+ std::map<TSize,s_offset> offsets;
+ //Copy links between set of vertices in LCB $lit
+ //from Graph $g and store in $currmatches,$currscores,$currseqs
+ std::vector<TVertexDescriptor> currlcb;
+ std::set<TVertexDescriptor> currlcbset;
+ //Unless allownestedlcbs, each anchor vertex can contribute to
+ //exactly one LCB; the longest LCB spanning the anchor
+ //If the vertex is reported in subsequent LCBs it will be skipped
+ if(msaOpt.allownestedlcbs == "true"){
+ //skip checks if anchor has already been aligned
+ //default for allownestedlcbs is false
+ }
+ else{
+ for(std::vector<unsigned int>::const_iterator vit = lit->begin();vit!=lit->end();++vit){
+ if(coveredSet.find(*vit)==coveredSet.end()){
+ currlcb.push_back(*vit);
+ }
+ }
+ *lit=currlcb;
+ }
+ retrieveLCBSegments(g,
+ seqSet,
+ vertexOrientMap,
+ lit,
+ lcbid,
+ sequenceNames,
+ currseqs,
+ currmatches,
+ currscores,
+ currnameSet,
+ offsets,
+ coveredSet,
+ vseqs,
+ boost::lexical_cast<int>(msaOpt.minlength));
+ //TODO, Add matches for duplications and overlapping matches
+ std::cout << "LCB: " << lcbid
+ << " vertices:" << lit->size()
+ << " seqset:" << length(seqSet)
+ << " offsets:" << offsets.size()
+ << " currseqs:" << length(currseqs)
+ << " sequenceNames:" << length(sequenceNames)
+ << " currnameset:" << length(currnameSet)
+ << std::endl;
+ assert(length(sequenceNames)==length(seqSet));
+ assert(length(currnameSet)==length(currseqs));
+ if(length(currseqs)>1 && length(currmatches)>0){
+ assert(length(currmatches)>0);
+ std::cout << "LCB Building alignment graph" << std::endl;
+ //Build new graph from matches
+ //Since LCBs contain no reversals
+ //All matches should be relative to the forward/leading strand only
+ //(non-reversed here)
+ TGraph currG(currseqs);
+ buildAlignmentGraph(currmatches, currscores, currG, FractionalScore());
+ std::cout << "Graph built V:" << numVertices(currG) << " E:" << numEdges(currG)
+ << " number of seqs:" << length(currseqs)
+ << " number of matches:" << length(currmatches)
+ << " number of scores:" << length(currscores)
+ << std::endl;
+ //Double check edge weights
+ typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
+ TEdgeIterator itE(currG);
+ //Undo Hack that stores reverse complement matches using
+ //negative edge weights
+ for(;!atEnd(itE);goNext(itE)){
+ if(cargo(value(itE))<0){
+ cargo(value(itE)) = cargo(value(itE))*-1;
+ }
+ }
+ //TESTING Test code to detect additional matches between
+ //disconnected vertices in the LCB
+ bool hashnonmatches=false;
+ if(hashnonmatches){
+ std::map<std::string,std::vector<TVertexDescriptor> > vhash;
+ typename std::map<std::string,std::vector<TVertexDescriptor> >::iterator vhashpos;
+ bool inserted;
+ //Attempt to combine non-matching vertices
+ typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
+ std::cout << "Attempting to add addl edges" << std::endl;
+ int newedges=0;
+ TVertexIterator itV(currG);
+ for(;!atEnd(itV);goNext(itV)){
+ if(degree(currG,*itV)==0){
+ String<char> sseq = infix(currseqs[sequenceId(currG,*itV)],fragmentBegin(currG,*itV),fragmentBegin(currG,*itV)+fragmentLength(currG,*itV));
+ char * c = toCString(sseq);
+ std::string cstr(c);
+ std::cout << sequenceId(currG,*itV) << " " << cstr << std::endl;
+ boost::tie(vhashpos, inserted) = vhash.insert(std::make_pair(cstr,std::vector<TVertexDescriptor>()));
+ vhashpos->second.push_back(*itV);
+ }
+ }
+ for(typename std::map<std::string,std::vector<TVertexDescriptor> >::iterator hit=vhash.begin();hit!=vhash.end();++hit){
+ for(typename std::vector<TVertexDescriptor>::iterator vit1=hit->second.begin();vit1!=hit->second.end();++vit1){
+ for(typename std::vector<TVertexDescriptor>::iterator vit2=vit1+1;vit2!=hit->second.end();++vit2){
+ assert(vit1!=vit2);
+ addEdge(currG,*vit1,*vit2,1);
+ newedges++;
+ }
+ }
+ }
+ std::cout << "Added " << newedges << " new edges" << std::endl;
+ }
+ TEdgeIterator itE2(currG);
+ for(;!atEnd(itE2);goNext(itE2)){
+ assert(cargo(value(itE2))>0);
+ }
+ //Map between LCB array (curr*) and segment graph array
+ std::vector<unsigned int> curroffsets;
+ std::vector<unsigned int> currspanlens;
+ std::vector<unsigned int> currseqlens;
+ std::vector<char> currorients;
+ assert(length(currseqs)==length(currnameSet));
+ currorients.resize(length(currnameSet));
+ currseqlens.resize(length(currnameSet));
+ currspanlens.resize(length(currnameSet));
+ curroffsets.resize(length(currnameSet));
+ String<int> relevant_segments;
+ std::set<unsigned int> currgenomes; //List of sequence ids
+ std::fstream strminfo;
+ std::string lcbgname = "lcbgraph"+boost::lexical_cast<std::string>(lcbid)+".info";
+ strminfo.open(lcbgname.c_str(), std::ios_base::out | std::ios_base::trunc);
+ //TODO, refactor using a id map
+ for(TSize currrow = 0; currrow<length(currnameSet); ++currrow) {
+ for(TSize row = 0; row<length(sequenceNames); ++row) {
+ if(currnameSet[currrow]==sequenceNames[row]){
+ strminfo << "SeqId:" << currrow << " " << currnameSet[currrow]
+ << " origId:" << row
+ << " offset:" << offsets[row].offset
+ << " orient:" << offsets[row].orient << std::endl;
+ curroffsets[currrow] = offsets[row].offset;
+ currspanlens[currrow] = offsets[row].spanlen;
+ currseqlens[currrow] = offsets[row].seqlen;
+ currorients[currrow] = offsets[row].orient;
+ //See if we overlap a duplication
+ findIntervals(dgs[row],dpms[row],offsets[row].offset,relevant_segments);
+ findIntervals(dgs[row],dpms[row],offsets[row].offset+offsets[row].spanlen,relevant_segments);
+ currgenomes.insert(genomeNames[row]);
+ std::cout << currnameSet[currrow] << " " << currorients[currrow] << std::endl;
+ }
+ }
+ }
+ strminfo.close();
+ std::cout << "LCB Building alignment graph done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ std::cout << "LCB Scoring matches" << std::endl;
+ //Use or build guide tree based on genome sequences
+ typedef double TDistanceValue;
+ Graph<Tree<TDistanceValue> > currguideTree;
+ assert(currgenomes.size()>0);
+ getGuideTree(genomeSeqSet,currgenomes,seqguideTrees,currguideTree);
+ TGraph currgOut(currseqs);
+ s_score sscores = alignSingleLCB(currG,
+ currgOut,
+ ++lcbid,
+ currseqs,
+ currguideTree,
+ msaOpt);
+#ifdef SCORING
+ //Map to orig seq ids
+ //TODO, refactor using a id map
+ String<TSize> colCount;
+ resize(colCount,nSeq+1);
+ for(TSize i=0;i<nSeq;++i){
+ for(TSize j=0;j<length(currseqs);++j){
+ if(sequenceNames[i]==currnameSet[j]){
+ colCount[i+1] += sscores.colCount[j+1];
+ }
+ }
+ }
+ sscores.colCount=colCount;
+ allscores.push_back(sscores);
+ if(msaOpt.unique == "true"){
+ //save interval
+ saveInterval(aintervals,
+ currnameSet,
+ curroffsets,
+ currspanlens,
+ currseqlens,
+ currorients,
+ lcbid);
+ }
+ if(strmmaf.is_open()){
+ //Optionally write output
+ //mafformat defined in refinement/graph_impl_align.h
+ std::ostringstream lcblabel;
+ lcblabel << " label=" << lcbid;
+ if(msaOpt.duplications == "true"){
+ std::set<int> dblocks;
+ for(unsigned int i=0;i<length(relevant_segments);i++){
+ dblocks.insert(relevant_segments[i]);
+ }
+ for(std::set<int>::iterator dit=dblocks.begin();dit!=dblocks.end();++dit){
+ //std::cout << "LCB:" <<lcbid << " overlaps duplicated block d" << *dit << std::endl;
+ lcblabel << " dup=d" << *dit;
+ }
+ }
+ write(strmmaf,currgOut,currnameSet,MafFormat(),curroffsets,currspanlens,currseqlens,currorients,lcblabel.str());
+ strmmaf.flush();
+ }
+ }
+ }
+ std::cerr << "Aligning LCBs done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ return allscores;
+template<typename TGraph,
+ typename TStringSet,
+ typename TStringSet2,
+ typename TNames,
+ typename TGenomeNames,
+ typename TScore,
+ typename TLCBs,
+ typename TVMap,
+ typename TStream,
+ typename TIMap>
+void wholeGenomeAlignment(TGraph &g,
+ TStringSet &seqSet,
+ TStringSet2 &genomeSeqSet,
+ TNames &sequenceNames,
+ TGenomeNames &genomeNames,
+ MsaOptions<Dna5, TScore> const& msaOpt,
+ TLCBs &LCBs,
+ TVMap &vertexOrientMap,
+ TStream &strmmaf,
+ TIMap &aintervals){
+ typedef Dna5 TAlphabet;
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ //Perform consistency scoring
+ //Do not add any edges to the graph
+ //Simple score existing match edges for consistency
+ //extension preserves directionality of matches
+ //Edge weight > 0 same strand
+ //Edge weight < 0 opposite strand
+ std::cerr << "Performing consistency scoring for connected edges only" << std::endl;
+#ifdef NDEBUG
+ ;
+ unsigned int nEdges = numEdges(g);
+ //tripletLibraryExtensionCond(g,false);
+ //tripletLibraryExtension(g, genomeguideTree, threshold / 2);
+ graphBasedTripletLibraryExtension(g,false);
+ std::cerr << "Num edges after consistency scoring " << numEdges(g) << std::endl;
+ assert(nEdges==numEdges(g));
+ std::cerr << "Consistency scoring done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ double vm, rss;
+ process_mem_usage(vm, rss);
+ cout << "VM: " << vm << "; RSS: " << rss << endl;
+ /*
+ if(msaOpt.filter){
+ std::cerr << "Filtering segment graph" << std::endl;
+ filterSegmentGraph(g,seqSet,genomeNames,genomeguideTree);
+ std::cerr << "Filtering segment graph done: "
+ << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ }
+ */
+ //*******
+ //Generate LCBs
+ //
+ //std::vector<std::vector<TVertexDescriptor> > LCBs;
+ //std::map<TVertexDescriptor,char> vertexOrientMap;
+ #ifdef TIMING
+ time(&now);
+ std::cerr << "TIME ALIGNMENT_GRAPH:" << lasttime << " " << now << " " << now-lasttime << std::endl;
+ lasttime=now;
+ #endif
+ generateLCBs(g,
+ LCBs,
+ seqSet,
+ sequenceNames,
+ genomeNames,
+ vertexOrientMap,
+ aintervals,
+ msaOpt);
+ std::cerr << "Generating " << LCBs.size() << " alignments " << std::endl;
+ process_mem_usage(vm, rss);
+ cout << "VM: " << vm << "; RSS: " << rss << endl;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ typedef typename EdgeType<TGraph>::Type TEdgeStump;
+ typename std::vector<std::vector<unsigned int> >::const_iterator lit;
+ typedef Fragment<> TFragment;
+ typedef String<TAlphabet> TSequence;
+ //Retrieve LCBs from the complete alignment graph $g
+ std::vector<s_score> allscores = alignLCBs(g,
+ LCBs,
+ seqSet,
+ genomeSeqSet,
+ sequenceNames,
+ genomeNames,
+ vertexOrientMap,
+ strmmaf,
+ aintervals,
+ msaOpt);
+ #ifdef TIMING
+ time(&now);
+ std::cerr << "TIME ALIGN_LCB:" << lasttime << " " << now << " " << now-lasttime << std::endl;
+ lasttime=now;
+ #endif
+ std::cerr << std::endl
+ << "Finished aligning LCBs: "
+ << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+#ifdef SCORING
+ std::cerr << "Calculating scores "<< std::endl;
+ typedef unsigned int TSize;
+ typedef typename Value<TScore>::Type TScoreValue;
+ TSize numGapEx = 0;
+ TSize numGap = 0;
+ TSize numPairs = 0;
+ TSize numIdents = 0;
+ TSize alignLen = 0;
+ TSize totalLen = 0;
+ TSize lcbCount = 0;
+ unsigned int minLen = std::numeric_limits<unsigned int>::max();
+ unsigned int maxLen = 0;
+ String<TSize> colCount;
+ String<TSize> seqCount;
+ TSize alignScore=0;
+ TSize nSeq = length(seqSet);
+ TSize nGen = length(genomeNames);
+ fill(colCount,nSeq+1,0);
+ fill(seqCount,nSeq+1,0);
+ for(TSize i=0;i<nSeq;i++){
+ assert(colCount[i]==0);
+ assert(seqCount[i]==0);
+ }
+ for(std::vector<s_score>::iterator sit=allscores.begin();sit!=allscores.end();++sit){
+ TSize nSeqn = sit->seqCount;
+ seqCount[nSeqn]++;
+ lcbCount++;
+ alignScore += sit->alignScore;
+ numGap += sit->numGap;
+ numGapEx += sit->numGapEx;
+ numPairs += sit->numPairs;
+ numIdents += sit->numIdents;
+ minLen = (sit->alignLen < minLen) ? sit->alignLen : minLen;
+ maxLen = (sit->alignLen > maxLen) ? sit->alignLen : maxLen;
+ alignLen += sit->alignLen;
+ totalLen += sit->totalLen;
+ for(TSize i=0;i<nSeqn;++i){
+ colCount[i+1] += sit->colCount[i+1];
+ }
+ }
+ std::string outfile(msaOpt.outfile);
+ std::fstream strmscore;
+ strmscore.open(std::string(outfile+".scores").c_str(), std::ios_base::out | std::ios_base::trunc);
+ TScoreValue gop = msaOpt.sc.data_gap_open;
+ TScoreValue gex = msaOpt.sc.data_gap_extend;
+ strmscore << "Num LCBs: " << lcbCount << std::endl;
+ strmscore << "Avg length: " << (float)alignLen/(float)lcbCount << "bp Range:" << minLen << "-" << maxLen << "bp " << std::endl;
+ strmscore << "Total scoring summary over all LCBs" << std::endl;
+ strmscore << "SP alignment Score: " << alignScore << std::endl;
+ strmscore << "Alignment Length: " << alignLen << std::endl;
+ strmscore << "Sum of sequence length: " << totalLen << std::endl;
+ strmscore << "#Match-Mismatch pairs: " << numPairs << std::endl;
+ strmscore << "#Match pairs: " << numIdents << std::endl;
+ strmscore << "Score contribution by match-mismatch pairs: " << (alignScore - (((TScoreValue) numGap * gop) + ((TScoreValue) numGapEx * gex))) << std::endl;
+ strmscore << "#Gap extensions: " << numGapEx << std::endl;
+ strmscore << "Score contribution by gap extensions: " << ((TScoreValue) numGapEx * gex) << std::endl;
+ strmscore << "#Gap openings: " << numGap << std::endl;
+ strmscore << "Score contribution by gap openings: " << ((TScoreValue) numGap * gop) << std::endl;
+ //strmscore << "Average percent identity: " << << std::endl;
+ //strmscore << "Average percent aligned: " << << std::endl;
+ if(nSeq!=nGen){
+ strmscore << "WARNING: Some of the following stats are not calculated correctly for incomplete genomes" << std::endl;
+ }
+ strmscore << "Count of columns with identical characters " << std::endl;
+ for(TSize i=0;i<=nSeq;++i){
+ if(i!=0)
+ strmscore << " " << i << ":" << colCount[i];
+ }
+ strmscore << std::endl;
+ strmscore << "Counts of seqs per LCB " << std::endl;
+ for(TSize i=1;i<=nSeq;++i){
+ strmscore << " " << i << ":" << seqCount[i];
+ }
+ strmscore << std::endl;
+ strmscore << "Lengths per seq" << std::endl;
+ int minseq = std::numeric_limits<unsigned int>::max();
+ int maxseq = 0;
+ int totalunaligned=0;
+ for(TSize i=0;i<nSeq;++i){
+ minseq = (minseq < length(seqSet[i])) ? minseq : length(seqSet[i]);
+ maxseq = (maxseq > length(seqSet[i])) ? maxseq : length(seqSet[i]);
+ strmscore << sequenceNames[i] << " len: " << length(seqSet[i]) << " unaligned lcb,bp,%: " << " aligned lcb,bp,%: " << endl;
+ //totalunaligned+=
+ }
+ //TODO these stats only work for completed genomes
+ if(length(sequenceNames)==length(genomeNames)){
+ //bionomial coffecient
+ int nfac=1;
+ int n1fac=1;
+ for(int i=1;i<=nSeq;++i){nfac *= i;}
+ for(int i=1;i<=(nSeq-2);++i){n1fac *= i;}
+ assert(n1fac>0);
+ int possiblematchpairsmin = (nfac/(2*n1fac))*minseq;
+ int possiblematchpairsaln = (nfac/(2*n1fac))*alignLen;
+ std::cout << nfac << " " << n1fac << std::endl;
+ assert(possiblematchpairsmin>0);
+ assert(possiblematchpairsaln>0);
+ strmscore << "Estimate of average %id (using min seq len) " << (float)numIdents / (float)possiblematchpairsmin << std::endl;
+ strmscore << "Estimate of average %id (using aln len) " << (float)numIdents / (float)possiblematchpairsaln << std::endl;
+ strmscore << "Estimate of average %aln (using min seq len) " << (float)numPairs / (float)possiblematchpairsmin << std::endl;
+ strmscore << "Estimate of average %aln (using aln len) " << (float)numPairs / (float)possiblematchpairsaln << std::endl;
+ strmscore << "Estimate of overall %id (maxseq/alignLen+unaligned) " << (float)maxseq / (float)(alignLen+totalunaligned) << std::endl;
+ strmscore << std::endl;
+ }
+ strmscore.close();
+ std::cout << std::endl
+ << "Finished calculating scores "
+ << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+//-1 - duplication end
+//0 align end
+//1 duplication start
+//2 align start
+template<typename TStringSet,
+ typename TNames,
+ typename TName,
+ typename TLoc,
+ typename TStream>
+void printUniques(TStringSet &seqSet,
+ TNames &sequenceNames,
+ std::map<TName,std::vector<TLoc> >&aintervals,
+ TStream &strmmaf){
+ int icount=0;
+ for(int i=0;i<(int)length(seqSet);i++){
+ typename std::map<TName,std::vector<TLoc> >::iterator ait=aintervals.find(sequenceNames[i]);
+ if(ait!=aintervals.end()){
+ sort(ait->second.begin(),ait->second.end(),poscmp<TLoc>());
+ int last=0;
+ int open=0;
+ int indup=0;
+ std::vector<int> currdups;
+ //pair<coord,start_end>
+ strmmaf << std::endl;
+ assert(ait->second[ait->second.size()-1].first<=(int)length(seqSet[i]));
+ for(typename std::vector<TLoc>::iterator pit = ait->second.begin();pit!=ait->second.end();pit++){
+ //Start of alignment
+ assert(pit->first>=0);
+ assert((unsigned)pit->first<=length(seqSet[i]));
+ if(pit->second>0){
+ if(pit->second==1){//duplication start
+ indup++;
+ currdups.push_back(pit->blocknum);
+ }
+ else{
+ std::cout << "OPEN:" << open << " type:" << pit->second << " seq: " << ait->first << " coord:" << pit->first << " last_close:" << last
+ << " spanlen: " << length(infix(seqSet[i],last,pit->first))
+ << " == " << pit->first - last
+ << " indup:" << indup <<std::endl;
+ if(open==0){
+ if(pit->first-last>0){
+ if(indup){
+ //print as dup
+ //std::cout << "DUP" << std::endl;
+ strmmaf << "a score=0 label=u" << icount++ << " mult=1 dup=";
+ for(std::vector<int>::iterator it =currdups.begin();it!=currdups.end();++it){
+ strmmaf << "d" << *it;
+ if(it+1!=currdups.end()){
+ strmmaf << ",";
+ }
+ }
+ strmmaf << std::endl;
+ }
+ else{
+ strmmaf << "a score=0 label=u" << icount++ << " mult=1" << std::endl;
+ }
+ assert((int)length(infix(seqSet[i],last,pit->first)) == pit->first - last);
+ strmmaf << "s " << ait->first << " " << last << " " << pit->first - last << " + "
+ << length(seqSet[i]) << " " << infix(seqSet[i],last,pit->first)
+ << std::endl
+ << std::endl;
+ }
+ }
+ if(pit->second==2){
+ open++;
+ }
+ }
+ }
+ else{
+ if(pit->second==-1){//duplication stop
+ indup--;
+ currdups.pop_back();
+ }
+ if(pit->second==0){//End of alignment
+ open--;
+ }
+ last=pit->first;
+ std:: cout << "CLOSE: " << open << " type:" << pit->second << " coord:" << last << std::endl;
+ }
+ if(last<0){
+ last=0;
+ }
+ assert(open>=0);
+ assert(last<=(int)length(seqSet[i]));
+ }
+ //assert(open==0);
+ if(last && length(seqSet[i])-last>0 && open==0){
+ std::cout << "Printing to end of sequence " << last << "-" << length(seqSet[i]) << std::endl;
+ if(indup){
+ //print as dup
+ //std::cout << "DUP" << std::endl;
+ strmmaf << "a score=0 label=u" << icount++ << " mult=1 dup=";
+ for(std::vector<int>::iterator it =currdups.begin();it!=currdups.end();++it){
+ strmmaf << "d" << *it;
+ if(it+1!=currdups.end()){
+ strmmaf << ",";
+ }
+ }
+ strmmaf << std::endl;
+ }
+ else{
+ strmmaf << "a score=0 label=u" << icount++ << " mult=1" << std::endl;
+ }
+ assert(length(infix(seqSet[i],last,length(seqSet[i]))) == length(seqSet[i]) - last);
+ strmmaf << "s " << ait->first << " " << last << " " << length(seqSet[i]) - last << " + "
+ << length(seqSet[i]) << " " << infix(seqSet[i],last,length(seqSet[i]))
+ << std::endl
+ << std::endl;
+ }
+ }
+ else{
+ std::cout << "No alignment on sequence " << sequenceNames[i] << std::endl;
+ std::cout << "Printing entire sequence" << std::endl;
+ strmmaf << "a score=0 label=0 mult=1" << std::endl;
+ strmmaf << "s " << sequenceNames[i] << " 0 " << length(seqSet[i]) << " + "
+ << length(seqSet[i]) << " " << seqSet[i]
+ << std::endl
+ << std::endl;
+ }
+ }
+template<typename TStringSet,
+ typename TCargo,
+ typename TSpec,
+ typename TStringSet1,
+ typename TNames,
+ typename TGenomeNames,
+ typename TIntervals,
+ typename TScore>
+singlepass_wholeGenomeAlignment(Graph<Alignment<TStringSet, TCargo, TSpec> >& gAlign,
+ TStringSet1& sequenceSet,
+ TNames& sequenceNames,
+ TGenomeNames& genomeNames,
+ TIntervals &aintervals,
+ MsaOptions<Dna5 , TScore> const& msaOpt)
+ typedef Dna5 TAlphabet;
+ typedef typename Value<TScore>::Type TScoreValue;
+ typedef typename Size<TStringSet>::Type TSize;
+ typedef typename Value<TStringSet1>::Type TString;
+ typedef typename Value<TNames>::Type TName;
+ //typedef Graph<Alignment<TStringSet, TSize> > TGraph;
+ //Using int to support negative edge scores
+ typedef Graph<Alignment<TStringSet, int> > TGraph;
+ typedef typename Id<TGraph>::Type TId;
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ typedef double TDistanceValue;
+ std::cerr << "Mugsy WGA" << std::endl;
+ std::cerr << "Reading sequences and alignments " << std::endl;
+ std::cerr << "--distance=" << msaOpt.distance << std::endl;
+ std::cerr << "--minlength=" << msaOpt.minlength << std::endl;
+ // Initialize alignment object
+ clear(gAlign);
+ assignStringSet(gAlign, sequenceSet);
+ // Some alignment constants
+ TStringSet& seqSet = stringSet(gAlign);
+ TSize nSeq = length(seqSet);
+ TSize nGenomes=0;
+ for(TSize i=0;i<length(genomeNames);i++){
+ nGenomes = (genomeNames[i] > nGenomes) ? genomeNames[i] : nGenomes;
+ }
+ nGenomes = nGenomes+1;
+ std::cerr << "Number of genomes:" << nGenomes << std::endl;
+ std::cerr << "Number of sequences:" << nSeq << std::endl;
+ // Containers for segment matches and corresponding scores
+ typedef String<Fragment<> > TFragmentString;
+ TFragmentString matches;
+ typedef String<TScoreValue> TScoreValues;
+ TScoreValues scores;
+ // Include segment matches from subalignments
+ if (!empty(msaOpt.alnfiles)) {
+ typedef typename Iterator<String<std::string>, Standard>::Type TIter;
+ TIter begIt = begin(msaOpt.alnfiles, Standard() );
+ //TIter begItEnd = end(msaOpt.alnfiles, Standard() );
+ //Only read first alignment file
+ //for(;begIt != begItEnd; goNext(begIt)) {
+ std::cerr << "*Alignment file XMFA format: " << (*begIt).c_str() << std::endl;
+ std::ifstream strm_lib;
+ strm_lib.open((*begIt).c_str(), ::std::ios_base::in | ::std::ios_base::binary);
+ read(strm_lib, matches, scores, sequenceSet, sequenceNames, MultiFastaAlign());
+ strm_lib.close();
+ // }
+ }
+ /*
+ //TODO, read mummer for defining MUMi
+ // Include MUMmer segment matches
+ if (!empty(msaOpt.mummerfiles)){
+ std::cout << "Parsing MUMmer segment matches:" << std::endl;
+ String<char> mummerFiles = value(msaOpt.mummerfiles);
+ String<char> currentMumFile;
+ for(TSize i = 0; i<=length(mummerFiles); ++i) {
+ if ((i == length(mummerFiles) || (value(mummerFiles, i) == ','))) {
+ std::cout << "*MUMmer file: " << currentMumFile << std::endl;
+ std::stringstream input;
+ input << currentMumFile;
+ std::ifstream strm_lib;
+ strm_lib.open(input.str().c_str(), std::ios_base::in | std::ios_base::binary);
+ read(strm_lib, matches, scores, seqSet, sequenceNames, MummerLib());
+ strm_lib.close();
+ clear(currentMumFile);
+ } else {
+ if ((value(mummerFiles, i) != ' ') && (value(mummerFiles, i) != '\t')) appendValue(currentMumFile, value(mummerFiles, i));
+ }
+ }
+ std::cout << "Parsing done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ }
+ */
+ std::cerr << "Reading alignments done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ double vm, rss;
+ process_mem_usage(vm, rss);
+ cout << "VM: " << vm << "; RSS: " << rss << endl;
+ //Build StringSet for each genome
+ TStringSet1 genomeSeqSet;
+ TSize seqSetLen = length(seqSet);
+ std::vector<std::vector<TSize> > genomeMap;
+ std::map<TSize,TSize> genomeLenMap;
+ genomeMap.resize(nGenomes);
+ for(TSize i=0;i<seqSetLen;++i){
+ genomeMap[genomeNames[i]].push_back(i);
+ }
+ TSize mapLen=genomeMap.size();
+ resize(genomeSeqSet,mapLen,Exact());
+ for(TSize i=0;i<mapLen;++i){
+ std::stringstream ss;
+ //Concatenate all sequences for the genome
+ for(typename std::vector<TSize>::iterator sit = genomeMap[i].begin();sit != genomeMap[i].end();++sit){
+ append(genomeSeqSet[i],seqSet[*sit]);
+ }
+ }
+ //Save sum of genome lengths for later
+ for(unsigned int i=0;i<length(genomeSeqSet);++i){
+ genomeLenMap[i]=length(genomeSeqSet[i]);
+ }
+ std::cerr << "Building guide trees" << std::endl;
+ /*
+ // Set-up a distance matrix
+ typedef String<TDistanceValue> TDistanceMatrix;
+ TDistanceMatrix distanceMatrix;
+ clear(distanceMatrix);
+ //Calculate initial
+ //Guide tree over all genomes
+ typedef Graph<Tree<TDistanceValue> > TGuideTree;
+ TGuideTree genomeguideTree;
+ TSize ktup=3; //3mers
+ getKmerSimilarityMatrix(genomeSeqSet, distanceMatrix, ktup, TAlphabet());
+ // Similarity to distance conversion
+ typedef typename Value<TDistanceMatrix>::Type TValue;
+ typedef typename Iterator<TDistanceMatrix, Standard>::Type TMatrixIterator;
+ TMatrixIterator matIt = begin(distanceMatrix, Standard());
+ TMatrixIterator endMatIt = end(distanceMatrix, Standard());
+ for(;matIt != endMatIt;++matIt)
+ *matIt = SEQAN_DISTANCE_UNITY - (*matIt);
+ if (msaOpt.build == 0) njTree(distanceMatrix, genomeguideTree);
+ else if (msaOpt.build == 1) upgmaTree(distanceMatrix, genomeguideTree, UpgmaMin());
+ else if (msaOpt.build == 2) upgmaTree(distanceMatrix, genomeguideTree, UpgmaMax());
+ else if (msaOpt.build == 3) upgmaTree(distanceMatrix, genomeguideTree, UpgmaAvg());
+ else if (msaOpt.build == 4) upgmaTree(distanceMatrix, genomeguideTree, UpgmaWeightAvg());
+ clear(distanceMatrix);
+ */
+ std::cerr << "Building guide trees done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ //*******
+ //Build alignment graph
+ //
+ // Use these segment matches for the initial alignment graph
+ std::cerr << "Building alignment graph from " << length(matches) << " matches" << std::endl;
+ TGraph g(seqSet);
+ if (!msaOpt.rescore) buildAlignmentGraph(matches, scores, g, FractionalScore() );
+ else buildAlignmentGraph(matches, scores, g, msaOpt.sc, ReScore() );
+ //clear these here to save memory
+ clear(matches);
+ clear(scores);
+ std::cerr << "Building alignment graph done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ process_mem_usage(vm, rss);
+ cout << "VM: " << vm << "; RSS: " << rss << endl;
+ std::cerr << std::endl << "Refined alignment graph built. E: " << numEdges(g) << " V:" << numVertices(g) << std::endl;
+ //Stats
+ typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
+ TVertexIterator itV(g);
+ unsigned totalmatchingbp=0;
+ unsigned totalseqlen=0;
+ for(;!atEnd(itV);goNext(itV)){
+ if(degree(g,*itV)>0){
+ totalmatchingbp+=fragmentLength(g,*itV);
+ }
+ }
+ for(unsigned int i=0;i<seqSetLen;i++){
+ totalseqlen+=length(seqSet[i]);
+ }
+ std::cerr << "Average fragment length: "
+ << (float)(totalmatchingbp/numVertices(g))
+ << "bp" << std::endl;
+ std::cerr << "Percentage matching bp:"
+ << totalmatchingbp << "/" << totalseqlen
+ << "=" << (float)totalmatchingbp/totalseqlen
+ << std::endl;
+ //Calculate a distance measure similar to MUMi
+ //Print range
+ std::map<std::pair<int,int>,int > flengths;
+ std::map<std::pair<int,int>,int>::iterator pos;
+ typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
+ TEdgeIterator itE(g);
+ TVertexDescriptor source,target;
+ TEdgeDescriptor ed;
+ bool inserted=false;
+ TSize sgen,tgen;
+ for(;!atEnd(itE);goNext(itE)){
+ ed = *itE;
+ source = getSource(ed);
+ target = getTarget(ed);
+ sgen = genomeNames[sequenceId(g,source)];
+ tgen = genomeNames[sequenceId(g,target)];
+ if(sgen>tgen){
+ TSize tmp=sgen;
+ sgen=tgen;
+ tgen=tmp;
+ }
+ assert(fragmentLength(g,source)==fragmentLength(g,target));
+ //TODO, fix may count same vertex more than once
+ boost::tie(pos, inserted) = flengths.insert(std::make_pair(std::make_pair(sgen,tgen),fragmentLength(g,source)));
+ if(!inserted){
+ pos->second+=fragmentLength(g,source);
+ }
+ }
+ float minnia=2;
+ float maxnia=0;
+ float minnim=2;
+ float maxnim=0;
+ for(std::map<std::pair<int,int>,int>::iterator mit = flengths.begin();mit!=flengths.end();++mit){
+ float avgsize = (genomeLenMap[mit->first.first]+genomeLenMap[mit->first.second])/2;
+ float minsize = genomeLenMap[mit->first.first] < genomeLenMap[mit->first.second] ? genomeLenMap[mit->first.first] : genomeLenMap[mit->first.second];
+ assert(avgsize>0);
+ float nia = 1 - (mit->second/avgsize);
+ float nim = 1 - (mit->second/minsize);
+ minnia = (minnia < nia) ? minnia : nia;
+ maxnia = (maxnia > nia) ? maxnia : nia;
+ minnim = (minnim < nim) ? minnim : nim;
+ maxnim = (maxnim > nim) ? maxnim : nim;
+ }
+ flengths.clear();
+ //clear(genomeSeqSet);
+ std::cerr << "D=1-Lseq/Lavg min-max: " << minnia << "-" << maxnia << std::endl;
+ std::cerr << "D=1-Lseq/Lmin min-max: " << minnim << "-" << maxnim << std::endl;
+ //
+ std::fstream rawstrm;
+ rawstrm.open("origrefinegraph.out", std::ios_base::out | std::ios_base::trunc);
+ write(rawstrm,g,sequenceSet,Raw());
+ rawstrm.close();
+ //*******
+ //
+ //TODO
+ //testing partitioning
+ //This partitioning is just for testing.
+ //The refined graph is already partitioned.
+ //Ideally matches could be filtered and partitioned prior to building the large alignment graph
+ //Convert graph back to matches
+ /*
+ TFragmentString filtmatches;
+ std::vector<TFragmentString> matchSets;
+ if(doPartitioning){
+ typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
+ TEdgeIterator itE(g);
+ for(;!atEnd(itE);goNext(itE)){
+ TEdgeDescriptor ed = *itE;
+ TVertexDescriptor vd1 = getSource(ed);
+ TVertexDescriptor vd2 = getTarget(ed);
+ appendValue(filtmatches, Fragment<>(sequenceId(g,vd1),
+ fragmentBegin(g,vd1),
+ sequenceId(g,vd2),
+ fragmentBegin(g,vd2),
+ fragmentLength(g,vd1),
+ ((int)(cargo(ed)<0)) ? true : false));
+ }
+ partitionSegments(seqSet,filtmatches,matchSets,msaOpt.partition);
+ for(typename std::vector<TFragmentString>::iterator it = matchSets.begin();
+ it!=matchSets.end();it++){
+ String<Fragment<> > matchset = *it;
+ std::cout << "Length of matchset " << length(matchset) << std::endl;
+ }
+ }
+ else{
+ //Using full graph
+ }
+ */
+ std::fstream strmmaf;
+ std::string outfile(msaOpt.outfile);
+ strmmaf.open(std::string(outfile+".maf").c_str(), std::ios_base::out | std::ios_base::trunc);
+ _streamWrite(strmmaf,"##maf version=1 scoring=mugsy");
+ typedef String<unsigned int> TComponentMap;
+ typedef typename Value<TComponentMap>::Type TComponent;
+ typedef typename Position<TGraph>::Type TPos;
+ typedef SVABlock<TComponent,unsigned,TVertexDescriptor,unsigned> TBlock;
+ std::vector<std::vector<TVertexDescriptor> > lcbs;
+ std::map<TVertexDescriptor,char> vertexOrientMap;
+ //LCBs are saved in lcbsp
+ //Optionally, can also store profiles and write directly to strmmaf
+ wholeGenomeAlignment(g,
+ seqSet,
+ genomeSeqSet,
+ sequenceNames,
+ genomeNames,
+ msaOpt,
+ lcbs,
+ vertexOrientMap,
+ strmmaf,
+ aintervals);
+ //Close out MAF
+ strmmaf << std::endl;
+ //loop over all profiles and print
+ //Print all remaining unaligned sequences
+ //TODO broken in refactor, fix
+ if(msaOpt.unique == "true"){
+ printUniques(seqSet,sequenceNames,aintervals,strmmaf);
+ }
+ //Close output streams
+ strmmaf.close();
+ std::cerr << "Mugsy all done" << std::endl;
+//Input all pairwise matches in duplicated regions
+//Build refinement graph to reduce into non-overlapping segments
+//Sort over each sequence and build runs of regions < DUP_ADJ
+//Save runs of length > DUP_CMB
+template<typename TStringSet,
+ typename TCargo,
+ typename TSpec,
+ typename TStringSet1,
+ typename TNames,
+ typename TGenomeNames,
+ typename TScore,
+ typename TIntervals>
+inline void
+findDuplications(Graph<Alignment<TStringSet, TCargo, TSpec> >& gAlign,
+ TStringSet1& sequenceSet,
+ TNames& sequenceNames,
+ TGenomeNames& genomeNames,
+ TIntervals& dupintervals,
+ MsaOptions<Dna5 , TScore> const& msaOpt)
+ typedef Dna5 TAlphabet;
+ typedef typename Value<TScore>::Type TScoreValue;
+ typedef typename Size<TStringSet>::Type TSize;
+ typedef typename Value<TStringSet1>::Type TString;
+ typedef typename Value<TNames>::Type TName;
+ //Using int to support negative edge scores
+ typedef Graph<Alignment<TStringSet, int> > TGraph;
+ typedef typename Id<TGraph>::Type TId;
+ typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
+ typedef typename EdgeDescriptor<TGraph>::Type TEdgeDescriptor;
+ //
+ typedef std::map<unsigned int, unsigned int> TComponentLength;
+ // Strongly Connected Components, topological sort, and length of each component
+ typedef String<unsigned int> TComponentMap;
+ typedef typename Value<TComponentMap>::Type TComponent;
+ typedef typename Position<TGraph>::Type TPos;
+ typedef SVABlock<TComponent,TSize,TVertexDescriptor,TPos> TBlock;
+ typedef typename Value<TComponentMap>::Type TComponent;
+ typedef std::pair<TId, TSize> TKey;
+ typedef std::map<TKey, TVertexDescriptor> TPosToVertexMap;
+ typedef FragmentInfo<TId, TSize> TFragmentInfo;
+ typedef double TDistanceValue;
+ std::cerr << "Detecting duplications " << std::endl;
+ // Initialize alignment object
+ clear(gAlign);
+ assignStringSet(gAlign, sequenceSet);
+ // Some alignment constants
+ TStringSet& seqSet = stringSet(gAlign);
+ TSize nSeq = length(seqSet);
+ TSize nGenomes=0;
+ for(TSize i=0;i<length(genomeNames);i++){
+ nGenomes = (genomeNames[i] > nGenomes) ? genomeNames[i] : nGenomes;
+ }
+ nGenomes = nGenomes+1;
+ std::cerr << "Number of genomes:" << nGenomes << std::endl;
+ std::cerr << "Number of sequences:" << nSeq << std::endl;
+ // Set-up a distance matrix
+ typedef String<TDistanceValue> TDistanceMatrix;
+ TDistanceMatrix distanceMatrix;
+ // Containers for segment matches and corresponding scores
+ typedef String<Fragment<> > TFragmentString;
+ TFragmentString matches;
+ typedef String<TScoreValue> TScoreValues;
+ TScoreValues scores;
+ // Include segment matches from subalignments
+ if (!empty(msaOpt.alnfiles)) {
+ typedef typename Iterator<String<std::string>, Standard>::Type TIter;
+ TIter begIt = begin(msaOpt.alnfiles, Standard() );
+ TIter begItEnd = end(msaOpt.alnfiles, Standard() );
+ goNext(begIt);//alignment XMFA is second alignment file passed
+ for(;begIt != begItEnd; goNext(begIt)) {
+ std::cerr << "*Alignment file XMFA format: " << (*begIt).c_str() << std::endl;
+ std::ifstream strm_lib;
+ strm_lib.open((*begIt).c_str(), ::std::ios_base::in | ::std::ios_base::binary);
+ //defined in graph_align_tcoffee_io.h
+ read(strm_lib, matches, scores, sequenceSet, sequenceNames, MultiFastaAlign());
+ strm_lib.close();
+ //clear(alignmentFile);read(strm_lib, matches, scores, sequenceNames, FastaAlign());
+ }
+ }
+ else{
+ assert(false);
+ }
+ std::cerr << "Building alignment graph" << std::endl;
+ TGraph g(seqSet);
+ //defined in graph_align_tcoffee_base.h
+ if (!msaOpt.rescore) buildAlignmentGraph(matches, scores, g, FractionalScore() );
+ else buildAlignmentGraph(matches, scores, g, msaOpt.sc, ReScore() );
+ //clear(matches);
+ //clear(scores);
+ std::cerr << "Building alignment graph done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ std::cerr << std::endl << "Refined alignment graph built. E: " << numEdges(g) << " V:" << numVertices(g) << std::endl;
+ //Stats
+ typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
+ TVertexIterator itV(g);
+ unsigned totalmatchingbp=0;
+ unsigned totalseqlen=0;
+ TSize seqSetLen = length(seqSet);
+ for(;!atEnd(itV);goNext(itV)){
+ if(degree(g,*itV)>0){
+ totalmatchingbp+=fragmentLength(g,*itV);
+ }
+ }
+ for(unsigned int i=0;i<seqSetLen;i++){
+ totalseqlen+=length(seqSet[i]);
+ }
+ std::cerr << "Average fragment length: "
+ << (float)(totalmatchingbp/numVertices(g))
+ << "bp" << std::endl;
+ std::cerr << "Percentage matching bp:"
+ << totalmatchingbp << "/" << totalseqlen
+ << "=" << (float)totalmatchingbp/totalseqlen
+ << std::endl;
+ std::fstream rawstrm;
+ rawstrm.open("origrefinegraph.out", std::ios_base::out | std::ios_base::trunc);
+ write(rawstrm,g,sequenceSet,Raw());
+ rawstrm.close();
+ std::cout << "Finding connected components " << std::endl;
+ std::cerr << "Determining duplicated regions" << std::endl;
+ //Sequence set that will capture each copy of the duplication
+ TStringSet1 runSeqSet;
+ //StringSet<TString, TSpec> runSeqSet;
+ Graph<Directed<> > runG;
+ // Connected Components
+ // Each CC represents an UNGAPPED set of aligned fragments/segments across sequences
+ // A CC is an ungapped block
+ // A CC is also an LCB at this point of the algorithm but may be extended
+ TComponentMap componentall;
+ std::map<std::pair<TComponent,TComponent>,TBlock *> componentVertexMap;
+ std::vector<std::vector<TBlock> > blocksbycomponent;
+ //TSize numComponents = connected_components(g, componentall);
+ //TSize numComponents = connected_components_by_genome_ranked(g, componentall, genomeNames,std::numeric_limits<unsigned int>::max());
+ TSize numComponents = connected_components_ranked(g, componentall);
+ std::cerr << "Determined " << numComponents << " component segments in graph of size " << numVertices(g) << std::endl;
+ assert(numComponents>0);
+ //std::cerr << "Calculating positional scores" <<std::endl;
+ //scorePosCons(g,componentall,numComponents,posScores,POS_ADJ);
+ //std::cerr << "Set positional scores" << std::endl;
+ //Identify runs
+ int POS_CMB = 100;
+ std::map<TSize,std::vector<TVertexDescriptor> > componentSeqMap;
+ std::map<TComponent,std::vector<TVertexDescriptor> > componentMap;
+ typename TPosToVertexMap::const_iterator it2 = g.data_pvMap.begin();
+ typename TPosToVertexMap::const_iterator it2End = g.data_pvMap.end();
+ for(it2 = g.data_pvMap.begin();it2!=it2End;++it2) {
+ TVertexDescriptor currV = it2->second;
+ assert(getProperty(componentall,currV)==componentall[currV]);
+ TSize currentSeq = sequenceId(g,currV);
+ typename std::map<TSize,std::vector<TVertexDescriptor> >::iterator fit = componentSeqMap.find(currentSeq);
+ if(fit==componentSeqMap.end()){
+ componentSeqMap[currentSeq] = std::vector<TVertexDescriptor>();
+ }
+ componentSeqMap[currentSeq].push_back(currV);
+ TComponent c = getProperty(componentall, currV);
+ if(componentMap.find(c)==componentMap.end()){
+ componentMap[c] = std::vector<TVertexDescriptor>();
+ }
+ componentMap[c].push_back(currV);
+ }
+ std::map<TSize,TSize> seqIdxMap;
+ std::map<TVertexDescriptor,TSize> runmap;
+ std::map<TSize,std::vector<TVertexDescriptor> > vrunmap;
+ int runcount=0;
+ for(typename std::map<TSize,std::vector<TVertexDescriptor> >::iterator it = componentSeqMap.begin();it!=componentSeqMap.end();++it){
+ std::set<std::pair<int,int> > runs;
+ TSize currentSeq = it->first;
+ //std::cout << "Examining sequence " << currentSeq
+ // << " with num vertices:" << it->second.size() << std::endl;
+ //Sort vertices in G on sequence currentSeq
+ std::set<TSize> repeatCC;
+ std::vector<TVertexDescriptor> vlist;
+ int lastcoord=0;
+ int runstart=0;
+ int runend=0;
+ sort(it->second.begin(),it->second.end(),vertexposcmp<TGraph>(g));
+ for(typename std::vector<TVertexDescriptor>::iterator vit = it->second.begin();vit!=it->second.end();++vit){
+ TVertexDescriptor currV = *vit;
+ TComponent c = getProperty(componentall, *vit);
+ //Only consider segments that are part of matches
+ if(componentMap[c].size()>1){
+ if(lastcoord>0){
+ int dist = fragmentBegin(g,*vit)-lastcoord;
+ //it->first.first is CC label
+ if(dist>(int)POS_CMB
+ || repeatCC.find(c)!=repeatCC.end()){
+ runend = lastcoord;
+ if(runend - runstart > POS_CMB){
+ //std::cout << runstart << " runstart " << runstart << " runend: " << runend << " len:" << runend - runstart << std::endl;
+ runs.insert(std::make_pair(runstart,runend));
+ for(unsigned i=0;i<vlist.size();++i){
+ runmap[vlist[i]] = runcount;
+ assert(sequenceId(g,vlist[i])==currentSeq);
+ }
+ if(vrunmap.find(runcount)==vrunmap.end()){
+ vrunmap[runcount] = std::vector<TVertexDescriptor>();
+ }
+ vrunmap[runcount].insert(vrunmap[runcount].end(),vlist.begin(),vlist.end());
+ addVertex(runG);
+ seqIdxMap[runcount]=currentSeq;
+ runcount++;
+ }
+ repeatCC.clear();
+ vlist.clear();
+ runstart = fragmentBegin(g,*vit);
+ }
+ }
+ lastcoord = fragmentBegin(g,*vit)+fragmentLength(g,*vit);
+ //it->first.first is CC labe
+ repeatCC.insert(c);
+ vlist.push_back(currV);
+ //std::cout << "Last coord:" << lastcoord << " component:" << c << " size:" << componentMap[c].size() << std::endl;
+ }
+ }
+ //assert(runcount+1==runs.size());
+ //A run is a list of CC
+ //Create a new seqSet that contains all the runs
+ //for(typename std::set<std::pair<int,int> >::iterator rit = runs.begin();rit != runs.end();++rit){
+ //std::cout << "Seq: " << currentSeq
+ // << rit->first << "-" << rit->second << " "
+ // << rit->second - rit->first << std::endl;
+ //TString newseq=seqSet[i];
+ //For each run, create a new seqset
+ //appendValue(runSeqSet,seqSet[currentSeq]);
+ //addVertex(runG);
+ //}
+ //std::cout << "Current seq run count " << runcount << " " << " . Total runs " << runs.size() << std::endl;
+ }
+ //
+ //Each run represents a copy of a duplicated region
+ //We will determine the copies that need to be aligned and store them in an LCB.
+ //Also stated, an LCB is a set of runs, where each run is a set of vertices in G(0).
+ //
+ //Determining the LCBs that represent duplications
+ //
+ //After the runs have been defined
+ //an LCB is simply a list of all vertices in G(0) reachable in the run; ie. all members of the CCs in that run
+ //
+ //
+ //To obtain the list of runs that comprise an LCB:
+ //Build a graph G(l) where each node is a runidx and an edge connects any two runidx that share a ccidx from G(0)
+ //To find the LCBs, simply determine the connected components in this graph G(l). Each component defines corresponds to an LCB
+ //and the list of vertices is obtained from G(0) using ccidx at each node
+ //foreach run in runs
+ // addvertex(run,g.l)
+ //
+ //foreach v1 in cc
+ // foreach v2 in cc
+ // if(runmap[v1]!=runmap[v2])
+ // addedge(runv1,runv2,g.l)
+ //
+ //numcc = connected_components(rccmap,g.l)
+ //lcbs.resize(numcc)
+ //
+ //foreach runv (g.l)
+ // //save all vertices associated with
+ // lcbidx <- rccmap[runv]
+ // lcbs[lcbidx].push_back(runv)
+ //Next, the vertices need to be updated to map to unique sequence ids for each run.
+ //foreach lcb (lcbs)
+ // numruns <- lcbs[lcb].size()
+ // Built new set set
+ // Build a new graph
+ // foreach run (lcbs[lcb])
+ // foreach v (vmap[run])
+ // seqIdMap[v] = runmap[v]
+ // lcbv.push_back(v)
+ //At this point, runSeqSet should be populated
+ //graph G(0) should be updated to reference sequence ids in runSeqSet
+ //seqIdMap should be an identity map
+ //retrieveLCBSegments()
+ //Probably not necessary
+ //foreach match (currmatches)
+ // fragment(match,0).seq = sequenceid[fragment(match,0).seq]
+ // fragment(match,1).seq = sequenceid[fragment(match,1).seq]
+ //buildAlignmentGraph()
+ TComponentMap runccmap;
+ //Add edges between any 2 runs that are connected in G(0)
+ //Edge iterator
+ typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
+ TEdgeIterator itE1(g);
+ for(;!atEnd(itE1);++itE1){
+ TEdgeDescriptor ed = *itE1;
+ TVertexDescriptor source = getSource(ed);
+ TVertexDescriptor target = getTarget(ed);
+ if(runmap.find(source)!=runmap.end()
+ && runmap.find(target)!=runmap.end()){
+ if(runmap[source]!=runmap[target]){
+ addEdge(runG,runmap[source],runmap[target]);
+ }
+ else{
+ //std::cout << runmap[source] << " " << runmap[target] << " s:" << sequenceId(g,source) << " " << sequenceId(g,target) <<std::endl;
+ assert(runmap[source]==runmap[target]);
+ assert(sequenceId(g,source)==sequenceId(g,target));
+ }
+ }
+ }
+ int numcc = connected_components(runG,runccmap);
+ std::cerr << "Determined " << numcc << " LCBs from a graph of runs: " << numVertices(runG) << std::endl;
+ typedef std::vector<TVertexDescriptor> TLCB;
+ std::vector<std::vector<TVertexDescriptor> > runs;
+ std::vector<std::vector<TVertexDescriptor> > LCBs;
+ //List of runs in an LCB
+ runs.resize(numcc);
+ //List of vertices in an LCB
+ LCBs.resize(numcc);
+ for(unsigned i=0;i<numVertices(runG);i++){
+ int lcbidx = runccmap[i];
+ //std::cout << "LCB " << lcbidx << " contains run " << i << std::endl;
+ runs[lcbidx].push_back(i);
+ }
+ std::map<TSize,TSize> seqIdMap;
+ //std::cout << "LCBs " << runs.size() << std::endl;
+ for(unsigned j=0;j<runs.size();j++){
+ int lcbidx = j;
+ int numruns = runs[j].size();
+ //std::cout << "LCB " << j << " runs " << numruns << std::endl;
+ for(int k=0;k<numruns;k++){
+ int runidx = runs[j][k];
+ LCBs[lcbidx].insert(LCBs[lcbidx].end(),vrunmap[runidx].begin(),vrunmap[runidx].end());
+ for(unsigned i=0;i<vrunmap[runidx].size();i++){
+ TVertexDescriptor currV = vrunmap[runidx][i];
+ //std::cout << "CurrV: " << currV << " runidx: " << runidx << " " << " seq:" << sequenceId(g,currV) << std::endl;
+ g.data_fragment[currV].data_seq_id = runidx;
+ //std::cout << "CurrV: " << currV << " runidx: " << runidx << " " << " seq:" << sequenceId(g,currV) << std::endl;
+ assert(sequenceId(g,currV)==(unsigned)runidx);
+ }
+ seqIdMap[runidx] = runidx;
+ }
+ }
+ //Set vertex orientation
+ std::map<TVertexDescriptor,char> vertexOrientMap;
+ typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
+ TVertexIterator itV2(g);
+ for(;!atEnd(itV2);goNext(itV2)){
+ vertexOrientMap[*itV2] = '+';
+ }
+ //Trim graph and remove vertices that are not part of runs
+ typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
+ TVertexIterator itv(g);
+ std::vector<TVertexDescriptor> removeV;
+ for(;!atEnd(itv);goNext(itv)) {
+ TVertexDescriptor currV = *itv;
+ if(runmap.find(currV)==runmap.end()){
+ removeV.push_back(currV);
+ }
+ }
+ for(typename std::vector<TVertexDescriptor>::iterator vit = removeV.begin();vit!=removeV.end();++vit){
+ removeVertex(g,*vit);
+ }
+ //At this point, runSeqSet should be populated
+ //graph G(0) should be updated to reference sequence ids in runSeqSet
+ //seqIdMap should be an identity map
+ resize(runSeqSet,numVertices(runG));
+ TNames sequenceRunsNames;
+ //resize(sequenceRunsNames,numVertices(runG));
+ for(unsigned int i=0;i<numVertices(runG);i++){
+ std::string name(toCString(sequenceNames[seqIdxMap[i]]));
+ std::string count(boost::lexical_cast<std::string>(i));
+ appendValue(sequenceRunsNames,name+"_"+count);
+ }
+ //For tracking substrings
+ //std::map<TSize,unsigned int> offsets;
+ //std::map<TSize,unsigned int> spanlens;
+ //std::map<TSize,unsigned int> seqlens;
+ //std::map<TSize,char> orients;
+ typedef unsigned TSize2;
+ std::map<TSize2,s_offset> offsets;
+ //Copy links between set of vertices in LCB $lit
+ //from Graph $g and store in $currmatches,$currscores,$currseqs
+ blocksbycomponent.resize(numComponents);
+ convertCC2Blocks(g,
+ componentall,
+ componentVertexMap,
+ blocksbycomponent,
+ dupintervals,
+ sequenceNames);
+ std::cerr << "Determining duplicated regions done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+ std::cerr << "Aligning duplicated regions" << std::endl;
+ unsigned int lcbid=0;
+ std::fstream strmmaf;
+ std::string outfile(msaOpt.outfile);
+ strmmaf.open(std::string(outfile+".dups.maf").c_str(), std::ios_base::out | std::ios_base::trunc);
+ _streamWrite(strmmaf,"##maf version=1 scoring=mugsy");
+ //std::cout << "Iterating over LCBs" << std::endl;
+ for(typename std::vector<std::vector<TVertexDescriptor> >::iterator lit = LCBs.begin();lit!=LCBs.end();lit++){
+ //Matches, scores, seqs, ids for current LCB
+ typedef String<Fragment<> > TFragmentString;
+ typedef typename Value<TScore>::Type TScoreValue;
+ typedef String<TScoreValue> TScoreValues;
+ typedef String<TAlphabet> TSequence;
+ TFragmentString currmatches;
+ TScoreValues currscores;
+ StringSet<TSequence, Owner<> > currseqs;
+ std::set<unsigned int> curridset;
+ TNames currnameSet;
+ std::set<TVertexDescriptor> coveredSet;
+ std::vector<std::vector<TVertexDescriptor> > vseqs;
+ retrieveLCBSegments(g,
+ runSeqSet,
+ seqSet,
+ seqIdxMap,
+ vertexOrientMap,
+ lit,
+ ++lcbid,
+ sequenceRunsNames,
+ currseqs,
+ currmatches,
+ currscores,
+ currnameSet,
+ offsets,
+ coveredSet,
+ vseqs,
+ boost::lexical_cast<unsigned int>(msaOpt.minlength));
+ //std::cout << "Retrieving LCB segments for LCB " << lcbid << std::endl;
+ if(length(currseqs)>1 && length(currmatches)>0){
+ TGraph currG(currseqs);
+ buildAlignmentGraph(currmatches, currscores, currG, FractionalScore());
+ //Double check edge weights
+ typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
+ TEdgeIterator itE(currG);
+ //Undo Hack that stores reverse complement matches using
+ //negative edge weights
+ for(;!atEnd(itE);goNext(itE)){
+ if(cargo(value(itE))<0){
+ cargo(value(itE)) = cargo(value(itE))*-1;
+ }
+ }
+ typedef double TDistanceValue;
+ Graph<Tree<TDistanceValue> > currguideTree;
+ Graph<Tree<TDistanceValue> > seqguideTree;
+ std::map<std::string, Graph<Tree<TDistanceValue> > > seqguideTrees;
+ //Build guide tree using current list of seqs
+ for(unsigned int i=0;i<length(currseqs);i++){
+ //std::cout << i << " " << length(currseqs[i]) << " " << ((curridset.find(i)!=curridset.end()) ? 1 : 0) << " " << currnameSet[i]<< std::endl;
+ curridset.insert(i);//force inclustion of this seq in building the guide tree
+ }
+ getGuideTree(currseqs,curridset,seqguideTrees,currguideTree);
+ typedef Fragment<> TFragment;
+ typedef String<TAlphabet> TSequence;
+ std::cout << "Aligning LCB " << lcbid << " with " << length(currseqs) << std::endl;
+ assert(curridset.size()>0);
+ TGraph currgOut(currseqs);
+ s_score sscores = alignSingleLCB(currG,
+ currgOut,
+ lcbid,
+ currseqs,
+ currguideTree,
+ msaOpt);
+ //Write MAF format
+ std::vector<unsigned int> curroffsets;
+ std::vector<unsigned int> currspanlens;
+ std::vector<unsigned int> currseqlens;
+ std::vector<char> currorients;
+ assert(length(currseqs)==length(currnameSet));
+ currorients.resize(length(currnameSet));
+ currseqlens.resize(length(currnameSet));
+ currspanlens.resize(length(currnameSet));
+ curroffsets.resize(length(currnameSet));
+ //TODO, refactor using a id map
+ for(TSize currrow = 0; currrow<length(currnameSet); ++currrow) {
+ for(TSize row = 0; row<length(sequenceRunsNames); ++row) {
+ if(currnameSet[currrow]==sequenceRunsNames[row]){
+ curroffsets[currrow] = offsets[row].offset;
+ currspanlens[currrow] = offsets[row].spanlen;
+ currseqlens[currrow] = offsets[row].seqlen;
+ currorients[currrow] = offsets[row].orient;
+ //reset name
+ currnameSet[currrow] = sequenceNames[seqIdxMap[row]];
+ }
+ }
+ }
+ saveInterval(dupintervals,
+ currnameSet,
+ curroffsets,
+ currspanlens,
+ currseqlens,
+ currorients,
+ lcbid,
+ true);
+ //mafformat defined in refinement/graph_impl_align.h
+ write(strmmaf,currgOut,currnameSet,MafFormat(),curroffsets,currspanlens,currseqlens,currorients,"label=d"+boost::lexical_cast<std::string>(lcbid));
+ strmmaf.flush();
+ }
+ }
+ strmmaf.close();
+ //Alignments of many duplicated regions tend to be fragmented on first pass
+ //Consider running refinement by default
+ //refineMSA(std::string(outfile+".dups.maf").c_str(),msaOpt);
+ //TODO
+ //Resolve repetitive clusters here
+ //(1)break edges with weak support from adjacent matches
+ //(2)determine mincut on repeatitive clusters
+ typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
+ std::fstream dotstrm;
+ dotstrm.open("refinegraphpos.dot", std::ios_base::out | std::ios_base::trunc);
+ dotstrm << "graph g{" << std::endl;
+ typedef typename Iterator<TGraph, VertexIterator>::Type TVertexIterator;
+ TVertexIterator it(g);
+ for(;!atEnd(it);goNext(it)) {
+ dotstrm << *it << " [label=\"" << *it << " S" << sequenceId(g,*it) << ","<<fragmentBegin(g,*it) << ","<<fragmentLength(g,*it) << "\"];" << std::endl;
+ }
+ std::cerr << "Aligning duplicated regions done: " << SEQAN_PROTIMEUPDATE(__myProfileTime) << " seconds" << std::endl;
+inline void
+_addVersion(CommandLineParser& parser) {
+ ::std::string rev = "$Revision: 4637 $";
+ addVersionLine(parser, "Version 1.00 (10 Oct 2009) Revision: " + rev.substr(11, 4) + "");
+template <typename TSeqSet, typename TNameSet>
+bool _loadSequences(TSeqSet& sequences,
+ TNameSet& fastaIDs,
+ TNameSet& genomes,
+ const char *fileName)
+ assert(length(genomes)==0);
+ MultiFasta multiFasta;
+ if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) return false;
+ AutoSeqFormat format;
+ guessFormat(multiFasta.concat, format);
+ split(multiFasta, format);
+ unsigned seqCount = length(multiFasta);
+ resize(sequences, seqCount, Exact());
+ resize(fastaIDs, seqCount, Exact());
+ resize(genomes, seqCount, Exact());
+ unsigned skippedseqCount = 0;
+ for(unsigned i = 0; i < seqCount; ++i)
+ {
+ char seqname[100],genomename[100];
+ std::string idline;
+ assignSeqId(idline, multiFasta[i], format);
+ int matches = sscanf(idline.c_str(),"%s %s",seqname,genomename);
+ if(matches==2){
+ fastaIDs[i]=seqname;
+ genomes[i]=genomename;
+ }
+ else{
+ assignSeqId(fastaIDs[i], multiFasta[i], format);
+ assignSeqId(genomes[i], multiFasta[i],format);
+ }
+ assignSeq(sequences[i], multiFasta[i], format);
+ //SVA check for bad inputs here < kmer size
+ if(length(sequences[i])<3){
+ skippedseqCount++;
+ }
+ }
+ if(skippedseqCount>0){
+ clear(sequences);
+ clear(fastaIDs);
+ clear(genomes);
+ seqCount = length(multiFasta)-skippedseqCount;
+ //std::cerr << "Updated seqCount " << seqCount << ". Skipping" << skippedseqCount << std::endl;
+ resize(sequences, seqCount, Exact());
+ resize(fastaIDs, seqCount, Exact());
+ resize(genomes, seqCount, Exact());
+ unsigned sidx=0;
+ String<char> testseq;
+ unsigned oseqCount = length(multiFasta);
+ for(unsigned i = 0; i < oseqCount; ++i)
+ {
+ assignSeq(testseq, multiFasta[i], format);
+ //SVA check for bad inputs here < kmer size
+ if(length(testseq)>=3){
+ char seqname[100],genomename[100];
+ std::string idline;
+ assignSeqId(idline, multiFasta[i], format);
+ int matches = sscanf(idline.c_str(),"%s %s",seqname,genomename);
+ if(matches==2){
+ fastaIDs[sidx]=seqname;
+ genomes[sidx]=genomename;
+ }
+ else{
+ assignSeqId(fastaIDs[sidx], multiFasta[i], format);
+ assignSeqId(genomes[sidx], multiFasta[i],format);
+ }
+ assignSeq(sequences[sidx], multiFasta[i], format);
+ sidx++;
+ }
+ else{
+ std::cerr << "Skipping sequence of length " << length(testseq) << std::endl;
+ }
+ }
+ assert(sidx==seqCount);
+ }
+ return (seqCount > 0);
+template<typename TAlphabet, typename TScore>
+inline void
+customizedMsaAlignment(MsaOptions<TAlphabet, TScore> const& msaOpt) {
+ typedef String<TAlphabet> TSequence;
+ StringSet<TSequence, Owner<> > sequenceSet;
+ StringSet<String<char> > sequenceNames;
+ StringSet<String<char> > genomeNames;
+ _loadSequences(sequenceSet, sequenceNames, genomeNames, msaOpt.seqfile.c_str());
+ assert(length(sequenceNames)==length(sequenceSet));
+ for(unsigned int j = 0; j<length(sequenceNames); ++j) {
+ std::cout << j << " " << sequenceNames[j] << std::endl;
+ assert(value(sequenceNames,j)==sequenceNames[j]);
+ }
+ // Alignment of the sequences
+ Graph<Alignment<StringSet<TSequence, Dependent<> >, void, WithoutEdgeId> > gAlign;
+ typedef unsigned int TSize;
+ TSize gidx=0;
+ std::map<String<char>,TSize> genomeIdx;
+ String<TSize> genomeIndices;
+ //Convert Names to indicies
+ for(TSize i=0;i<length(genomeNames);++i){
+ TSize cidx;
+ if(genomeIdx.find(genomeNames[i])==genomeIdx.end()){
+ genomeIdx[genomeNames[i]]=gidx;
+ cidx=gidx;
+ gidx++;
+ }
+ else{
+ cidx=genomeIdx[genomeNames[i]];
+ }
+ appendValue(genomeIndices,cidx);
+ }
+ // Calc MSA
+ //Aligned intervals, used to determine remaining segments unaligned
+ typedef iloc TLoc;
+ std::map<String<char>,std::vector<TLoc> > aintervals;
+ if(msaOpt.duplications == "true"){
+ findDuplications(gAlign, sequenceSet, sequenceNames, genomeIndices, aintervals, msaOpt);
+ }
+ //if(msaOpt.partition >= 2){
+ //Testing new code
+ //Prototype only
+ //multipassprog_wholeGenomeAlignment(gAlign, sequenceSet, sequenceNames, genomeIndices, msaOpt);
+ //std::cerr << "Bad partition parameter " << msaOpt.partition << std::endl;
+ //exit(1);
+ //}
+ //else{
+ singlepass_wholeGenomeAlignment(gAlign, sequenceSet, sequenceNames, genomeIndices, aintervals, msaOpt);
+ //}
+ // Alignment output
+ if (msaOpt.outputFormat == 0) {
+ FILE* strmWrite = fopen(msaOpt.outfile.c_str(), "w");
+ write(strmWrite, gAlign, sequenceNames, FastaFormat());
+ fclose(strmWrite);
+ } else if (msaOpt.outputFormat == 1) {
+ FILE* strmWrite = fopen(msaOpt.outfile.c_str(), "w");
+ write(strmWrite, gAlign, sequenceNames, MsfFormat());
+ fclose(strmWrite);
+ }
+template<typename TAlphabet, typename TScore, typename TSc>
+inline void
+_setMatchScore(MsaOptions<TAlphabet, TScore>&, TSc) {
+ // No operation
+template<typename TAlphabet, typename TScore, typename TSc>
+inline void
+_setMismatchScore(MsaOptions<TAlphabet, TScore>&, TSc) {
+ // No operation
+template<typename TAlphabet, typename TSc>
+inline void
+_setMatchScore(MsaOptions<TAlphabet, Score<int, Simple> >& msaOpt, TSc msc) {
+ msaOpt.sc.data_match = msc;
+template<typename TAlphabet, typename TSc>
+inline void
+_setMismatchScore(MsaOptions<TAlphabet, Score<int, Simple> >& msaOpt, TSc mmsc) {
+ msaOpt.sc.data_mismatch = mmsc;
+template<typename TConfigOptions, typename TScore>
+inline void
+evaluateAlignment(TConfigOptions const& cfgOpt, TScore const& scType, Dna5) {
+ std::fstream strmmaf;
+ //FILE * strmmafrefined;
+ struct mafFile *mf;
+ mf = mafOpen(cfgOpt.infile.c_str(), 0);
+ struct mafAli *a, *A, *last_a;
+ struct mafComp *c;
+ A = last_a = NULL;
+ while ((a = mafNext(mf)) != NULL) {
+ if ((c = a->components) == NULL)
+ assert(false);//fatal("empty maf entry");
+ if (last_a == NULL)
+ A = a;
+ else
+ last_a->next = a;
+ last_a = a;
+ }
+ if(A==NULL){
+ std::cout << "can't find any alignments" << std::endl;
+ }
+ else{
+ int lcbid=0;
+ char chrName[200], species_name[200];
+ int COL_WIDTH=60;
+ long unsigned int totalscore=0;
+ long unsigned int totallen=0;
+ for (a = A; a != NULL; a = a->next) {
+ int ncol = a->textSize;
+ std::ostringstream tmpgraph;
+ tmpgraph << "MUGTMP" << getpid() << "_" << ++lcbid;
+ std::fstream strmfsa;
+ std::string fname(tmpgraph.str());
+ fname = "/tmp/"+fname + ".eval.fsa";
+ strmfsa.open(fname.c_str(), std::ios_base::out | std::ios_base::trunc);
+ for(c=a->components; c!=NULL; c=c->next) {
+ parseSrcName(c->src, species_name, chrName);
+ //Write FASTA
+ strmfsa << ">" << c->src << std::endl ;
+ int col=0;
+ int j=0;
+ for (col = j = 0; j < ncol; ++j) {
+ strmfsa << c->text[j];
+ ++col;
+ if (col == COL_WIDTH) {
+ strmfsa << std::endl;
+ col = 0;
+ }
+ }
+ if (col != 0){
+ strmfsa << std::endl;
+ }
+ }
+ strmfsa.close();
+ typedef typename Value<TScore>::Type TScoreValue;
+ typedef String<Dna5> TSequence;
+ typedef typename Size<TSequence>::Type TSize;
+ typedef String<char> TName;
+ StringSet<TSequence, Owner<> > origStrSet;
+ StringSet<TName> names;
+ // Read the sequences
+ std::fstream strm;
+ strm.open(fname.c_str(), std::ios_base::in | std::ios_base::binary);
+ read(strm,origStrSet,names,FastaAlign());
+ strm.close();
+ // Make a dependent StringSet
+ typedef StringSet<TSequence, Dependent<> > TDepSequenceSet;
+ TDepSequenceSet strSet(origStrSet);
+ // Read the alignment
+ typedef String<Fragment<> > TFragmentString;
+ String<TScoreValue> scores;
+ TFragmentString matches;
+ std::fstream strm_lib;
+ strm_lib.open(fname.c_str(), std::ios_base::in | std::ios_base::binary);
+ read(strm_lib,matches, scores, names, FastaAlign());
+ strm_lib.close();
+ unlink(fname.c_str());
+ // Build the alignment graph
+ typedef Graph<Alignment<TDepSequenceSet, TSize> > TGraph;
+ TGraph g(strSet);
+ buildAlignmentGraph(matches, g, FrequencyCounting() );
+ // Print the scoring information
+ TScoreValue gop = scType.data_gap_open;
+ TScoreValue gex = scType.data_gap_extend;
+ std::cout << "Scoring parameters:" << std::endl;
+ std::cout << "*Gap opening: " << gop << std::endl;
+ std::cout << "*Gap extension: " << gex << std::endl;
+ std::cout << "*Scoring matrix: " << std::endl;
+ TSize alphSize = ValueSize<Dna5>::VALUE;
+ std::cout << " ";
+ for(TSize col = 0; col<alphSize; ++col) std::cout << Dna5(col) << ',';
+ std::cout << std::endl;
+ for(TSize row = 0; row<alphSize; ++row) {
+ for(TSize col = 0; col<alphSize; ++col) {
+ if (col == 0) std::cout << Dna5(row) << ": ";
+ std::cout << score(scType, Dna5(row), Dna5(col));
+ if (col < alphSize - 1) std::cout << ',';
+ }
+ std::cout << std::endl;
+ }
+ std::cout << std::endl;
+ // Print the alignment information
+ TSize numGapEx = 0;
+ TSize numGap = 0;
+ TSize numPairs = 0;
+ TSize alignLen = 0;
+ String<TSize> pairCount;
+ TScoreValue alignScore = alignmentEvaluation(g, scType, numGapEx, numGap, numPairs, pairCount, alignLen);
+ totalscore+=alignScore;
+ totallen+=alignLen;
+ std::cout << "Alignment Score: " << alignScore << std::endl;
+ std::cout << "Alignment Length: " << alignLen << std::endl;
+ std::cout << "#Match-Mismatch pairs: " << numPairs << std::endl;
+ std::cout << "Score contribution by match-mismatch pairs: " << (alignScore - (((TScoreValue) numGap * gop) + ((TScoreValue) numGapEx * gex))) << std::endl;
+ std::cout << "#Gap extensions: " << numGapEx << std::endl;
+ std::cout << "Score contribution by gap extensions: " << ((TScoreValue) numGapEx * gex) << std::endl;
+ std::cout << "#Gap openings: " << numGap << std::endl;
+ std::cout << "Score contribution by gap openings: " << ((TScoreValue) numGap * gop) << std::endl;
+ std::cout << std::endl;
+ std::cout << "#Pairs: " << std::endl;
+ std::cout << " ";
+ for(TSize col = 0; col<alphSize; ++col) std::cout << Dna5(col) << ',';
+ std::cout << std::endl;
+ for(TSize row = 0; row<alphSize; ++row) {
+ for(TSize col = 0; col<alphSize; ++col) {
+ if (col == 0) std::cout << Dna5(row) << ": ";
+ std::cout << value(pairCount, row * alphSize + col);
+ if (col < alphSize - 1) std::cout << ',';
+ }
+ std::cout << std::endl;
+ }
+ /*
+ struct mafAli *nexta;
+ for (a = A; a != NULL; a = nexta) {
+ nexta=a->next;
+ mafAliFree(&a);
+ }
+ */
+ }
+ mafFileFree(&mf);
+ std::cout << "Total alignment score: " << totalscore << std::endl;
+ std::cout << "Total alignment length: " << totallen << std::endl;
+ }
+template<typename TAlphabet, typename TScore>
+inline void
+_initMsaParams(CommandLineParser& parser, TScore& scMat) {
+ // Msa configuration
+ MsaOptions<TAlphabet, TScore> msaOpt;
+ // Set main options
+ getOptionValueLong(parser, "seq", msaOpt.seqfile);
+ getOptionValueLong(parser, "outfile", msaOpt.outfile);
+ // MUGSY specific options
+ getOptionValueLong(parser, "distance", msaOpt.distance);
+ getOptionValueLong(parser, "minlength", msaOpt.minlength);
+ getOptionValueLong(parser, "refine", msaOpt.refine);
+ getOptionValueLong(parser, "duplications", msaOpt.duplications);
+ getOptionValueLong(parser, "unique", msaOpt.unique);
+ getOptionValueLong(parser, "allownestedlcbs", msaOpt.allownestedlcbs);
+ getOptionValueLong(parser, "anchorwin", msaOpt.anchorwin);
+ getOptionValueLong(parser, "blockfile", msaOpt.blockfile);
+ getOptionValueLong(parser, "segmentation", msaOpt.segmentation);
+ if(msaOpt.segmentation != "none" && msaOpt.segmentation != "enredo" && msaOpt.segmentation != "mercator"){
+ msaOpt.segmentation = "mugsy";
+ }
+ String<char> optionVal;
+ getOptionValueLong(parser, "format", optionVal);
+ if (optionVal == "maf") msaOpt.outputFormat = 0;
+ else if (optionVal == "msf") msaOpt.outputFormat = 1;
+ unsigned int beg = 0;
+ ::std::string tmpVal;
+ if (beg != tmpVal.length())
+ appendValue(msaOpt.libfiles, tmpVal.substr(beg, tmpVal.length() - beg));
+ getOptionValueLong(parser, "aln", tmpVal);
+ beg = 0;
+ for(unsigned int i = 0; i<tmpVal.length(); ++i) {
+ if (tmpVal[i] == ',') {
+ appendValue(msaOpt.alnfiles, tmpVal.substr(beg, i - beg));
+ beg = i + 1;
+ }
+ }
+ if (beg != tmpVal.length())
+ appendValue(msaOpt.alnfiles, tmpVal.substr(beg, tmpVal.length() - beg));
+ // Set scoring options
+ msaOpt.sc = scMat;
+ getOptionValueLong(parser, "gop", msaOpt.sc.data_gap_open);
+ getOptionValueLong(parser, "gex", msaOpt.sc.data_gap_extend);
+ int msc = 0;
+ getOptionValueLong(parser, "msc", msc);
+ _setMatchScore(msaOpt, msc);
+ int mmsc = 0;
+ getOptionValueLong(parser, "mmsc", mmsc);
+ _setMismatchScore(msaOpt, mmsc);
+ // Set guide tree options
+ if (optionVal == "nj") msaOpt.build = 0;
+ else if (optionVal == "min") msaOpt.build = 1;
+ else if (optionVal == "max") msaOpt.build = 2;
+ else if (optionVal == "avg") msaOpt.build = 3;
+ else if (optionVal == "wavg") msaOpt.build = 4;
+ // Set alignment evaluation options
+ getOptionValueLong(parser, "infile", msaOpt.infile);
+ // Check if any segment-match generation procedure is selected, otherwise set the default
+ if ((empty(msaOpt.alnfiles)) && (empty(msaOpt.method))) {
+ appendValue(msaOpt.method, 0);
+ appendValue(msaOpt.method, 1);
+ }
+ // Evaluation mode?
+ if (isSetLong(parser, "infile")) {
+ if(length(msaOpt.refine) > 0 && msaOpt.refine != "colinear"){ //Refinement mode
+ refineMSA(msaOpt.infile.c_str(),msaOpt);
+ }
+ else {
+ //typedef typename Value<TScore>::Type TScoreValue;
+ //TScore scType(boost::lexical_cast<int>(value(msaOpt, "msc")),
+ // boost::lexical_cast<int>(value(msaOpt, "mmsc")),-1 * boost::lexical_cast<int>(value(msaOpt, "gex")),-1 * boost::lexical_cast<int>(value(msaOpt, "gop")));
+ evaluateAlignment(msaOpt, msaOpt.sc, Dna5() );
+ //evaluateAlignment(msaOpt);
+ }
+ } else { // or alignment mode?
+ if (!isSetLong(parser, "seq")) {
+ shortHelp(parser, std::cerr); // print short help and exit
+ exit(0);
+ }
+ customizedMsaAlignment(msaOpt);
+ }
+inline void
+_initScoreMatrix(CommandLineParser& parser, Dna5 const) {
+ String<char> matrix;
+ getOptionValueLong(parser, "matrix", matrix);
+ if (isSetLong(parser, "matrix")) {
+ Score<int, ScoreMatrix<> > sc;
+ loadScoreMatrix(sc, matrix);
+ _initMsaParams<Dna5>(parser, sc);
+ } else {
+ Score<int> sc;
+ _initMsaParams<Dna5>(parser, sc);
+ }
+int main(int argc, const char *argv[]){
+#ifdef TIMING
+ time(&now);
+ lasttime=now;
+ //////////////////////////////////////////////////////////////////////////////
+ // Command line parsing
+ //////////////////////////////////////////////////////////////////////////////
+ std::string versionstring = std::string("1.3");
+ // Set the keys
+ CommandLineParser parser;
+ _addVersion(parser);
+ addTitleLine(parser, "*************************************************");
+ addTitleLine(parser, "* mugsyWGA *");
+ addTitleLine(parser, "* v"+versionstring+" *");
+ addTitleLine(parser, "* Multiple whole genome aligner *");
+ addTitleLine(parser, "* using graph based LCB identification *");
+ addTitleLine(parser, "* and Seqan::TCoffee *");
+ addTitleLine(parser, "*************************************************");
+ addUsageLine(parser, "-seq <multi-FASTA sequence file> -aln <Aligned pairwise FASTA library> [-distance <LCB chaining distance>] [-minlength <LCB minimum length>] [Other options]");
+ //Many config options lifted from seqan::tcoffee
+ addSection(parser, "Main Options:");
+ addOption(parser, addArgumentText(CommandLineOption("s", "seq", "multi-FASTA file with all input sequences. For draft genomes, FASTA headers should be in the form >seqname genomename.", OptionType::String), "<FASTA Sequence File>"));
+ addOption(parser, addArgumentText(CommandLineOption("al", "aln", "Library of pairwise alignments. Aligned multi-FASTA format (XMFA)", OptionType::String), "<File1>,<File2>,..."));
+ addOption(parser, addArgumentText(CommandLineOption("o", "outfile", "output filename prefix", (int)OptionType::String, "outfile"), "<Filename>"));
+ addOption(parser, addArgumentText(CommandLineOption("distance", "distance", "LCB chaining distance", (int)OptionType::String,"1000"), "<String>"));
+ addOption(parser, addArgumentText(CommandLineOption("minlength", "minlength", "Minimum LCB segment span", (int)OptionType::String,"100"), "<Int>"));
+ addOption(parser, addArgumentText(CommandLineOption("unique", "unique", "Report unique regions", OptionType::String,"true"), "[true|false]"));
+ addOption(parser, addArgumentText(CommandLineOption("duplications", "duplications", "Report duplications. Requires a second alignment file of pairwise duplications is passed to --aln. ", OptionType::String,"false"), "[true|false]"));
+ addSection(parser, "Other Options:");
+ addOption(parser, addArgumentText(CommandLineOption("f", "format", "output format", (int)OptionType::String, "maf"), "[maf | msf]"));
+ addOption(parser, addArgumentText(CommandLineOption("anchorwin", "anchorwin", "bp window to consider for collapsing anchors", (int)OptionType::Int,0), "<Int>"));
+ //synchain-mugsy can return overlapping and nested synteny blocks with the extent determined by --distance
+ //allownestedlcbs=false ensures each multi-genome anchor contributes to exactly one LCB; the longest LCB spanning the anchor
+ //The LCBs are sorted by length in descending order. Each anchor is
+ //removed from the anchor graph as soon as it is aligned in an LCB.
+ addOption(parser, addArgumentText(CommandLineOption("allownestedlcbs", "allownestedlcbs", "allow anchors to contribute to multiple LCBs. Default=false", OptionType::String,"false"), "[true|false]"));
+ addOption(parser, addArgumentText(CommandLineOption("refine", "refine", "refinement method: mugsy,fsa,pecan,mlagan", OptionType::String), "<String>"));
+ //addOption(parser, addArgumentText(CommandLineOption("poscorewindow", "psw", "posscorewindow", (int)OptionType::Int,1000), "<Int>"));
+ //addOption(parser, addArgumentText(CommandLineOption("possharedcutoff", "pscut", "possharedcutoff", (int)OptionType::Double,(double)0.1), "<Int>"));
+ addOption(parser, addArgumentText(CommandLineOption("segmentation", "segmentation", "Segmentation method. mugsy,enredo,mercator", OptionType::String), "<String>"));
+ addOption(parser, addArgumentText(CommandLineOption("blockfile", "blockfile", "Bypass segmentation and use this output file from synchain-mugsy", OptionType::String), "<String>"));
+ addSection(parser, "Scoring Options:");
+ addOption(parser, addArgumentText(CommandLineOption("g", "gop", "gap open penalty", (int)OptionType::Int, -13), "<Int>"));
+ addOption(parser, addArgumentText(CommandLineOption("e", "gex", "gap extension penalty", (int)OptionType::Int, -1), "<Int>"));
+ addOption(parser, addArgumentText(CommandLineOption("ma", "matrix", "score matrix", (int)OptionType::String, "Blosum62"), "<Matrix file>. Ignored."));
+ addOption(parser, addArgumentText(CommandLineOption("ms", "msc", "match score", (int)OptionType::Int, 5), "<Int>"));
+ addOption(parser, addArgumentText(CommandLineOption("mm", "mmsc", "mismatch penalty", (int)OptionType::Int, -4), "<Int>"));
+ addSection(parser, "Guide Tree Options:");
+ //addOption(parser, addArgumentText(CommandLineOption("u", "usetree", "tree filename", OptionType::String), "<Newick guide tree>"));
+ addOption(parser, addArgumentText(CommandLineOption("b", "build", "tree building method for progressive aln", (int)OptionType::String, "nj"), "[nj, min, max, avg, wavg]"));
+ addHelpLine(parser, "nj = Neighbor-joining");
+ addHelpLine(parser, "min = UPGMA single linkage");
+ addHelpLine(parser, "max = UPGMA complete linkage");
+ addHelpLine(parser, "avg = UPGMA average linkage");
+ addHelpLine(parser, "wavg = UPGMA weighted average linkage");
+ addHelpLine(parser, "Neighbor-joining creates an");
+ addHelpLine(parser, " unrooted tree. We root that tree");
+ addHelpLine(parser, " at the last joined pair.");
+ // Alignment evaluation
+ addSection(parser, "Alignment Evaluation Options:");
+ addOption(parser, addArgumentText(CommandLineOption("i", "infile", "alignment file", OptionType::String), "<FASTA alignment file>"));
+ if (argc == 1)
+ {
+ shortHelp(parser, std::cerr); // print short help and exit
+ return 0;
+ }
+ bool exitrun=false;
+ if (!parse(parser, argc, argv, ::std::cerr)) exitrun=true;
+ if (isSetLong(parser, "help") || isSetLong(parser, "version")) exitrun=false; // print help or version and exit
+ char * mugsyinstallstr = std::getenv("MUGSY_INSTALL");
+ if(mugsyinstallstr==NULL || strlen(mugsyinstallstr)==0){
+ std::cerr << "ERROR: Environment variable MUGSY_INSTALL must be set to the installation directory for mugsy" << std::endl;
+ exit(1);
+ }
+ assert(mugsyinstallstr != NULL);
+ std::string mugsyinstall = std::string(mugsyinstallstr);
+ assert(mugsyinstall.length()>0);
+ std::cerr << "Using MUGSY_INSTALL=" << mugsyinstall << std::endl;
+ //Check for chaining executable
+ struct stat st;
+ if(stat(std::string(mugsyinstall+"/synchain-mugsy").c_str(),&st) == 0){
+ //present
+ }
+ else{
+ std::cerr << "ERROR: MUGSY_INSTALL/synchain-mugsy not found. check installation at MUGSY_INSTALL=" << mugsyinstall << std::endl;
+ exitrun=true;
+ }
+ if(exitrun){
+ return 1;
+ }
+ // Basic command line options
+ String<char> alphabet = "dna";
+ // Initialize scoring matrices
+ _initScoreMatrix(parser, Dna5());
+ return 0;
diff --git a/mugsy-seqan/projects/library/apps/mugsy/rna_alphabet.h b/mugsy-seqan/projects/library/apps/mugsy/rna_alphabet.h
new file mode 100644
index 0000000..0f42883
--- /dev/null
+++ b/mugsy-seqan/projects/library/apps/mugsy/rna_alphabet.h
@@ -0,0 +1,305 @@
+ SeqAn - The Library for Sequence Analysis
+ http://www.seqan.de
+Copyright (C) 2007
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+Lesser General Public License for more details.
+// RNA5 Alphabet
+template <typename T = void>
+struct _Translate_Table_Rna5_2_Ascii
+ static char const VALUE[5];
+template <typename T>
+char const _Translate_Table_Rna5_2_Ascii<T>::VALUE[5] = {'A', 'C', 'G', 'U', 'N'};
+template <typename T = void>
+struct _Translate_Table_Rna_2_Ascii
+ static char const VALUE[4];
+template <typename T>
+char const _Translate_Table_Rna_2_Ascii<T>::VALUE[4] = {'A', 'C', 'G', 'U'};
+template <typename T = void>
+struct _Translate_Table_Byte_2_Rna5
+ static char const VALUE[256];
+template <typename T>
+char const _Translate_Table_Byte_2_Rna5<T>::VALUE[256] =
+ 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //0
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //1
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //2
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //3
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //4
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //5
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //6
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //7
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //8
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //9
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //10
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //11
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //12
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //13
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //14
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 //15
+template <typename T = void>
+struct _Translate_Table_Byte_2_Rna
+ static char const VALUE[256];
+template <typename T>
+char const _Translate_Table_Byte_2_Rna<T>::VALUE[256] =
+ 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //0
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //1
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //2
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //3
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //4
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //5
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //6
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //7
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //8
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //9
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //10
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //11
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //12
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //13
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //14
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 //15
+template <typename T = void>
+struct _Translate_Table_Ascii_2_Rna5
+ static char const VALUE[256];
+template <typename T>
+char const _Translate_Table_Ascii_2_Rna5<T>::VALUE[256] =
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //0
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //1
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //2
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //3
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, //4
+// , A, B, C, D, E, D, G, H, I, J, K, L, M, N, O,
+ 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //5
+// P, Q, R, S, T, U, V, W, X, Y, Z, , , , ,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, //6
+// , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
+ 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //7
+// p, q, r, s, t, u, v, w, x, y, z, , , , ,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //8
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //9
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //10
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //11
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //12
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //13
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //14
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 //15
+template <typename T = void>
+struct _Translate_Table_Ascii_2_Rna
+ static char const VALUE[256];
+template <typename T>
+char const _Translate_Table_Ascii_2_Rna<T>::VALUE[256] =
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //0
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //1
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //2
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //3
+ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, //4
+// , A, B, C, D, E, D, G, H, I, J, K, L, M, N, O,
+ 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //5
+// P, Q, R, S, T, U, V, W, X, Y, Z, , , , ,
+ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, //6
+// , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
+ 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //7
+// p, q, r, s, t, u, v, w, x, y, z, , , , ,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //8
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //9
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //10
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //11
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //12
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //13
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //14
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 //15
+struct _Rna {};
+typedef SimpleType<unsigned char,_Rna> Rna;
+template <> struct ValueSize< Rna > { enum { VALUE = 4 }; };
+template <> struct BitsPerValue< Rna > { enum { VALUE = 2 }; };
+struct _Rna5 {};
+typedef SimpleType<unsigned char, _Rna5> Rna5;
+template <> struct ValueSize< Rna5 > { enum { VALUE = 5 }; };
+template <> struct BitsPerValue< Rna5 > { enum { VALUE = 3 }; };
+//Rna assignment
+inline void
+assign(Ascii& target,
+ Rna const & source)
+ target = _Translate_Table_Rna_2_Ascii<>::VALUE[source.value];
+template <>
+struct CompareType<Rna, Byte> { typedef Rna Type; };
+inline void assign(Rna & target, Byte c_source)
+ target.value = _Translate_Table_Byte_2_Rna<>::VALUE[c_source];
+template <>
+struct CompareType<Rna, Ascii> { typedef Rna Type; };
+inline void assign(Rna & target, Ascii c_source)
+ target.value = _Translate_Table_Ascii_2_Rna<>::VALUE[(unsigned char)c_source];
+template <>
+struct CompareType<Rna, Unicode> { typedef Rna Type; };
+inline void assign(Rna & target, Unicode c_source)
+ target.value = _Translate_Table_Ascii_2_Rna<>::VALUE[(unsigned char) c_source];
+template <>
+struct CompareType<Rna, Rna5> { typedef Rna Type; };
+inline void assign(Rna & target, Rna5 const & c_source)
+ target.value = c_source.value & 0x03;
+//Rna5 assignment
+inline void
+assign(Ascii& target,
+ Rna5 const & source)
+ target = _Translate_Table_Rna5_2_Ascii<>::VALUE[source.value];
+template <>
+struct CompareType<Rna5, Byte> { typedef Rna5 Type; };
+inline void assign(Rna5 & target, Byte c_source)
+ target.value = _Translate_Table_Byte_2_Rna5<>::VALUE[c_source];
+template <>
+struct CompareType<Rna5, Ascii> { typedef Rna5 Type; };
+inline void assign(Rna5 & target, Ascii c_source)
+ target.value = _Translate_Table_Ascii_2_Rna5<>::VALUE[(unsigned char)c_source];
+template <>
+struct CompareType<Rna5, Unicode> { typedef Rna5 Type; };
+inline void assign(Rna5 & target, Unicode c_source)
+ target.value = _Translate_Table_Ascii_2_Rna5<>::VALUE[(unsigned char) c_source];
+template <>
+struct CompareType<Rna5, Rna> { typedef Dna Type; };
+inline void assign(Rna5 & target, Rna const & c_source)
+ target.value = c_source.value;
+#endif //#ifndef SEQAN_HEADER_...
diff --git a/mugsy-seqan/projects/library/apps/mugsy/transformcoords.h b/mugsy-seqan/projects/library/apps/mugsy/transformcoords.h
new file mode 100644
index 0000000..fc42939
--- /dev/null
+++ b/mugsy-seqan/projects/library/apps/mugsy/transformcoords.h
@@ -0,0 +1,36 @@
+struct mafAli
+/* A multiple alignment. */
+ struct mafAli *next;
+ double score;
+ struct mafComp *components; /* List of components of alignment */
+ int textSize; /* Size of text in each component. */
+ int chain_len;
+ int label;
+ char orient; /* Relative orientation of the reference */
+struct mafComp
+/* A component of a multiple alignment. */
+ {
+ struct mafComp *next;
+ char *name; /* comman name of sequence source. */
+ char *src; /* Name of sequence source. */
+ char *text; /* The sequence including dashes. */
+ char* contig;
+ int* mafPosMap;
+ int srcSize; /* Size of sequence source. */
+ int start; /* Start within sequence. Zero based. If strand is - is relative to src end. */
+ int size; /* Size in sequence (does not include dashes). */
+ short nameID;
+ char strand; /* Strand of sequence. Either + or -*/
+ char paralog;
+extern "C" void parseSrcName(char* srcName, char* name, char* src);
+extern "C" struct mafFile *mafOpen(const char *fileName, int verbose);
+extern "C" struct mafAli *mafNext(struct mafFile *mafFile);
+extern "C" void mafWrite(FILE *f, struct mafAli *maf);
+extern "C" void mafWriteStart(FILE *f, char *scoring);
+extern "C" void mafFileFree(struct mafFile **pObj);
+extern "C" void mafAliFree(struct mafAli **pObj);
diff --git a/mugsyWGA b/mugsyWGA
new file mode 120000
index 0000000..6b72076
--- /dev/null
+++ b/mugsyWGA
@@ -0,0 +1 @@
\ No newline at end of file
diff --git a/mugsyenv.sh b/mugsyenv.sh
new file mode 100644
index 0000000..4a4dcb4
--- /dev/null
+++ b/mugsyenv.sh
@@ -0,0 +1,8 @@
+export MUGSY_INSTALL=/usr/local/projects/angiuoli/mugsy_trunk
+export PERL5LIB=$MUGSY_INSTALL/perllibs
+#For testing TBA
+#export PATH=$PATH:$MUGSY_INSTALL/../../multiz-tba/trunk/
diff --git a/mumi.sh b/mumi.sh
new file mode 100755
index 0000000..0a43ee5
--- /dev/null
+++ b/mumi.sh
@@ -0,0 +1,140 @@
+# Compute the MUMi similarity value between two given complete genome sequences. If a genome sequence is
+# contained within a directory, all chromosomes of the genome sequences are merged before the genomes are
+# compared with each other.
+# seq1 GenBank file or directory containing GenBank files for the same genome
+# seq2 GenBank file or directory containing GenBank files for the same genome
+# -p optional prefix used for creation of temporary file names (default: "MUMI")
+# -t optional directory for storage of temporary files (default: "/tmp")
+# syntax: mumi [-p prefix] [-t tmp_dir] seq1 seq2
+# process command line options
+while getopts 'p:t:' option
+ case ${option} in
+ p) prefix=${OPTARG};;
+ t) tmp_dir=`echo "${OPTARG}" | sed -e 's/\/*$//'`;;
+ ?) echo "Usage: mumi [-p prefix] [-t tmp_dir] seq1 seq2" >&2
+ exit 1;;
+ esac
+let "numoptions = ${OPTIND}-1"
+shift ${numoptions}
+# process command line arguments
+if [ $# -ne 2 ]
+ echo "$0: invalid number of arguments" >&2
+ echo "Usage: mumi [-p prefix] [-t tmp_dir] seq1 seq2" >&2
+ exit 1
+if [ ! -f $1 -a ! -d $1 ]
+ echo "$0: illigal argument" >&2
+ echo "Usage: mumi [-p prefix] [-t tmp_dir] seq1 seq2" >&2
+ exit 1
+if [ ! -f $2 -a ! -d $2 ]
+ echo "$0: illigal argument" >&2
+ echo "Usage: mumi [-p prefix] [-t tmp_dir] seq1 seq2" >&2
+ exit 1
+seq1_file=`echo "$1" | sed -e 's/\/*$//'` # remove final slash from directory name
+seq2_file=`echo "$2" | sed -e 's/\/*$//'`
+seq1_name=`echo "${seq1_file}" | awk -F'/' '{print $NF}'` # extract name as last part of file or directory name
+seq2_name=`echo "${seq2_file}" | awk -F'/' '{print $NF}'`
+seq1_fasta="${tmp_dir}/${prefix}_${seq1_name}.fasta" # construct temporary FASTA file names
+mumfile="${tmp_dir}/${prefix}_${seq1_name}_${seq2_name}.mummer" # construct temporary file name to output results
+# convert GenBank files to (concatenated) files in FASTA format
+# echo "converting GenBank files to (concatenated) FASTA files ..."
+echo ">${seq1_name}" > ${seq1_fasta}
+if [ -d ${seq1_file} ]
+ for seqfile in `grep -H '^DEFINITION' ${seq1_file}/*.gbk | grep -v 'plasmid' | sort | cut -d':' -f1`
+ do
+ seqret -sequence ${seqfile} -sformat gb -osf fasta -stdout -auto 2> /dev/null | tail -n +2 >> ${seq1_fasta}
+ while [ $? -ne 0 ]
+ do
+ seqret -sequence ${seqfile} -sformat gb -osf fasta -stdout -auto 2> /dev/null | tail -n +2 >> ${seq1_fasta}
+ done
+ done
+ seqret -sequence ${seq1_file} -sformat gb -osf fasta -stdout -auto 2> /dev/null | tail -n +2 >> ${seq1_fasta}
+ while [ $? -ne 0 ]
+ do
+ seqret -sequence ${seq1_file} -sformat gb -osf fasta -stdout -auto 2> /dev/null | tail -n +2 >> ${seq1_fasta}
+ done
+echo ">${seq2_name}" > ${seq2_fasta}
+if [ -d ${seq2_file} ]
+ for seqfile in `grep -H '^DEFINITION' ${seq2_file}/*.gbk | grep -v 'plasmid' | sort | cut -d':' -f1`
+ do
+ seqret -sequence ${seqfile} -sformat gb -osf fasta -stdout -auto 2> /dev/null | tail -n +2 >> ${seq2_fasta}
+ while [ $? -ne 0 ]
+ do
+ seqret -sequence ${seqfile} -sformat gb -osf fasta -stdout -auto 2> /dev/null | tail -n +2 >> ${seq2_fasta}
+ done
+ done
+ seqret -sequence ${seq2_file} -sformat gb -osf fasta -stdout -auto 2> /dev/null | tail -n +2 >> ${seq2_fasta}
+ while [ $? -ne 0 ]
+ do
+ seqret -sequence ${seq2_file} -sformat gb -osf fasta -stdout -auto 2> /dev/null | tail -n +2 >> ${seq2_fasta}
+ done
+# process sequences by mummer
+mummer -mum -b -c -l 19 ${seq1_fasta} ${seq2_fasta} > ${mumfile} 2> /dev/null
+# get sequence length
+seq1_len=`tail -n +2 ${seq1_fasta} | tr -d '\n\r ' | wc -c`
+seq2_len=`tail -n +2 ${seq2_fasta} | tr -d '\n\r ' | wc -c`
+# process mummer output
+# echo "processing mummer output ..."
+awk -v seq1_len=${seq1_len} -v seq2_len=${seq2_len} -v seq1_name=${seq1_name} -v seq2_name=${seq2_name} '
+# forward or reverse hit for second sequence
+/^>/ { if ($0 ~ /Reverse/) reverse=1; next }
+# mark positions covered by MUMs
+ len+=$3
+ for(i=$1;i<$1+$3;++i) seq1[i-1]=1
+ if (reverse==1)
+ for(i=$2-$3+1;i<=$2;++i) seq2[i-1]=1
+ else
+ for(i=$2;i<$2+$3;++i) seq2[i-1]=1
+# determine MUM-index
+END {
+ # compute MUM-coverages of both genomes
+ for(i=0;i<seq1_len;++i) seq1_cov+=seq1[i]
+ for(i=0;i<seq2_len;++i) seq2_cov+=seq2[i]
+ # compute different versions of MUMi similarity value
+ sim1=seq1_cov/seq1_len
+ sim2=seq2_cov/seq2_len
+ sim3=(seq1_cov + seq2_cov)/(seq1_len + seq2_len)
+ sim4=0.5*(sim1 + sim2)
+ # output results
+ printf("%s\t%s\t%d\t%d\t%d\t%d\t%10.8f\t%10.8f\t%10.8f\t%10.8f\n",seq1_name,seq2_name,seq1_len,seq2_len,seq1_cov,seq2_cov,sim1,sim2,sim3,sim4)
+' ${mumfile}
+# remove temporary files
+rm -f ${seq1_fasta}
+rm -f ${seq2_fasta}
+rm -f ${mumfile}
diff --git a/mumi_fasta.sh b/mumi_fasta.sh
new file mode 100755
index 0000000..f99036b
--- /dev/null
+++ b/mumi_fasta.sh
@@ -0,0 +1,95 @@
+# Compute the MUMi similarity value between two given complete genome sequences.
+# seq1 FASTA file for seq1.
+# seq2 FASTA file for seq2.
+# -p optional prefix used for creation of temporary file names (default: "MUMI")
+# -t optional directory for storage of temporary files (default: "/tmp")
+# syntax: mumi [-p prefix] [-t tmp_dir] seq1 seq2
+# process command line options
+while getopts 'p:t:' option
+ case ${option} in
+ p) prefix=${OPTARG};;
+ t) tmp_dir=`echo "${OPTARG}" | sed -e 's/\/*$//'`;;
+ ?) echo "Usage: mumi [-p prefix] [-t tmp_dir] seq1 seq2" >&2
+ exit 1;;
+ esac
+let "numoptions = ${OPTIND}-1"
+shift ${numoptions}
+# process command line arguments
+if [ $# -ne 2 ]
+ echo "$0: invalid number of arguments" >&2
+ echo "Usage: mumi [-p prefix] [-t tmp_dir] seq1 seq2" >&2
+ exit 1
+if [ ! -f $1 -a ! -d $1 ]
+ echo "$0: illigal argument" >&2
+ echo "Usage: mumi [-p prefix] [-t tmp_dir] seq1 seq2" >&2
+ exit 1
+if [ ! -f $2 -a ! -d $2 ]
+ echo "$0: illigal argument" >&2
+ echo "Usage: mumi [-p prefix] [-t tmp_dir] seq1 seq2" >&2
+ exit 1
+seq1_fasta=`echo "$1" | sed -e 's/\/*$//'` # remove final slash from directory name
+seq2_fasta=`echo "$2" | sed -e 's/\/*$//'`
+seq1_name=`echo "${seq1_fasta}" | awk -F'/' '{print $NF}'` # extract name as last part of file or directory name
+seq2_name=`echo "${seq2_fasta}" | awk -F'/' '{print $NF}'`
+mumfile="${tmp_dir}/${prefix}_${seq1_name}_${seq2_name}.mummer" # construct temporary file name to output results
+# process sequences by mummer
+mummer -mum -b -c -l 19 ${seq1_fasta} ${seq2_fasta} > ${mumfile} 2> /dev/null
+# get sequence length
+seq1_len=`tail -n +2 ${seq1_fasta} | tr -d '\n\r ' | wc -c`
+seq2_len=`tail -n +2 ${seq2_fasta} | tr -d '\n\r ' | wc -c`
+# process mummer output
+# echo "processing mummer output ..."
+awk -v seq1_len=${seq1_len} -v seq2_len=${seq2_len} -v seq1_name=${seq1_name} -v seq2_name=${seq2_name} '
+# forward or reverse hit for second sequence
+/^>/ { if ($0 ~ /Reverse/) reverse=1; next }
+# mark positions covered by MUMs
+ len+=$3
+ for(i=$1;i<$1+$3;++i) seq1[i-1]=1
+ if (reverse==1)
+ for(i=$2-$3+1;i<=$2;++i) seq2[i-1]=1
+ else
+ for(i=$2;i<$2+$3;++i) seq2[i-1]=1
+# determine MUM-index
+END {
+ # compute MUM-coverages of both genomes
+ for(i=0;i<seq1_len;++i) seq1_cov+=seq1[i]
+ for(i=0;i<seq2_len;++i) seq2_cov+=seq2[i]
+ # compute different versions of MUMi similarity value
+ sim1=seq1_cov/seq1_len
+ sim2=seq2_cov/seq2_len
+ sim3=(seq1_cov + seq2_cov)/(seq1_len + seq2_len)
+ sim4=0.5*(sim1 + sim2)
+ # output results
+ printf("%s\t%s\t%d\t%d\t%d\t%d\t%10.8f\t%10.8f\t%10.8f\t%10.8f\n",seq1_name,seq2_name,seq1_len,seq2_len,seq1_cov,seq2_cov,sim1,sim2,sim3,sim4)
+' ${mumfile}
+# remove temporary files
+rm -f ${mumfile}
diff --git a/plot.pl b/plot.pl
new file mode 100755
index 0000000..bd5d5d0
--- /dev/null
+++ b/plot.pl
@@ -0,0 +1,403 @@
+#./plot.pl outputprefix reforganismname ps,xll
+#Eg. cat /tmp/testfilter.maf | ./plot.pl /tmp/testfilter genome2 mugsy.out > out.gp
+#cat /tmp/plasmidfilter.maf | ./plot.pl /tmp/plasmidfilter AF401292 mugsy.out > out.gp
+#gnuplot out.gp
+#Requires delta files output by mugsy in outputprefix
+use strict;
+my $terminal = ($ARGV[4] =~ /ps/) ? 'postscript' : 'X11';
+my($refname) = ($ARGV[1] =~ /^([^:.]+)/);
+my $delta = "$ARGV[0].$refname.filt.delta";
+print STDERR "Parsing $delta\n";
+die "Can't find delta file" if(!-e $delta);
+#Need to add -R -Q support for specifying the order of draft sequences
+my $mummerplotcmd = "/usr/local/projects/angiuoli/developer/sangiuoli/mummer/trunk/MUMmer3.20/mummerplot -p $ARGV[0].$refname \"$delta\"";
+`$mummerplotcmd 1> /dev/null 2> /dev/null`;
+my $idlenlookup={};
+open FILE, "$ARGV[0].$refname.gp" or die "Can't open file $ARGV[0].$refname.gp";
+my $savelen;
+my @xseqs;
+my @yseqs;
+while (my $line=<FILE>){
+ if($line =~ /^set ytics/){
+ $savelen=2;
+ }
+ if($line =~ /^set xtics/){
+ $savelen=1;
+ }
+ if($line =~ /^set [xy]label "([^\"]+)"/){
+ if($1 eq "QRY" || $1 eq "REF"){
+ }
+ else{
+ my $id = $1;
+ $id =~ /([^\:\.]+)[\:\.]([^\:]+)/;
+ if($1 eq $2){
+ $id = $1;
+ }
+ else{
+ $id = "$1.$2";
+ }
+ if($line =~ /xlabel/){
+ push @xseqs,[$id,0];
+ }
+ elsif($line =~ /ylabel/){
+ push @yseqs,[$id,0];
+ }
+ }
+ }
+ if($savelen){
+ my($id,$len) = ($line =~ /\"\*?([^\"]+)\"\s+(\d+)\,/);
+ $id =~ /([^\:\.]+)[\:\.]([^\:]+)/;
+ if($1 eq $2){
+ $id = $1;
+ }
+ else{
+ $id = "$1.$2";
+ }
+ if(defined $len && $id ne ""){
+ if($savelen==1){
+ push @xseqs,[$id,$len];
+ }
+ elsif($savelen==2){
+ push @yseqs,[$id,$len];
+ }
+ }
+ }
+my @seqs = (@xseqs, at yseqs);
+for(my $i=0;$i<@seqs;$i++){
+ my($id,$len) = ($seqs[$i]->[0],$seqs[$i]->[1]);
+ $idlenlookup->{$id} = $len;
+close FILE;
+open FILE1,"+>$ARGV[0].$refname.maf.fplot" or die "Can't open plot $ARGV[0].$refname.maf.fplot";
+open FILE2,"+>$ARGV[0].$refname.maf.rplot" or die "Can't open plot $ARGV[0].$refname.maf.rplot";
+print FILE1 "0 0 0\n";
+print FILE1 "0 0 0\n";
+print FILE1 "\n\n";
+print FILE2 "0 0 0\n";
+print FILE2 "0 0 0\n";
+print FILE2 "\n\n";
+my @accs = `grep ">" $delta`;
+my $synfile = "$ARGV[0].$refname.syn.plot";
+my $reportgraphs = {};
+my @graphs;
+my $varreportgraphs = {};
+my @vargraphs;
+#Synteny blocks
+ open FILE,$ARGV[2] or die "Can't open output file $ARGV[2]";
+ my $currgraph;
+ my $currchain;
+ my $name;
+ while(my $line=<FILE>){
+ chomp $line;
+ if($line !~ /^[\s\#]/){
+ my @elts = split(/\s+/,$line);
+ if($name ne $elts[0]){
+ $name = "$elts[0]";
+ }
+ my $seq = $elts[1];
+ my $start = $elts[3];
+ my $end = $elts[4];
+ $reportgraphs->{$name}->{$name}->{'seqs'}->{$seq}->{'start'} = $start;
+ $reportgraphs->{$name}->{$name}->{'seqs'}->{$seq}->{'end'} = $end;
+ }
+ }
+ close FILE;
+ @graphs = keys %$reportgraphs;
+if(defined $ARGV[3] && -e $ARGV[3]){
+ open FILE,$ARGV[3] or die "Can't open output file $ARGV[3]";
+ my $currgraph;
+ my $currchain;
+ my $name;
+ while(my $line=<FILE>){
+ chomp $line;
+ if($line !~ /^[\s\#]/){
+ my @elts = split(/\s+/,$line);
+ if($name ne $elts[0]){
+ $name = "$elts[0]";
+ }
+ my $seq = $elts[1];
+ my $start = $elts[3];
+ my $end = $elts[4];
+ $varreportgraphs->{$name}->{$name}->{'seqs'}->{$seq}->{'start'} = $start;
+ $varreportgraphs->{$name}->{$name}->{'seqs'}->{$seq}->{'end'} = $end;
+ }
+ }
+ close FILE;
+ @vargraphs = keys %$varreportgraphs;
+my $first=1;
+my @outlabels;
+open FILE, "+>$ARGV[0].$refname.syn.plot";
+foreach my $graphfile (@graphs){
+ chomp $graphfile;
+ foreach my $chainname (keys %{$reportgraphs->{$graphfile}}){
+ my @labels = keys %{$reportgraphs->{$graphfile}->{'seqs'}};
+ foreach my $x (@xseqs){
+ my $xacc = $x->[0];
+ $xacc =~ s/[\.|]/_/g;
+ if(exists $reportgraphs->{$graphfile}->{$chainname}->{'seqs'}->{$xacc}){
+ foreach my $y (@yseqs){
+ my $yacc = $y->[0];
+ $yacc =~ s/[\.|]/_/g;
+ if(exists $reportgraphs->{$graphfile}->{$chainname}->{'seqs'}->{$yacc}){
+ die "Can't find length for $x->[0]" if(! exists $idlenlookup->{$x->[0]});
+ die "Can't find length for $y->[0]" if(! exists $idlenlookup->{$y->[0]});
+ my $min0 = $reportgraphs->{$graphfile}->{$chainname}->{'seqs'}->{$xacc}->{'start'} += $idlenlookup->{$x->[0]};
+ my $min1 = $reportgraphs->{$graphfile}->{$chainname}->{'seqs'}->{$yacc}->{'start'} += $idlenlookup->{$y->[0]};
+ my $max0 = $reportgraphs->{$graphfile}->{$chainname}->{'seqs'}->{$xacc}->{'end'} += $idlenlookup->{$x->[0]};
+ my $max1 = $reportgraphs->{$graphfile}->{$chainname}->{'seqs'}->{$yacc}->{'end'} += $idlenlookup->{$y->[0]};
+ printf FILE ("%d %d %d #$graphfile\n",$min0,$min1,100);
+ printf FILE ("%d %d %d\n",$min0,$max1,100);
+ printf FILE ("\n");
+ printf FILE ("%d %d %d\n",$min0,$max1,100);
+ printf FILE ("%d %d %d\n",$max0,$max1,100);
+ printf FILE ("\n");
+ printf FILE ("%d %d %d\n",$max0,$max1,100);
+ printf FILE ("%d %d %d\n",$max0,$min1,100);
+ printf FILE ("\n");
+ printf FILE ("%d %d %d\n",$max0,$min1,100);
+ printf FILE ("%d %d %d\n",$min0,$min1,100);
+ printf FILE ("\n\n");
+# push @outlabels,"set label \"$chainname\" at $min0,",$min1+(($max1-$min1)/2),"\n";
+# push @outlabels,"set label \"$chainname\" at ",$min0+(($max0-$min0)/2),",",$min1+(($max1-$min1)/2),"\n";
+ push @outlabels,"set label \"$chainname\" at ",$min0+(($max0-$min0)/2),",",$min1,"\n";
+# push @outlabels,"set label \"$chainname\" at $min0,",$max1,"\n";
+ }
+ }
+ }
+ }
+ }
+close FILE;
+open FILE, "+>$ARGV[0].$refname.var.plot";
+foreach my $vargraphfile (@vargraphs){
+ chomp $vargraphfile;
+ foreach my $chainname (keys %{$varreportgraphs->{$vargraphfile}}){
+ my @labels = keys %{$varreportgraphs->{$vargraphfile}->{'seqs'}};
+ foreach my $x (@xseqs){
+ my $xacc = $x->[0];
+ $xacc =~ s/[\.|]/_/g;
+ if(exists $varreportgraphs->{$vargraphfile}->{$chainname}->{'seqs'}->{$xacc}){
+ foreach my $y (@yseqs){
+ my $yacc = $y->[0];
+ $yacc =~ s/[\.|]/_/g;
+ if(exists $varreportgraphs->{$vargraphfile}->{$chainname}->{'seqs'}->{$yacc}){
+ die "Can't find length for $x->[0]" if(! exists $idlenlookup->{$x->[0]});
+ die "Can't find length for $y->[0]" if(! exists $idlenlookup->{$y->[0]});
+ my $min0 = $varreportgraphs->{$vargraphfile}->{$chainname}->{'seqs'}->{$xacc}->{'start'} += $idlenlookup->{$x->[0]};
+ my $min1 = $varreportgraphs->{$vargraphfile}->{$chainname}->{'seqs'}->{$yacc}->{'start'} += $idlenlookup->{$y->[0]};
+ my $max0 = $varreportgraphs->{$vargraphfile}->{$chainname}->{'seqs'}->{$xacc}->{'end'} += $idlenlookup->{$x->[0]};
+ my $max1 = $varreportgraphs->{$vargraphfile}->{$chainname}->{'seqs'}->{$yacc}->{'end'} += $idlenlookup->{$y->[0]};
+ printf FILE ("%d %d %d #$vargraphfile\n",$min0,$min1,100);
+ printf FILE ("%d %d %d\n",$min0,$max1,100);
+ printf FILE ("\n");
+ printf FILE ("%d %d %d\n",$min0,$max1,100);
+ printf FILE ("%d %d %d\n",$max0,$max1,100);
+ printf FILE ("\n");
+ printf FILE ("%d %d %d\n",$max0,$max1,100);
+ printf FILE ("%d %d %d\n",$max0,$min1,100);
+ printf FILE ("\n");
+ printf FILE ("%d %d %d\n",$max0,$min1,100);
+ printf FILE ("%d %d %d\n",$min0,$min1,100);
+ printf FILE ("\n\n");
+# push @outlabels,"set label \"$chainname\" at $min0,",$min1+(($max1-$min1)/2),"\n";
+# push @outlabels,"set label \"$chainname\" at ",$min0+(($max0-$min0)/2),",",$min1+(($max1-$min1)/2),"\n";
+ push @outlabels,"set label \"$chainname\" at ",$min0+(($max0-$min0)/2),",",$min1,"\n";
+# push @outlabels,"set label \"$chainname\" at $min0,",$max1,"\n";
+ }
+ }
+ }
+ }
+ }
+close FILE;
+open FILE, "$ARGV[0].$refname.gp" or die "Can't open file $ARGV[0].$refname.gp";
+my $inplot=0;
+while (my $line=<FILE>){
+ if($line =~ /^plot/){
+ print join('', at outlabels);
+ $inplot++;
+ }
+ elsif($inplot>0){
+ if($line =~ /ls\s+2\s+$/){
+ chomp $line;
+ $line .= ", \\\n";
+ }
+ $inplot++;
+ }
+ print $line;
+ if($inplot==3){
+ print " \"$ARGV[0].$refname.maf.fplot\" title \"MAFFWD\" w lp ls 3, \\\n";
+ print " \"$ARGV[0].$refname.maf.rplot\" title \"MAFREV\" w lp ls 4, \\\n";
+ print " \"$ARGV[0].$refname.syn.plot\" title \"SYNBLOCKS\" w lp ls 5";
+ if(defined $ARGV[3]){
+ print ", \\\n";
+ print " \"$ARGV[0].$refname.var.plot\" title \"VARBLOCKS\" w lp ls 6 \n";
+ }
+ else{
+ print "\n";
+ }
+ $inplot=0;
+ }
+sub maf2gp{
+ my($fh1,$fh2,$refacc)=@_;
+ my $refline;
+ my $x = [];
+ my $lcbnum=0;
+ $refacc =~ /([^\:\.]+)[\:\.]([^\:]+)/;
+ if($1 && $2 && $1 ne $2){
+ $refacc = "$1.$2";
+ }
+ else{
+ $refacc = $1;
+ }
+ print STDERR "Using accession: $refacc\n";
+ while(my $line=<STDIN>){
+ if($line =~ /^a\s+/){
+ if(scalar(@$x)>0){
+ &printblock($fh1,$fh2,$x,$refacc,$lcbnum);
+ $lcbnum++;
+ $x = [];
+ }
+ }
+ else{
+ if($line =~ /^(s.+)\s+\S+/){
+ push @$x,$1;
+ }
+ }
+ }
+ &printblock($fh1,$fh2,$x,$refacc,$lcbnum);
+sub printblock{
+ my($fh1, $fh2, $scores,$ref,$lcbnum) = @_;
+ my($refa,$refb,$refe,$refo,$reflen);
+ my $hasref=0;
+ my $refacc;
+ foreach my $line (@$scores){
+ my($qry) = ($line =~ /s\s+(\S+)/);
+ $qry =~ /([^\:\.]+)[\:\.]([^\:]+)/;
+ if($1 && $2 && $1 ne $2){
+ $qry = "$1.$2";
+ }
+ else{
+ $qry = $1;
+ }
+ if($qry =~ /^$ref/){
+ $refacc = $qry;
+ my $refoffset = $idlenlookup->{$refacc};
+ ($refa,$refb,$refe,$refo,$reflen) = ($line =~ /s\s+(\S+)\s+(\d+)\s+(\d+)\s+([\+\-])\s+(\d+)/);
+ $refe = $refb + $refe;
+ $refe += $refoffset;
+ $refb += $refoffset;
+ $hasref=1;
+ }
+ }
+ if($hasref==1){
+ foreach my $line (@$scores){
+ my($qry) = ($line =~ /s\s+(\S+)/);
+ $qry =~ /([^\:\.]+)[\:\.]([^\:]+)/;
+ if($1 && $2 && $1 ne $2){
+ $qry = "$1.$2";
+ }
+ else{
+ $qry = $1;
+ }
+ if($qry ne $refacc){
+ my $qryoffset = $idlenlookup->{$qry};
+ #print STDERR "$qry $qryoffset\n";
+ if(defined $qryoffset){
+ my($qrya,$qryb,$qrye,$qryo,$qrylen) = ($line =~ /s\s+(\S+)\s+(\d+)\s+(\d+)\s+([\+\-])\s+(\d+)/);
+ $qrye = $qryb + $qrye;
+ $qryb = $qryb;
+ if($refo eq '+' && $qryo eq '+'){
+ #print STDERR "$refa\t$refb\t$refe\t$refo\t$qrya\t$qryb\t$qrye\t$qryo\n";
+ $qrye += $qryoffset;
+ $qryb += $qryoffset;
+ print $fh1 "$refb $qryb 100\n";
+ print $fh1 "$refe $qrye 100\n\n\n";
+ push @outlabels,"set label \"$lcbnum\" at ",$refe+100,",",$qrye,"\n";
+ }
+ elsif($refo eq '+' && $qryo eq '-'){
+ $qrye = ($qrylen - $qrye);
+ $qryb = ($qrylen - $qryb);
+ #print STDERR "$refa\t$refb\t$refe\t$refo\t$qry\t$qryb\t$qrye\t$qryo\n";
+ $qrye += $qryoffset;
+ $qryb += $qryoffset;
+ print $fh2 "$refe $qrye 100\n";
+ print $fh2 "$refb $qryb 100\n\n\n";
+ push @outlabels,"set label \"$lcbnum\" at ",$refb+100,",",$qryb,"\n";
+ }
+ elsif($refo eq '-' && $qryo eq '+'){
+ my $refec = $reflen - $refe;
+ my $refbc = $reflen - $refb;
+ #print STDERR "$refa\t$refbc\t$refec\t$refo\t$qry\t$qryb\t$qrye\t$qryo\n";
+ $qrye += $qryoffset;
+ $qryb += $qryoffset;
+ print $fh2 "$refec $qrye 100\n";
+ print $fh2 "$refbc $qryb 100\n\n\n";
+ push @outlabels,"set label \"$lcbnum\" at ",$refbc+100,",",$qryb,"\n";
+ }
+ elsif($refo eq '-' && $qryo eq '-'){
+ my $refec = $reflen - $refe;
+ my $refbc = $reflen - $refb;
+ #print STDERR "$refa\t$refbc\t$refec\t$refo\t$qry\t$qryb\t$qrye\t$qryo\n";
+ $qrye = $qryoffset + ($qrylen - $qrye);
+ $qryb = $qryoffset + ($qrylen - $qryb);
+ print $fh1 "$refec $qrye 100\n";
+ print $fh1 "$refbc $qryb 100\n\n\n";
+ push @outlabels,"set label \"$lcbnum\" at ",$refbc+100,",",$qryb,"\n";
+ }
+ else{
+ die;
+ }
+ #print STDERR "\n";
+ }
+ }
+ }
+ }
diff --git a/splitmaf.pl b/splitmaf.pl
new file mode 100755
index 0000000..25a8524
--- /dev/null
+++ b/splitmaf.pl
@@ -0,0 +1,48 @@
+#Accepts pairwise maf only
+#./splitmaf.pl outputprefix < input.maf
+my $qfiles = {};
+my @seqs;
+my @buffer;
+my $currscoreline;
+my $header = "##maf version=1 scoring=maf_project_simple\n";
+while(my $line=<STDIN>){
+ if($line =~ /^a/){
+ die "Only pairwise seqs accepted" if(scalar(@seqs)>2);
+ if(scalar(@seqs)>0){
+ &writemaf(\@seqs,\@buffer);
+ }
+ $currscoreline=$line;
+ @seqs = ();
+ @buffer = ();
+ }
+ elsif($line =~ /^s\s+([^.\s]+)/){
+ push @seqs,$1;
+ }
+ push @buffer,$line;
+ &writemaf(\@seqs,\@buffer);
+sub writemaf{
+ my($seqs,$buffer) = @_;
+ die "Invalid seqs ids $seqs->[0] $seqs->[1]" if(!defined $seqs->[0] || !defined $seqs->[1]);
+ my $fh;
+ if(! exists $qfiles->{$seqs->[0]}->{$seqs->[1]}){
+ open $fh, "+>$ARGV[0]$seqs->[0].$seqs->[1].maf" or die "Can't open file $ARGV[0]$seqs->[0].$seqs->[1].maf: $!";
+ print $fh $header;
+ $qfiles->{$seqs->[0]}->{$seqs->[1]} = $fh;
+ print "$ARGV[0]$seqs->[0].$seqs->[1].maf\n";
+ }
+ $fh = $qfiles->{$seqs->[0]}->{$seqs->[1]};
+ print $fh @$buffer;
diff --git a/synchain-mugsy b/synchain-mugsy
new file mode 120000
index 0000000..8a89de6
--- /dev/null
+++ b/synchain-mugsy
@@ -0,0 +1 @@
\ No newline at end of file
diff --git a/util/mafgrep.pl b/util/mafgrep.pl
new file mode 100755
index 0000000..398274b
--- /dev/null
+++ b/util/mafgrep.pl
@@ -0,0 +1,55 @@
+#Returns list of blocks that contain all sequences in the set seqid1...seqidn
+#./mafgrep.pl seqid1 seqid2 ... seqidn < out.maf
+use strict;
+my $format='maf';#or tab
+my %grepids = map { $_, 1 } @ARGV;
+print STDERR "Looking for ",scalar(keys %grepids),"\n";
+my $currscore;
+my $currorient;
+my $blockorient;
+my @allblocks;
+my $block = [];
+while(my $line=<STDIN>){
+ if($line =~ /^a\s+score=(\S+)/){
+ $currscore=$1;
+ push @allblocks,$block;
+ $block=[];
+ }
+ elsif($line =~ /^s/){
+ my @elts = split(/\s+/,$line);
+ #0-score,1-blockorient,2-accession,3-start,4-end
+ push @$block,[$currscore,$currorient,$elts[1],$elts[2],$elts[2]+$elts[3],$elts[3],$elts[4],$line];
+ }
+print STDERR "Parsed ",scalar(@allblocks)," blocks\n";
+print "##maf version=12\n";
+push @allblocks,$block;
+foreach my $blocks (@allblocks){
+ #Lookup of all seqs in the block
+ my %seqs = map {$_->[2], 1} @$blocks;
+ #
+ my %results = map { $_, $grepids{$_} } grep { not exists $seqs{$_} } keys %grepids;
+ #print STDERR "Seqs ",join(',',sort keys %seqs)," ",scalar(@$block),"\n";
+ #print STDERR "Results ",join(',',sort keys %results),"\n";
+ #print STDERR "Grep ",join(',',sort keys %grepids),"\n"; #join(' ',keys %seqs)," | ",join(' ',keys %grepids),"\n";
+ if(scalar(keys %results)==0){
+ if($format eq 'maf'){
+ print "a score=$blocks->[0]->[0]\n";
+ }
+ foreach my $bl (@$blocks){
+ if($format eq 'maf'){
+ if(exists $grepids{$bl->[2]}){
+ print "$bl->[7]";
+ }
+ }
+ else{
+ print "$bl->[2]\t$bl->[3]\t$bl->[4]\t$bl->[5]\t$bl->[6]\n";
+ }
+ }
+ print "\n";
+ }
diff --git a/util/mafstats.pl b/util/mafstats.pl
new file mode 100755
index 0000000..89cd308
--- /dev/null
+++ b/util/mafstats.pl
@@ -0,0 +1,600 @@
+#Reports coverage
+#Unique DNA should be sum of blocks blocks with one seq and runs aligned to all gaps
+use strict;
+my $found=0;
+my $currscore;
+my $currorient;
+my $blockorient;
+my @allblocks;
+my $block = [];
+my $isdup=0;
+my $multiplealnblkcount=0;
+while(my $line=<STDIN>){
+ if($line =~ /^a/){
+ ($currscore) =~ ($line =~ /score=(\S+)/);
+ my($label) = ($line =~ /label=(\S+)/);
+ my($isdup) = ($line =~ /dup=/) ? 1 : 0;
+ push @allblocks,$block if(scalar(@$block)>0);
+ $multiplealnblkcount++ if(scalar(@$block)>1);
+ $block=[];
+ }
+ elsif($line =~ /^s/){
+ #my @elts = split(/\s+/,$line);
+ #0-score,1-blockorient,2-accession,3-start,4-end
+ chomp $line;
+ push @$block,[$currscore,$line,$isdup];
+ }
+push @allblocks,$block if(scalar(@$block)>0);
+#Number of lcbs with N genomes
+my $lcbseqcount = [];
+#Frequency of alignment columns with N identical rows
+my $numIdentCols = [];
+#Freq of columns with no gaps
+my $numUngappedCols = [];
+#Freq columns with one seq and all gaps
+my $numGappedCols = [];
+#Number of bps in blocks containing N genomes
+my $lcbbpcount = [];
+my $lcbbpdistro = [];
+my $gapdistro = [];
+my $alnbpseqs = {};
+my $lcbseqs = {};
+my $totalscore=0;
+my $numgaps=0;
+my $numblocks=scalar(@allblocks);
+my $totallen=0;
+my $totalseqlen=0;
+my $smallestblks=0;
+my $smallerblks=0;
+my $smallerlen=0;
+my $smallestlen=0;
+my $nummaf=0;
+my $uniqcount=0;
+my $dupcount=0;
+my %minseq;
+my %maxseq;
+my %allseqs;
+print "Num_blocks:$numblocks\n";
+print "Num_multi_blocks:$multiplealnblkcount\n";
+my $lcbid=0;
+open AFILE,"+>aln.$ARGV[0].dat";
+foreach my $block (@allblocks){
+ my $issmaller=0;
+ my $issmallest=0;
+ #Min and max len of seqs in the LCB
+ my $minlen=-1;
+ my $maxlen=0;
+ die if(scalar(@$block) ==0);
+ my $nseq = scalar(@$block);
+ die if($nseq <= 0);
+ $lcbseqcount->[$nseq]++;
+ my @alntext;
+ my $isdup=0;
+ if($nseq>1){
+ foreach my $maf (@$block){
+ if($maf->[2]){
+ $isdup=1;
+ }
+ my($seq,$beg,$len,$orient,$seqlen,$text) = ($maf->[1] =~ /s\s+(\S+)\s+(\d+)\s+(\d+)\s+([\+\-])\s+(\d+)\s+(\S+)/);
+ die if($len<0);
+ $text =~ s/\s+//g;
+ if($text =~ /[^-]/){
+ push @alntext,$text;
+ if(exists $allseqs{$seq}){
+ die if($seqlen != $allseqs{$seq});
+ }
+ else{
+ $allseqs{$seq} = $seqlen;
+ }
+ if($minlen==-1){
+ $minlen=$len;
+ }
+ else{
+ $minlen = ($len<$minlen) ? $len:$minlen;
+ die if($minlen<0);
+ }
+ $maxlen = ($len>$maxlen) ? $len:$maxlen;
+ my $cgaps = ($text =~ tr/\-/-/);
+ die if($cgaps<0);
+ $numgaps += $cgaps;
+ my($fmin,$fmax);
+ if($orient eq '-'){
+ $fmin = $seqlen-$beg-$len;
+ $fmax = $seqlen-$beg;
+ }
+ else{
+ $fmin = $beg;
+ $fmax = $beg+$len;
+ }
+ die "$maf->[1]" if($fmin < 0 || $fmin > $seqlen);
+ die "$maf->[1]" if($fmax < 0 || $fmax > $seqlen);
+ $minseq{$seq} = ($minseq{$seq} < $fmin) ? $fmin : $minseq{$seq};
+ $maxseq{$seq} = ($maxseq{$seq} > $fmax) ? $fmax : $maxseq{$seq};
+ $lcbseqs->{$seq} = [] if(!ref $lcbseqs->{$seq});
+ push @{$lcbseqs->{$seq}},[$fmin,$fmax,$orient];
+ die "$maf->[1]" if($len<=0);
+ $totallen += $len;
+ $nummaf++;
+ if($len < 100){
+ $issmallest=1;
+ $smallestlen+=$len;
+ }
+ if($len < 1000){
+ $issmaller=1;
+ $smallerlen+=$len;
+ }
+ }
+ else{
+ print STDERR "All gap encountered but length $len > 0 $text\n" if($len != 0);
+ $nseq--;
+ }
+ }
+ }
+ else{
+ my($seq,$beg,$len,$orient,$seqlen,$text) = ($block->[0]->[1] =~ /s\s+(\S+)\s+(\d+)\s+(\d+)\s+([\+\-])\s+(\d+)\s+(\S+)/);
+ $minlen=$len;
+ }
+ die if($minlen<0);
+ $smallerblks++ if($issmaller);
+ $smallestblks++ if($issmallest);
+ $lcbbpcount->[$nseq]+=$minlen;
+ if($nseq ==1){
+ if($isdup){
+ $uniqcount +=$minlen;
+ }
+ else{
+ $dupcount +=$minlen;
+ }
+ }
+ $lcbbpdistro->[$nseq] = [] if(!ref $lcbbpdistro->[$nseq]);
+ push @{$lcbbpdistro->[$nseq]},$minlen;
+ print STDERR "LCB: $lcbid maxlen:$maxlen\n";
+ $lcbid++;
+ if($nseq>1){
+ my $alnmatrix = &maf2matrix(\@alntext);
+ my($lcbtotalscore,$blklen) = &scorealn($alnmatrix,
+ $numIdentCols,
+ $numUngappedCols,
+ $numGappedCols,
+ $gapdistro,
+ $alnbpseqs);
+ die "$lcbtotalscore,$blklen" if($blklen ==0);
+ $totalscore+=$lcbtotalscore;
+ my $alnlen = scalar(@{$alnmatrix->[0]});
+ my $nseq = scalar(@$alnmatrix);
+ for(my $k=0;$k<$alnlen;$k++){
+ for(my $i=0;$i<$nseq;$i++){
+ print AFILE $alnmatrix->[$i]->[$k];
+ }
+ print AFILE "\n";
+ }
+ }
+close AFILE;
+my $seqmatrix = &getCovered($lcbseqs);
+my $uniqbptotal=0;
+my $alignedlentotal=0;
+my $doublecovtotal=0;
+open MFILE,"+>bps.$ARGV[0].dat";
+foreach my $seq (sort {$a cmp $b} keys %allseqs){
+ $totalseqlen+=$allseqs{$seq};
+ my $alignedlen=0;
+ my $doublecov=0;
+ my $uniqbp=0;
+ for(my $i=0;$i<$allseqs{$seq};$i++){
+ if($seqmatrix->{$seq}->[$i]){
+ $alignedlen++;
+ $alignedlentotal++;
+ if($seqmatrix->{$seq}->[$i]>1){
+ $doublecovtotal+=$seqmatrix->{$seq}->[$i]-1;
+ $doublecov++;
+ }
+ }
+ else{
+ die if($seqmatrix->{$seq}->[$i]>0);
+ $uniqbptotal++;
+ $uniqbp++;
+ }
+ print MFILE "$seq $i $seqmatrix->{$seq}->[$i]\n";
+ }
+ print "$seq len:$allseqs{$seq} aln_cov:$alignedlen aln_cov_pct:",$alignedlen/$allseqs{$seq}," uniq:$uniqbp doublecov:$doublecov \n";
+close MFILE;
+#Count of bases that are aligned to only gaps
+my $uniqaln=0;
+for(my $i=0;$i<scalar(@$numGappedCols);$i++){
+ $uniqaln+=$numGappedCols->[$i];
+print "\n";
+#Summary #genomes,total len, avg block size
+print "max_genomes_aln:",scalar(@$numIdentCols)-1,"\n";
+print STDERR "Num ident cols size=",scalar(@$numIdentCols),"!= Numbpdistro=",scalar(@$lcbbpdistro),"\n" if(scalar(@$numIdentCols)!=scalar(@$lcbbpdistro));
+print "total_seq_len:",$totalseqlen,"\n";
+print "avg_block_len:",$totallen/$nummaf,"\n";
+print "num_lcbs:",$nummaf,"\n";
+print "double_covered:",$doublecovtotal,"\n";
+#Avg/total coverage, #bps aligned
+print "aln_cov:",$alignedlentotal," ",$totallen-$doublecovtotal,"\n";
+print "aln_cov_pct:",$alignedlentotal/$totalseqlen,"\n";
+print "not_cov:",$uniqbptotal,"\n";
+print "not_cov_pct:",$uniqbptotal/$totalseqlen,"\n";
+print "aln_bps:",($totalseqlen-$uniqbptotal-$uniqaln),"\n";
+print "aln_pct:",($totalseqlen-$uniqbptotal-$uniqaln)/$totalseqlen,"\n";
+print "core_bps:",$numUngappedCols->[scalar(@$numUngappedCols)-1],"\n";
+print "core_pct:",$numUngappedCols->[scalar(@$numUngappedCols)-1]/$totalseqlen,"\n";
+print "uniq_bps:",$uniqbptotal+$uniqaln,"\n";
+print "uniq_pct:",($uniqbptotal+$uniqaln)/$totalseqlen,"\n";
+print "\n";
+print "MISMATCH between uniqLCB len and calculated len\n" if( $lcbbpcount->[1] != $uniqbptotal);
+print "uniq_LCBlen:",$lcbbpcount->[1],"\n";
+print "uniq_cov:",$uniqbptotal,"\n";
+print "uniq_aln:",$uniqaln,"\n";
+print "uniq_dup:",$uniqcount,"\n";
+print "dup_bps:",$dupcount,"\n";
+print "blklt100bp:",$smallestblks,"\n";
+print "blklen:",$smallestlen,"\n";
+print "blklt1000bp:",$smallerblks,"\n";
+print "blklen:",$smallerlen,"\n";
+print "num_gaps:",$numgaps,"\n";
+print "score:",$totalscore,"\n";
+print "LCB seq count\n";
+for(my $i=0;$i<scalar(@$lcbseqcount);$i++){
+ print "$i\t";
+print "\n";
+for(my $i=0;$i<scalar(@$lcbseqcount);$i++){
+ print $lcbseqcount->[$i],"\t";
+print "\n";
+print "LCB coverage bp count\n";
+for(my $i=0;$i<scalar(@$lcbbpcount);$i++){
+ print "$i\t";
+print "\n";
+for(my $i=0;$i<scalar(@$lcbbpcount);$i++){
+ print $lcbbpcount->[$i],"\t";
+print "\n";
+print "Ident.Freq of identical alignment columns\n";
+for(my $i=0;$i<scalar(@$numIdentCols);$i++){
+ print "$i\t";
+print "\n";
+for(my $i=0;$i<scalar(@$numIdentCols);$i++){
+ print "$numIdentCols->[$i]\t";
+print "\n";
+print "NoGaps.Freq of alignment columns with no gaps\n";
+for(my $i=0;$i<scalar(@$numUngappedCols);$i++){
+ print "$i\t";
+print "\n";
+for(my $i=0;$i<scalar(@$numUngappedCols);$i++){
+ print "$numUngappedCols->[$i]\t";
+print "\n";
+print "AllGaps.Freq of alignment cols with one seq and all gaps\n";
+for(my $i=0;$i<scalar(@$numGappedCols);$i++){
+ print "$i\t";
+print "\n";
+for(my $i=0;$i<scalar(@$numGappedCols);$i++){
+ print "$numGappedCols->[$i]\t";
+print "\n";
+print "LCBs:";
+my @lcblens;
+for(my $i=2;$i<@$lcbbpdistro;++$i){
+ push @lcblens,@{$lcbbpdistro->[$i]} if(ref $lcbbpdistro->[$i]);
+print join(',',sort {$a <=> $b} @lcblens);
+print "\n";
+print "LCBs core:";
+print join(',',sort {$a <=> $b} @{$lcbbpdistro->[scalar(@$lcbbpdistro)-1]});
+print "\n";
+print "Gaps:";
+my @gaplens;
+foreach my $seq (@$gapdistro){
+ push @gaplens,@$seq if(ref $seq);
+print "\n";
+print join(',',sort {$a <=> $b} @gaplens);
+print "\n";
+print STDERR "Writing .dat files for R\n";
+#Data for R
+open LFILE,"+>lcbs.$ARGV[0].dat";
+print LFILE join("\n",sort {$a <=> $b} @lcblens);
+close LFILE;
+open CFILE,"+>corelcbs.$ARGV[0].dat";
+print CFILE join("\n",sort {$a <=> $b} @{$lcbbpdistro->[scalar(@$lcbbpdistro)-1]});
+close CFILE;
+open GFILE,"+>gaps.$ARGV[0].dat";
+print GFILE join("\n",sort {$a <=> $b} @gaplens);
+close GFILE;
+open RFILE,"+>mafstats.$ARGV[0].r";
+print RFILE "lcbs <- read.csv(file=\"lcbs.$ARGV[0].dat\");\n";
+print RFILE "corelcbs <- read.csv(file=\"corelcbs.$ARGV[0].dat\");\n";
+print RFILE "gaps <- read.csv(file=\"gaps.$ARGV[0].dat\");\n";
+print RFILE "hist(lcbs\$X1, col=\"green\", main=\"LCBs\", xlab=\"LCB length (bp)\");\n";
+print RFILE "dev.print(device=postscript, \"lcbs.$ARGV[0].eps\", onefile=FALSE, horizontal=FALSE);\n";
+print RFILE "hist(corelcbs\$X1, col=\"blue\", main=\"Core LCBs\", xlab=\"LCB length (bp)\");\n";
+print RFILE "dev.print(device=postscript, \"corelcbs.$ARGV[0].eps\", onefile=FALSE, horizontal=FALSE);\n";
+print RFILE "hist(gaps\$X1, col=\"red\", main=\"Gaps\", xlab=\"Gap length (bp)\");\n";
+print RFILE "dev.print(device=postscript, \"gaps.$ARGV[0].eps\", onefile=FALSE, horizontal=FALSE);\n";
+close RFILE;
+sub scorealn{
+ my($matrix,$numIdentCols,$numUngappedCols,$numGappedCols,$gapaln,$alnbpseqs) = @_;
+ my $gapext = -1;
+ my $gapopen = -2;
+ my $gapopeni=0;
+ my $gapopenj=0;
+ my $gapexcount=0;
+ my $gapcount=0;
+ my $totalscore=0;
+ my $alnlen = 0;
+ my $nseq = scalar(@$matrix);
+ #print "Scoring $nseq\n";
+ #Loop over each sequence/row
+ for(my $i=0;$i<$nseq;$i++){
+ if($alnlen!=0){
+ die if($alnlen != scalar(@{$matrix->[$i]}));
+ }
+ else{
+ $alnlen = scalar(@{$matrix->[$i]});
+ }
+ for(my $j=$i+1;$j<$nseq;$j++){
+ #print "$i $alnlen\n";
+ #Loop over each column
+ for(my $k=0;$k<$alnlen;$k++){
+ if($matrix->[$i]->[$k] ne '-'){
+ if($matrix->[$j]->[$k] ne '-'){
+ $gapopeni=0;
+ $gapopenj=0;
+ $totalscore+=1;#$scorematrix[$matrix[$i][$k]][$matrix[$j][$k]];
+ }
+ else{
+ if($gapopenj){
+ $gapexcount++;
+ $totalscore+=$gapext;
+ }
+ else{
+ $gapopenj=1;
+ $gapcount++;
+ $totalscore+=$gapopen;
+ }
+ }
+ }
+ else{
+ if($matrix->[$j]->[$k] ne '-'){
+ if($gapopeni){
+ $gapexcount++;
+ $totalscore+=$gapext;
+ }
+ else{
+ $gapopeni=1;
+ $gapcount++;
+ $totalscore+=$gapopen;
+ }
+ }
+ }
+ }
+ }
+ }
+ #Get number of identical columns, allowing for gaps but not mismatches
+ # S2 TT---TAAAA-A
+ # 332223223020
+ # $numIdentCols[0]=2 //at least one mismatch
+ # $numIdentCols[2]=6
+ # $numIdentCols[3]=3
+ my $c; #bp
+ my @uniqruns;
+ my $uniqrow;
+ my $runopen;
+ my $startrun=-1;
+ my $runpos=-1;
+ for(my $k=0;$k<$alnlen;$k++){
+ my $numIdents=0;
+ my $mismatch;
+ for(my $j=0;$j<$nseq;$j++){
+ if($matrix->[$j]->[$k] ne '-'){
+ if($numIdents==0){
+ $c = lc($matrix->[$j]->[$k]);
+ $numIdents++;
+ $uniqrow=$j;
+ }
+ else{
+ if(lc($matrix->[$j]->[$k]) eq $c){
+ $numIdents++;
+ }
+ else{
+ $numIdents=0;
+ last;
+ }
+ }
+ }
+ else{
+ $mismatch=1;
+ }
+ }
+ if($numIdents==1){
+ if($runopen eq $uniqrow){
+ $runpos=$k;
+ }
+ else{
+ push @uniqruns,[$runopen,$startrun,$runpos] if($runopen ne "");
+ $runopen=$uniqrow;
+ $startrun=$k;
+ $runpos=$k;
+ }
+ }
+ else{
+ push @uniqruns,[$runopen,$startrun,$runpos] if($runopen ne "");
+ $runopen = "";
+ $startrun=-1;
+ $runpos=-1;
+ }
+ $numIdentCols->[$numIdents]++;
+ $mismatch =1 if($numIdents<$nseq);
+ #push @$mismatches,$k if($mismatch);
+ }
+ #Get number of ungapped columns
+ # S2 TT---TAAAA-A
+ # 330003003303
+ # $numUngapped[0]=6
+ # $numUngapped[3]=6
+ my $c; #bp
+ for(my $k=0;$k<$alnlen;$k++){
+ my $numUngaps=0;
+ for(my $j=0;$j<$nseq;$j++){
+ if($matrix->[$j]->[$k] ne '-'){
+ $numUngaps++;
+ }
+ else{
+ $numUngaps=0;
+ last;
+ }
+ }
+ $numUngappedCols->[$numUngaps]++;
+ }
+ #Get number columns with one sequence and all gaps
+ for(my $k=0;$k<$alnlen;$k++){
+ my $numGaps=0;
+ for(my $j=0;$j<$nseq;$j++){
+ if($matrix->[$j]->[$k] eq '-'){
+ $numGaps++;
+ }
+ }
+ if($numGaps==$nseq){
+ for(my $j=0;$j<$nseq;$j++){
+ for(my $k=0;$k<$alnlen;$k++){
+ print STDERR "$matrix->[$j]->[$k]";
+ }
+ print STDERR "\n";
+ }
+ print STDERR "Column $k has all gaps\n";
+ }
+ if($numGaps>0 && $numGaps==$nseq-1){
+ $numGappedCols->[$numGaps]++;
+ }
+ }
+ #
+ # Save lengths of all runs of gaps
+ # Eg.
+ # S2 TT---TAAAT-T
+ #
+ #Results
+ #$gapaln[1]=[3,1]
+ #$gapaln[2]=[2]
+ $gapopen=0;
+ for(my $j=0;$j<$nseq;$j++){
+ $gapopen=0;
+ for(my $k=0;$k<$alnlen;$k++){
+ if($matrix->[$j]->[$k] eq '-'){
+ $gapopen++;
+ }
+ else{
+ if($gapopen){ #end of a run of gaps
+ $gapaln->[$j] = [] if(!ref $gapaln->[$j]);
+ push @{$gapaln->[$j]},$gapopen;
+ #if($gapopen>1000){
+ #print STDERR "Long gap $gapopen in seq $j\n";
+ #}
+ }
+ #start of a run of gaps
+ $gapopen=0;
+ }
+ }
+ }
+ if($gapopen){
+ $gapaln->[$nseq-1] = [] if(!ref $gapaln->[$nseq-1]);
+ push @{$gapaln->[$nseq-1]},$gapopen;
+ }
+ return ($totalscore,$alnlen);
+sub maf2matrix{
+ my($mafs) = @_;
+ my $matrix = [];
+ my $i=0;
+ print STDERR " with ",scalar(@$mafs)," seqs\n";
+ foreach my $m (@$mafs){
+ my @row = split(//,$m);
+ $matrix->[$i++] = \@row;
+ }
+ return $matrix;
+sub getCovered{
+ my($blocksbyseq) = @_;
+ my $seqmatrix = {};
+ foreach my $seq (sort {$a cmp $b} keys %$blocksbyseq){
+ foreach my $b (@{$blocksbyseq->{$seq}}){
+ for(my $j=$b->[0];$j<$b->[1];$j++){
+ if($seqmatrix->{$seq}->[$j]>0){
+ print STDERR " $seq $j doublecov $seqmatrix->{$seq}->[$j] $b->[0] $b->[1]\n";
+ }
+ $seqmatrix->{$seq}->[$j]++;
+ }
+ }
+ }
+ return $seqmatrix;
diff --git a/util/reportvariants.pl b/util/reportvariants.pl
new file mode 100755
index 0000000..4d662b9
--- /dev/null
+++ b/util/reportvariants.pl
@@ -0,0 +1,118 @@
+#./reportvariants.pl index fasta
+use strict;
+use Bio::Perl;
+use Bio::DB::Fasta;
+use Bio::Seq;
+use lib '/usr/local/projects/angiuoli/developer/sangiuoli/mugsy/trunk/mapping/';
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+use AlignmentTree;
+my %options;
+my $results = GetOptions (\%options,
+ 'gap_window|g=s',
+ 'display_window|d=s',
+ 'gaps_allowed|a=s') || pod2usage(-verbose => 1);
+pod2usage(-verbose=>1) if($options{'help'});
+my $atree = AlignmentTree::deserialize($ARGV[0]);
+my $db = Bio::DB::Fasta->new($ARGV[1],'-reindex'=>1);
+my $gapthreshold=0;
+if(exists $options{'gaps_allowed'}){
+ $gapthreshold = $options{'gaps_allowed'};
+my $gap_window=5;
+if(exists $options{'gap_window'}){
+ $gap_window = $options{'gap_window'};
+my $display_window=5;
+if(exists $options{'display_window'}){
+ $display_window = $options{'display_window'};
+shift @ARGV;
+shift @ARGV;
+my $pwseqs = {};
+my $refname = shift @ARGV;
+foreach my $seq (@ARGV){
+ $pwseqs->{$seq}++;
+open VFILE,"+>$$.pwvariants.out" or die "Can't open file pwvariants.out";
+open SFILE,"+>$$.snpvariants.out" or die "Can't open file snpvariants.out";
+foreach my $alnname (sort {$a cmp $b} keys %{$atree->{_alignments}}){
+ my($alnobj,$aln_bv,$align_width) = @{$atree->{_alignments}->{$alnname}};
+ my ($mmatrix,$seqmatrix,$names) = $atree->getAlignmentMatrix($alnname,1,$align_width,$db);
+ if(@$seqmatrix > 1){
+ #print STDERR "Checking alignment $alnname $align_width ",scalar(@$seqmatrix),"\n";
+ my $ngaps;
+ my $nmismatches;
+ my $variants = {};
+ my $seqvariants = {};
+ my $refidx;
+ for(my $i=0;$i<@$seqmatrix;$i++){
+ if($names->[$i] eq $refname){
+ $refidx=$i;
+ }
+ }
+#Matrix cols start at 0
+ for(my $j=0;$j<$align_width;$j++){
+ my $b;
+ my $refbp = lc(substr($seqmatrix->[$refidx],$j,1));
+ for(my $i=0;$i<@$seqmatrix;$i++){
+ if($i ne $refidx){
+ my $currbp = lc(substr($seqmatrix->[$i],$j,1));
+ if($currbp ne $refbp && $currbp !~ /[yskrmwnw]/){
+ $variants->{$j}++;
+ $seqvariants->{$i}->{$j}++;
+ }
+ }
+ #print "$b=$currbp " if($b ne '-' && $currbp ne '-');
+ }
+ }
+ #print STDERR "variants ",scalar(keys %$variants),"\n";
+ foreach my $col (sort {$a <=> $b} keys %$variants){
+ my $gaps=0;
+ for(my $i=0;$i<@$seqmatrix;$i++){
+ my $start = $col - $gap_window;
+ $start = 0 if($start < 0);
+ my $end = $col + $gap_window;
+ $end = $align_width if($end > $align_width);
+ $gaps+= (substr($seqmatrix->[$i],$start,$end-$start+1) =~ tr/\-/\-/);
+ }
+ if($gaps<=$gapthreshold){
+ my $refc;
+ for(my $i=0;$i<@$seqmatrix;$i++){
+ my $start = $col - $display_window;
+ $start = 0 if($start < 0);
+ my $end = $col + $display_window;
+ $end = $align_width if($end > $align_width);
+ my($alni) = $atree->getAlignedInterval($alnname,$names->[$i]);
+ my $colstart = 1+$start;
+ my $colend = $colstart;
+ my($startc,$endc) = AlignmentTree::columntocoords($alni,$col+1,$col+1);
+ $refc = $startc if($names->[$i] eq "$refname");
+ #AlignmentTree::printAlignmentDebug($alnobj);
+ printf("%10s %s\tcoords:%d-%d\n",$names->[$i],lc(substr($seqmatrix->[$i],$start,$end-$start+1)),$startc,$endc);
+#, substr($seqmatrix->[$i],$start,$end-$start),"\n";
+ if($names->[0] eq "$refname" && exists $pwseqs->{$names->[$i]} && $seqvariants->{$i}->{$col}){
+ print SFILE "$names->[$i]\t$refname\t$refc\t",$refc+1,"\t",uc(substr($seqmatrix->[0],$col,1)),"\n";
+ print VFILE "$names->[$i]\t$refc\t",$refc+1,"\t",substr($seqmatrix->[0],$col,1),"/",substr($seqmatrix->[$i],$col,1),"\t$names->[$i]\t$startc-$endc\n";
+ }
+ }
+ printf("%10s ^ \n");
+ print "\n";
+ }
+ }
+ }
+close VFILE;
+close SFILE;
diff --git a/xmfa2maf.pl b/xmfa2maf.pl
new file mode 100755
index 0000000..b20389e
--- /dev/null
+++ b/xmfa2maf.pl
@@ -0,0 +1,116 @@
+#Utility for converting output of Mauve XMFA to MAF format
+#USAGE: ./xmfa2maf seqs.len < aln.xmfa > aln.maf
+use strict;
+my $seqname;
+my $start;
+my $end;
+my $orient;
+my $seqinfo = [];
+my %lens;
+my $blocks = [];
+my $usenum = $ARGV[1];
+my $idx=0;
+ open(FILE,$ARGV[0]) or die "Can't open file $ARGV[0] needed for sequence lengths";
+ while(my $line=<FILE>){
+ chomp $line;
+ my($name,$len,$newname) = split(/\s+/,$line);
+ if($usenum){
+ $lens{++$idx}->{'len'} = $len;
+ if(length($newname)>0){
+ $lens{$idx}->{'name'} = $newname;
+ }
+ }
+ elsif($name){
+ $lens{$name}->{'len'} = $len;
+ if(length($newname)>0){
+ $lens{$name}->{'name'} = $newname;
+ }
+ }
+ }
+ close FILE;
+print "##maf version=1 scoring=mauve\n";
+while(my $line=<STDIN>){
+ if($line =~ /^\s*=/){
+ if(defined $seqname && $start>0){
+ push @$blocks,[$seqname,$start-1,$end,$orient,$seqinfo];
+ }
+ if(scalar(@$blocks)>0){
+ #Convert alignment to zero start, interbase coordinates
+ print "a score=1\n";
+ foreach my $l (@$blocks){
+ &printMAF(@$l);
+ }
+ print "\n";
+ }
+ $seqname=undef;
+ $start=0;
+ $seqinfo=[];
+ $blocks = [];
+ }
+ #Format >id1:start-end orient id2
+ elsif(($line =~ /^>\s+\S+\:/ &&
+ $line =~ /^>\s*(\S+)\:(\d+)-(\d+)\s+([\+\-])\s+(\S+)/)
+ ||
+ $line =~ /^>(\S+)\s+(\d+)\s+(\d+)\s+([\+\-])\s+(\S+)/){
+ chomp $line;
+ if(defined $seqname && $start>0){
+ push @$blocks,[$seqname,$start-1,$end,$orient,$seqinfo];
+ }
+ my $seqid;
+ if(exists $lens{$1}){
+ $seqid = $1;
+ }else{
+ if(exists $lens{$5}){
+ $seqid = $5;
+ }
+ else{
+ $seqid = $1;
+ $lens{$1}->{'len'} = $5;
+ }
+ }
+ $start = $2;
+ if($start>0){
+ $end = $3;
+ #XMFA format start always < end
+ die "Invalid coordinates $start-$end" if($start>$end);
+ #Relative orientation of the alignment
+ $orient = $4;
+ my $file = $5;
+ $seqname = $seqid;
+ $seqname =~ s/^\/.*\/(\S+)/$1/;
+ }
+ $seqinfo=[];
+ }
+ else{
+ if($line !~ /\#/){
+ if(defined $seqname){
+ chomp $line;
+ push @$seqinfo,$line if($line =~ /\S+/);
+ }
+ }
+ }
+sub printMAF{
+ my($id,$s,$e,$o,$str) = @_;
+ die "No length specified for seq $id in $ARGV[0]" if(!exists $lens{$id});
+ die "$e<$s" if($e<=$s);
+ die if($o ne '+' && $o ne '-');
+ my $len = $e-$s;
+ $s = ($o eq '-') ? ($lens{$id}->{'len'}-$e) : $s;
+ die "Bad coords $s $e $lens{$id}->{'len'}" if($s<0);
+ my $seqlen = $lens{$id}->{'len'};
+ if(exists $lens{$id}->{'name'}){
+ $id = $lens{$id}->{'name'};
+ }
+ print "s $id $s ",$len," $o $seqlen ",join('',@$str),"\n";
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mugsy.git
