[med-svn] [harvest-tools] 01/06: New upstream version 1.3
Andreas Tille
tille at debian.org
Fri Oct 21 09:15:06 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository harvest-tools.
commit a2f141c9bfc4321d46b05bea8b3c44bbfa0f0efd
Author: Andreas Tille <tille at debian.org>
Date: Fri Oct 21 11:02:01 2016 +0200
New upstream version 1.3
---
INSTALL.txt | 38 +++++--
Makefile.in | 6 +-
README.md | 18 ++--
configure.ac | 2 +-
src/harvest/AnnotationList.cpp | 27 ++++-
src/harvest/AnnotationList.h | 7 +-
src/harvest/HarvestIO.cpp | 42 +++++++-
src/harvest/HarvestIO.h | 3 +-
src/harvest/LcbList.cpp | 121 ++++++++++++++++++++++
src/harvest/LcbList.h | 1 +
src/harvest/PhylogenyTree.cpp | 43 ++++++++
src/harvest/PhylogenyTree.h | 1 +
src/harvest/PhylogenyTreeNode.cpp | 5 +-
src/harvest/PhylogenyTreeNode.h | 5 +-
src/harvest/ReferenceList.cpp | 14 +--
src/harvest/ReferenceList.h | 12 +--
src/harvest/VariantList.cpp | 210 +++++++++++++++++++++++++++++++++-----
src/harvest/VariantList.h | 139 ++++++++++++++++++++++++-
src/harvest/harvest.cpp | 141 +++++++++++++++++++++++--
19 files changed, 746 insertions(+), 89 deletions(-)
diff --git a/INSTALL.txt b/INSTALL.txt
index 91049c6..63ab922 100644
--- a/INSTALL.txt
+++ b/INSTALL.txt
@@ -4,13 +4,14 @@ Dependencies:
-------------
- Autoconf ( http://www.gnu.org/software/autoconf/ )
- Protocol Buffers ( https://code.google.com/p/protobuf/ )
- - Zlib ( http://www.zlib.net/ )
+ - Cap'n Proto ( https://capnproto.org/ )
+ - Zlib ( http://www.zlib.net/, included with OS X and most Linuxes )
Steps:
------
./bootstrap.sh
- ./configure [--prefix=...] [--with-protobuf=...]
+ ./configure [--prefix=...] [--with-protobuf=...] [--with-capnp=...]
make
[sudo] make install
@@ -19,15 +20,32 @@ Products:
---------
- command line tool ( <prefix>/bin/harvest )
- static library ( <prefix>/lib/libharvest.a )
- - includes
- - Harvest, Protocol Buffer message class ( <prefix>/include/harvest.pb.h )
- - HarvestIO, Harvest wrapper with file IO ( <prefix>/include/HarvestIO.h )
+ - includes ( <prefix>/include/harvest/ )
+
+
+Deployment:
+-----------
+OSX
+ HarvestTools can be built on any OSX version starting with 10.7 (using
+ XCode >= 4.6), and the binary should work across the same range of versions,
+ regardless of the particular version it was built with.
+Linux
+ HarvestTools must be built with at least GCC 4.8 due to its c++11 dependency.
+ However, the resulting binary can run on Linux with older GCC libraries if
+ libstdc++ is linked statically. To do this, ensure that you have a statically
+ built library (libstdc++.a) and add "-static-libstdc++" to $CPPFLAGS before
+ running "make". You may also need to use "-L <dir>"in $CPPFLAGS to tell GCC
+ where to find libstdc++.a.
Notes:
------
-When running ./configure, use --prefix to install somewhere other than
-/usr/local. Use --with-protobuf if the Protocol Buffer libraries are not
-in their default location (/usr/local). Zlib should be installed in a standard
-system location. Sudo will be necessary for 'make install' if write permission
-is not available in --prefix.
+* When running ./configure, use --prefix to install somewhere other than
+ /usr/local.
+* Sudo will be necessary for 'make install' if write permission
+ is not available in --prefix (e.g. by default).
+* Use --with-protobuf=... or --with-capnp=... if the Protocol Buffer or Cap'n
+ Proto libraries are not in the default location (/usr/local). These should be
+ absolute paths and should not include "bin" or "lib".
+* If Zlib is not installed in a standard system location (it usually is),
+ CXXFLAGS and LDFLAGS will have to be modified before making.
diff --git a/Makefile.in b/Makefile.in
index f9c362c..fdb2ce0 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -5,7 +5,7 @@ UNAME_S=$(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CXXFLAGS += -mmacosx-version-min=10.7 -stdlib=libc++
else
- CXXFLAGS += -include src/harvest/memcpyLink.h -L. -static-libstdc++ -Wl,--wrap=memcpy
+ CXXFLAGS += -include src/harvest/memcpyLink.h -Wl,--wrap=memcpy
CFLAGS += -include src/harvest/memcpyLink.h
endif
@@ -44,13 +44,13 @@ libharvest.a : $(OBJECTS)
$(CXX) -c $(CXXFLAGS) $(CPPFLAGS) -o $@ $<
src/harvest/memcpyWrap.o : src/harvest/memcpyWrap.c
- $(CC) -c -o $@ $<
+ $(CC) $(CFLAGS) -c -o $@ $<
src/harvest/pb/harvest.pb.cc src/harvest/pb/harvest.pb.h : src/harvest/pb/harvest.proto
cd src; @protobuf@/bin/protoc --cpp_out . harvest/pb/harvest.proto
src/harvest/capnp/harvest.capnp.c++ src/harvest/capnp/harvest.capnp.h : src/harvest/capnp/harvest.capnp
- cd src/harvest/capnp;@capnp@/bin/capnp compile -oc++ harvest.capnp
+ cd src/harvest/capnp;export PATH=@capnp@/bin/:${PATH};capnp compile -I @capnp@/include -oc++ harvest.capnp
install : libharvest.a
mkdir -p @prefix@/bin/
diff --git a/README.md b/README.md
index 5c5e3a0..a3b9e40 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,11 @@
-![Travis-CI build status]
-(https://travis-ci.org/marbl/harvest-tools.svg?branch=master)
-
HarvestTools is a part of the Harvest software suite and provides file
-conversion between Gingr files and various standard text formats. HarvestTools
-is normally distributed as a binary for OS X or Linux:
-
-OSX: https://github.com/marbl/harvest-tools/releases/download/v1.0/harvesttools-OSX64.gz
-Linux: https://github.com/marbl/harvest-tools/releases/download/v1.0/harvesttools-Linux64.gz
-
-However, the source is available for unique build environments or for development.
+conversion between Gingr files and various standard text formats (see
+[Harvest](http://harvest.readthedocs.org)).
-Harvest information: https://github.com/marbl/harvest
+HarvestTools is normally distributed as a binary for OS X or Linux (see Harvest
+link above). However, the source is provided here for unique build environments
+or for development.
-CITATION provides details on how to cite harvest-tools.
+CITATION provides details on how to cite HarvestTools.
INSTALL.txt provides instructions for building from source and installing.
LICENSE.txt provides licensing information.
diff --git a/configure.ac b/configure.ac
index f0b918f..fc8e61d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
AC_INIT(src/harvest/harvest.cpp)
AC_ARG_WITH(protobuf, [ --with-protobuf=<path/to/protobuf> Protocol Buffers install dir (default: /usr/local/)])
-AC_ARG_WITH(flatbuffers, [ --with-capnp=<path/to/capnp> Cap'n Proto install dir (default: /usr/local/)])
+AC_ARG_WITH(capnp, [ --with-capnp=<path/to/capnp> Cap'n Proto install dir (default: /usr/local/)])
if test "$with_protobuf" == ""
then
diff --git a/src/harvest/AnnotationList.cpp b/src/harvest/AnnotationList.cpp
index ca10e5d..b6b9a58 100644
--- a/src/harvest/AnnotationList.cpp
+++ b/src/harvest/AnnotationList.cpp
@@ -7,9 +7,15 @@
#include "AnnotationList.h"
#include <fstream>
#include "parse.h"
+#include <algorithm>
using namespace std;
+bool annotationLessThan(const Annotation & a, const Annotation & b)
+{
+ return a.start < b.start;
+}
+
void AnnotationList::clear()
{
annotations.clear();
@@ -60,6 +66,10 @@ void AnnotationList::initFromCapnp(const capnp::Harvest::Reader & harvestReader,
//printf("%s\t%d\t%d\t%d\t%c\t%s\t%s\n", annotation.locus.c_str(), msgAnn.sequence(), annotation.start, annotation.end, annotation.reverse ? '-' : '+', annotation.name.c_str(), annotation.description.c_str());
}
+
+ // older capnp files might not be sorted
+ //
+ sort(annotations.begin(), annotations.end(), annotationLessThan);
}
void AnnotationList::initFromGenbank(const char * file, ReferenceList & referenceList, bool useSeq)
@@ -114,12 +124,17 @@ void AnnotationList::initFromGenbank(const char * file, ReferenceList & referenc
}
while ( *token == ' ' );
}
- else if ( ! useSeq && removePrefix(line, "VERSION") )
+ else if ( ! useSeq && (token = removePrefix(line, "VERSION")) )
{
- if ( const char * giToken = strstr(line, " GI:") )
+ while ( *token == ' ' )
{
- long int gi = atol(giToken + 4);
- int sequence = referenceList.getReferenceSequenceFromGi(gi);
+ token++;
+ }
+
+ if ( *token )
+ {
+ char * acc = strtok(token, " \t\n");
+ int sequence = referenceList.getReferenceSequenceFromAcc(acc);
offset = 0;
@@ -130,7 +145,7 @@ void AnnotationList::initFromGenbank(const char * file, ReferenceList & referenc
}
else
{
- throw NoGiException(file);
+ throw NoAccException(file);
}
}
else if ( removePrefix(line, "FEATURES") )
@@ -314,6 +329,8 @@ void AnnotationList::initFromGenbank(const char * file, ReferenceList & referenc
}
}
+ sort(annotations.begin(), annotations.end(), annotationLessThan);
+
for ( int i = 0; false && i < annotations.size(); i++ )
{
annotation = &annotations.at(i);
diff --git a/src/harvest/AnnotationList.h b/src/harvest/AnnotationList.h
index 3ee2859..609a5c1 100644
--- a/src/harvest/AnnotationList.h
+++ b/src/harvest/AnnotationList.h
@@ -9,6 +9,7 @@
#include <string>
#include <vector>
+#include <map>
#include <stdexcept>
#include "harvest/capnp/harvest.capnp.h"
@@ -44,16 +45,16 @@ public:
std::string file;
};
- class NoGiException : public std::exception
+ class NoAccException : public std::exception
{
public:
- NoGiException(const std::string & fileNew)
+ NoAccException(const std::string & fileNew)
{
file = fileNew;
}
- virtual ~NoGiException() throw() {}
+ virtual ~NoAccException() throw() {}
std::string file;
};
diff --git a/src/harvest/HarvestIO.cpp b/src/harvest/HarvestIO.cpp
old mode 100644
new mode 100755
index a9226f2..c165dce
--- a/src/harvest/HarvestIO.cpp
+++ b/src/harvest/HarvestIO.cpp
@@ -21,6 +21,7 @@
#include <zlib.h>
#include <stdio.h>
#include <assert.h>
+#include <algorithm>
#define SET_BINARY_MODE(file)
@@ -409,6 +410,11 @@ void HarvestIO::writeMfa(std::ostream &out) const
lcbList.writeToMfa(out, referenceList, trackList, variantList);
}
+void HarvestIO::writeFilteredMfa(std::ostream &out, std::ostream &out2) const
+{
+ lcbList.writeFilteredToMfa(out, out2, referenceList, trackList, variantList);
+}
+
void HarvestIO::writeNewick(std::ostream &out, bool useMult) const
{
if ( ! phylogenyTree.getRoot() )
@@ -483,9 +489,39 @@ void HarvestIO::writeSnp(std::ostream &out, bool indels) const
variantList.writeToMfa(out, indels, trackList);
}
-void HarvestIO::writeVcf(std::ostream &out, bool indels) const
+void HarvestIO::writeVcf(std::ostream &out, const vector<string> * trackNames, const PhylogenyTreeNode * node, bool indels, bool signature) const
{
- variantList.writeToVcf(out, indels, referenceList, trackList);
+ vector<int> tracks;
+
+ if ( trackNames )
+ {
+ // specific tracks
+
+ for ( int i = 0; i < trackNames->size(); i++ )
+ {
+ tracks.push_back(trackList.getTrackIndexByFile(trackNames->at(i)));
+ }
+
+ sort(tracks.begin(), tracks.end());
+ }
+ else if ( node )
+ {
+ // clade variants; only use leaves of node
+
+ node->getLeafIds(tracks);
+ sort(tracks.begin(), tracks.end());
+ }
+ else
+ {
+ // use all tracks
+
+ for ( int i = 0; i < trackList.getTrackCount(); i++ )
+ {
+ tracks.push_back(i);
+ }
+ }
+
+ variantList.writeToVcf(out, indels, referenceList, annotationList, trackList, tracks, signature);
}
@@ -635,4 +671,4 @@ void zerr(int ret)
case Z_VERSION_ERROR:
fputs("zlib version mismatch!\n", stderr);
}
-}
\ No newline at end of file
+}
diff --git a/src/harvest/HarvestIO.h b/src/harvest/HarvestIO.h
index fae8ce0..a573023 100644
--- a/src/harvest/HarvestIO.h
+++ b/src/harvest/HarvestIO.h
@@ -44,9 +44,10 @@ public:
void writeFasta(std::ostream &out) const;
void writeHarvest(const char * file);
void writeMfa(std::ostream &out) const;
+ void writeFilteredMfa(std::ostream &out, std::ostream &out2) const;
void writeNewick(std::ostream &out, bool useMult = false) const;
void writeSnp(std::ostream &out, bool indels = false) const;
- void writeVcf(std::ostream &out, bool indels = false) const;
+ void writeVcf(std::ostream &out, const std::vector<std::string> * trackNames = 0, const PhylogenyTreeNode * node = 0, bool indels = false, bool signature = false) const;
void writeXmfa(std::ostream &out, bool split = false) const;
void writeBackbone(std::ostream &out) const;
diff --git a/src/harvest/LcbList.cpp b/src/harvest/LcbList.cpp
old mode 100644
new mode 100755
index a7fc05f..a99b037
--- a/src/harvest/LcbList.cpp
+++ b/src/harvest/LcbList.cpp
@@ -1144,6 +1144,127 @@ void LcbList::writeToMfa(ostream & out, const ReferenceList & referenceList, con
out << endl;
}
}
+void LcbList::writeFilteredToMfa(ostream & out, ostream & out2, const ReferenceList & referenceList, const TrackList & trackList, const VariantList & variantList) const
+{
+ // now iterate over alignments
+
+ int totrefgaps = 0;
+
+ for ( int i = 0; i < trackList.getTrackCount(); i++)
+ {
+ int refend = 0;
+ int currvar = 0;
+ int width = 80;
+ int col = 0;
+
+ out << '>' << trackList.getTrack(i).file << endl;
+
+ for ( int j = 0; j < lcbs.size(); j++ )
+ {
+ const LcbList::Lcb & lcb = lcbs.at(j);
+ int refIndex = lcb.sequence;
+ int refstart = lcb.position;
+ const LcbList::Region & region = lcb.regions.at(i);
+
+ int currpos = refstart;
+ int variantsSize = variantList.getVariantCount();
+ const VariantList::Variant * currvarref;
+
+ if ( currvar < variantsSize )
+ {
+ currvarref = &variantList.getVariant(currvar);
+
+ if ( currvarref->alleles[0] == '-' )
+ {
+ currpos--;
+ }
+ }
+
+ while
+ (
+ currpos - refstart < lcb.regions.at(0).length ||
+ (
+ currvar < variantsSize &&
+ currvarref->sequence == refIndex &&
+ currvarref->position - refstart < lcb.regions.at(0).length
+ )
+ )
+ {
+ if ( currpos == referenceList.getReference(refIndex).sequence.size() )
+ {
+ printf("ERROR: LCB %d extends beyond reference (position %d)\n", j,
+ currpos);
+ return;
+ }
+
+ if ( col == width )
+ {
+ out << endl;
+ col = 0;
+ }
+
+ if
+ (
+ currvar == variantsSize ||
+ (currpos != currvarref->position && currpos >= refstart) ||
+ (
+ currvarref->reference == '-' &&
+ currvar > 0 &&
+ variantList.getVariant(currvar - 1).position != currpos &&
+ currpos >= refstart
+ )
+ )
+ {
+ // ALB -- do not output if this SNP has been filtered for some reason
+ //if ( currvarref->filters == 0 ) {
+ out << referenceList.getReference(refIndex).sequence.at(currpos);
+ if(i == 0) {
+ // believe our internal genome sequence index is 0-based, so add one here to be compatible with GenBank 1-based system
+ out2 << currpos + 1 << ",";
+ }
+ //}
+ col++;
+
+ if ( col == width )
+ {
+ out << endl;
+ col = 0;
+ }
+ }
+
+ if ( currvar < variantsSize && currpos == currvarref->position )
+ {
+ // ALB -- do not output if this SNP has been filtered for some reason
+ if ( currvarref->filters == 0 ) {
+ out << currvarref->alleles[i];
+ if(i == 0) {
+ out2 << currpos + 1 << ",";
+ }
+ }
+ currvar++;
+
+ if ( currvar < variantsSize )
+ {
+ currvarref = &variantList.getVariant(currvar);
+ }
+
+ col++;
+ }
+
+ if ( currvar == variantsSize || currvarref->position > currpos || currvarref->sequence != refIndex )
+ {
+ currpos++;
+ }
+ }
+
+ }
+
+ out << endl;
+ if(i == 0) {
+ out2 << endl;
+ }
+ }
+}
void LcbList::writeToProtocolBuffer(Harvest * msg) const
{
Harvest::Alignment * msgAlignment = msg->mutable_alignment();
diff --git a/src/harvest/LcbList.h b/src/harvest/LcbList.h
index f07bad5..a389078 100644
--- a/src/harvest/LcbList.h
+++ b/src/harvest/LcbList.h
@@ -75,6 +75,7 @@ public:
void initWithSingleLcb(const ReferenceList & referenceList, const TrackList & trackList);
void writeToCapnp(capnp::Harvest::Builder & harvestBuilder) const;
void writeToMfa(std::ostream & out, const ReferenceList & referenceList, const TrackList & trackList, const VariantList & variantList) const;
+ void writeFilteredToMfa(std::ostream & out, std::ostream & out2, const ReferenceList & referenceList, const TrackList & trackList, const VariantList & variantList) const;
void writeToProtocolBuffer(Harvest * msg) const;
void writeToXmfa(std::ostream & out, const ReferenceList & referenceList, const TrackList & trackList, const VariantList & variantList) const;
diff --git a/src/harvest/PhylogenyTree.cpp b/src/harvest/PhylogenyTree.cpp
index f0fb1cb..f27b48a 100644
--- a/src/harvest/PhylogenyTree.cpp
+++ b/src/harvest/PhylogenyTree.cpp
@@ -35,6 +35,49 @@ void PhylogenyTree::clear()
}
}
+const PhylogenyTreeNode * PhylogenyTree::getLca(int track1, int track2) const
+{
+ const PhylogenyTreeNode * node1;
+ const PhylogenyTreeNode * node2;
+
+ for ( int i = 0; i < leaves.size(); i++ )
+ {
+ if ( leaves[i]->getTrackId() == track1 )
+ {
+ node1 = leaves[i];
+ }
+
+ if ( leaves[i]->getTrackId() == track2 )
+ {
+ node2 = leaves[i];
+ }
+ }
+
+ while ( node1 && node1->getAncestors() > node2->getAncestors() )
+ {
+ node1 = node1->getParent();
+ }
+
+ while ( node2 && node2->getAncestors() > node1->getAncestors() )
+ {
+ node2 = node2->getParent();
+ }
+
+ while ( node1 && node2 && node1 != node2 )
+ {
+ node1 = node1->getParent();
+ node2 = node2->getParent();
+ }
+
+ if ( ! node1 || ! node2 )
+ {
+ cout << "ERROR: could not get LCA for tracks " << track1 << " and " << track2 << "." << endl;
+ exit(1);
+ }
+
+ return node1;
+}
+
void PhylogenyTree::getLeafIds(vector<int> & ids) const
{
ids.resize(0);
diff --git a/src/harvest/PhylogenyTree.h b/src/harvest/PhylogenyTree.h
index c268858..0f42088 100644
--- a/src/harvest/PhylogenyTree.h
+++ b/src/harvest/PhylogenyTree.h
@@ -23,6 +23,7 @@ public:
~PhylogenyTree();
void clear();
+ const PhylogenyTreeNode * getLca(int track1, int track2) const;
const PhylogenyTreeNode * getLeaf(int id) const;
void getLeafIds(std::vector<int> & ids) const;
double getMult() const;
diff --git a/src/harvest/PhylogenyTreeNode.cpp b/src/harvest/PhylogenyTreeNode.cpp
index d19c3db..b04e259 100644
--- a/src/harvest/PhylogenyTreeNode.cpp
+++ b/src/harvest/PhylogenyTreeNode.cpp
@@ -266,16 +266,17 @@ void PhylogenyTreeNode::getPairwiseDistances(float ** matrix, int size)
}
}
-void PhylogenyTreeNode::initialize(int & newId, int &leaf, float depthParent)
+void PhylogenyTreeNode::initialize(int & newId, int &leaf, float depthParent, int ancestorsNew)
{
id = newId;
newId++;
leafMin = leaf;
depth = depthParent + distance;
+ ancestors = ancestorsNew;
for ( int i = 0; i < children.size(); i++ )
{
- children[i]->initialize(newId, leaf, depth);
+ children[i]->initialize(newId, leaf, depth, ancestors + 1);
}
if ( children.size() == 0 )
diff --git a/src/harvest/PhylogenyTreeNode.h b/src/harvest/PhylogenyTreeNode.h
index ca2d0ea..620fff3 100644
--- a/src/harvest/PhylogenyTreeNode.h
+++ b/src/harvest/PhylogenyTreeNode.h
@@ -25,6 +25,7 @@ public:
PhylogenyTreeNode * bisectEdge(float distanceLower);
PhylogenyTreeNode * collapse();
+ int getAncestors() const;
float getBootstrap() const;
PhylogenyTreeNode * getChild(unsigned int index) const;
int getChildrenCount() const;
@@ -39,7 +40,7 @@ public:
void getLeafIds(std::vector<int> & ids) const;
void getPairwiseDistances(float ** matrix, int size);
const PhylogenyTreeNode * getParent() const;
- void initialize(int & newId, int & leaf, float depthParent = 0);
+ void initialize(int & newId, int & leaf, float depthParent = 0, int ancestorsNew = 0);
void invert(PhylogenyTreeNode * fromChild = 0);
void setAlignDist(float dist, float dep);
void setParent(PhylogenyTreeNode * parentNew, float distanceNew);
@@ -65,6 +66,7 @@ private:
PhylogenyTreeNode * parent;
int id;
int trackId;
+ int ancestors;
double distance;
float depth;
int leafMin;
@@ -72,6 +74,7 @@ private:
float bootstrap;
};
+inline int PhylogenyTreeNode::getAncestors() const {return ancestors;}
inline float PhylogenyTreeNode::getBootstrap() const {return bootstrap;}
inline PhylogenyTreeNode * PhylogenyTreeNode::getChild(unsigned int index) const {return children[index];};
inline int PhylogenyTreeNode::getChildrenCount() const {return children.size();}
diff --git a/src/harvest/ReferenceList.cpp b/src/harvest/ReferenceList.cpp
index f2f7467..e912ee3 100644
--- a/src/harvest/ReferenceList.cpp
+++ b/src/harvest/ReferenceList.cpp
@@ -56,25 +56,19 @@ int ReferenceList::getReferenceSequenceFromConcatenated(long int position) const
return undef;
}
-int ReferenceList::getReferenceSequenceFromGi(long int gi) const
+int ReferenceList::getReferenceSequenceFromAcc(const string & acc) const
{
for ( int i = 0; i < references.size(); i++ )
{
- size_t giToken = references.at(i).name.find("gi|");
+ size_t giToken = references.at(i).name.find(acc);
if ( giToken != string::npos )
{
- if ( gi == atol(references.at(i).name.c_str() + giToken + 3))
- {
- return i;
- }
+ return i;
}
}
- char giString[1024];
- sprintf(giString, "%ld", gi);
-
- throw GiNotFoundException(giString);
+ throw AccNotFoundException(acc);
return undef;
}
diff --git a/src/harvest/ReferenceList.h b/src/harvest/ReferenceList.h
index d9bd61a..ab230ba 100644
--- a/src/harvest/ReferenceList.h
+++ b/src/harvest/ReferenceList.h
@@ -28,18 +28,18 @@ class ReferenceList
{
public:
- class GiNotFoundException : public std::exception
+ class AccNotFoundException : public std::exception
{
public:
- GiNotFoundException(const std::string & giNew)
+ AccNotFoundException(const std::string & accNew)
{
- gi = giNew;
+ acc = accNew;
}
- virtual ~GiNotFoundException() throw() {}
+ virtual ~AccNotFoundException() throw() {}
- std::string gi;
+ std::string acc;
};
class NameNotFoundException : public std::exception
@@ -63,7 +63,7 @@ public:
const Reference & getReference(int index) const;
int getReferenceCount() const;
int getReferenceSequenceFromConcatenated(long int position) const;
- int getReferenceSequenceFromGi(long int gi) const;
+ int getReferenceSequenceFromAcc(const std::string & acc) const;
int getReferenceSequenceFromName(std::string name) const;
void initFromCapnp(const capnp::Harvest::Reader & harvestReader);
void initFromFasta(const char * file);
diff --git a/src/harvest/VariantList.cpp b/src/harvest/VariantList.cpp
index 040798d..794b2ed 100644
--- a/src/harvest/VariantList.cpp
+++ b/src/harvest/VariantList.cpp
@@ -274,7 +274,14 @@ void VariantList::addVariantsFromAlignment(const vector<string> & seqs, const Re
if ( referenceList.getReferenceCount() )
{
- varNew->reference = referenceList.getReference(sequence).sequence[position];
+ if ( offset > 0 )
+ {
+ varNew->reference = '-';
+ }
+ else
+ {
+ varNew->reference = referenceList.getReference(sequence).sequence[position];
+ }
}
else
{
@@ -973,16 +980,24 @@ void VariantList::writeToProtocolBuffer(Harvest * msg) const
}
}
-void VariantList::writeToVcf(std::ostream &out, bool indels, const ReferenceList & referenceList, const TrackList & trackList) const
+void VariantList::writeToVcf(std::ostream &out, bool indels, const ReferenceList & referenceList, const AnnotationList & annotationList, const TrackList & trackList, const vector<int> & tracksFocus, bool signature) const
{
//tjt: Currently outputs SNPs, no indels
//tjt: next pass will add standard VCF output for indels, plus an attempt at qual vals
//tjt: also filters need to be added to findVariants to populate FILTer column
-
+
+ int annCur = -1; // current annotation
+ int annNext = 0;
+
//indel char, to skip columns with indels (for now)
char indl = '-';
//the VCF output file
+ out << "##INFO=<ID=CDS,Number=1,Type=String,Description=\"Coding sequence locus\">" << endl;
+ out << "##INFO=<ID=SYN,Number=0,Type=Flag,Description=\"All alternative alleles are synonymous in coding sequence\">" << endl;
+ out << "##INFO=<ID=AAR,Number=1,Type=String,Description=\"Reference amino acid in coding sequence\">" << endl;
+ out << "##INFO=<ID=AAA,Number=.,Type=String,Description=\"Alternate amino acid in coding sequence, one per alternate allele\">" << endl;
+
for ( int i = 0; i < filters.size(); i++ )
{
const Filter & filter = filters.at(i);
@@ -994,11 +1009,31 @@ void VariantList::writeToVcf(std::ostream &out, bool indels, const ReferenceList
//#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT AA1
out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
-
+ vector<int> tracks;
+
+ if ( signature )
+ {
+ // output all tracks
+
+ for ( int i = 0; i < trackList.getTrackCount(); i++ )
+ {
+ tracks.push_back(i);
+ }
+ }
+ else
+ {
+ // only output tracks of interest (will be all if not differential)
+
+ for ( int i = 0; i < tracksFocus.size(); i++ )
+ {
+ tracks.push_back(tracksFocus[i]);
+ }
+ }
+
//output the file name for each column
- for ( int i = 0; i < trackList.getTrackCount(); i++ )
+ for ( int i = 0; i < tracks.size(); i++ )
{
- const TrackList::Track & track = trackList.getTrack(i);
+ const TrackList::Track & track = trackList.getTrack(tracks[i]);
//out << '\t' << (msgTrack.has_name() ? msgTrack.name() : msgTrack.file());
out << '\t' << track.file;
}
@@ -1010,16 +1045,96 @@ void VariantList::writeToVcf(std::ostream &out, bool indels, const ReferenceList
{
const Variant & variant = variants.at(j);
- //no indels for now..
- if (variant.alleles[0] == indl)
+ //no indels for now.. TODO: should this check outside the clade also?
+ bool indel = false;
+ //
+ for ( int i = 0; i < tracks.size(); i++ )
+ {
+ if ( variant.alleles[tracks[i]] == indl )
+ {
+ indel = true;
+ break;
+ }
+ }
+
+ if ( indel )
+ {
continue;
+ }
+
+ if ( tracks.size() != trackList.getTrackCount() )
+ {
+ // differential
- if (find(variant.alleles.begin(), variant.alleles.end(),indl) != variant.alleles.end())
- continue;
-
+ bool same = true;
+
+ for ( int i = 1; i < tracks.size(); i++ )
+ {
+ if ( variant.alleles[tracks[i]] != variant.alleles[tracks[0]] )
+ {
+ same = false;
+ break;
+ }
+ }
+
+ if ( same )
+ {
+ continue;
+ }
+ }
+ else if ( signature )
+ {
+ bool pass[tracks.size()];
+
+ for ( int i = 0; i < tracks.size(); i++ )
+ {
+ pass[i] = variant.alleles[i] != variant.alleles[tracksFocus[0]];
+ }
+
+ for ( int i = 0; i < tracksFocus.size(); i++ )
+ {
+ pass[tracksFocus[i]] = variant.alleles[tracksFocus[i]] == variant.alleles[tracksFocus[0]];
+ }
+
+ bool isSignature = true;
+
+ for ( int i = 0; i < tracks.size(); i++ )
+ {
+ if ( ! pass[i] )
+ {
+ isSignature = false;
+ break;
+ }
+ }
+
+ if ( ! isSignature )
+ {
+ continue;
+ }
+ }
+
//capture the reference position of variant
int pos = variant.position;
-
+
+ // annotations use concatenated coords; sum previous ref lengths to translate
+ //
+ int offset = 0;
+ //
+ for ( int i = 0; i < variant.sequence; i++ )
+ {
+ offset += referenceList.getReference(i).sequence.length();
+ }
+
+ while ( annNext < annotationList.getAnnotationCount() && annotationList.getAnnotation(annNext).start <= pos + offset )
+ {
+ if ( annotationList.getAnnotation(annNext).feature == "CDS" )
+ {
+ annCur = annNext;
+ }
+
+ annNext++;
+ }
+
//output first few columns, including context (+/- 7bp for now)
int ws = 10;
int lend = pos-ws;
@@ -1042,20 +1157,22 @@ void VariantList::writeToVcf(std::ostream &out, bool indels, const ReferenceList
out << "\t" << variant.reference << "\t";
allele_list.push_back(variant.reference);
bool prev_var = false;
- for ( int i = 0; i < trackList.getTrackCount(); i++ )
+ for ( int i = 0; i < tracks.size(); i++ )
{
- if (find(allele_list.begin(), allele_list.end(), variant.alleles[i]) == allele_list.end())
+ char allele = variant.alleles[tracks[i]];
+
+ if (find(allele_list.begin(), allele_list.end(), allele) == allele_list.end())
{
- if (variant.alleles[i] == indl)
- continue;
+ if (allele == indl)
+ continue; // should never happen
//to know if we need to output a preceding comma
- if (!prev_var)
- out << variant.alleles[i];
- else
- out << "," << variant.alleles[i];
+ if (prev_var)
+ out << ",";
+
+ out << allele;
- allele_list.push_back(variant.alleles[i]);
+ allele_list.push_back(allele);
prev_var = true;
}
}
@@ -1098,7 +1215,52 @@ void VariantList::writeToVcf(std::ostream &out, bool indels, const ReferenceList
}
//INFO
- out << "\tNA";
+ //
+ out << '\t';
+ //
+ if ( annCur != -1 && annotationList.getAnnotation(annCur).end >= pos + offset )
+ {
+ out << "CDS=" << annotationList.getAnnotation(annCur).locus << ';';
+
+ string codonRef = refseq.substr(annotationList.getAnnotation(annCur).start - offset + (pos + offset - annotationList.getAnnotation(annCur).start) / 3 * 3, 3);
+ int codonPos = (pos + offset - annotationList.getAnnotation(annCur).start) % 3;
+
+ bool rc = annotationList.getAnnotation(annCur).reverse;
+
+ string aaRef = translations.count(codonRef) ? rc ? translationsRc.at(codonRef) : translations.at(codonRef) : ".";
+ out << "AAR=" << aaRef << ";AAA=";
+
+ bool syn = true;
+
+ for ( int i = 1; i < allele_list.size(); i++ )
+ {
+ if ( i > 1 )
+ {
+ out << ',';
+ }
+
+ string codonAlt = codonRef;
+ codonAlt[codonPos] = allele_list.at(i);
+ string aaAlt = translations.count(codonAlt) ? rc ? translationsRc.at(codonAlt) : translations.at(codonAlt) : ".";
+
+ if ( aaRef != aaAlt )
+ {
+ syn = false;
+ }
+
+ out << aaAlt;
+ }
+
+ if ( syn )
+ {
+ out << ";SYN";
+ }
+ }
+ else
+ {
+ out << "NA";
+ }
+
//FORMAT
out << "\tGT";
@@ -1112,9 +1274,9 @@ void VariantList::writeToVcf(std::ostream &out, bool indels, const ReferenceList
indexByAllele[allele_list[i]] = i;
}
- for (i = 0; i < trackList.getTrackCount(); i++ )
+ for (i = 0; i < tracks.size(); i++ )
{
- out << "\t" << indexByAllele[variant.alleles[i]];
+ out << "\t" << indexByAllele[variant.alleles[tracks[i]]];
}
out << "\n";
diff --git a/src/harvest/VariantList.h b/src/harvest/VariantList.h
index 9e686bf..0763436 100644
--- a/src/harvest/VariantList.h
+++ b/src/harvest/VariantList.h
@@ -14,9 +14,146 @@
#include "harvest/PhylogenyTree.h"
#include "harvest/ReferenceList.h"
#include "harvest/TrackList.h"
+#include "harvest/AnnotationList.h"
typedef long long unsigned int uint64;
+static const std::map<std::string, std::string> translations =
+{
+ {"TTT", "F"},
+ {"TCT", "S"},
+ {"TAT", "Y"},
+ {"TGT", "C"},
+ {"TTC", "F"},
+ {"TCC", "S"},
+ {"TAC", "Y"},
+ {"TGC", "C"},
+ {"TTA", "L"},
+ {"TCA", "S"},
+ {"TAA", "*"},
+ {"TGA", "*"},
+ {"TTG", "L"},
+ {"TCG", "S"},
+ {"TAG", "*"},
+ {"TGG", "W"},
+ {"CTT", "L"},
+ {"CCT", "P"},
+ {"CAT", "H"},
+ {"CGT", "R"},
+ {"CTC", "L"},
+ {"CCC", "P"},
+ {"CAC", "H"},
+ {"CGC", "R"},
+ {"CTA", "L"},
+ {"CCA", "P"},
+ {"CAA", "Q"},
+ {"CGA", "R"},
+ {"CTG", "L"},
+ {"CCG", "P"},
+ {"CAG", "Q"},
+ {"CGG", "R"},
+ {"ATT", "I"},
+ {"ACT", "T"},
+ {"AAT", "N"},
+ {"AGT", "S"},
+ {"ATC", "I"},
+ {"ACC", "T"},
+ {"AAC", "N"},
+ {"AGC", "S"},
+ {"ATA", "I"},
+ {"ACA", "T"},
+ {"AAA", "K"},
+ {"AGA", "R"},
+ {"ATG", "M"},
+ {"ACG", "T"},
+ {"AAG", "K"},
+ {"AGG", "R"},
+ {"GTT", "V"},
+ {"GCT", "A"},
+ {"GAT", "D"},
+ {"GGT", "G"},
+ {"GTC", "V"},
+ {"GCC", "A"},
+ {"GAC", "D"},
+ {"GGC", "G"},
+ {"GTA", "V"},
+ {"GCA", "A"},
+ {"GAA", "E"},
+ {"GGA", "G"},
+ {"GTG", "V"},
+ {"GCG", "A"},
+ {"GAG", "E"},
+ {"GGG", "G"},
+};
+
+static const std::map<std::string, std::string> translationsRc =
+{
+ {"TTT", "K"},
+ {"GTT", "N"},
+ {"CTT", "K"},
+ {"ATT", "N"},
+ {"TGT", "T"},
+ {"GGT", "T"},
+ {"CGT", "T"},
+ {"AGT", "T"},
+ {"TCT", "R"},
+ {"GCT", "S"},
+ {"CCT", "R"},
+ {"ACT", "S"},
+ {"TAT", "I"},
+ {"GAT", "I"},
+ {"CAT", "M"},
+ {"AAT", "I"},
+ {"TTG", "Q"},
+ {"GTG", "H"},
+ {"CTG", "Q"},
+ {"ATG", "H"},
+ {"TGG", "P"},
+ {"GGG", "P"},
+ {"CGG", "P"},
+ {"AGG", "P"},
+ {"TCG", "R"},
+ {"GCG", "R"},
+ {"CCG", "R"},
+ {"ACG", "R"},
+ {"TAG", "L"},
+ {"GAG", "L"},
+ {"CAG", "L"},
+ {"AAG", "L"},
+ {"TTC", "E"},
+ {"GTC", "D"},
+ {"CTC", "E"},
+ {"ATC", "D"},
+ {"TGC", "A"},
+ {"GGC", "A"},
+ {"CGC", "A"},
+ {"AGC", "A"},
+ {"TCC", "G"},
+ {"GCC", "G"},
+ {"CCC", "G"},
+ {"ACC", "G"},
+ {"TAC", "V"},
+ {"GAC", "V"},
+ {"CAC", "V"},
+ {"AAC", "V"},
+ {"TTA", "*"},
+ {"GTA", "Y"},
+ {"CTA", "*"},
+ {"ATA", "Y"},
+ {"TGA", "S"},
+ {"GGA", "S"},
+ {"CGA", "S"},
+ {"AGA", "S"},
+ {"TCA", "*"},
+ {"GCA", "C"},
+ {"CCA", "W"},
+ {"ACA", "C"},
+ {"TAA", "L"},
+ {"GAA", "F"},
+ {"CAA", "L"},
+ {"AAA", "F"},
+};
+
class VariantList
{
public:
@@ -98,7 +235,7 @@ public:
void writeToMfa(std::ostream &out, bool indels, const TrackList & trackList) const;
void writeToProtocolBuffer(Harvest * harvest) const;
void writeToCapnp(capnp::Harvest::Builder & harvestBuilder) const;
- void writeToVcf(std::ostream &out, bool indels, const ReferenceList & referenceList, const TrackList & trackList) const;
+ void writeToVcf(std::ostream &out, bool indels, const ReferenceList & referenceList, const AnnotationList & annotationList, const TrackList & trackList, const std::vector<int> & tracks, bool signature = false) const;
static bool variantLessThan(const Variant & a, const Variant & b)
{
diff --git a/src/harvest/harvest.cpp b/src/harvest/harvest.cpp
old mode 100644
new mode 100755
index 7a356b4..9a44407
--- a/src/harvest/harvest.cpp
+++ b/src/harvest/harvest.cpp
@@ -12,7 +12,51 @@
using namespace::std;
-int main(int argc, const char * argv[])
+void parseTracks(char * arg, vector<string> & tracks, bool & lca)
+{
+ lca = false;
+ bool list = false;
+
+ char * i = arg;
+
+ while ( *i != 0 )
+ {
+ if ( *i == ':' )
+ {
+ if ( lca )
+ {
+ cerr << "ERROR: LCA must only have 2 tracks (\"" << arg << "\")." << endl;
+ exit(1);
+ }
+
+ lca = true;
+ }
+ else if ( *i == ',' )
+ {
+ list = true;
+ }
+
+ if ( lca && list )
+ {
+ cerr << "ERROR: Cannot use ':' and ',' when specifying tracks (\"" << arg << "\")." << endl;
+ exit(1);
+ }
+
+ i++;
+ }
+
+ char * token = strtok(arg, ":,");
+
+ while ( token )
+ {
+ tracks.push_back(token);
+ token = strtok(0, ":,");
+ }
+}
+
+static const char * version = "1.3";
+
+int main(int argc, char * argv[])
{
const char * input = 0;
const char * output = 0;
@@ -26,9 +70,14 @@ int main(int argc, const char * argv[])
const char * xmfa = 0;
const char * outFasta = 0;
const char * outMfa = 0;
+ const char * outMfaFiltered = 0;
+ const char * outMfaFilteredPositions = 0;
const char * outNewick = 0;
const char * outSnp = 0;
const char * outVcf = 0;
+ vector<string> tracks;
+ bool lca = false;
+ bool signature = false;
const char * outBB = 0;
const char * outXmfa = 0;
bool help = false;
@@ -47,10 +96,29 @@ int main(int argc, const char * argv[])
switch ( argv[i][1] )
{
case '-':
- if ( strcmp(argv[i], "--midpoint-reroot") == 0 )
+ if ( strcmp(argv[i], "--version") == 0 )
+ {
+ cout << version << endl;
+ return 0;
+ }
+ else if ( strcmp(argv[i], "--midpoint-reroot") == 0 )
{
midpointReroot = true;
}
+ else if ( strcmp(argv[i], "--internal") == 0 )
+ {
+ parseTracks(argv[++i], tracks, lca);
+ }
+ else if ( strcmp(argv[i], "--signature") == 0 )
+ {
+ signature = true;
+ parseTracks(argv[++i], tracks, lca);
+ }
+ else
+ {
+ printf("ERROR: Unrecognized option ('%s').\n", argv[i]);
+ help = true;
+ }
break;
case 'a': maf = argv[++i]; break;
case 'b': bed.push_back(argv[++i]); break;
@@ -60,6 +128,7 @@ int main(int argc, const char * argv[])
case 'g': genbank.push_back(argv[++i]); break;
case 'h': help = true; break;
case 'i': input = argv[++i]; break;
+ case 'I': outMfaFiltered = argv[++i]; outMfaFilteredPositions = "reference_positions.txt"; break;
case 'm': mfa = argv[++i]; break;
case 'M': outMfa = argv[++i]; break;
case 'n': newick = argv[++i]; break;
@@ -92,7 +161,7 @@ int main(int argc, const char * argv[])
if (help || argc < 2)
{
- cout << "harvesttools options:" << endl;
+ cout << "harvesttools version " << version << " options:" << endl;
cout << " -i <Gingr input>" << endl;
cout << " -b <bed filter intervals>,<filter name>,\"<description>\"" << endl;
cout << " -B <output backbone intervals>" << endl;
@@ -102,6 +171,7 @@ int main(int argc, const char * argv[])
cout << " -a <MAF alignment input>" << endl;
cout << " -m <multi-fasta alignment input>" << endl;
cout << " -M <multi-fasta alignment output (concatenated LCBs)>" << endl;
+ cout << " -I <multi-fasta alignment output (concatenated LCBs minus filtered SNPs)>" << endl;
cout << " -n <Newick tree input>" << endl;
cout << " -N <Newick tree output>" << endl;
cout << " --midpoint-reroot (reroot the tree at its midpoint after loading)" << endl;
@@ -110,6 +180,13 @@ int main(int argc, const char * argv[])
cout << " -u 0/1 (update the branch values to reflect genome length)" << endl;
cout << " -v <VCF input>" << endl;
cout << " -V <VCF output>" << endl;
+ cout << " --internal <track1>,<track2>,... #only variants that differ among tracks" << endl;
+ cout << " listed" << endl;
+ cout << " --internal <track1>:<track2> #only variants that differ within LCA" << endl;
+ cout << " clade of <track1> and <track2>" << endl;
+ cout << " --signature <track1>,<track2>,... #only signature variants of tracks listed" << endl;
+ cout << " --signature <track1>:<track2> #only signature variants of LCA clade of" << endl;
+ cout << " <track1> and <track2>" << endl;
cout << " -x <xmfa alignment file>" << endl;
cout << " -X <output xmfa alignment file>" << endl;
cout << " -h (show this help)" << endl;
@@ -179,9 +256,14 @@ int main(int argc, const char * argv[])
cerr << " ERROR: No sequence in Genbank file (" << e.file << ") and no other reference loaded.\n";
return 1;
}
- catch ( const AnnotationList::NoGiException & e )
+ catch ( const AnnotationList::NoAccException & e )
+ {
+ cerr << " ERROR: Genbank file (" << e.file << ") does not contain accession; cannot be matched to existing reference.\n";
+ return 1;
+ }
+ catch ( const ReferenceList::AccNotFoundException & e )
{
- cerr << " ERROR: Genbank file (" << e.file << ") does not contain GI; cannot be matched to existing reference.\n";
+ cerr << " ERROR: Could not find a loaded reference with accession \"" << e.acc << "\"\n";
return 1;
}
@@ -316,6 +398,27 @@ int main(int argc, const char * argv[])
hio.writeMfa(*fp);
}
+ if ( outMfaFiltered )
+ {
+ if (!quiet) cerr << "Writing " << outMfaFiltered << " and " << outMfaFilteredPositions << " ...\n";
+
+ std::ostream* fp = &cout;
+ std::ostream* fp2 = &cout;
+ std::ofstream fout;
+ std::ofstream fout2;
+
+ if (out1.compare(outMfaFiltered) != 0)
+ {
+
+ fout.open(outMfaFiltered);
+ fout2.open(outMfaFilteredPositions);
+ fp = &fout;
+ fp2 = &fout2;
+ }
+
+ hio.writeFilteredMfa(*fp, *fp2);
+ }
+
if ( outNewick )
{
if (!quiet) cerr << "Writing " << outNewick << "...\n";
@@ -381,6 +484,12 @@ int main(int argc, const char * argv[])
if ( outVcf )
{
+ if ( lca && ! hio.phylogenyTree.getRoot() )
+ {
+ cerr << "ERROR: No tree loaded for LCA\n";
+ return 1;
+ }
+
if (!quiet) cerr << "Writing " << outVcf << "...\n";
std::ostream* fp = &cout;
@@ -392,8 +501,26 @@ int main(int argc, const char * argv[])
fp = &fout;
}
- hio.writeVcf(*fp, true);
-
+ try
+ {
+ hio.writeVcf
+ (
+ *fp,
+ tracks.size() > 0 && ! lca ? &tracks : 0,
+ lca ? hio.phylogenyTree.getLca
+ (
+ hio.trackList.getTrackIndexByFile(tracks[0]),
+ hio.trackList.getTrackIndexByFile(tracks[1])
+ ) : 0,
+ true,
+ signature
+ );
+ }
+ catch ( const TrackList::TrackNotFoundException & e )
+ {
+ cerr << "ERROR: No track named \"" << e.name << "\"" << endl;
+ return 1;
+ }
}
return 0;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/harvest-tools.git
More information about the debian-med-commit
mailing list