[med-svn] [freebayes] 01/01: New upstream version 1.1.0
Andreas Tille
tille at debian.org
Sat Sep 2 05:46:07 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to annotated tag upstream/1.1.0
in repository freebayes.
commit a0624d4044e8e6fb8525d247fa8af778f90a8e81
Author: Andreas Tille <tille at debian.org>
Date: Sun Jun 18 14:47:31 2017 +0200
New upstream version 1.1.0
---
.gitmodules | 9 +-
README.md | 8 +-
python/allelebayes.py | 2 +
python/multiset.py | 2 +-
scripts/bgziptabix | 13 +
scripts/fasta_generate_regions.py | 2 +-
scripts/freebayes-parallel | 4 +-
scripts/sam_add_rg.pl | 2 +-
src/Allele.h | 7 +-
src/AlleleParser.cpp | 633 ++++++++++++++++++++-------------
src/AlleleParser.h | 63 ++--
src/Fasta.cpp | 24 +-
src/Fasta.h | 3 +
src/Genotype.cpp | 2 +-
src/LeftAlign.cpp | 115 +++---
src/LeftAlign.h | 94 ++++-
src/Makefile | 40 ++-
src/NonCall.cpp | 15 +-
src/NonCall.h | 2 +
src/Parameters.cpp | 8 +
src/Parameters.h | 1 +
src/ResultData.cpp | 56 ++-
src/ResultData.h | 8 +-
src/bamleftalign.cpp | 69 ++--
src/freebayes.cpp | 23 +-
test/Makefile | 2 +-
test/region-and-target-handling.t | 104 ------
test/t/00_region_and_target_handling.t | 145 ++++++++
test/t/01_call_variants.t | 12 +-
test/t/02_multi_bam.t | 88 +++++
test/t/03_reference_bases.t | 56 +++
31 files changed, 1066 insertions(+), 546 deletions(-)
diff --git a/.gitmodules b/.gitmodules
index 6341f4d..7f973b4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "vcflib"]
- path = vcflib
- url = https://github.com/ekg/vcflib.git
[submodule "bamtools"]
path = bamtools
url = https://github.com/ekg/bamtools.git
@@ -13,3 +10,9 @@
[submodule "bash-tap"]
path = test/bash-tap
url = https://github.com/illusori/bash-tap.git
+[submodule "vcflib"]
+ path = vcflib
+ url = https://github.com/vcflib/vcflib.git
+[submodule "SeqLib"]
+ path = SeqLib
+ url = https://github.com/walaj/SeqLib.git
diff --git a/README.md b/README.md
index f233549..30dfe62 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ If possible, please also refer to the version number provided by freebayes when
it is run without arguments or with the `--help` option. For example, you
should see something like this:
- version: v0.9.10-3-g47a713e-dirty
+ version: v0.9.10-3-g47a713e
This provides both a point release number and a git commit id, which will
ensure precise reproducibility of results.
@@ -74,12 +74,6 @@ tree. Currently, the tree is hosted on github, and can be obtained via:
Note the use of --recursive. This is required in order to download all
nested git submodules for external repositories.
-After you've already done the above to clone the most recent development
-version, if you wish to compile a specific version of FreeBayes from, you
-can then do something like the following:
-
- git checkout v0.9.20 && git submodule update --recursive
-
### Resolving proxy issues with git
Depending on your local network configuration, you may have problems obtaining
diff --git a/python/allelebayes.py b/python/allelebayes.py
index 6bf6ba1..af49ab6 100644
--- a/python/allelebayes.py
+++ b/python/allelebayes.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
# calculates data likelihoods for sets of alleles
import multiset
diff --git a/python/multiset.py b/python/multiset.py
index 4a4942b..5ed41f3 100644
--- a/python/multiset.py
+++ b/python/multiset.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
multiset.py -- non-recursive n multichoose k and
diff --git a/scripts/bgziptabix b/scripts/bgziptabix
new file mode 100755
index 0000000..ae6c4a0
--- /dev/null
+++ b/scripts/bgziptabix
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+if [ $# -ne 1 ];
+then
+echo usage: $0 '[file name]'
+echo runs bgzip on the input and tabix indexes the result
+echo "== bgzip >[file] && tabix -fp vcf [file]"
+exit
+fi
+
+file=$1
+
+bgzip >$file && tabix -fp vcf $file
diff --git a/scripts/fasta_generate_regions.py b/scripts/fasta_generate_regions.py
index 0433410..5e28744 100755
--- a/scripts/fasta_generate_regions.py
+++ b/scripts/fasta_generate_regions.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
import sys
diff --git a/scripts/freebayes-parallel b/scripts/freebayes-parallel
index 63bdc62..94becbb 100755
--- a/scripts/freebayes-parallel
+++ b/scripts/freebayes-parallel
@@ -36,5 +36,5 @@ command=("freebayes" "$@")
#$command | head -100 | grep "^#" # generate header
# iterate over regions using gnu parallel to dispatch jobs
cat "$regionsfile" | parallel -k -j "$ncpus" "${command[@]}" --region {}
-) | vcffirstheader \
- | vcfstreamsort -w 1000 | vcfuniq # remove duplicates at region edges
+) | ../vcflib/scripts/vcffirstheader \
+ | ../vcflib/bin/vcfstreamsort -w 1000 | vcfuniq # remove duplicates at region edges
diff --git a/scripts/sam_add_rg.pl b/scripts/sam_add_rg.pl
index cf5892a..108d9d1 100644
--- a/scripts/sam_add_rg.pl
+++ b/scripts/sam_add_rg.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
#
diff --git a/src/Allele.h b/src/Allele.h
index 6122f96..3f0125f 100644
--- a/src/Allele.h
+++ b/src/Allele.h
@@ -13,10 +13,13 @@
#include <assert.h>
#include "Utility.h"
#include "convert.h"
-#include "api/BamAlignment.h"
+
+//#ifdef HAVE_BAMTOOLS
+//#include "api/BamAlignment.h"
+//using namespace BamTools;
+//#endif
using namespace std;
-using namespace BamTools;
class Allele;
diff --git a/src/AlleleParser.cpp b/src/AlleleParser.cpp
index 444034e..dcf7ec8 100644
--- a/src/AlleleParser.cpp
+++ b/src/AlleleParser.cpp
@@ -27,21 +27,32 @@
using namespace std;
+namespace { // anonymous namespace
+
+// Convert a std::string into an integer, ignoring any commas.
+int stringToInt(string str) {
+ str.erase(remove(str.begin(), str.end(), ','), str.end());
+ return atoi(str.c_str());
+}
+
+} // anonymous namespace
+
// open BAM input file
void AlleleParser::openBams(void) {
// report differently if we have one or many bam files
if (parameters.bams.size() == 1) {
- DEBUG("Opening BAM fomat alignment input file: " << parameters.bams.front() << " ...");
+ DEBUG("Opening BAM format alignment input file: " << parameters.bams.front() << " ...");
} else if (parameters.bams.size() > 1) {
- DEBUG("Opening " << parameters.bams.size() << " BAM fomat alignment input files");
- for (vector<string>::const_iterator b = parameters.bams.begin();
+ DEBUG("Opening " << parameters.bams.size() << " BAM format alignment input files");
+ for (vector<string>::const_iterator b = parameters.bams.begin();
b != parameters.bams.end(); ++b) {
DEBUG2(*b);
}
}
-
+
+#ifdef HAVE_BAMTOOLS
if (parameters.useStdin) {
if (!bamMultiReader.Open(parameters.bams)) {
ERROR("Could not read BAM data from stdin");
@@ -74,13 +85,65 @@ void AlleleParser::openBams(void) {
}
}
+#else
+ if (parameters.useStdin) {
+ if (!bamMultiReader.Open("-")) {
+ ERROR("Could not read BAM data from stdin");
+ exit(1);
+ }
+ } else {
+ for (std::vector<std::string>::const_iterator i = parameters.bams.begin();
+ i != parameters.bams.end(); ++i)
+ if (!bamMultiReader.Open(*i)) {
+ ERROR("Could not open input BAM file: " + *i);
+ exit(1);
+ }
+ else {
+ /*if (!bamMultiReader.LocateIndexes()) {
+ ERROR("Opened BAM reader without index file, jumping is disabled.");
+ cerr << bamMultiReader.GetErrorString() << endl;
+ if (!targets.empty()) {
+ ERROR("Targets specified but no BAM index file provided.");
+ ERROR("FreeBayes cannot jump through targets in BAM files without BAM index files, exiting.");
+ ERROR("Please generate a BAM index file eithe, e.g.:");
+ ERROR(" \% bamtools index -in <bam_file>");
+ ERROR(" \% samtools index <bam_file>");
+ exit(1);
+ }
+ }*/
+ }
+ /*if (!bamMultiReader.SetExplicitMergeOrder(bamMultiReader.MergeByCoordinate)) {
+ ERROR("could not set sort order to coordinate");
+ cerr << bamMultiReader.GetErrorString() << endl;
+ exit(1);
+ }*/
+ }
+#endif
+
+ // from PR 319 below
+#ifdef HAVE_BAMTOOLS
+ if (!parameters.useStdin) {
+ BamReader reader;
+ for (vector<string>::const_iterator b = parameters.bams.begin();
+ b != parameters.bams.end(); ++b) {
+ reader.Open(*b);
+ string bamHeader = reader.GetHeaderText();
+ vector<string> headerLines = split(bamHeader, '\n');
+ bamHeaderLines.insert(bamHeaderLines.end(), headerLines.begin(), headerLines.end());
+ reader.Close();
+ }
+ } else {
+ bamHeaderLines = split(bamMultiReader.GetHeaderText(), '\n');
+ }
+#else
// retrieve header information
- bamHeader = bamMultiReader.GetHeaderText();
+ string bamHeader = bamMultiReader.GETHEADERTEXT;
bamHeaderLines = split(bamHeader, '\n');
- DEBUG(" done");
+#endif
+ DEBUG(" done");
}
void AlleleParser::openOutputFile(void) {
@@ -131,6 +194,26 @@ void AlleleParser::getSequencingTechnologies(void) {
cerr << "no sequencing technology specified in @RG tag (no PL: in @RG tag) " << endl << headerLine << endl;
}
} else {
+ map<string, string>::iterator s = readGroupToTechnology.find(readGroupID);
+ if (s != readGroupToTechnology.end()) {
+ if (s->second != tech) {
+ ERROR("multiple technologies (PL) map to the same read group (RG)" << endl
+ << endl
+ << "technologies " << tech << " and " << s->second << " map to " << readGroupID << endl
+ << endl
+ << "As freebayes operates on a virtually merged stream of its input files," << endl
+ << "it will not be possible to determine what technology an alignment belongs to" << endl
+ << "at runtime." << endl
+ << endl
+ << "To resolve the issue, ensure that RG ids are unique to one technology" << endl
+ << "across all the input files to freebayes." << endl
+ << endl
+ << "See bamaddrg (https://github.com/ekg/bamaddrg) for a method which can" << endl
+ << "add RG tags to alignments." << endl);
+ exit(1);
+ }
+ // if it's the same technology and RG combo, no worries
+ }
readGroupToTechnology[readGroupID] = tech;
technologies[tech] = true;
}
@@ -257,11 +340,11 @@ void AlleleParser::getSampleNames(void) {
map<string, string>::iterator s = readGroupToSampleNames.find(readGroupID);
if (s != readGroupToSampleNames.end()) {
if (s->second != name) {
- ERROR("ERROR: multiple samples (SM) map to the same read group (RG)" << endl
+ ERROR("multiple samples (SM) map to the same read group (RG)" << endl
<< endl
<< "samples " << name << " and " << s->second << " map to " << readGroupID << endl
<< endl
- << "As freebayes operates on a virtually merged stream of its input files," << endl
+ << "As freebayes operates on a virtually merged stream of its input files," << endl
<< "it will not be possible to determine what sample an alignment belongs to" << endl
<< "at runtime." << endl
<< endl
@@ -323,7 +406,7 @@ void AlleleParser::getSampleNames(void) {
//--------------------------------------------------------------------------------
<< "Warning: No sample file given, and no @RG tags found in BAM header." << endl
<< "All alignments from all input files will be assumed to come from the same" << endl
- << "individual. To group alignments by sample, you must add read groups and sample" << endl
+ << "individual. To group alignments by sample, you must add read groups and sample" << endl
<< "names to your alignments. You can do this using ./scripts/sam_add_rg.pl in the" << endl
<< "freebayes source tree, or by specifying read groups and sample names when you" << endl
<< "prepare your sequencing data for alignment." << endl
@@ -340,10 +423,16 @@ string AlleleParser::vcfHeader() {
stringstream headerss;
headerss
- << "##fileformat=VCFv4.1" << endl
+ << "##fileformat=VCFv4.2" << endl
<< "##fileDate=" << dateStr() << endl
<< "##source=freeBayes " << VERSION_GIT << endl
- << "##reference=" << reference.filename << endl
+ << "##reference=" << reference.filename << endl;
+
+ for (REFVEC::const_iterator it = referenceSequences.begin();
+ it != referenceSequences.end(); ++it)
+ headerss << "##contig=<ID=" << it->REFNAME << ",length=" << it->REFLEN << ">" << endl;
+
+ headerss
<< "##phasing=none" << endl
<< "##commandline=\"" << parameters.commandline << "\"" << endl
<< "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">" << endl
@@ -356,8 +445,8 @@ string AlleleParser::vcfHeader() {
<< "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">" << endl
// observation counts
- << "##INFO=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count, with partial observations recorded fractionally\">" << endl
- << "##INFO=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observations, with partial observations recorded fractionally\">" << endl
+ << "##INFO=<ID=RO,Number=1,Type=Integer,Description=\"Count of full observations of the reference haplotype.\">" << endl
+ << "##INFO=<ID=AO,Number=A,Type=Integer,Description=\"Count of full observations of this alternate haplotype.\">" << endl
<< "##INFO=<ID=PRO,Number=1,Type=Float,Description=\"Reference allele observation count, with partial observations recorded fractionally\">" << endl
<< "##INFO=<ID=PAO,Number=A,Type=Float,Description=\"Alternate allele observations, with partial observations recorded fractionally\">" << endl
@@ -435,7 +524,7 @@ string AlleleParser::vcfHeader() {
<< "##INFO=<ID=MQMR,Number=1,Type=Float,Description=\"Mean mapping quality of observed reference alleles\">" << endl
<< "##INFO=<ID=PAIRED,Number=A,Type=Float,Description=\"Proportion of observed alternate alleles which are supported by properly paired read fragments\">" << endl
<< "##INFO=<ID=PAIREDR,Number=1,Type=Float,Description=\"Proportion of observed reference alleles which are supported by properly paired read fragments\">" << endl
- << "##INFO=<ID=MIN,Number=1,Type=Integer,Description=\"Minimum depth in gVCF output block.\">" << endl
+ << "##INFO=<ID=MIN_DP,Number=1,Type=Integer,Description=\"Minimum depth in gVCF output block.\">" << endl
<< "##INFO=<ID=END,Number=1,Type=Integer,Description=\"Last position (inclusive) in gVCF output record.\">" << endl;
// sequencing technology tags, which vary according to input data
@@ -448,18 +537,23 @@ string AlleleParser::vcfHeader() {
headerss << "##INFO=<ID=REPEAT,Number=1,Type=String,Description=\"Description of the local repeat structures flanking the current position\">" << endl;
}
+ string gqType = "Float";
+ if (parameters.strictVCF)
+ gqType = "Integer";
+
// format fields for genotypes
headerss << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl
- << "##FORMAT=<ID=GQ,Number=1,Type=Float,Description=\"Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype\">" << endl
+ << "##FORMAT=<ID=GQ,Number=1,Type=" << gqType << ",Description=\"Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype\">" << endl
// this can be regenerated with RA, AA, QR, QA
<< "##FORMAT=<ID=GL,Number=G,Type=Float,Description=\"Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy\">" << endl
//<< "##FORMAT=<ID=GLE,Number=1,Type=String,Description=\"Genotype Likelihood Explicit, same as GL, but with tags to indicate the specific genotype. For instance, 0^-75.22|1^-223.42|0/0^-323.03|1/0^-99.29|1/1^-802.53 represents both haploid and diploid genotype likilehoods in a biallelic context\">" << endl
<< "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">" << endl
+ << "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Number of observation for each allele\">" << endl
<< "##FORMAT=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count\">" << endl
<< "##FORMAT=<ID=QR,Number=1,Type=Integer,Description=\"Sum of quality of the reference observations\">" << endl
<< "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count\">" << endl
<< "##FORMAT=<ID=QA,Number=A,Type=Integer,Description=\"Sum of quality of the alternate observations\">" << endl
- << "##FORMAT=<ID=MIN,Number=1,Type=Integer,Description=\"Minimum depth in gVCF output block.\">" << endl
+ << "##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description=\"Minimum depth in gVCF output block.\">" << endl
//<< "##FORMAT=<ID=SRF,Number=1,Type=Integer,Description=\"Number of reference observations on the forward strand\">" << endl
//<< "##FORMAT=<ID=SRR,Number=1,Type=Integer,Description=\"Number of reference observations on the reverse strand\">" << endl
//<< "##FORMAT=<ID=SAF,Number=1,Type=Integer,Description=\"Number of alternate observations on the forward strand\">" << endl
@@ -485,7 +579,7 @@ void AlleleParser::setupVCFInput(void) {
// variant input for analysis and targeting
if (!parameters.variantPriorsFile.empty()) {
variantCallInputFile.open(parameters.variantPriorsFile);
- currentVariant = new vcf::Variant(variantCallInputFile);
+ currentVariant = new vcflib::Variant(variantCallInputFile);
usingVariantInputAlleles = true;
// get sample names from VCF input file
@@ -517,15 +611,14 @@ void AlleleParser::loadBamReferenceSequenceNames(void) {
//--------------------------------------------------------------------------
// store the names of all the reference sequences in the BAM file
- referenceSequences = bamMultiReader.GetReferenceData();
+ referenceSequences = bamMultiReader.GETREFDATA;
int i = 0;
- for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
- referenceIDToName[i] = r->RefName;
+ for (REFVEC::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
+ referenceIDToName[i] = r->REFNAME;
++i;
}
- DEBUG("Number of ref seqs: " << bamMultiReader.GetReferenceCount());
-
+ DEBUG("Number of ref seqs: " << bamMultiReader.GETREFNUM);
}
@@ -548,58 +641,58 @@ bool AlleleParser::hasMoreInputVariants(void) {
return next.first != -1;
}
-bool AlleleParser::loadNextPositionWithAlignmentOrInputVariant(BamAlignment& alignment) {
+bool AlleleParser::loadNextPositionWithAlignmentOrInputVariant(BAMALIGN& alignment) {
+
pair<int, long> next = nextInputVariantPosition();
if (next.first != -1) {
int varRefID = next.first;
- //cerr << varRefID << " " << alignment.RefID << " " << next.second << " " << alignment.Position << endl;
- if (!hasMoreAlignments || varRefID < alignment.RefID || varRefID == alignment.RefID && next.second < alignment.Position) {
- return loadNextPositionWithInputVariant();
+ if (!hasMoreAlignments || varRefID < alignment.REFID || (varRefID == alignment.REFID && next.second < alignment.POSITION)) {
+ return loadNextPositionWithInputVariant();
} else {
- loadReferenceSequence(alignment);
+ loadReferenceSequence(alignment);
}
} else {
- loadReferenceSequence(alignment);
+ loadReferenceSequence(alignment);
}
return true;
}
bool AlleleParser::loadNextPositionWithInputVariant(void) {
- pair<int, long> next = nextInputVariantPosition();
- if (next.first != -1) {
- //cerr << "Next is " << next.first << ":" << next.second << endl;
- loadReferenceSequence(referenceIDToName[next.first]);
- currentPosition = next.second;
- rightmostHaplotypeBasisAllelePosition = currentPosition;
- return true;
- } else {
- return false;
- }
+ pair<int, long> next = nextInputVariantPosition();
+ if (next.first != -1) {
+ //cerr << "Next is " << next.first << ":" << next.second << endl;
+ loadReferenceSequence(referenceIDToName[next.first]);
+ currentPosition = next.second;
+ rightmostHaplotypeBasisAllelePosition = currentPosition;
+ return true;
+ } else {
+ return false;
+ }
}
// alignment-based method for loading the first bit of our reference sequence
-void AlleleParser::loadReferenceSequence(BamAlignment& alignment) {
- loadReferenceSequence(referenceIDToName[alignment.RefID]);
- currentPosition = alignment.Position;
+void AlleleParser::loadReferenceSequence(BAMALIGN& alignment) {
+ loadReferenceSequence(referenceIDToName[alignment.REFID]);
+ currentPosition = alignment.POSITION;
}
void AlleleParser::loadReferenceSequence(string& seqname) {
if (currentSequenceName != seqname) {
currentSequenceName = seqname;
currentSequenceStart = 0;
- currentRefID = bamMultiReader.GetReferenceID(currentSequenceName);
- currentSequence = uppercase(reference.getSequence(currentSequenceName));
- int i = 0; // check the first few characters and verify they are not garbage
- for (string::iterator citr = currentSequence.begin();
- i < 100 && citr != currentSequence.end(); ++citr, ++i) {
- char c = *citr;
- if (c != 'A' && c != 'T' && c != 'G' && c != 'C' && c != 'N') {
- ERROR("Found non-DNA character " << c << " at position " << i << " in " << seqname << endl
- << ". Is your reference compressed or corrupted? "
- << "freebayes requires an uncompressed reference sequence.");
- exit(1);
- }
+ currentRefID = bamMultiReader.GETREFID(currentSequenceName);
+ currentSequence = uppercase(reference.getRawSequence(currentSequenceName));
+ // check the first few characters and verify they are not garbage
+ string validBases = "ACGTURYKMSWBDHVN-";
+ size_t found = currentSequence.substr(0, 100).find_first_not_of(validBases);
+ if (found != string::npos) {
+ ERROR("Found non-DNA character " << currentSequence.at(found)
+ << " at position " << found << " in " << seqname << endl
+ << "Is your reference compressed or corrupted? "
+ << "freebayes requires an uncompressed reference sequence.");
+ exit(1);
}
+ currentSequence = reference.getSequence(currentSequenceName);
}
}
@@ -653,18 +746,18 @@ void AlleleParser::loadTargets(void) {
size_t foundRangeSep = region.find(sep, foundFirstColon);
if (foundRangeSep == string::npos) {
sep = "-";
- foundRangeSep = region.find("-", foundFirstColon);
+ foundRangeSep = region.find(sep, foundFirstColon);
}
if (foundRangeSep == string::npos) {
- startPos = atoi(region.substr(foundFirstColon + 1).c_str());
+ startPos = stringToInt(region.substr(foundFirstColon + 1));
// differ from bamtools in this regard, in that we process only
// the specified position if a range isn't given
stopPos = startPos + 1;
} else {
- startPos = atoi(region.substr(foundFirstColon + 1, foundRangeSep - foundFirstColon).c_str());
+ startPos = stringToInt(region.substr(foundFirstColon + 1, foundRangeSep - foundFirstColon).c_str());
// if we have range sep specified, but no second number, read to the end of sequence
if (foundRangeSep + sep.size() != region.size()) {
- stopPos = atoi(region.substr(foundRangeSep + sep.size()).c_str()); // end-exclusive, bed-format
+ stopPos = stringToInt(region.substr(foundRangeSep + sep.size()).c_str()); // end-exclusive, bed-format
} else {
stopPos = -1;
}
@@ -712,12 +805,12 @@ void AlleleParser::loadTargetsFromBams(void) {
// otherwise, if we weren't given a region string or targets file, analyze
// all reference sequences from BAM file
DEBUG2("no targets specified, using all targets from BAM files");
- RefVector::iterator refIter = referenceSequences.begin();
- RefVector::iterator refEnd = referenceSequences.end();
+ REFVEC::iterator refIter = referenceSequences.begin();
+ REFVEC::iterator refEnd = referenceSequences.end();
for( ; refIter != refEnd; ++refIter) {
- RefData refData = *refIter;
- string refName = refData.RefName;
- BedTarget bd(refName, 0, refData.RefLength); // 0-based inclusive internally
+ REFDATA refData = *refIter;
+ string refName = refData.REFNAME;
+ BedTarget bd(refName, 0, refData.REFLEN); // 0-based inclusive internally
DEBUG2("will process reference sequence " << refName << ":" << bd.left << ".." << bd.right + 1);
targets.push_back(bd);
}
@@ -740,11 +833,11 @@ void AlleleParser::loadSampleCNVMap(void) {
// in the sampleCNV map. note that the reference "sample" is named after
// the current reference sequence.
if (!parameters.diploidReference) {
- for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
- sampleCNV.setPloidy(referenceSampleName, r->RefName, 0, r->RefLength, 1);
+ for (REFVEC::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
+ sampleCNV.setPloidy(referenceSampleName, r->REFNAME, 0, r->REFLEN, 1);
}
}
-
+
}
int AlleleParser::currentSamplePloidy(string const& sample) {
@@ -851,8 +944,8 @@ AlleleParser::~AlleleParser(void) {
}
// position of alignment relative to current sequence
-int AlleleParser::currentSequencePosition(const BamAlignment& alignment) {
- return alignment.Position - currentSequenceStart;
+int AlleleParser::currentSequencePosition(const BAMALIGN& alignment) {
+ return alignment.POSITION - currentSequenceStart;
}
// relative current position within the cached currentSequence
@@ -900,14 +993,14 @@ bool AlleleParser::isCpG(string& altbase) {
}
}
-void capBaseQuality(BamAlignment& alignment, int baseQualityCap) {
- string& rQual = alignment.Qualities;
- char qualcap = qualityInt2Char(baseQualityCap);
- for (string::iterator c = rQual.begin(); c != rQual.end(); ++c) {
- if (qualityChar2ShortInt(*c) > baseQualityCap) {
- *c = qualcap;
- }
+void capBaseQuality(BAMALIGN& alignment, int baseQualityCap) {
+ string rQual = alignment.QUALITIES;
+ char qualcap = qualityInt2Char(baseQualityCap);
+ for (string::iterator c = rQual.begin(); c != rQual.end(); ++c) {
+ if (qualityChar2ShortInt(*c) > baseQualityCap) {
+ *c = qualcap;
}
+ }
}
void RegisteredAlignment::addAllele(Allele newAllele, bool mergeComplex, int maxComplexGap, bool boundIndels) {
@@ -946,7 +1039,7 @@ void RegisteredAlignment::addAllele(Allele newAllele, bool mergeComplex, int max
Allele& lastAllele = alleles.back();
if (isEmptyAllele(newAllele) ||
- newAllele.isReference() && newAllele.referenceLength == 0) {
+ (newAllele.isReference() && newAllele.referenceLength == 0)) {
// do nothing
} else if (newAllele.isReference() && isUnflankedIndel(lastAllele)) {
// add flanking base to indel, ensuring haplotype length of 2 for all indels
@@ -1074,7 +1167,7 @@ void RegisteredAlignment::addAllele(Allele newAllele, bool mergeComplex, int max
} else {
AlleleType atype = ALLELE_COMPLEX;
if (lastAllele.isSNP() || lastAllele.isMNP()) {
- if (lastCigar.back().second == "X" && newAllele.isSNP() || newAllele.isMNP()) {
+ if ((lastCigar.back().second == "X" && newAllele.isSNP()) || newAllele.isMNP()) {
atype = ALLELE_MNP;
}
}
@@ -1108,10 +1201,10 @@ void AlleleParser::updateHaplotypeBasisAlleles(long int pos, int referenceLength
pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW + 1)) {
//cerr << "the vcf line " << haplotypeVariantInputFile.line << endl;
// get the variants in the target region
- vcf::Variant var(haplotypeVariantInputFile);
+ vcflib::Variant var(haplotypeVariantInputFile);
while (haplotypeVariantInputFile.getNextVariant(var)) {
//cerr << "input variant: " << var << endl;
-
+
// the following stanza is for parsed
// alternates. instead use whole haplotype calls, as
// alternates can be parsed prior to providing the
@@ -1122,9 +1215,9 @@ void AlleleParser::updateHaplotypeBasisAlleles(long int pos, int referenceLength
}
*/
- map<string, vector<vcf::VariantAllele> > variants = var.parsedAlternates();
- for (map<string, vector<vcf::VariantAllele> >::iterator a = variants.begin(); a != variants.end(); ++a) {
- for (vector<vcf::VariantAllele>::iterator v = a->second.begin(); v != a->second.end(); ++v) {
+ map<string, vector<vcflib::VariantAllele> > variants = var.parsedAlternates();
+ for (map<string, vector<vcflib::VariantAllele> >::iterator a = variants.begin(); a != variants.end(); ++a) {
+ for (vector<vcflib::VariantAllele>::iterator v = a->second.begin(); v != a->second.end(); ++v) {
//cerr << v->ref << "/" << v->alt << endl;
if (v->ref != v->alt) {
//cerr << "basis allele " << v->position << " " << v->ref << "/" << v->alt << endl;
@@ -1176,13 +1269,12 @@ Allele AlleleParser::makeAllele(RegisteredAlignment& ra,
int basesRight,
string& readSequence,
string& sampleName,
- BamAlignment& alignment,
+ BAMALIGN& alignment,
string& sequencingTech,
long double qual,
string& qualstr
) {
-
string cigar;
int reflen = length;
@@ -1220,7 +1312,7 @@ Allele AlleleParser::makeAllele(RegisteredAlignment& ra,
// NB, if we are using haplotype basis alleles the algorithm forces
// alleles that aren't in the haplotype basis set into the reference space
if (type != ALLELE_REFERENCE
- && type != ALLELE_NULL
+ && type != ALLELE_NULL
&& !allowedHaplotypeBasisAllele(pos + 1,
refSequence,
readSequence)) {
@@ -1284,7 +1376,7 @@ Allele AlleleParser::makeAllele(RegisteredAlignment& ra,
while (minEntropy > 0 && // ignore if turned off
repeatRightBoundary - currentSequenceStart < currentSequence.size() && //guard
entropy(currentSequence.substr(start, repeatRightBoundary - pos)) < minEntropy) {
- //cerr << "entropy of " << entropy(currentSequence.substr(start, repeatRightBoundary - pos)) << " is too low, ";
+ //cerr << "entropy of " << entropy(currentSequence.substr(start, repeatRightBoundary - pos)) << " is too low, ";
//cerr << "increasing rought boundary to ";
++repeatRightBoundary;
//cerr << repeatRightBoundary << endl;
@@ -1299,6 +1391,8 @@ Allele AlleleParser::makeAllele(RegisteredAlignment& ra,
}
}
+ string qnamer = alignment.QNAME;
+
return Allele(type,
currentSequenceName,
pos,
@@ -1310,58 +1404,58 @@ Allele AlleleParser::makeAllele(RegisteredAlignment& ra,
basesRight,
readSequence,
sampleName,
- alignment.Name,
+ qnamer,
ra.readgroup,
sequencingTech,
- !alignment.IsReverseStrand(),
+ !alignment.ISREVERSESTRAND,
max(qual, (long double) 0), // ensure qual is at least 0
qualstr,
- alignment.MapQuality,
- alignment.IsPaired(),
- alignment.IsMateMapped(),
- alignment.IsProperPair(),
+ alignment.MAPPINGQUALITY,
+ alignment.ISPAIRED,
+ alignment.ISMATEMAPPED,
+ alignment.ISPROPERPAIR,
cigar,
&ra.alleles,
- alignment.Position,
- alignment.GetEndPosition());
+ alignment.POSITION,
+ alignment.ENDPOSITION);
}
-RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, RegisteredAlignment& ra, string& sampleName, string& sequencingTech) {
+RegisteredAlignment& AlleleParser::registerAlignment(BAMALIGN& alignment, RegisteredAlignment& ra, string& sampleName, string& sequencingTech) {
- string rDna = alignment.QueryBases;
- string rQual = alignment.Qualities;
+ string rDna = alignment.QUERYBASES;
+ string rQual = alignment.QUALITIES;
int rp = 0; // read position, 0-based relative to read
int csp = currentSequencePosition(alignment); // current sequence position, 0-based relative to currentSequence
- int sp = alignment.Position; // sequence position
-
+ int sp = alignment.POSITION; // sequence position
if (usingHaplotypeBasisAlleles) {
- updateHaplotypeBasisAlleles(sp, alignment.AlignedBases.size());
+ updateHaplotypeBasisAlleles(sp, alignment.ALIGNEDBASES);
}
#ifdef VERBOSE_DEBUG
if (parameters.debug2) {
DEBUG2("registering alignment " << rp << " " << csp << " " << sp << endl <<
- "alignment readName " << alignment.Name << endl <<
- "alignment isPaired " << alignment.IsPaired() << endl <<
- "alignment isMateMapped " << alignment.IsMateMapped() << endl <<
- "alignment isProperPair " << alignment.IsProperPair() << endl <<
- "alignment mapQual " << alignment.MapQuality << endl <<
- "alignment sampleID " << sampleName << endl <<
- "alignment position " << alignment.Position << endl <<
- "alignment length " << alignment.Length << endl <<
- "alignment AlignedBases.size() " << alignment.AlignedBases.size() << endl <<
- "alignment GetEndPosition() " << alignment.GetEndPosition() << endl <<
- "alignment end position " << alignment.Position + alignment.AlignedBases.size());
+ "alignment readName " << alignment.QNAME << endl <<
+ "alignment isPaired " << alignment.ISPAIRED << endl <<
+ "alignment isMateMapped " << alignment.ISMATEMAPPED << endl <<
+ "alignment isProperPair " << alignment.ISPROPERPAIR << endl <<
+ "alignment mapQual " << alignment.MAPPINGQUALITY << endl <<
+ "alignment sampleID " << sampleName << endl <<
+ "alignment position " << alignment.POSITION << endl <<
+ "alignment length " << alignment.ALIGNMENTLENGTH << endl <<
+ "alignment AlignedBases.size() " << alignment.ALIGNEDBASES << endl <<
+ "alignment GetEndPosition() " << alignment.ENDPOSITION << endl <<
+ "alignment end position " << alignment.POSITION + alignment.ALIGNEDBASES);
stringstream cigarss;
int alignedLength = 0;
- for (vector<CigarOp>::const_iterator c = alignment.CigarData.begin(); c != alignment.CigarData.end(); ++c) {
- cigarss << c->Type << c->Length;
- if (c->Type == 'D')
- alignedLength += c->Length;
- if (c->Type == 'M')
- alignedLength += c->Length;
+ CIGAR cig = alignment.GETCIGAR;
+ for (CIGAR::const_iterator c = cig.begin(); c != cig.end(); ++c) {
+ cigarss << c->CIGTYPE << c->CIGLEN;
+ if (c->CIGTYPE == 'D')
+ alignedLength += c->CIGLEN;
+ if (c->CIGTYPE == 'M')
+ alignedLength += c->CIGLEN;
}
DEBUG2("alignment cigar " << cigarss.str());
@@ -1369,9 +1463,9 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
DEBUG2("current sequence pointer: " << csp);
DEBUG2("read: " << rDna);
- DEBUG2("aligned bases: " << alignment.AlignedBases);
- DEBUG2("qualities: " << alignment.Qualities);
- DEBUG2("reference seq: " << currentSequence.substr(csp, alignment.AlignedBases.size()));
+ DEBUG2("aligned bases: " << alignment.QUERYBASES);
+ DEBUG2("qualities: " << alignment.QUALITIES);
+ DEBUG2("reference seq: " << currentSequence.substr(csp, alignment.ALIGNEDBASES));
}
#endif
@@ -1381,7 +1475,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
*
* Also, we don't store the entire undelying sequence; just the subsequence
* that matches our current target region.
- *
+ *
* As we step through a match sequence, we look for mismatches. When we
* see one we set a positional flag indicating the location, and we emit a
* 'Reference' allele that stretches from the the base after the last
@@ -1394,14 +1488,24 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
*
*/
- vector<bool> indelMask (alignment.AlignedBases.size(), false);
+ /* std::cerr << "********" << std::endl
+ << alignment.QueryBases << std::endl
+ << alignment.AlignedBases << std::endl;
+ vector<CigarOp>::const_iterator cigarIter2 = alignment.CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd2 = alignment.CigarData.end();
+ for (; cigarIter2 != cigarEnd2; ++cigarIter2)
+ std::cerr << cigarIter2->Length << cigarIter2->Type;
+ std::cerr << std::endl;
+ */
+ vector<bool> indelMask (alignment.ALIGNEDBASES, false);
- vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin();
- vector<CigarOp>::const_iterator cigarEnd = alignment.CigarData.end();
+ CIGAR cigar = alignment.GETCIGAR;
+ CIGAR::const_iterator cigarIter = cigar.begin();
+ CIGAR::const_iterator cigarEnd = cigar.end();
for ( ; cigarIter != cigarEnd; ++cigarIter ) {
- int l = cigarIter->Length;
- char t = cigarIter->Type;
- DEBUG2("cigar item: " << t << l);
+ int l = cigarIter->CIGLEN;
+ char t = cigarIter->CIGTYPE;
+ DEBUG2("cigar item: " << t << l);
if (t == 'M' || t == 'X' || t == '=') { // match or mismatch
int firstMatch = csp; // track the first match after a mismatch, for recording 'reference' alleles
@@ -1421,10 +1525,11 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
b = rDna.at(rp);
} catch (std::out_of_range outOfRange) {
cerr << "Exception: Cannot read past the end of the alignment's sequence." << endl
- << alignment.Name << endl
+ << alignment.QNAME << endl
<< currentSequenceName << ":" << (long unsigned int) currentPosition + 1 << endl
- << alignment.AlignedBases << endl
- << currentSequence.substr(csp, alignment.AlignedBases.size()) << endl;
+ //<< alignment.AlignedBases << endl
+ << currentSequence.substr(csp, alignment.ALIGNEDBASES) << endl;
+ cerr << " RP " << rp << " " << rDna <<" len " << rDna.length() << std::endl;
abort();
}
@@ -1436,13 +1541,13 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
try {
sb = currentSequence.at(csp);
} catch (std::out_of_range outOfRange) {
- cerr << "Exception: Unable to read reference sequence base past end of current cached sequence." << endl
+ cerr << "Exception: Unable to read reference sequence base past end of current cached sequence." << endl
<< currentSequenceName << ":" << (long unsigned int) currentPosition + 1 << endl
- << alignment.Position << "-" << alignment.GetEndPosition() << endl
- << "alignment: " << alignment.AlignedBases << endl
- << "currentSequence: " << currentSequence << endl
- << "currentSequence matching: " << currentSequence.substr(csp, alignment.AlignedBases.size()) << endl;
- //abort();
+ << alignment.POSITION << "-" << alignment.ENDPOSITION << endl
+ //<< "alignment: " << alignment.AlignedBases << endl
+ << "currentSequence: " << currentSequence << endl
+ << "currentSequence matching: " << currentSequence.substr(csp, alignment.ALIGNEDBASES) << endl;
+ //abort();
break;
}
@@ -1460,12 +1565,12 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
sp - length,
length,
rp, // bases left (for first base in ref allele)
- alignment.QueryBases.size() - rp, // bases right (for first base in ref allele)
+ alignment.SEQLEN - rp, // bases right (for first base in ref allele)
readSequence,
sampleName,
alignment,
sequencingTech,
- alignment.MapQuality, // reference allele quality == mapquality
+ alignment.MAPPINGQUALITY, // reference allele quality == mapquality
qualstr),
parameters.allowComplex, parameters.maxComplexGap);
}
@@ -1503,7 +1608,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
sp - length + j,
1,
rp - length - j, // bases left
- alignment.QueryBases.size() - rp + j, // bases right
+ alignment.SEQLEN - rp + j, // bases right
rs,
sampleName,
alignment,
@@ -1511,7 +1616,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
lqual,
qualp),
parameters.allowComplex, parameters.maxComplexGap);
-
+
} else {
ra.addAllele(
makeAllele(ra,
@@ -1519,7 +1624,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
sp - length + j,
1,
rp - length - j, // bases left
- alignment.QueryBases.size() - rp + j, // bases right
+ alignment.SEQLEN - rp + j, // bases right
rs,
sampleName,
alignment,
@@ -1553,7 +1658,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
sp - length + j,
1,
rp - length - j, // bases left
- alignment.QueryBases.size() - rp + j, // bases right
+ alignment.SEQLEN - rp + j, // bases right
rs,
sampleName,
alignment,
@@ -1561,7 +1666,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
lqual,
qualp),
parameters.allowComplex, parameters.maxComplexGap);
-
+
} else {
ra.addAllele(
makeAllele(ra,
@@ -1569,7 +1674,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
sp - length + j,
1,
rp - length - j, // bases left
- alignment.QueryBases.size() - rp + j, // bases right
+ alignment.SEQLEN - rp + j, // bases right
rs,
sampleName,
alignment,
@@ -1592,12 +1697,12 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
sp - length,
length,
rp, // bases left (for first base in ref allele)
- alignment.QueryBases.size() - rp, // bases right (for first base in ref allele)
+ alignment.SEQLEN - rp, // bases right (for first base in ref allele)
readSequence,
sampleName,
alignment,
sequencingTech,
- alignment.MapQuality, // ... hmm
+ alignment.MAPPINGQUALITY, // ... hmm
qualstr),
parameters.allowComplex, parameters.maxComplexGap);
}
@@ -1655,7 +1760,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
if (qual >= parameters.BQL2) {
//ra.mismatches += l;
for (int i=0; i<l; i++) {
- indelMask[sp - alignment.Position + i] = true;
+ indelMask[sp - alignment.POSITION + i] = true;
}
}
@@ -1663,8 +1768,9 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
// some aligners like to report deletions at the beginnings and ends of reads.
// without any sequence in the read to support this, it is hard to believe
// that these deletions are real, so we ignore them here.
- if (cigarIter != alignment.CigarData.begin() // guard against deletion at beginning
- && (cigarIter+1) != alignment.CigarData.end() // and against deletion at end
+ CIGAR cigar = alignment.GETCIGAR;
+ if (cigarIter != cigar.begin() // guard against deletion at beginning
+ && (cigarIter+1) != cigar.end() // and against deletion at end
&& allATGC(refseq)) {
string nullstr;
ra.addAllele(
@@ -1673,7 +1779,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
sp,
l,
rp, // bases left (for first base in ref allele)
- alignment.QueryBases.size() - rp, // bases right (for first base in ref allele)
+ alignment.SEQLEN - rp, // bases right (for first base in ref allele)
nullstr, // no read sequence for deletions
sampleName,
alignment,
@@ -1735,7 +1841,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
if (qual >= parameters.BQL2) {
//ra.mismatches += l;
- indelMask[sp - alignment.Position] = true;
+ indelMask[sp - alignment.POSITION] = true;
}
string readseq = rDna.substr(rp, l);
@@ -1747,7 +1853,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
sp,
l,
rp - l, // bases left (for first base in ref allele)
- alignment.QueryBases.size() - rp, // bases right (for first base in ref allele)
+ alignment.SEQLEN - rp, // bases right (for first base in ref allele)
readseq,
sampleName,
alignment,
@@ -1766,7 +1872,7 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
// nothing to do, soft clip is beyond the beginning of the reference
} else {
string qualstr = rQual.substr(rp, l);
- string readseq = alignment.QueryBases.substr(rp, l);
+ string readseq = alignment.QUERYBASES.substr(rp, l);
// skip these bases in the read
ra.addAllele(
makeAllele(ra,
@@ -1774,12 +1880,12 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
sp - l,
l,
rp - l, // bases left
- alignment.QueryBases.size() - rp, // bases right
+ alignment.SEQLEN - rp, // bases right
readseq,
sampleName,
alignment,
sequencingTech,
- alignment.MapQuality,
+ alignment.MAPPINGQUALITY,
qualstr),
parameters.allowComplex, parameters.maxComplexGap);
}
@@ -1906,8 +2012,8 @@ void AlleleParser::updateAlignmentQueue(long int position,
bool gettingPartials) {
DEBUG2("updating alignment queue");
- DEBUG2("currentPosition = " << position
- << "; currentSequenceStart = " << currentSequenceStart
+ DEBUG2("currentPosition = " << position
+ << "; currentSequenceStart = " << currentSequenceStart
<< "; currentSequence end = " << currentSequence.size() + currentSequenceStart);
// make sure we have sequence for the *first* alignment
@@ -1916,25 +2022,30 @@ void AlleleParser::updateAlignmentQueue(long int position,
// push to the front until we get to an alignment that doesn't overlap our
// current position or we reach the end of available alignments
// filter input reads; only allow mapped reads with a certain quality
- DEBUG2("currentAlignment.Position == " << currentAlignment.Position
- << ", currentAlignment.AlignedBases.size() == " << currentAlignment.AlignedBases.size()
+ DEBUG2("currentAlignment.Position == " << currentAlignment.POSITION
+ << ", currentAlignment.AlignedBases.size() == " << currentAlignment.ALIGNEDBASES
<< ", currentPosition == " << position
<< ", currentSequenceStart == " << currentSequenceStart
<< " .. + currentSequence.size() == " << currentSequenceStart + currentSequence.size()
);
if (hasMoreAlignments
- && currentAlignment.Position <= position
- && currentAlignment.RefID == currentRefID) {
+ && currentAlignment.POSITION <= position
+ && currentAlignment.REFID == currentRefID) {
do {
DEBUG2("top of alignment parsing loop");
- DEBUG("alignment: " << currentAlignment.Name);
+ DEBUG("alignment: " << currentAlignment.QNAME);
// get read group, and map back to a sample name
string readGroup;
+#ifdef HAVE_BAMTOOLS
if (!currentAlignment.GetTag("RG", readGroup)) {
+#else
+ readGroup = currentAlignment.GetZTag("RG");
+ if (readGroup.empty()) {
+#endif
if (!oneSampleAnalysis) {
ERROR("Couldn't find read group id (@RG tag) for BAM Alignment " <<
- currentAlignment.Name << " at position " << position
+ currentAlignment.QNAME << " at position " << position
<< " in sequence " << currentSequence << " EXITING!");
exit(1);
} else {
@@ -1943,7 +2054,7 @@ void AlleleParser::updateAlignmentQueue(long int position,
} else {
if (oneSampleAnalysis) {
ERROR("No read groups specified in BAM header, but alignment " <<
- currentAlignment.Name << " at position " << position
+ currentAlignment.QNAME << " at position " << position
<< " in sequence " << currentSequence << " has a read group.");
exit(1);
}
@@ -1956,33 +2067,37 @@ void AlleleParser::updateAlignmentQueue(long int position,
}
// skip this alignment if we are not using duplicate reads (we remove them by default)
- if (currentAlignment.IsDuplicate() && !parameters.useDuplicateReads) {
- //DEBUG("skipping alignment " << currentAlignment.Name << " because it is a duplicate read");
+ if (currentAlignment.ISDUPLICATE && !parameters.useDuplicateReads) {
+ DEBUG("skipping alignment " << currentAlignment.QNAME << " because it is a duplicate read");
continue;
}
// skip unmapped alignments, as they cannot be used in the algorithm
- if (!currentAlignment.IsMapped()) {
- //DEBUG("skipping alignment " << currentAlignment.Name << " because it is not mapped");
+ if (!currentAlignment.ISMAPPED) {
+ DEBUG("skipping alignment " << currentAlignment.QNAME << " because it is not mapped");
continue;
}
// skip alignments which have no aligned bases
- if (currentAlignment.AlignedBases.size() == 0) {
- //DEBUG("skipping alignment " << currentAlignment.Name << " because it has no aligned bases");
+ if (currentAlignment.ALIGNEDBASES == 0) {
+ DEBUG("skipping alignment " << currentAlignment.QNAME << " because it has no aligned bases");
continue;
}
// skip alignments which are non-primary
+#ifdef HAVE_BAMTOOLS
if (!currentAlignment.IsPrimaryAlignment()) {
+#else
+ if (currentAlignment.SecondaryFlag()) {
+#endif
//DEBUG("skipping alignment " << currentAlignment.Name << " because it is not marked primary");
continue;
}
- if (!gettingPartials && currentAlignment.GetEndPosition() < position) {
- cerr << currentAlignment.Name << " at " << currentSequenceName << ":" << currentAlignment.Position << " is out of order!"
- << " expected after " << position << endl;
- continue;
+ if (!gettingPartials && currentAlignment.ENDPOSITION < position) {
+ cerr << currentAlignment.QNAME << " at " << currentSequenceName << ":" << currentAlignment.POSITION << " is out of order!"
+ << " expected after " << position << endl;
+ continue;
}
// otherwise, get the sample name and register the alignment to generate a sequence of alleles
@@ -1990,12 +2105,12 @@ void AlleleParser::updateAlignmentQueue(long int position,
// such as mismatches
// initially skip reads with low mapping quality (what happens if MapQuality is not in the file)
- if (currentAlignment.MapQuality >= parameters.MQL0) {
+ if (currentAlignment.MAPPINGQUALITY >= parameters.MQL0) {
// extend our cached reference sequence to allow processing of this alignment
//extendReferenceSequence(currentAlignment);
// left realign indels
if (parameters.leftAlignIndels) {
- int length = currentAlignment.GetEndPosition() - currentAlignment.Position + 1;
+ int length = currentAlignment.ENDPOSITION - currentAlignment.POSITION + 1;
stablyLeftAlign(currentAlignment,
currentSequence.substr(currentSequencePosition(currentAlignment), length));
}
@@ -2012,7 +2127,8 @@ void AlleleParser::updateAlignmentQueue(long int position,
}
// decomposes alignment into a set of alleles
// here we get the deque of alignments ending at this alignment's end position
- deque<RegisteredAlignment>& rq = registeredAlignments[currentAlignment.GetEndPosition()];
+ deque<RegisteredAlignment>& rq = registeredAlignments[currentAlignment.ENDPOSITION];
+
// and insert the registered alignment into that deque
rq.push_front(RegisteredAlignment(currentAlignment, parameters));
RegisteredAlignment& ra = rq.front();
@@ -2020,7 +2136,7 @@ void AlleleParser::updateAlignmentQueue(long int position,
// backtracking if we have too many mismatches
// or if there are no recorded alleles
if (ra.alleles.empty()
- || ((float) ra.mismatches / (float) currentAlignment.QueryBases.size()) > parameters.readMaxMismatchFraction
+ || ((float) ra.mismatches / (float) currentAlignment.SEQLEN) > parameters.readMaxMismatchFraction
|| ra.mismatches > parameters.RMU
|| ra.snpCount > parameters.readSnpLimit
|| ra.indelCount > parameters.readIndelLimit) {
@@ -2031,10 +2147,10 @@ void AlleleParser::updateAlignmentQueue(long int position,
newAlleles.push_back(&*allele);
}
}
- }
- } while ((hasMoreAlignments = bamMultiReader.GetNextAlignment(currentAlignment))
- && currentAlignment.Position <= position
- && currentAlignment.RefID == currentRefID);
+ }
+ } while ((hasMoreAlignments = GETNEXT(bamMultiReader, currentAlignment))
+ && currentAlignment.POSITION <= position
+ && currentAlignment.REFID == currentRefID);
}
DEBUG2("... finished pushing new alignments");
@@ -2086,7 +2202,8 @@ pair<int, long int> AlleleParser::nextInputVariantPosition(void) {
return make_pair(currentRefID, ic->first);
} else {
// find next chrom with input alleles
- map<int, map<long, vector<Allele> > >::iterator nc = inputVariantAlleles.upper_bound(currentRefID);
+ map<int, map<long, vector<Allele> > >::iterator nc
+ = inputVariantAlleles.upper_bound(currentRefID);
if (nc != inputVariantAlleles.end()) {
return make_pair(nc->first, nc->second.begin()->first);
} else {
@@ -2107,35 +2224,39 @@ void AlleleParser::getInputVariantsInRegion(string& seq, long start, long end) {
if (!usingVariantInputAlleles) return;
// get the variants in the target region
- vcf::Variant var(variantCallInputFile);
+ vcflib::Variant var(variantCallInputFile);
if (!seq.empty()) {
variantCallInputFile.setRegion(seq, start, end);
}
bool ok;
- while (ok = variantCallInputFile.getNextVariant(*currentVariant)) {
+ while ((ok = variantCallInputFile.getNextVariant(*currentVariant))) {
long int pos = currentVariant->position - 1;
// get alternate alleles
bool includePreviousBaseForIndels = true;
- map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->parsedAlternates();
+ map<string, vector<vcflib::VariantAllele> > variantAlleles = currentVariant->parsedAlternates();
// TODO this would be a nice option: why does it not work?
- //map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->flatAlternates();
- vector< vector<vcf::VariantAllele> > orderedVariantAlleles;
- for (vector<string>::iterator a = currentVariant->alt.begin(); a != currentVariant->alt.end(); ++a) {
+ //map<string, vector<vcflib::VariantAllele> > variantAlleles = currentVariant->flatAlternates();
+ vector< vector<vcflib::VariantAllele> > orderedVariantAlleles;
+ for (vector<string>::iterator a = currentVariant->alt.begin();
+ a != currentVariant->alt.end(); ++a) {
orderedVariantAlleles.push_back(variantAlleles[*a]);
}
vector<Allele> genotypeAlleles;
set<long int> alternatePositions;
- for (vector< vector<vcf::VariantAllele> >::iterator g = orderedVariantAlleles.begin(); g != orderedVariantAlleles.end(); ++g) {
+ for (vector< vector<vcflib::VariantAllele> >::iterator
+ g = orderedVariantAlleles.begin();
+ g != orderedVariantAlleles.end(); ++g) {
- vector<vcf::VariantAllele>& altAllele = *g;
+ vector<vcflib::VariantAllele>& altAllele = *g;
vector<Allele> alleles;
- for (vector<vcf::VariantAllele>::iterator v = altAllele.begin(); v != altAllele.end(); ++v) {
- vcf::VariantAllele& variant = *v;
+ for (vector<vcflib::VariantAllele>::iterator v = altAllele.begin();
+ v != altAllele.end(); ++v) {
+ vcflib::VariantAllele& variant = *v;
long int allelePos = variant.position - 1;
AlleleType type;
string alleleSequence = variant.alt;
@@ -2172,9 +2293,9 @@ void AlleleParser::getInputVariantsInRegion(string& seq, long start, long end) {
allelePos -= 1;
reflen = len + 2;
alleleSequence =
- uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos, 1))
+ reference.getSubSequence(currentVariant->sequenceName, allelePos, 1)
+ alleleSequence
- + uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos+1+len, 1));
+ + reference.getSubSequence(currentVariant->sequenceName, allelePos+1+len, 1);
cigar = "1M" + convert(len) + "D" + "1M";
} else {
// we always include the flanking bases for these elsewhere, so here too in order to be consistent and trigger use
@@ -2182,9 +2303,9 @@ void AlleleParser::getInputVariantsInRegion(string& seq, long start, long end) {
// add previous base and post base to match format typically used for calling
allelePos -= 1;
alleleSequence =
- uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos, 1))
+ reference.getSubSequence(currentVariant->sequenceName, allelePos, 1)
+ alleleSequence
- + uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos+1, 1));
+ + reference.getSubSequence(currentVariant->sequenceName, allelePos+1, 1);
len = variant.alt.size() - var.ref.size();
cigar = "1M" + convert(len) + "I" + "1M";
reflen = 2;
@@ -2199,7 +2320,7 @@ void AlleleParser::getInputVariantsInRegion(string& seq, long start, long end) {
genotypeAlleles.push_back(allele);
if (allele.type != ALLELE_REFERENCE) {
- inputVariantAlleles[bamMultiReader.GetReferenceID(currentVariant->sequenceName)][allele.position].push_back(allele);
+ inputVariantAlleles[bamMultiReader.GETREFID(currentVariant->sequenceName)][allele.position].push_back(allele);
alternatePositions.insert(allele.position);
}
}
@@ -2240,18 +2361,18 @@ void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
if (gotRegion) {
// get the variants in the target region
- vcf::Variant var(variantCallInputFile);
+ vcflib::Variant var(variantCallInputFile);
bool ok;
- while (ok = variantCallInputFile.getNextVariant(*currentVariant)) {
+ while ((ok = variantCallInputFile.getNextVariant(*currentVariant))) {
DEBUG("getting input alleles from input VCF at position " << currentVariant->sequenceName << ":" << currentVariant->position);
long int pos = currentVariant->position - 1;
// get alternate alleles
bool includePreviousBaseForIndels = true;
- map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->parsedAlternates();
+ map<string, vector<vcflib::VariantAllele> > variantAlleles = currentVariant->parsedAlternates();
// TODO this would be a nice option: why does it not work?
- //map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->flatAlternates();
- vector< vector<vcf::VariantAllele> > orderedVariantAlleles;
+ //map<string, vector<vcflib::VariantAllele> > variantAlleles = currentVariant->flatAlternates();
+ vector< vector<vcflib::VariantAllele> > orderedVariantAlleles;
for (vector<string>::iterator a = currentVariant->alt.begin(); a != currentVariant->alt.end(); ++a) {
orderedVariantAlleles.push_back(variantAlleles[*a]);
}
@@ -2259,14 +2380,14 @@ void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
vector<Allele> genotypeAlleles;
set<long int> alternatePositions;
- for (vector< vector<vcf::VariantAllele> >::iterator g = orderedVariantAlleles.begin(); g != orderedVariantAlleles.end(); ++g) {
+ for (vector< vector<vcflib::VariantAllele> >::iterator g = orderedVariantAlleles.begin(); g != orderedVariantAlleles.end(); ++g) {
- vector<vcf::VariantAllele>& altAllele = *g;
+ vector<vcflib::VariantAllele>& altAllele = *g;
vector<Allele> alleles;
- for (vector<vcf::VariantAllele>::iterator v = altAllele.begin(); v != altAllele.end(); ++v) {
- vcf::VariantAllele& variant = *v;
+ for (vector<vcflib::VariantAllele>::iterator v = altAllele.begin(); v != altAllele.end(); ++v) {
+ vcflib::VariantAllele& variant = *v;
long int allelePos = variant.position - 1;
AlleleType type;
string alleleSequence = variant.alt;
@@ -2303,9 +2424,9 @@ void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
allelePos -= 1;
reflen = len + 2;
alleleSequence =
- uppercase(reference.getSubSequence(currentSequenceName, allelePos, 1))
+ reference.getSubSequence(currentSequenceName, allelePos, 1)
+ alleleSequence
- + uppercase(reference.getSubSequence(currentSequenceName, allelePos+1+len, 1));
+ + reference.getSubSequence(currentSequenceName, allelePos+1+len, 1);
cigar = "1M" + convert(len) + "D" + "1M";
} else {
// we always include the flanking bases for these elsewhere, so here too in order to be consistent and trigger use
@@ -2313,9 +2434,9 @@ void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
// add previous base and post base to match format typically used for calling
allelePos -= 1;
alleleSequence =
- uppercase(reference.getSubSequence(currentSequenceName, allelePos, 1))
+ reference.getSubSequence(currentSequenceName, allelePos, 1)
+ alleleSequence
- + uppercase(reference.getSubSequence(currentSequenceName, allelePos+1, 1));
+ + reference.getSubSequence(currentSequenceName, allelePos+1, 1);
len = variant.alt.size() - var.ref.size();
cigar = "1M" + convert(len) + "I" + "1M";
reflen = 2;
@@ -2329,7 +2450,7 @@ void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
genotypeAlleles.push_back(allele);
if (allele.type != ALLELE_REFERENCE) {
- inputVariantAlleles[bamMultiReader.GetReferenceID(allele.referenceName)][allele.position].push_back(allele);
+ inputVariantAlleles[bamMultiReader.GETREFID(allele.referenceName)][allele.position].push_back(allele);
alternatePositions.insert(allele.position);
}
@@ -2534,7 +2655,7 @@ bool AlleleParser::toNextTarget(void) {
if (!loadTarget(++currentTarget)) {
continue;
}
- if (ok = getFirstAlignment()) {
+ if ((ok = getFirstAlignment())) {
break;
}
}
@@ -2592,11 +2713,18 @@ bool AlleleParser::loadTarget(BedTarget* target) {
currentPosition = currentTarget->left;
rightmostHaplotypeBasisAllelePosition = currentTarget->left;
+#ifdef HAVE_BAMTOOLS
if (!bamMultiReader.SetRegion(currentRefID, currentTarget->left, currentRefID, currentTarget->right + 1)) { // bamtools expects 0-based, half-open
ERROR("Could not SetRegion to " << currentTarget->seq << ":" << currentTarget->left << ".." << currentTarget->right + 1);
cerr << bamMultiReader.GetErrorString() << endl;
return false;
}
+#else
+ if (!bamMultiReader.SetRegion(SeqLib::GenomicRegion(currentRefID, currentTarget->left, currentTarget->right + 1))) { // bamtools expects 0-based, half-open
+ ERROR("Could not SetRegion to " << currentTarget->seq << ":" << currentTarget->left << ".." << currentTarget->right + 1);
+ return false;
+ }
+#endif
if (variantCallInputFile.is_open()) {
stringstream r;
@@ -2609,7 +2737,7 @@ bool AlleleParser::loadTarget(BedTarget* target) {
currentTarget->right + 1);
//return false;
} else {
- DEBUG("set region of variant input file to " <<
+ DEBUG("set region of variant input file to " <<
currentTarget->seq << ":" << currentTarget->left << ".." <<
currentTarget->right + 1);
}
@@ -2627,15 +2755,15 @@ bool AlleleParser::loadTarget(BedTarget* target) {
bool AlleleParser::getFirstAlignment(void) {
bool hasAlignments = true;
- if (!bamMultiReader.GetNextAlignment(currentAlignment)) {
- hasAlignments = false;
+ if (!GETNEXT(bamMultiReader, currentAlignment)) {
+ hasAlignments = false;
} else {
- while (!currentAlignment.IsMapped()) {
- if (!bamMultiReader.GetNextAlignment(currentAlignment)) {
- hasAlignments = false;
- break;
- }
- }
+ while (!currentAlignment.ISMAPPED) {
+ if (!GETNEXT(bamMultiReader, currentAlignment)) {
+ hasAlignments = false;
+ break;
+ }
+ }
}
if (hasAlignments) {
@@ -2718,9 +2846,9 @@ bool AlleleParser::toNextPosition(void) {
if (parameters.useStdin || targets.empty()) {
// here we loop over unaligned reads at the beginning of a target
// we need to get to a mapped read to figure out where we are
- while (hasMoreAlignments && !currentAlignment.IsMapped()) {
- hasMoreAlignments = bamMultiReader.GetNextAlignment(currentAlignment);
- }
+ while (hasMoreAlignments && !currentAlignment.ISMAPPED) {
+ hasMoreAlignments = GETNEXT(bamMultiReader, currentAlignment);
+ }
// determine if we have more alignments or not
if (!hasMoreAlignments) {
if (hasMoreInputVariants()) {
@@ -2739,11 +2867,13 @@ bool AlleleParser::toNextPosition(void) {
}
} else {
// step the position
- ++currentPosition;
+ if (!first_pos) {
+ ++currentPosition;
+ }
// if the current position of this alignment is outside of the reference sequence length
// we need to switch references
if (currentPosition >= reference.sequenceLength(currentSequenceName)
- || registeredAlignments.empty() && currentRefID != currentAlignment.RefID) {
+ || (registeredAlignments.empty() && currentRefID != currentAlignment.REFID)) {
DEBUG("at end of sequence");
clearRegisteredAlignments();
loadNextPositionWithAlignmentOrInputVariant(currentAlignment);
@@ -2752,7 +2882,9 @@ bool AlleleParser::toNextPosition(void) {
}
} else {
// or if it's not we should step to the next position
- ++currentPosition;
+ if (!first_pos) {
+ ++currentPosition;
+ }
// if we've run off the right edge of a target, jump
if (currentPosition > currentTarget->right) {
// time to move to a new target
@@ -2835,8 +2967,7 @@ bool AlleleParser::dummyProcessNextTarget(void) {
return false;
}
- while (bamMultiReader.GetNextAlignment(currentAlignment)) {
- }
+ while (GETNEXT(bamMultiReader, currentAlignment)) { }
return true;
}
@@ -2880,7 +3011,7 @@ bool RegisteredAlignment::fitHaplotype(int haplotypeStart, int haplotypeLength,
vector<Allele> newAlleles;
int haplotypeEnd = haplotypeStart + haplotypeLength;
-
+
//if (containedAlleleTypes == ALLELE_REFERENCE) {
// return false;
//}
@@ -3086,8 +3217,8 @@ void AlleleParser::buildHaplotypeAlleles(
deque<RegisteredAlignment>& ras = registeredAlignments[i];
for (deque<RegisteredAlignment>::iterator r = ras.begin(); r != ras.end(); ++r) {
RegisteredAlignment& ra = *r;
- if (ra.start > currentPosition && ra.start < currentPosition + haplotypeLength
- || ra.end > currentPosition && ra.end < currentPosition + haplotypeLength) {
+ if ((ra.start > currentPosition && ra.start < currentPosition + haplotypeLength)
+ || (ra.end > currentPosition && ra.end < currentPosition + haplotypeLength)) {
Allele* aptr;
bool allowPartials = true;
ra.fitHaplotype(currentPosition, haplotypeLength, aptr, allowPartials);
@@ -3138,7 +3269,7 @@ void AlleleParser::buildHaplotypeAlleles(
// for each non-reference allele within the haplotype length of this
// position, adjust the length and reference sequences of the adjacent
- // alleles
+ // alleles
DEBUG("fitting haplotype block " << currentPosition << " to " << currentPosition + haplotypeLength << ", " << haplotypeLength << "bp");
lastHaplotypeLength = haplotypeLength;
@@ -3224,7 +3355,7 @@ void AlleleParser::buildHaplotypeAlleles(
*/
Allele refAllele = genotypeAllele(ALLELE_REFERENCE,
- uppercase(reference.getSubSequence(currentSequenceName, currentPosition, haplotypeLength)),
+ reference.getSubSequence(currentSequenceName, currentPosition, haplotypeLength),
haplotypeLength,
convert(haplotypeLength)+"M",
haplotypeLength,
@@ -3346,7 +3477,7 @@ void AlleleParser::buildHaplotypeAlleles(
(*p)->setQuality();
(*p)->update(haplotypeLength);
}
-
+
// now add in partial observations collected from partially-overlapping reads
if (!partialHaplotypeObservations.empty()) {
samples.assignPartialSupport(alleles,
@@ -3397,7 +3528,7 @@ void AlleleParser::buildHaplotypeAlleles(
}
-bool AlleleParser::getCompleteObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& haplotypeObservations) {
+void AlleleParser::getCompleteObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& haplotypeObservations) {
for (map<long unsigned int, deque<RegisteredAlignment> >::iterator ras = registeredAlignments.begin(); ras != registeredAlignments.end(); ++ras) {
deque<RegisteredAlignment>& rq = ras->second;
for (deque<RegisteredAlignment>::iterator rai = rq.begin(); rai != rq.end(); ++rai) {
@@ -3439,7 +3570,7 @@ void AlleleParser::unsetAllProcessedFlags(void) {
// process the next length bp of alignments, so as to get allele observations partially overlapping our calling window
-bool AlleleParser::getPartialObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& partials) {
+void AlleleParser::getPartialObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& partials) {
//cerr << "getting partial observations of haplotype from " << currentPosition << " to " << currentPosition + haplotypeLength << endl;
vector<Allele*> newAlleles;
@@ -3458,8 +3589,8 @@ bool AlleleParser::getPartialObservationsOfHaplotype(Samples& samples, int haplo
deque<RegisteredAlignment>& ras = registeredAlignments[i];
for (deque<RegisteredAlignment>::iterator r = ras.begin(); r != ras.end(); ++r) {
RegisteredAlignment& ra = *r;
- if (ra.start > currentPosition && ra.start < currentPosition + haplotypeLength
- || ra.end > currentPosition && ra.end < currentPosition + haplotypeLength) {
+ if ((ra.start > currentPosition && ra.start < currentPosition + haplotypeLength)
+ || (ra.end > currentPosition && ra.end < currentPosition + haplotypeLength)) {
Allele* aptr;
bool allowPartials = true;
ra.fitHaplotype(currentPosition, haplotypeLength, aptr, allowPartials);
@@ -3534,9 +3665,9 @@ void AlleleParser::getAlleles(Samples& samples, int allowedAlleleTypes,
if (allowedAlleleTypes & allele.type
&& ((haplotypeLength > 1 &&
((allele.type == ALLELE_REFERENCE
- && allele.position <= currentPosition
+ && allele.position <= currentPosition
&& allele.position + allele.referenceLength >= currentPosition + haplotypeLength)
- ||
+ ||
(allele.position == currentPosition
&& allele.referenceLength == haplotypeLength)
||
@@ -3549,7 +3680,7 @@ void AlleleParser::getAlleles(Samples& samples, int allowedAlleleTypes,
((allele.type == ALLELE_REFERENCE
&& allele.position <= currentPosition
&& allele.position + allele.referenceLength > currentPosition)
- ||
+ ||
(allele.position == currentPosition)))
) ) {
allele.update(haplotypeLength);
@@ -3623,10 +3754,10 @@ Allele* AlleleParser::referenceAllele(int mapQ, int baseQ) {
string sequencingTech = "reference";
string baseQstr = "";
//baseQstr += qualityInt2Char(baseQ);
- Allele* allele = new Allele(ALLELE_REFERENCE,
+ Allele* allele = new Allele(ALLELE_REFERENCE,
currentSequenceName,
currentPosition,
- ¤tPosition,
+ ¤tPosition,
¤tReferenceBase,
1,
currentPosition + 1,
@@ -3699,7 +3830,7 @@ vector<Allele> AlleleParser::genotypeAlleles(
if (haplotypeLength == 1) {
altseq = currentReferenceBase;
} else {
- altseq = uppercase(reference.getSubSequence(currentSequenceName, currentPosition, haplotypeLength));
+ altseq = reference.getSubSequence(currentSequenceName, currentPosition, haplotypeLength);
}
}
unfilteredAlleles.push_back(make_pair(genotypeAllele(allele.type,
@@ -3715,7 +3846,7 @@ vector<Allele> AlleleParser::genotypeAlleles(
map<Allele, int> filteredAlleles;
- DEBUG("filtering genotype alleles which are not supported by at least " << parameters.minAltCount
+ DEBUG("filtering genotype alleles which are not supported by at least " << parameters.minAltCount
<< " observations comprising at least " << parameters.minAltFraction << " of the observations in a single individual");
for (vector<pair<Allele, int> >::iterator p = unfilteredAlleles.begin();
p != unfilteredAlleles.end(); ++p) {
@@ -3725,7 +3856,7 @@ vector<Allele> AlleleParser::genotypeAlleles(
DEBUG("genotype allele: " << genotypeAllele << " qsum " << qSum);
for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
- Sample& sample = s->second;
+ Sample& sample = s->second;
int alleleCount = 0;
int qsum = 0;
Sample::iterator c = sample.find(genotypeAllele.currentBase);
@@ -3739,10 +3870,10 @@ vector<Allele> AlleleParser::genotypeAlleles(
}
int observationCount = sample.observationCount();
if (qsum >= parameters.minAltQSum
- && alleleCount >= parameters.minAltCount
+ && alleleCount >= parameters.minAltCount
&& ((float) alleleCount / (float) observationCount) >= parameters.minAltFraction) {
- DEBUG(genotypeAllele << " has support of " << alleleCount
- << " in individual " << s->first << " (" << observationCount << " obs)" << " and fraction "
+ DEBUG(genotypeAllele << " has support of " << alleleCount
+ << " in individual " << s->first << " (" << observationCount << " obs)" << " and fraction "
<< (float) alleleCount / (float) observationCount);
filteredAlleles[genotypeAllele] = qSum;
break;
@@ -3915,7 +4046,7 @@ map<string, int> AlleleParser::repeatCounts(long int position, const string& seq
j += i;
++rightsteps;
}
- // if we went left and right a non-zero number of times,
+ // if we went left and right a non-zero number of times,
if (leftsteps + rightsteps > 1) {
counts[seq] = leftsteps + rightsteps;
}
diff --git a/src/AlleleParser.h b/src/AlleleParser.h
index 0eed6be..f7f4a6a 100644
--- a/src/AlleleParser.h
+++ b/src/AlleleParser.h
@@ -15,7 +15,7 @@
#include <cmath>
#include "split.h"
#include "join.h"
-#include "api/BamReader.h"
+
#include "BedReader.h"
#include "Parameters.h"
#include "Utility.h"
@@ -23,7 +23,7 @@
#include "Sample.h"
#include "Fasta.h"
#include "TryCatch.h"
-#include "api/BamMultiReader.h"
+
#include "Genotype.h"
#include "CNV.h"
#include "Result.h"
@@ -39,7 +39,6 @@
#define CACHED_BASIS_HAPLOTYPE_WINDOW 1000
using namespace std;
-using namespace BamTools;
// a structure holding information about our parameters
@@ -60,19 +59,19 @@ public:
int alleleTypes;
Parameters parameters;
- RegisteredAlignment(BamAlignment& alignment, Parameters parameters)
+ RegisteredAlignment(BAMALIGN& alignment, Parameters parameters)
//: alignment(alignment)
- : start(alignment.Position)
- , end(alignment.GetEndPosition())
- , refid(alignment.RefID)
- , name(alignment.Name)
+ : start(alignment.POSITION)
+ , end(alignment.ENDPOSITION)
+ , refid(alignment.REFID)
+ , name(alignment.QNAME)
, mismatches(0)
, snpCount(0)
, indelCount(0)
, alleleTypes(0)
, parameters(parameters)
{
- alignment.GetTag("RG", readgroup);
+ FILLREADGROUP(readgroup, alignment);
}
void addAllele(Allele allele, bool mergeComplex = true,
@@ -88,11 +87,11 @@ public:
AlleleFilter(long unsigned int s, long unsigned int e) : start(s), end(e) {}
// true of the allele is outside of our window
- bool operator()(Allele& a) {
+ bool operator()(Allele& a) {
return !(start >= a.position && end < a.position + a.length);
}
- bool operator()(Allele*& a) {
+ bool operator()(Allele*& a) {
return !(start >= a->position && end < a->position + a->length);
}
@@ -121,17 +120,16 @@ public:
bool operator<(const AllelicPrimitive& a, const AllelicPrimitive& b);
-void capBaseQuality(BamAlignment& alignment, int baseQualityCap);
-
+void capBaseQuality(BAMALIGN& alignment, int baseQualityCap);
class AlleleParser {
public:
Parameters parameters; // holds operational parameters passed at program invocation
-
+
AlleleParser(int argc, char** argv);
- ~AlleleParser(void);
+ ~AlleleParser(void);
vector<string> sampleList; // list of sample names, indexed by sample id
vector<string> sampleListFromBam; // sample names drawn from BAM file
@@ -149,7 +147,7 @@ public:
vector<string> referenceSequenceNames;
map<int, string> referenceIDToName;
string referenceSampleName;
-
+
// target regions
vector<BedTarget> targets;
// returns true if we are within a target
@@ -157,18 +155,18 @@ public:
bool inTarget(void);
// bamreader
- BamMultiReader bamMultiReader;
+ BAMREADER bamMultiReader;
// bed reader
BedReader bedReader;
// VCF
- vcf::VariantCallFile variantCallFile;
- vcf::VariantCallFile variantCallInputFile; // input variant alleles, to target analysis
- vcf::VariantCallFile haplotypeVariantInputFile; // input alleles which will be used to construct haplotype alleles
+ vcflib::VariantCallFile variantCallFile;
+ vcflib::VariantCallFile variantCallInputFile; // input variant alleles, to target analysis
+ vcflib::VariantCallFile haplotypeVariantInputFile; // input alleles which will be used to construct haplotype alleles
// input haplotype alleles
- //
+ //
// as calling progresses, a window of haplotype basis alleles from the flanking sequence
// map from starting position to length->alle
map<long int, vector<AllelicPrimitive> > haplotypeBasisAlleles; // this is in the current reference sequence
@@ -187,7 +185,7 @@ public:
int basesRight,
string& readSequence,
string& sampleName,
- BamAlignment& alignment,
+ BAMALIGN& alignment,
string& sequencingTech,
long double qual,
string& qualstr);
@@ -205,7 +203,7 @@ public:
map<string, map<long int, map<Allele, int> > > inputAlleleCounts; // drawn from input VCF
Sample* nullSample;
- bool loadNextPositionWithAlignmentOrInputVariant(BamAlignment& currentAlignment);
+ bool loadNextPositionWithAlignmentOrInputVariant(BAMALIGN& currentAlignment);
bool loadNextPositionWithInputVariant(void);
bool hasMoreInputVariants(void);
@@ -215,15 +213,14 @@ public:
void getInputAlleleCounts(vector<Allele>& genotypeAlleles, map<string, int>& inputAFs);
// reference names indexed by id
- vector<RefData> referenceSequences;
+ REFVEC referenceSequences;
// ^^ vector of objects containing:
//RefName; //!< Name of reference sequence
//RefLength; //!< Length of reference sequence
//RefHasAlignments; //!< True if BAM file contains alignments mapped to reference sequence
- string bamHeader;
vector<string> bamHeaderLines;
-
+
void openBams(void);
void openOutputFile(void);
void getSampleNames(void);
@@ -235,7 +232,7 @@ public:
vector<int> currentPloidies(Samples& samples);
void loadBamReferenceSequenceNames(void);
void loadFastaReference(void);
- void loadReferenceSequence(BamAlignment& alignment);
+ void loadReferenceSequence(BAMALIGN& alignment);
void loadReferenceSequence(string& seqname);
string referenceSubstr(long int position, unsigned int length);
void loadTargets(void);
@@ -243,7 +240,7 @@ public:
bool getFirstVariant(void);
void loadTargetsFromBams(void);
void initializeOutputFiles(void);
- RegisteredAlignment& registerAlignment(BamAlignment& alignment, RegisteredAlignment& ra, string& sampleName, string& sequencingTech);
+ RegisteredAlignment& registerAlignment(BAMALIGN& alignment, RegisteredAlignment& ra, string& sampleName, string& sequencingTech);
void clearRegisteredAlignments(void);
void updateAlignmentQueue(long int position, vector<Allele*>& newAlleles, bool gettingPartials = false);
void updateInputVariants(long int pos, int referenceLength);
@@ -264,12 +261,12 @@ public:
bool loadTarget(BedTarget*);
bool toFirstTargetPosition(void);
bool toNextPosition(void);
- bool getCompleteObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& haplotypeObservations);
- bool getPartialObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& partials);
+ void getCompleteObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& haplotypeObservations);
+ void getPartialObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& partials);
bool dummyProcessNextTarget(void);
bool toNextTarget(void);
void setPosition(long unsigned int);
- int currentSequencePosition(const BamAlignment& alignment);
+ int currentSequencePosition(const BAMALIGN& alignment);
int currentSequencePosition();
void unsetAllProcessedFlags(void);
bool getNextAlleles(Samples& allelesBySample, int allowedAlleleTypes);
@@ -348,8 +345,8 @@ private:
int basesAfterCurrentTarget; // ........................................ after ...................
int currentRefID;
- BamAlignment currentAlignment;
- vcf::Variant* currentVariant;
+ BAMALIGN currentAlignment;
+ vcflib::Variant* currentVariant;
};
diff --git a/src/Fasta.cpp b/src/Fasta.cpp
index 6488de1..d883845 100644
--- a/src/Fasta.cpp
+++ b/src/Fasta.cpp
@@ -245,7 +245,17 @@ FastaReference::~FastaReference(void) {
delete index;
}
-string FastaReference::getSequence(string seqname) {
+string removeIupacBases(string& str) {
+ const string validBases = "ATGCN";
+ size_t found = str.find_first_not_of(validBases);
+ while (found != string::npos) {
+ str[found] = 'N';
+ found = str.find_first_not_of(validBases, found + 1);
+ }
+ return str;
+}
+
+string FastaReference::getRawSequence(string seqname) {
FastaIndexEntry entry = index->entry(seqname);
int newlines_in_sequence = entry.length / entry.line_blen;
int seqlen = newlines_in_sequence + entry.length;
@@ -263,6 +273,11 @@ string FastaReference::getSequence(string seqname) {
return s;
}
+string FastaReference::getSequence(string seqname) {
+ string u = uppercase(getRawSequence(seqname));
+ return removeIupacBases(u);
+}
+
// TODO cleanup; odd function. use a map
string FastaReference::sequenceNameStartingWith(string seqnameStart) {
try {
@@ -273,7 +288,7 @@ string FastaReference::sequenceNameStartingWith(string seqnameStart) {
}
}
-string FastaReference::getSubSequence(string seqname, int start, int length) {
+string FastaReference::getRawSubSequence(string seqname, int start, int length) {
FastaIndexEntry entry = index->entry(seqname);
length = min(length, entry.length - start);
if (start < 0 || length < 1) {
@@ -301,6 +316,11 @@ string FastaReference::getSubSequence(string seqname, int start, int length) {
return s;
}
+string FastaReference::getSubSequence(string seqname, int start, int length) {
+ string u = uppercase(getRawSubSequence(seqname, start, length));
+ return removeIupacBases(u);
+}
+
long unsigned int FastaReference::sequenceLength(string seqname) {
FastaIndexEntry entry = index->entry(seqname);
return entry.length;
diff --git a/src/Fasta.h b/src/Fasta.h
index e99cb7e..455cf96 100644
--- a/src/Fasta.h
+++ b/src/Fasta.h
@@ -17,6 +17,7 @@
#include <stdio.h>
#include <algorithm>
#include "LargeFileSupport.h"
+#include "Utility.h"
#include <sys/stat.h>
#include "split.h"
#include <stdlib.h>
@@ -62,9 +63,11 @@ class FastaReference {
FILE* file;
FastaIndex* index;
vector<FastaIndexEntry> findSequencesStartingWith(string seqnameStart);
+ string getRawSequence(string seqname);
string getSequence(string seqname);
// potentially useful for performance, investigate
// void getSequence(string seqname, string& sequence);
+ string getRawSubSequence(string seqname, int start, int length);
string getSubSequence(string seqname, int start, int length);
string sequenceNameStartingWith(string seqnameStart);
long unsigned int sequenceLength(string seqname);
diff --git a/src/Genotype.cpp b/src/Genotype.cpp
index 779a03d..bdde8c6 100644
--- a/src/Genotype.cpp
+++ b/src/Genotype.cpp
@@ -1303,7 +1303,7 @@ void addAllHomozygousCombos(
}
}
if (allSameAndHomozygous) {
- allelesWithHomozygousCombos[genotype->front().allele] == true;
+ allelesWithHomozygousCombos[genotype->front().allele] = true;
}
}
diff --git a/src/LeftAlign.cpp b/src/LeftAlign.cpp
index 78b7046..f937bfe 100644
--- a/src/LeftAlign.cpp
+++ b/src/LeftAlign.cpp
@@ -22,12 +22,13 @@
//
// In practice, we must call this function until the alignment is stabilized.
//
-bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
+bool leftAlign(BAMALIGN& alignment, string& referenceSequence, bool debug) {
+ string alignmentAlignedBases = alignment.QUERYBASES;
+ const string alignmentSequence = alignment.QUERYBASES;
int arsOffset = 0; // pointer to insertion point in aligned reference sequence
string alignedReferenceSequence = referenceSequence;
int aabOffset = 0;
- string alignmentAlignedBases = alignment.QueryBases;
// store information about the indels
vector<FBIndelAllele> indels;
@@ -39,12 +40,13 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
string softEnd;
stringstream cigar_before, cigar_after;
- for (vector<CigarOp>::const_iterator c = alignment.CigarData.begin();
- c != alignment.CigarData.end(); ++c) {
- unsigned int l = c->Length;
- char t = c->Type;
+ CIGAR cigar = alignment.GETCIGAR;
+ for (CIGAR::const_iterator c = cigar.begin();
+ c != cigar.end(); ++c) {
+ unsigned int l = c->CIGLEN;
+ char t = c->CIGTYPE;
cigar_before << l << t;
- if (t == 'M') { // match or mismatch
+ if (t == 'M' || t == 'X' || t == '=') { // match or mismatch
sp += l;
rp += l;
} else if (t == 'D') { // deletion
@@ -58,7 +60,7 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
aabOffset += l;
sp += l; // update reference sequence position
} else if (t == 'I') { // insertion
- indels.push_back(FBIndelAllele(true, l, sp, rp, alignment.QueryBases.substr(rp, l), false));
+ indels.push_back(FBIndelAllele(true, l, sp, rp, alignmentSequence.substr(rp, l), false));
alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
arsOffset += l;
rp += l;
@@ -116,7 +118,7 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
if (debug) {
if (steppos >= 0 && readsteppos >= 0) {
cerr << referenceSequence.substr(steppos, indel.length) << endl;
- cerr << alignment.QueryBases.substr(readsteppos, indel.length) << endl;
+ cerr << alignmentSequence.substr(readsteppos, indel.length) << endl;
cerr << indel.sequence << endl;
}
}
@@ -124,7 +126,8 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
while (steppos >= 0 && readsteppos >= 0
&& !indel.splice
&& indel.sequence == referenceSequence.substr(steppos, indel.length)
- && indel.sequence == alignment.QueryBases.substr(readsteppos, indel.length)
+ && indel.sequence == alignmentSequence.substr(readsteppos, indel.length)
+ //&& indel.sequence == alignment.QueryBases.substr(readsteppos, indel.length)
&& (id == indels.begin()
|| (previous->insertion && steppos >= previous->position)
|| (!previous->insertion && steppos >= previous->position + previous->length))) {
@@ -156,8 +159,10 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
steppos = indel.position - 1;
readsteppos = indel.readPosition - 1;
while (steppos >= 0 && readsteppos >= 0
- && alignment.QueryBases.at(readsteppos) == referenceSequence.at(steppos)
- && alignment.QueryBases.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
+ //&& alignment.QueryBases.at(readsteppos) == referenceSequence.at(steppos)
+ //&& alignment.QueryBases.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
+ && alignmentSequence.at(readsteppos) == referenceSequence.at(steppos)
+ && alignmentSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
&& (id == indels.begin()
|| (previous->insertion && indel.position - 1 >= previous->position)
|| (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
@@ -196,7 +201,8 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
))) {
if (previous->homopolymer()) {
string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
- string readseq = alignment.QueryBases.substr(prev_end_read, indel.position - prev_end_ref);
+ //string readseq = alignment.QueryBases.substr(prev_end_read, indel.position - prev_end_ref);
+ string readseq = alignmentSequence.substr(prev_end_read, indel.position - prev_end_ref);
LEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl);
if (previous->sequence.at(0) == seq.at(0)
&& FBhomopolymer(seq)
@@ -237,23 +243,23 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
//
// and simultaneously reconstruct the cigar
- vector<CigarOp> newCigar;
+ CIGAR newCigar;
if (!softBegin.empty()) {
- newCigar.push_back(CigarOp('S', softBegin.size()));
+ newCigar.ADDCIGAR(CIGOP('S', softBegin.size()));
}
vector<FBIndelAllele>::iterator id = indels.begin();
FBIndelAllele last = *id++;
if (last.position > 0) {
- newCigar.push_back(CigarOp('M', last.position));
+ newCigar.ADDCIGAR(CIGOP('M', last.position));
}
if (last.insertion) {
- newCigar.push_back(CigarOp('I', last.length));
+ newCigar.ADDCIGAR(CIGOP('I', last.length));
} else if (last.splice) {
- newCigar.push_back(CigarOp('N', last.length));
+ newCigar.ADDCIGAR(CIGOP('N', last.length));
} else {
- newCigar.push_back(CigarOp('D', last.length));
+ newCigar.ADDCIGAR(CIGOP('D', last.length));
}
int lastend = last.insertion ? last.position : (last.position + last.length);
LEFTALIGN_DEBUG(last << ",");
@@ -262,20 +268,25 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
FBIndelAllele& indel = *id;
LEFTALIGN_DEBUG(indel << ",");
if (indel.position < lastend) {
- cerr << "impossibility?: indel realigned left of another indel" << endl << alignment.Name
- << " " << alignment.Position << endl << alignment.QueryBases << endl;
+ cerr << "impossibility?: indel realigned left of another indel" << endl << alignment.QNAME
+ << " " << alignment.POSITION << endl << alignment.QUERYBASES << endl;
exit(1);
+
} else if (indel.position == lastend && indel.insertion == last.insertion) {
- CigarOp& op = newCigar.back();
+ CIGOP& op = newCigar.back();
+#ifdef HAVE_BAMTOOLS
op.Length += indel.length;
+#else
+ op = SeqLib::CigarField(op.Type(), op.Length() + indel.length);
+#endif
} else if (indel.position >= lastend) { // also catches differential indels, but with the same position
- newCigar.push_back(CigarOp('M', indel.position - lastend));
+ newCigar.ADDCIGAR(CIGOP('M', indel.position - lastend));
if (indel.insertion) {
- newCigar.push_back(CigarOp('I', indel.length));
+ newCigar.ADDCIGAR(CIGOP('I', indel.length));
} else if (indel.splice) {
- newCigar.push_back(CigarOp('N', indel.length));
+ newCigar.ADDCIGAR(CIGOP('N', indel.length));
} else { // deletion
- newCigar.push_back(CigarOp('D', indel.length));
+ newCigar.ADDCIGAR(CIGOP('D', indel.length));
}
}
@@ -284,34 +295,40 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
}
if (lastend < alignedLength) {
- newCigar.push_back(CigarOp('M', alignedLength - lastend));
+ newCigar.ADDCIGAR(CIGOP('M', alignedLength - lastend));
}
if (!softEnd.empty()) {
- newCigar.push_back(CigarOp('S', softEnd.size()));
+ newCigar.ADDCIGAR(CIGOP('S', softEnd.size()));
}
LEFTALIGN_DEBUG(endl);
#ifdef VERBOSE_DEBUG
if (debug) {
- for (vector<CigarOp>::const_iterator c = alignment.CigarData.begin();
- c != alignment.CigarData.end(); ++c) {
- unsigned int l = c->Length;
- char t = c->Type;
+ CIGAR cigar = alignment.GETCIGAR;
+ for (CIGAR::const_iterator c = cigar.begin();
+ c != cigar.end(); ++c) {
+ unsigned int l = c->CIGLEN;
+ char t = c->CIGTYPE;
cerr << l << t;
}
cerr << endl;
}
#endif
+#ifdef HAVE_BAMTOOLS
alignment.CigarData = newCigar;
+#else
+ alignment.SetCigar(newCigar);
+#endif
- for (vector<CigarOp>::const_iterator c = alignment.CigarData.begin();
- c != alignment.CigarData.end(); ++c) {
- unsigned int l = c->Length;
- char t = c->Type;
- cigar_after << l << t;
+ cigar = alignment.GETCIGAR;
+ for (CIGAR::const_iterator c = cigar.begin();
+ c != cigar.end(); ++c) {
+ unsigned int l = c->CIGLEN;
+ char t = c->CIGTYPE;
+ cigar_after << l << t;
}
LEFTALIGN_DEBUG(cigar_after.str() << endl);
@@ -324,18 +341,22 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
}
-int countMismatches(BamAlignment& alignment, string referenceSequence) {
+int countMismatches(BAMALIGN& alignment, string referenceSequence) {
+ const string alignmentSequence = alignment.QUERYBASES;
int mismatches = 0;
int sp = 0;
int rp = 0;
- for (vector<CigarOp>::const_iterator c = alignment.CigarData.begin();
- c != alignment.CigarData.end(); ++c) {
- unsigned int l = c->Length;
- char t = c->Type;
- if (t == 'M') { // match or mismatch
+ CIGAR cigar = alignment.GETCIGAR;
+ for (CIGAR::const_iterator c = cigar.begin();
+ c != cigar.end(); ++c) {
+ unsigned int l = c->CIGLEN;
+ char t = c->CIGTYPE;
+
+ if (t == 'M' || t == 'X' || t == '=') { // match or mismatch
for (int i = 0; i < l; ++i) {
- if (alignment.QueryBases.at(rp) != referenceSequence.at(sp))
+ //if (alignment.QueryBases.at(rp) != referenceSequence.at(sp))
+ if (alignmentSequence.at(rp) != referenceSequence.at(sp))
++mismatches;
++sp;
++rp;
@@ -360,15 +381,13 @@ int countMismatches(BamAlignment& alignment, string referenceSequence) {
// realignment. Returns true on realignment success or non-realignment.
// Returns false if we exceed the maximum number of realignment iterations.
//
-bool stablyLeftAlign(BamAlignment& alignment, string referenceSequence, int maxiterations, bool debug) {
+ bool stablyLeftAlign(BAMALIGN& alignment, string referenceSequence, int maxiterations, bool debug) {
#ifdef VERBOSE_DEBUG
int mismatchesBefore = countMismatches(alignment, referenceSequence);
#endif
if (!leftAlign(alignment, referenceSequence, debug)) {
-
- LEFTALIGN_DEBUG("did not realign" << endl);
return true;
} else {
@@ -381,7 +400,7 @@ bool stablyLeftAlign(BamAlignment& alignment, string referenceSequence, int maxi
int mismatchesAfter = countMismatches(alignment, referenceSequence);
if (mismatchesBefore != mismatchesAfter) {
- cerr << alignment.Name << endl;
+ cerr << alignment.QNAME << endl;
cerr << "ERROR: found " << mismatchesBefore << " mismatches before, but " << mismatchesAfter << " after left realignment!" << endl;
exit(1);
}
diff --git a/src/LeftAlign.h b/src/LeftAlign.h
index 634dfef..def06f7 100644
--- a/src/LeftAlign.h
+++ b/src/LeftAlign.h
@@ -14,9 +14,96 @@
#include <vector>
#include "Fasta.h"
+
+#ifdef HAVE_BAMTOOLS
#include "api/BamAlignment.h"
#include "api/BamReader.h"
+#include "api/BamMultiReader.h"
#include "api/BamWriter.h"
+using namespace BamTools;
+#define GETNEXT(reader, alignment) reader.GetNextAlignment(alignment)
+#define GETERRORSTRING(reader) reader.GetErrorString()
+#define ALIGNMENTLENGTH Length
+#define ISMAPPED IsMapped()
+#define REFID RefID
+#define POSITION Position
+#define REFNAME RefName
+#define REFLEN RefLength
+#define REFVEC RefVector
+#define REFDATA RefData
+#define BAMALIGN BamAlignment
+#define QUALITIES Qualities
+#define QUERYBASES QueryBases
+#define ALIGNEDBASES AlignedBases.size()
+#define QNAME Name
+#define GETREFDATA GetReferenceData()
+#define GETREFNUM GetReferenceCount()
+#define GETREFID(name) GetReferenceID(name)
+#define ENDPOSITION GetEndPosition()
+#define SEQLEN QueryBases.size()
+#define MAPPINGQUALITY MapQuality
+#define CIGAR std::vector<CigarOp>
+#define GETCIGAR CigarData
+#define ISDUPLICATE IsDuplicate()
+#define ISREVERSESTRAND IsReverseStrand()
+#define ISPAIRED IsPaired()
+#define ISMATEMAPPED IsMateMapped()
+#define ISPROPERPAIR IsProperPair()
+#define CIGLEN Length
+#define CIGTYPE Type
+#define BAMREADER BamMultiReader
+#define BAMSINGLEREADER BamReader
+#define FILLREADGROUP(rg, align) (align).GetTag("RG", (rg))
+#define ADDCIGAR push_back
+#define CIGOP CigarOp
+#define GETHEADERTEXT GetHeaderText()
+#define STDIN "stdin"
+#define WRITEALIGNMENT(writer, alignment) writer.SaveAlignment(alignment)
+#else
+
+#define GETNEXT(reader, alignment) reader.GetNextRecord(alignment)
+#define MAPPINGQUALITY MapQuality()
+#define ALIGNMENTLENGTH Length()
+#define ISMAPPED MappedFlag()
+#define ISPAIRED PairedFlag()
+#define ISMATEMAPPED MateMappedFlag()
+#define ISPROPERPAIR ProperPair()
+#define ISREVERSESTRAND ReverseFlag()
+#define SEQLEN Length()
+#define BAMALIGN SeqLib::BamRecord
+#define REFID ChrID()
+#define POSITION Position()
+#define REFVEC std::vector<SeqLib::HeaderSequence>
+#define REFDATA SeqLib::HeaderSequence
+#define REFNAME Name
+#define REFLEN Length
+#define QUALITIES Qualities()
+#define QUERYBASES Sequence()
+#define ALIGNEDBASES NumAlignedBases()
+#define QNAME Qname()
+#define GETREFDATA Header().GetHeaderSequenceVector()
+#define GETREFNUM Header().NumSequences()
+#define ENDPOSITION PositionEnd()
+#define CIGAR SeqLib::Cigar
+#define BAMREADER SeqLib::BamReader
+#define BAMSINGLEREADER SeqLib::BamReader
+#define GETCIGAR GetCigar()
+#define GETREFID(name) Header().Name2ID(name)
+#define ISDUPLICATE DuplicateFlag()
+#define CIGLEN Length()
+#define CIGTYPE Type()
+#define ADDCIGAR add
+#define CIGOP SeqLib::CigarField
+#define FILLREADGROUP(rg, align) (rg) = (align).GetZTag("RG")
+#define GETHEADERTEXT HeaderConcat()
+#include "SeqLib/BamReader.h"
+#include "SeqLib/BamWriter.h"
+#define STDIN "-"
+#define WRITEALIGNMENT(writer, alignment) writer.WriteRecord(alignment)
+#endif
+
+
+
#include "IndelAllele.h"
@@ -28,10 +115,9 @@
#endif
using namespace std;
-using namespace BamTools;
-bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug = false);
-bool stablyLeftAlign(BamAlignment& alignment, string referenceSequence, int maxiterations = 20, bool debug = false);
-int countMismatches(BamAlignment& alignment, string referenceSequence);
+bool leftAlign(BAMALIGN& alignment, string& referenceSequence, bool debug = false);
+bool stablyLeftAlign(BAMALIGN& alignment, string referenceSequence, int maxiterations = 20, bool debug = false);
+int countMismatches(BAMALIGN& alignment, string referenceSequence);
#endif
diff --git a/src/Makefile b/src/Makefile
index 86adbb2..ff1e53d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -13,10 +13,13 @@ CFLAGS=-O3 -D_FILE_OFFSET_BITS=64 -g
#CFLAGS=-O3 -static -D VERBOSE_DEBUG # enables verbose debugging via --debug2
BAMTOOLS_ROOT=../bamtools
+SEQLIB_ROOT=../SeqLib
VCFLIB_ROOT=../vcflib
+TABIX_ROOT=$(VCFLIB_ROOT)/tabixpp
+HTSLIB_ROOT=$(TABIX_ROOT)/htslib
-LIBS = -L./ -L$(VCFLIB_ROOT)/tabixpp/ -L$(BAMTOOLS_ROOT)/lib -ltabix -lz -lm
-INCLUDE = -I$(BAMTOOLS_ROOT)/src -I../ttmath -I$(VCFLIB_ROOT)/src -I$(VCFLIB_ROOT)/
+LIBS = -lz -lm -lpthread
+INCLUDE = -I../ttmath -I$(BAMTOOLS_ROOT)/src/ -I$(VCFLIB_ROOT)/src/ -I$(TABIX_ROOT)/ -I$(VCFLIB_ROOT)/smithwaterman/ -I$(VCFLIB_ROOT)/multichoose/ -I$(VCFLIB_ROOT)/filevercmp/ -I$(HTSLIB_ROOT) -I$(SEQLIB_ROOT) -I$(SEQLIB_ROOT)/htslib
all: autoversion ../bin/freebayes ../bin/bamleftalign
@@ -37,7 +40,11 @@ gprof:
# builds bamtools static lib, and copies into root
$(BAMTOOLS_ROOT)/lib/libbamtools.a:
cd $(BAMTOOLS_ROOT) && mkdir -p build && cd build && cmake .. && $(MAKE)
+$(HTSLIB_ROOT)/libhts.a:
+ cd $(HTSLIB_ROOT) && make
+$(SEQLIB_ROOT)/src/libseqlib.a:
+ cd $(SEQLIB_ROOT) && ./configure && make
OBJECTS=BedReader.o \
CNV.o \
@@ -64,14 +71,18 @@ OBJECTS=BedReader.o \
NonCall.o \
SegfaultHandler.o \
../vcflib/tabixpp/tabix.o \
- ../vcflib/tabixpp/bgzf.o \
+ ../vcflib/tabixpp/htslib/bgzf.o \
../vcflib/smithwaterman/SmithWatermanGotoh.o \
- ../vcflib/smithwaterman/disorder.c \
+ ../vcflib/smithwaterman/disorder.cpp \
../vcflib/smithwaterman/LeftAlign.o \
../vcflib/smithwaterman/Repeats.o \
../vcflib/smithwaterman/IndelAllele.o \
Variant.o \
- $(BAMTOOLS_ROOT)/lib/libbamtools.a
+ $(BAMTOOLS_ROOT)/lib/libbamtools.a \
+ $(SEQLIB_ROOT)/src/libseqlib.a \
+ $(SEQLIB_ROOT)/bwa/libbwa.a \
+ $(SEQLIB_ROOT)/fermi-lite/libfml.a \
+ $(SEQLIB_ROOT)/htslib/libhts.a
HEADERS=multichoose.h version_git.h
@@ -86,16 +97,16 @@ alleles ../bin/alleles: alleles.o $(OBJECTS) $(HEADERS)
dummy ../bin/dummy: dummy.o $(OBJECTS) $(HEADERS)
$(CXX) $(CFLAGS) $(INCLUDE) dummy.o $(OBJECTS) -o ../bin/dummy $(LIBS)
-bamleftalign ../bin/bamleftalign: $(BAMTOOLS_ROOT)/lib/libbamtools.a bamleftalign.o Fasta.o LeftAlign.o IndelAllele.o split.o
- $(CXX) $(CFLAGS) $(INCLUDE) bamleftalign.o Fasta.o LeftAlign.o IndelAllele.o split.o $(BAMTOOLS_ROOT)/lib/libbamtools.a -o ../bin/bamleftalign $(LIBS)
+bamleftalign ../bin/bamleftalign: $(BAMTOOLS_ROOT)/lib/libbamtools.a $(SEQLIB_ROOT)/src/libseqlib.a $(SEQLIB_ROOT)/htslib/libhts.a bamleftalign.o Fasta.o LeftAlign.o IndelAllele.o split.o
+ $(CXX) $(CFLAGS) $(INCLUDE) bamleftalign.o Fasta.o Utility.o LeftAlign.o IndelAllele.o split.o $(BAMTOOLS_ROOT)/lib/libbamtools.a $(SEQLIB_ROOT)/src/libseqlib.a $(SEQLIB_ROOT)/htslib/libhts.a -o ../bin/bamleftalign $(LIBS)
-bamfiltertech ../bin/bamfiltertech: $(BAMTOOLS_ROOT)/lib/libbamtools.a bamfiltertech.o $(OBJECTS) $(HEADERS)
+bamfiltertech ../bin/bamfiltertech: $(BAMTOOLS_ROOT)/lib/libbamtools.a $(SEQLIB_ROOT)/src/libseqlib.a $(SEQLIB_ROOT)/htslib/libhts.a bamfiltertech.o $(OBJECTS) $(HEADERS)
$(CXX) $(CFLAGS) $(INCLUDE) bamfiltertech.o $(OBJECTS) -o ../bin/bamfiltertech $(LIBS)
# objects
-Fasta.o: Fasta.cpp
+Fasta.o: Fasta.cpp Utility.o
$(CXX) $(CFLAGS) $(INCLUDE) -c Fasta.cpp
alleles.o: alleles.cpp AlleleParser.o Allele.o
@@ -104,7 +115,7 @@ alleles.o: alleles.cpp AlleleParser.o Allele.o
dummy.o: dummy.cpp AlleleParser.o Allele.o
$(CXX) $(CFLAGS) $(INCLUDE) -c dummy.cpp
-freebayes.o: freebayes.cpp TryCatch.h $(BAMTOOLS_ROOT)/lib/libbamtools.a
+freebayes.o: freebayes.cpp TryCatch.h $(HTSLIB_ROOT)/libhts.a $(BAMTOOLS_ROOT)/lib/libbamtools.a
$(CXX) $(CFLAGS) $(INCLUDE) -c freebayes.cpp
fastlz.o: fastlz.c fastlz.h
@@ -125,7 +136,7 @@ Genotype.o: Genotype.cpp Genotype.h Allele.h multipermute.h
Ewens.o: Ewens.cpp Ewens.h
$(CXX) $(CFLAGS) $(INCLUDE) -c Ewens.cpp
-AlleleParser.o: AlleleParser.cpp AlleleParser.h multichoose.h Parameters.h $(BAMTOOLS_ROOT)/lib/libbamtools.a
+AlleleParser.o: AlleleParser.cpp AlleleParser.h multichoose.h Parameters.h $(BAMTOOLS_ROOT)/lib/libbamtools.a $(HTSLIB_ROOT)/libhts.a
$(CXX) $(CFLAGS) $(INCLUDE) -c AlleleParser.cpp
Utility.o: Utility.cpp Utility.h Sum.h Product.h
@@ -173,7 +184,7 @@ bamleftalign.o: bamleftalign.cpp LeftAlign.cpp
bamfiltertech.o: bamfiltertech.cpp
$(CXX) $(CFLAGS) $(INCLUDE) -c bamfiltertech.cpp
-LeftAlign.o: LeftAlign.h LeftAlign.cpp $(BAMTOOLS_ROOT)/lib/libbamtools.a
+LeftAlign.o: LeftAlign.h LeftAlign.cpp $(BAMTOOLS_ROOT)/lib/libbamtools.a $(HTSLIB_ROOT)/libhts.a
$(CXX) $(CFLAGS) $(INCLUDE) -c LeftAlign.cpp
IndelAllele.o: IndelAllele.cpp IndelAllele.h
@@ -182,8 +193,9 @@ IndelAllele.o: IndelAllele.cpp IndelAllele.h
Variant.o: $(VCFLIB_ROOT)/src/Variant.h $(VCFLIB_ROOT)/src/Variant.cpp
$(CXX) $(CFLAGS) $(INCLUDE) -c $(VCFLIB_ROOT)/src/Variant.cpp
-../vcflib/tabixpp/tabix.o: ../vcflib/tabixpp/tabix.hpp ../vcflib/tabixpp/tabix.cpp
-../vcflib/tabixpp/bgzf.o: ../vcflib/tabixpp/bgzf.c ../vcflib/tabixpp/bgzf.h
+../vcflib/tabixpp/tabix.o:
+ cd $(TABIX_ROOT)/ && make
+../vcflib/tabixpp/htslib/bgzf.o: ../vcflib/tabixpp/htslib/bgzf.c ../vcflib/tabixpp/htslib/htslib/bgzf.h
cd ../vcflib/tabixpp && $(MAKE)
../vcflib/smithwaterman/SmithWatermanGotoh.o: ../vcflib/smithwaterman/SmithWatermanGotoh.h ../vcflib/smithwaterman/SmithWatermanGotoh.cpp
diff --git a/src/NonCall.cpp b/src/NonCall.cpp
index e9a7fe4..e361752 100644
--- a/src/NonCall.cpp
+++ b/src/NonCall.cpp
@@ -4,8 +4,8 @@ NonCall NonCalls::aggregateAll(void) {
NonCall aggregate;
bool first = true;
for (NonCalls::const_iterator nc = this->begin(); nc != this->end(); ++nc) {
- for (map<long, map<string, NonCall> >::const_iterator p = nc->second.begin();
- p != nc->second.end(); ++p) {
+ for (map<long, map<string, NonCall> >::const_iterator p
+ = nc->second.begin(); p != nc->second.end(); ++p) {
for (map<string, NonCall>::const_iterator s = p->second.begin();
s != p->second.end(); ++s) {
const NonCall& nonCall = s->second;
@@ -13,12 +13,14 @@ NonCall NonCalls::aggregateAll(void) {
aggregate.altCount += nonCall.altCount;
aggregate.reflnQ += nonCall.reflnQ;
aggregate.altlnQ += nonCall.altlnQ;
+ aggregate.nCount += 1;
if (first) {
- aggregate.minDepth = nonCall.refCount + nonCall.altCount;
- first = false;
+ aggregate.minDepth = nonCall.refCount + nonCall.altCount;
+ first = false;
} else {
- aggregate.minDepth = min(aggregate.minDepth, nonCall.refCount + nonCall.altCount);
- }
+ aggregate.minDepth = min(aggregate.minDepth,
+ nonCall.refCount + nonCall.altCount);
+ }
}
}
}
@@ -39,6 +41,7 @@ void NonCalls::aggregatePerSample(map<string, NonCall>& perSample) {
aggregate.altCount += nonCall.altCount;
aggregate.reflnQ += nonCall.reflnQ;
aggregate.altlnQ += nonCall.altlnQ;
+ aggregate.nCount += 1;
if (!seen.count(name)) {
aggregate.minDepth = nonCall.refCount + nonCall.altCount;
seen.insert(name);
diff --git a/src/NonCall.h b/src/NonCall.h
index c1715a1..e169558 100644
--- a/src/NonCall.h
+++ b/src/NonCall.h
@@ -20,6 +20,7 @@ public:
, altCount(0)
, altlnQ(0)
, minDepth(0)
+ , nCount(0)
{ }
NonCall(int rc, long double rq, int ac, long double aq, int mdp)
@@ -32,6 +33,7 @@ public:
int refCount;
int altCount;
int minDepth;
+ int nCount ;
long double reflnQ;
long double altlnQ;
};
diff --git a/src/Parameters.cpp b/src/Parameters.cpp
index 3dfbf51..f2aa2f5 100644
--- a/src/Parameters.cpp
+++ b/src/Parameters.cpp
@@ -161,6 +161,8 @@ void Parameters::usage(char** argv) {
<< " -P --pvar N Report sites if the probability that there is a polymorphism" << endl
<< " at the site is greater than N. default: 0.0. Note that post-" << endl
<< " filtering is generally recommended over the use of this parameter." << endl
+ << " --strict-vcf" << endl
+ << " Generate strict VCF format (FORMAT/GQ will be an int)" << endl
<< endl
<< "population model:" << endl
<< endl
@@ -398,6 +400,7 @@ Parameters::Parameters(int argc, char** argv) {
allowMNPs = true; // -X --no-mnps
allowSNPs = true; // -I --no-snps
allowComplex = true;
+ strictVCF = false;
maxComplexGap = 3;
//maxHaplotypeLength = 100;
minRepeatSize = 5;
@@ -514,6 +517,7 @@ Parameters::Parameters(int argc, char** argv) {
{"indel-exclusion-window", required_argument, 0, 'x'},
{"theta", required_argument, 0, 'T'},
{"pvar", required_argument, 0, 'P'},
+ {"strict-vcf", no_argument, 0, '/'},
{"read-dependence-factor", required_argument, 0, 'D'},
{"binomial-obs-priors-off", no_argument, 0, 'V'},
{"allele-balance-priors-off", no_argument, 0, 'a'},
@@ -703,6 +707,10 @@ Parameters::Parameters(int argc, char** argv) {
}
break;
+ case '/':
+ strictVCF = true;
+ break;
+
case 'u':
allowComplex = false;
break;
diff --git a/src/Parameters.h b/src/Parameters.h
index 80d20ba..ac164b7 100644
--- a/src/Parameters.h
+++ b/src/Parameters.h
@@ -60,6 +60,7 @@ public:
bool leftAlignIndels; // -O --left-align-indels
bool allowMNPs; // -X --allow-mnps
bool allowComplex; // -X --allow-complex
+ bool strictVCF;
int maxComplexGap;
//int maxHaplotypeLength;
int minRepeatSize;
diff --git a/src/ResultData.cpp b/src/ResultData.cpp
index b92dfac..0e8469c 100644
--- a/src/ResultData.cpp
+++ b/src/ResultData.cpp
@@ -5,8 +5,8 @@ using namespace std;
-vcf::Variant& Results::vcf(
- vcf::Variant& var, // variant to update
+vcflib::Variant& Results::vcf(
+ vcflib::Variant& var, // variant to update
BigFloat pHom,
long double bestComboOddsRatio,
//long double alleleSamplingProb,
@@ -53,7 +53,7 @@ vcf::Variant& Results::vcf(
// get the required size of the reference sequence
// strip identical bases from start and/or end of alleles
- // if bases have been stripped from the beginning,
+ // if bases have been stripped from the beginning,
// set up VCF record-wide variables
@@ -76,6 +76,7 @@ vcf::Variant& Results::vcf(
if (parameters.calculateMarginals) var.format.push_back("GQ");
// XXX
var.format.push_back("DP");
+ var.format.push_back("AD");
var.format.push_back("RO");
var.format.push_back("QR");
var.format.push_back("AO");
@@ -296,7 +297,7 @@ vcf::Variant& Results::vcf(
long double altReadMismatchRate = (altObsCount == 0 ? 0 : altReadMismatchSum / altObsCount);
long double altReadSNPRate = (altObsCount == 0 ? 0 : altReadSNPSum / altObsCount);
long double altReadIndelRate = (altObsCount == 0 ? 0 : altReadIndelSum / altObsCount);
-
+
//var.info["XAM"].push_back(convert(altReadMismatchRate));
//var.info["XAS"].push_back(convert(altReadSNPRate));
//var.info["XAI"].push_back(convert(altReadIndelRate));
@@ -420,7 +421,7 @@ vcf::Variant& Results::vcf(
// tally partial observations to get a mean coverage per bp of reference
int haplotypeLength = refbase.size();
int basesInObservations = 0;
-
+
for (map<string, vector<Allele*> >::iterator g = alleleGroups.begin(); g != alleleGroups.end(); ++g) {
for (vector<Allele*>::iterator a = g->second.begin(); a != g->second.end(); ++a) {
basesInObservations += (*a)->alternateSequence.size();
@@ -430,7 +431,7 @@ vcf::Variant& Results::vcf(
for (map<Allele*, set<Allele*> >::iterator p = partialObservationSupport.begin(); p != partialObservationSupport.end(); ++p) {
basesInObservations += p->first->alternateSequence.size();
}
-
+
double depthPerBase = (double) basesInObservations / (double) haplotypeLength;
var.info["DPB"].push_back(convert(depthPerBase));
@@ -510,10 +511,15 @@ vcf::Variant& Results::vcf(
sampleOutput["GT"].push_back(genotype->relativeGenotype(refbase, altAlleles));
if (parameters.calculateMarginals) {
- sampleOutput["GQ"].push_back(convert(nan2zero(big2phred((BigFloat)1 - big_exp(sampleLikelihoods.front().marginal)))));
+ double val = nan2zero(big2phred((BigFloat)1 - big_exp(sampleLikelihoods.front().marginal)));
+ if (parameters.strictVCF)
+ sampleOutput["GQ"].push_back(convert(int(round(val))));
+ else
+ sampleOutput["GQ"].push_back(convert(val));
}
sampleOutput["DP"].push_back(convert(sample.observationCount()));
+ sampleOutput["AD"].push_back(convert(sample.observationCount(refbase)));
sampleOutput["RO"].push_back(convert(sample.observationCount(refbase)));
sampleOutput["QR"].push_back(convert(sample.qualSum(refbase)));
@@ -521,6 +527,7 @@ vcf::Variant& Results::vcf(
Allele& altAllele = *aa;
string altbase = altAllele.base();
sampleOutput["AO"].push_back(convert(sample.observationCount(altbase)));
+ sampleOutput["AD"].push_back(convert(sample.observationCount(altbase)));
sampleOutput["QA"].push_back(convert(sample.qualSum(altbase)));
}
@@ -630,8 +637,8 @@ vcf::Variant& Results::vcf(
}
-vcf::Variant& Results::gvcf(
- vcf::Variant& var,
+vcflib::Variant& Results::gvcf(
+ vcflib::Variant& var,
NonCalls& nonCalls,
AlleleParser* parser) {
@@ -652,7 +659,7 @@ vcf::Variant& Results::gvcf(
assert(numSites > 0);
// set up site call
- var.ref = parser->currentReferenceBaseString();
+ var.ref = parser->referenceSubstr(startPos, 1);
var.alt.push_back("<*>");
var.sequenceName = parser->currentSequenceName;
var.position = startPos + 1; // output text field is one-based
@@ -664,19 +671,28 @@ vcf::Variant& Results::gvcf(
var.format.clear();
var.format.push_back("GQ");
var.format.push_back("DP");
- var.format.push_back("MIN");
+ var.format.push_back("MIN_DP");
var.format.push_back("QR");
var.format.push_back("QA");
-
+
NonCall total = nonCalls.aggregateAll();
+
+ /* This resets min depth to zero if nonCalls is less than numSites. */
+
+ int minDepth = total.minDepth;
+
+ if(numSites != total.nCount){
+ minDepth = 0;
+ }
+
var.info["DP"].push_back(convert((total.refCount+total.altCount) / numSites));
- var.info["MIN"].push_back(convert(total.minDepth));
+ var.info["MIN_DP"].push_back(convert(minDepth));
// The text END field is one-based, inclusive. We proudly conflate this
// with our zero-based, exclusive endPos.
var.info["END"].push_back(convert(endPos));
// genotype quality is 1- p(polymorphic)
-
+
map<string, NonCall> perSample;
nonCalls.aggregatePerSample(perSample);
@@ -688,8 +704,18 @@ vcf::Variant& Results::gvcf(
map<string, vector<string> >& sampleOutput = var.samples[sampleName];
long double qual = nc.reflnQ - nc.altlnQ;
sampleOutput["GQ"].push_back(convert(ln2phred(qual)));
+
+
+ /* This resets min depth to zero if nonCalls is less than numSites. */
+
+ int minDepth = nc.minDepth;
+
+ if(numSites != nc.nCount){
+ minDepth = 0;
+ }
+
sampleOutput["DP"].push_back(convert((nc.refCount+nc.altCount) / numSites));
- sampleOutput["MIN"].push_back(convert(nc.minDepth));
+ sampleOutput["MIN_DP"].push_back(convert(minDepth));
sampleOutput["QR"].push_back(convert(ln2phred(nc.reflnQ)));
sampleOutput["QA"].push_back(convert(ln2phred(nc.altlnQ)));
}
diff --git a/src/ResultData.h b/src/ResultData.h
index ef02cae..afd9fb4 100644
--- a/src/ResultData.h
+++ b/src/ResultData.h
@@ -41,8 +41,8 @@ public:
}
}
- vcf::Variant& vcf(
- vcf::Variant& var, // variant to update
+ vcflib::Variant& vcf(
+ vcflib::Variant& var, // variant to update
BigFloat pHom,
long double bestComboOddsRatio,
//long double alleleSamplingProb,
@@ -61,8 +61,8 @@ public:
vector<string>& sequencingTechnologies,
AlleleParser* parser);
- vcf::Variant& gvcf(
- vcf::Variant& var,
+ vcflib::Variant& gvcf(
+ vcflib::Variant& var,
NonCalls& noncalls,
AlleleParser* parser);
};
diff --git a/src/bamleftalign.cpp b/src/bamleftalign.cpp
index be8b927..4cc18c0 100644
--- a/src/bamleftalign.cpp
+++ b/src/bamleftalign.cpp
@@ -11,10 +11,7 @@
#include <vector>
#include "Fasta.h"
-#include "api/BamAlignment.h"
-#include "api/BamReader.h"
-#include "api/BamWriter.h"
-//#include "IndelAllele.h"
+
#include "LeftAlign.h"
#ifdef VERBOSE_DEBUG
@@ -25,7 +22,6 @@
#endif
using namespace std;
-using namespace BamTools;
void printUsage(char** argv) {
cerr << "usage: [BAM data stream] | " << argv[0] << " [options]" << endl
@@ -123,12 +119,15 @@ int main(int argc, char** argv) {
exit(1);
}
- BamReader reader;
- if (!reader.Open("stdin")) {
+
+ BAMSINGLEREADER reader;
+ if (!reader.Open(STDIN)) {
cerr << "could not open stdin for reading" << endl;
exit(1);
}
+#ifdef HAVE_BAMTOOLS
+
BamWriter writer;
if (isuncompressed) {
@@ -139,42 +138,57 @@ int main(int argc, char** argv) {
cerr << "could not open stdout for writing" << endl;
exit(1);
}
+#else
+
+ SeqLib::BamWriter writer(isuncompressed ? SeqLib::SAM : SeqLib::BAM);
+ SeqLib::BamHeader hdr = reader.Header();
+ if (hdr.isEmpty()) {
+ cerr << "could not open header for input" << endl;
+ exit(1);
+ }
+ writer.SetHeader(hdr);
+
+ if (!suppress_output && !writer.Open("-")) {
+ cerr << "could not open stdout for writing" << endl;
+ exit(1);
+ }
+#endif
// store the names of all the reference sequences in the BAM file
map<int, string> referenceIDToName;
- vector<RefData> referenceSequences = reader.GetReferenceData();
+ REFVEC referenceSequences = reader.GETREFDATA;
int i = 0;
- for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
- referenceIDToName[i] = r->RefName;
+ for (REFVEC::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
+ referenceIDToName[i] = r->REFNAME;
++i;
}
- BamAlignment alignment;
-
- while (reader.GetNextAlignment(alignment)) {
+ BAMALIGN alignment;
+ while (GETNEXT(reader, alignment)) {
+
DEBUG("--------------------------- read --------------------------" << endl);
- DEBUG("| " << referenceIDToName[alignment.RefID] << ":" << alignment.Position << endl);
- DEBUG("| " << alignment.Name << ":" << alignment.GetEndPosition() << endl);
- DEBUG("| " << alignment.Name << ":" << (alignment.IsMapped() ? " mapped" : " unmapped") << endl);
- DEBUG("| " << alignment.Name << ":" << " cigar data size: " << alignment.CigarData.size() << endl);
+ DEBUG("| " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl);
+ DEBUG("| " << alignment.QNAME << ":" << alignment.ENDPOSITION << endl);
+ DEBUG("| " << alignment.QNAME << ":" << (alignment.ISMAPPED ? " mapped" : " unmapped") << endl);
+ DEBUG("| " << alignment.QNAME << ":" << " cigar data size: " << alignment.GETCIGAR.size() << endl);
DEBUG("--------------------------- realigned --------------------------" << endl);
// skip unmapped alignments, as they cannot be left-realigned without CIGAR data
- if (alignment.IsMapped()) {
+ if (alignment.ISMAPPED) {
- int endpos = alignment.GetEndPosition();
- int length = endpos - alignment.Position + 1;
- if (alignment.Position >= 0 && length > 0) {
+ int endpos = alignment.ENDPOSITION;
+ int length = endpos - alignment.POSITION + 1;
+ if (alignment.POSITION >= 0 && length > 0) {
if (!stablyLeftAlign(alignment,
reference.getSubSequence(
- referenceIDToName[alignment.RefID],
- alignment.Position,
+ referenceIDToName[alignment.REFID],
+ alignment.POSITION,
length),
maxiterations, debug)) {
- cerr << "unstable realignment of " << alignment.Name
- << " at " << referenceIDToName[alignment.RefID] << ":" << alignment.Position << endl
- << alignment.AlignedBases << endl;
+ cerr << "unstable realignment of " << alignment.QNAME
+ << " at " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl
+ << alignment.QUERYBASES << endl;
}
}
@@ -184,7 +198,7 @@ int main(int argc, char** argv) {
DEBUG(endl);
if (!suppress_output)
- writer.SaveAlignment(alignment);
+ WRITEALIGNMENT(writer, alignment);
}
@@ -193,5 +207,4 @@ int main(int argc, char** argv) {
writer.Close();
return 0;
-
}
diff --git a/src/freebayes.cpp b/src/freebayes.cpp
index be412da..cd07873 100644
--- a/src/freebayes.cpp
+++ b/src/freebayes.cpp
@@ -2,7 +2,7 @@
// freebayes
//
// A bayesian genetic variant detector.
-//
+//
// standard includes
//#include <cstdio>
@@ -44,7 +44,7 @@
// local helper debugging macros to improve code readability
#define DEBUG(msg) \
- if (parameters.debug) { cerr << msg << endl; }
+ if (parameters.debug) { cerr << msg << endl; }
// lower-priority messages
#ifdef VERBOSE_DEBUG
@@ -59,7 +59,7 @@
cerr << msg << endl;
-using namespace std;
+using namespace std;
// todo
// generalize the main function to take the parameters as input
@@ -120,7 +120,7 @@ int main (int argc, char *argv[]) {
if (parameters.output == "vcf") {
out << parser->variantCallFile.header << endl;
}
-
+
if (0 < parameters.maxCoverage) {
srand(13);
}
@@ -144,7 +144,7 @@ int main (int argc, char *argv[]) {
|| (parameters.gVCFchunk &&
nonCalls.lastPos().second - nonCalls.firstPos().second
> parameters.gVCFchunk))) {
- vcf::Variant var(parser->variantCallFile);
+ vcflib::Variant var(parser->variantCallFile);
out << results.gvcf(var, nonCalls, parser) << endl;
nonCalls.clear();
}
@@ -177,13 +177,12 @@ int main (int argc, char *argv[]) {
for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
string sampleName = s->first;
Sample& sample = s->second;
- // get the coverage for this sample
+ // get the coverage for this sample
int sampleCoverage = 0;
for (Sample::iterator sg = sample.begin(); sg != sample.end(); ++sg) {
sampleCoverage += sg->second.size();
}
if (sampleCoverage <= parameters.maxCoverage) {
- skip = true;
continue;
}
@@ -365,7 +364,7 @@ int main (int argc, char *argv[]) {
int estimatedMinorAllelesAtLocus = max(1, (int) ceil((double) numCopiesOfLocus * estimatedMinorFrequency));
//cerr << "estimated minor frequency " << estimatedMinorFrequency << endl;
//cerr << "estimated minor count " << estimatedMinorAllelesAtLocus << endl;
-
+
map<string, vector<vector<SampleDataLikelihood> > > sampleDataLikelihoodsByPopulation;
map<string, vector<vector<SampleDataLikelihood> > > variantSampleDataLikelihoodsByPopulation;
@@ -654,16 +653,16 @@ int main (int argc, char *argv[]) {
// output
- if (!alts.empty() && (1 - pHom.ToDouble()) >= parameters.PVL || parameters.PVL == 0) {
+ if ((!alts.empty() && (1 - pHom.ToDouble()) >= parameters.PVL) || parameters.PVL == 0){
// write the last gVCF record(s)
if (parameters.gVCFout && !nonCalls.empty()) {
- vcf::Variant var(parser->variantCallFile);
+ vcflib::Variant var(parser->variantCallFile);
out << results.gvcf(var, nonCalls, parser) << endl;
nonCalls.clear();
}
- vcf::Variant var(parser->variantCallFile);
+ vcflib::Variant var(parser->variantCallFile);
out << results.vcf(
var,
@@ -696,7 +695,7 @@ int main (int argc, char *argv[]) {
// write the last gVCF record
if (parameters.gVCFout && !nonCalls.empty()) {
Results results;
- vcf::Variant var(parser->variantCallFile);
+ vcflib::Variant var(parser->variantCallFile);
out << results.gvcf(var, nonCalls, parser) << endl;
nonCalls.clear();
}
diff --git a/test/Makefile b/test/Makefile
index 6285c24..bc2c711 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -12,4 +12,4 @@ $(freebayes):
cd .. && $(MAKE)
$(vcfuniq):
- cd ../vcflib && $(MAKE)
+ cd ../vcflib && make clean && make
diff --git a/test/region-and-target-handling.t b/test/region-and-target-handling.t
deleted file mode 100644
index b99b6ee..0000000
--- a/test/region-and-target-handling.t
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-# vi: set ft=sh :
-test=$(dirname $0)
-root=$(dirname $0)/..
-source $test/test-simple-bash/lib/test-simple.bash \
- tests 8
-
-PATH=$root/bin:$PATH
-
-ref=$test/`basename $0`.ref
-alt=$test/`basename $0`.alt
-bam=$test/`basename $0`.bam
-bed=$test/`basename $0`.bed
-vcf=$test/`basename $0`.vcf
-
-trap 'rm -f $ref* $alt $bam* $bed $vcf' EXIT
-
-# 01234567 start
-# ATCGGCTA
-# 12345678 end
-
-cat >$ref <<REF
->ref
-ATCGGCTA
-REF
-samtools faidx $ref
-
-cat >$alt <<ALT
->alt
-GTTAGGTT
-ALT
-
-samtools view -b - >$bam <<SAM
- at HD VN:1.5 SO:coordinate
- at SQ SN:ref LN:8
-alt 0 ref 1 30 1X1=2X1=1X1=1X * 0 0 GTTAGGTT *
-SAM
-samtools index $bam
-
-cat >$bed <<BED
-ref 0 1 first_base
-ref 2 4 third_and_fourth_base
-ref 5 6 sixth_base
-ref 7 8 eigth_base
-BED
-
-cat >$vcf <<VCF
-##fileformat=VCFv4.1
-##INFO=<ID=NAME,Number=0,Type=String,Description="Test name">
-#CHROM POS ID REF ALT QUAL FILTER INFO
-ref 1 . A G 1234 PASS NAME=first base
-ref 2 . T . 1234 PASS NAME=second base
-ref 3 . C T 1234 PASS NAME=third base
-ref 4 . G A 1234 PASS NAME=fourth base
-ref 5 . G . 1234 PASS NAME=fifth base
-ref 6 . C G 1234 PASS NAME=sixth base
-ref 7 . T . 1234 PASS NAME=seventh base
-ref 8 . A T 1234 PASS NAME=eigth base
-VCF
-
-PS4='\n+ '
-
-function run_freebayes() {
- ($root/bin/freebayes "$@" \
- --haplotype-length 0 --min-alternate-count 1 \
- --min-alternate-fraction 0 --pooled-continuous --report-monomorphic \
- --ploidy 1 \
- -f $ref $bam \
- | grep -vE '^#' | cut -f1-5)
-}
-
-if [[ -n $TEST_DEBUG ]]; then
- cat $ref >&2
- cat $bed >&2
- cat $vcf >&2
- vcfannotate --bed $bed --key MATCH $vcf >&2
- vcfintersect --bed $bed $vcf >&2
- bedtools intersect -a $vcf -b $bed >&2
-fi
-
-[[ -z $(run_freebayes --region ref:4-5 --region ref:6-7) ]]; ok $? 'ref:4-5 ref:6-7 are empty'
-[[ -z $(run_freebayes --region ref:4 --region ref:6) ]]; ok $? 'ref:4 ref:6 are empty'
-
-expected=`cat <<END
-ref 6 . C G
-ref 8 . A T
-END
-`
-[[ $(run_freebayes --region ref:5-6 --region ref:7-8) == $expected ]]; ok $? 'ref:5-6 ref:7-8'
-[[ $(run_freebayes --region ref:5 --region ref:7) == $expected ]]; ok $? 'ref:5 ref:7'
-[[ $(run_freebayes --region ref:5-) == $expected ]]; ok $? 'ref:5-'
-
-expected=`cat <<END
-ref 1 . A G
-ref 3 . CG TA
-ref 6 . C G
-ref 8 . A T
-END
-`
-[[ $(run_freebayes --targets $bed) == $expected ]]; ok $? "--targets $bed"
-[[ $(run_freebayes --region ref) == $expected ]]; ok $? "--region ref"
-
-[[ $(run_freebayes --region ref:1-20 2>&1) =~ "Target region coordinates (ref 1 19) outside of reference sequence bounds (ref 8)" ]]
- ok $? 'region outside of bounds error'
diff --git a/test/t/00_region_and_target_handling.t b/test/t/00_region_and_target_handling.t
new file mode 100644
index 0000000..13e7376
--- /dev/null
+++ b/test/t/00_region_and_target_handling.t
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+# vi: set ft=sh :
+
+test=$(dirname $0)/..
+root=$(dirname $0)/../..
+
+source $test/test-simple-bash/lib/test-simple.bash \
+ tests 11
+
+PATH=$root/bin:$PATH
+
+ref=$test/$(basename $0).ref
+alt=$test/$(basename $0).alt
+bam=$test/$(basename $0).bam
+bed=$test/$(basename $0).bed
+vcf=$test/$(basename $0).vcf
+
+trap 'rm -f $ref* $alt $bam* $bed $vcf' EXIT
+
+# 01234567890 start
+# ATCGGCTAAAA ref
+# GTTAGGTTAAC alt
+# 12345678901 end
+
+cat >$ref <<REF
+>ref
+ATCGGCTAAAA
+REF
+samtools faidx $ref
+
+cat >$alt <<ALT
+>alt
+GTTAGGTTAAC
+ALT
+
+samtools view -S -b - >$bam <<SAM
+ at HD VN:1.5 SO:coordinate
+ at SQ SN:ref LN:11
+alt 0 ref 1 30 1X1=2X1=1X1=1X2=1X * 0 0 GTTAGGTTAAC *
+SAM
+samtools index $bam
+
+cat >$bed <<BED
+ref 0 1 first_base
+ref 2 4 third_and_fourth_base
+ref 5 6 sixth_base
+ref 7 8 eighth_base
+ref 10 11 eleventh base
+BED
+
+cat >$vcf <<VCF
+##fileformat=VCFv4.1
+##INFO=<ID=NAME,Number=0,Type=String,Description="Test name">
+#CHROM POS ID REF ALT QUAL FILTER INFO
+ref 1 . A G 1234 PASS NAME=first base
+ref 2 . T . 1234 PASS NAME=second base
+ref 3 . C T 1234 PASS NAME=third base
+ref 4 . G A 1234 PASS NAME=fourth base
+ref 5 . G . 1234 PASS NAME=fifth base
+ref 6 . C G 1234 PASS NAME=sixth base
+ref 7 . T . 1234 PASS NAME=seventh base
+ref 8 . A T 1234 PASS NAME=eighth base
+ref 9 . A . 1234 PASS NAME=ninth base
+ref 10 . A . 1234 PASS NAME=tenth base
+ref 11 . A C 1234 PASS NAME=eleventh base
+VCF
+
+PS4='\n+ '
+
+function run_freebayes() {
+ ($root/bin/freebayes "$@" \
+ --haplotype-length 0 --min-alternate-count 1 \
+ --min-alternate-fraction 0 --pooled-continuous --report-monomorphic \
+ --ploidy 1 \
+ -f $ref $bam \
+ | grep -vE "^#" | cut -f1-5)
+}
+
+if [[ -n $TEST_DEBUG ]]; then
+ cat $ref >&2
+ cat $bed >&2
+ cat $vcf >&2
+ vcfannotate --bed $bed --key MATCH $vcf >&2
+ vcfintersect --bed $bed $vcf >&2
+ bedtools intersect -a $vcf -b $bed >&2
+fi
+
+
+output=$(run_freebayes --region ref:4-5 --region ref:6-7)
+ok [ -z "$output" ] "ref:4-5 ref:6-7 are empty" || echo "$output"
+
+output=$(run_freebayes --region ref:4 --region ref:6)
+ok [ -z "$output" ] "ref:4 ref:6 are empty" || echo "$output"
+
+expected=$(cat <<END
+ref 6 . C G
+ref 8 . A T
+END
+)
+output=$(run_freebayes --region ref:5-6 --region ref:7-8)
+ok [ "$output" == "$expected" ] "ref:5-6 ref:7-8" || echo "$output"
+
+output=$(run_freebayes --region ref:5 --region ref:7)
+ok [ "$output" == "$expected" ] "ref:5 ref:7" || echo "$output"
+
+expected=$(cat <<END
+ref 6 . C G
+ref 8 . A T
+ref 11 . A C
+END
+)
+output=$(run_freebayes --region ref:5-)
+ok [ "$output" == "$expected" ] "ref:5-" || echo "$output"
+
+expected=$(cat <<END
+ref 1 . A G
+ref 3 . CG TA
+ref 6 . C G
+ref 8 . A T
+ref 11 . A C
+END
+)
+output=$(run_freebayes)
+ok [ "$output" == "$expected" ] "full output" || echo "$output"
+
+output=$(run_freebayes --targets $bed)
+ok [ "$output" == "$expected" ] "--targets $bed" || echo "$output"
+
+output=$(run_freebayes --region ref)
+ok [ "$output" == "$expected" ] "--region ref" || echo "$output"
+
+# commas as separators: should really be 1,000
+expected=$(cat <<END
+ref 11 . A C
+END
+)
+output=$(run_freebayes --region ref:1,0)
+ok [ "$output" == "$expected" ] "ref:1,0-" || echo "$output"
+
+output=$(run_freebayes --region ref:1,0-1,1)
+ok [ "$output" == "$expected" ] "ref:1,0-1,1" || echo "$output"
+
+expected="ERROR(freebayes): Target region coordinates (ref 1 20) outside of reference sequence bounds (ref 11) terminating."
+output=$(run_freebayes --region ref:1-20 2>&1)
+ok [ "$output" == "$expected" ] "region outside of bounds error" || echo "$output"
diff --git a/test/t/01_call_variants.t b/test/t/01_call_variants.t
old mode 100644
new mode 100755
index abc25f1..ac74c76
--- a/test/t/01_call_variants.t
+++ b/test/t/01_call_variants.t
@@ -62,7 +62,7 @@ q 12119 12130
EOF
is $(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam -t targets.bed | grep -v "^#" | wc -l) $by_region "a targets bed file can be used with the same effect as running by region"
-#rm targets.bed
+rm targets.bed
is $(samtools view -u tiny/NA12878.chr22.tiny.bam | freebayes -f tiny/q.fa --stdin | grep -v "^#" | wc -l) \
@@ -76,27 +76,27 @@ is $(freebayes -f tiny/q.fa -@ tiny/q.vcf.gz tiny/NA12878.chr22.tiny.bam | grep
# ensure that positions at which no variants exist get put in the out vcf
is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz tiny/NA12878.chr22.tiny.bam | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t11000$|\t1000$)" | wc -l) 3 "freebayes puts required variants in output"
-is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz tiny/NA12878.chr22.tiny.bam -l | grep -v "^#" | wc -l) 3 "freebayes limits calls to input variants correctly"
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz tiny/NA12878.chr22.tiny.bam -l | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t11000$|\t1000$)" | wc -l) 3 "freebayes limits calls to input variants correctly"
is $(freebayes -f tiny/q.fa -@ tiny/q.vcf.gz -l tiny/1read.bam | grep -v "^#" | wc -l) 20 "freebayes reports all input variants even when there is no input data"
# check variant input with region specified
is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -r q:1-10000 tiny/NA12878.chr22.tiny.bam | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t11000$|\t1000$)" | wc -l) 2 "freebayes handles region and variant input"
-is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -r q:1-10000 tiny/NA12878.chr22.tiny.bam -l | grep -v "^#" | wc -l) 2 "freebayes limits to variant input correctly when region is given"
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -r q:1-10000 tiny/NA12878.chr22.tiny.bam -l | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t1000$)" | wc -l) 2 "freebayes limits to variant input correctly when region is given"
# check variant input when reading from stdin
is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz --stdin < tiny/NA12878.chr22.tiny.bam | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t11000$|\t1000$)" | wc -l) 3 "freebayes handles variant input and reading from stdin"
-is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -l --stdin < tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) 3 "freebayes limits to variant input when reading from stdin"
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -l --stdin < tiny/NA12878.chr22.tiny.bam | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t11000$|\t1000$)" | wc -l) 3 "freebayes limits to variant input when reading from stdin"
-is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -r q:1-10000 -l --stdin < tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) 2 "freebayes handles region, stdin, and variant input"
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -r q:1-10000 -l --stdin < tiny/NA12878.chr22.tiny.bam | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t1000$)" | wc -l) 2 "freebayes handles region, stdin, and variant input"
gzip -c tiny/q.fa >tiny/q.fa.gz
cp tiny/q.fa.fai tiny/q.fa.gz.fai
freebayes -f tiny/q.fa.gz -@ tiny/q_spiked.vcf.gz -r q:1-10000 -l - < tiny/NA12878.chr22.tiny.bam >/dev/null 2>/dev/null
is $? 1 "freebayes bails out when given a gzipped or corrupted reference"
-rm tiny/q.fa.gz.*
+rm tiny/q.fa.gz*
is $(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) $(freebayes-parallel tiny/q.regions 2 -f tiny/q.fa tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) "running in parallel makes no difference"
diff --git a/test/t/02_multi_bam.t b/test/t/02_multi_bam.t
new file mode 100755
index 0000000..44653f7
--- /dev/null
+++ b/test/t/02_multi_bam.t
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+BASH_TAP_ROOT=bash-tap
+source ./bash-tap/bash-tap-bootstrap
+
+PATH=../bin:$PATH # for freebayes
+
+plan tests 7
+
+ref=$(basename $0).ref
+
+trap 'rm -f ${ref}* $(basename $0)*.bam*' EXIT
+
+cat >${ref} <<REF
+>ref
+AATCGGCTA
+REF
+samtools faidx ${ref}
+
+function run_freebayes() {
+ freebayes "$@" \
+ --haplotype-length 0 --min-alternate-count 1 \
+ --min-alternate-fraction 0 --pooled-continuous --report-monomorphic \
+ --ploidy 1 \
+ -f $ref $bam \
+ 2>&1 \
+ | grep -vE "^#" | cut -f1-5
+}
+
+function make_bam() {
+ local id=$1 && shift
+ local sm=$1 && shift
+ local pl=$1 && shift
+ local suffix=${1:-} && shift
+ local first_snp=${1:-G} && shift
+
+ local bam="$(basename $0).${id}.${sm}.${pl}${suffix}.bam"
+ samtools view -S -b - >${bam} <<SAM
+ at HD VN:1.5 SO:coordinate
+ at SQ SN:ref LN:9
+ at RG ID:${id} SM:${sm} PL:${pl}
+alt 0 ref 1 30 1=1X1=2X1=1X1=1X * 0 0 A${first_snp}TTAGGTT * RG:Z:${id}
+SAM
+ samtools index ${bam}
+ echo ${bam}
+}
+
+expected=$(cat <<END
+ref 2 . A G
+ref 4 . CG TA
+ref 7 . C G
+ref 9 . A T
+END
+)
+bam1=$(make_bam "id1" "sample1" "platform1" "a")
+bam2=$(make_bam "id1" "sample1" "platform1" "b")
+is "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes calls from two identical BAMs"
+
+bam1=$(make_bam "id1" "sample1" "platform1")
+bam2=$(make_bam "id2" "sample2" "platform2")
+is "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes calls from two BAMs with different samples for different read groups"
+
+bam1=$(make_bam "id1" "sample1" "platform1")
+bam2=$(make_bam "id2" "sample1" "platform2")
+is "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes calls from two BAMs with different technologies for different read groups"
+
+bam1=$(make_bam "id1" "sample1" "platform1")
+bam2=$(make_bam "id1" "sample2" "platform1")
+expected='ERROR\(freebayes\): multiple samples \(SM\) map to the same read group \(RG\)'
+like "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes rejects two BAMs with different samples for same read groups"
+
+bam1=$(make_bam "id1" "sample1" "platform1")
+bam2=$(make_bam "id1" "sample1" "platform2")
+expected='ERROR\(freebayes\): multiple technologies \(PL\) map to the same read group \(RG\)'
+like "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes rejects two BAMs with different technologies for same read groups"
+
+bam1=$(make_bam "id1" "sample1" "platform1")
+bam2=$(make_bam "id2" "sample2" "platform2" "" "C")
+expected=$(cat <<END
+ref 2 . A C,G
+ref 4 . CG TA
+ref 7 . C G
+ref 9 . A T
+END
+)
+is "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes calls multiple alts from two different BAMs"
+
+is "$(run_freebayes -f ${ref} ${bam1} ${bam1})" "Error: Duplicate bam file '02_multi_bam.t.id1.sample1.platform1.bam'" "freebayes rejects two BAMs with the same name"
diff --git a/test/t/03_reference_bases.t b/test/t/03_reference_bases.t
new file mode 100644
index 0000000..de5dc6b
--- /dev/null
+++ b/test/t/03_reference_bases.t
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+BASH_TAP_ROOT=bash-tap
+source ./bash-tap/bash-tap-bootstrap
+
+PATH=../bin:$PATH # for freebayes
+
+plan tests 3
+
+ref=$(basename $0).ref
+bam=$(basename $0).bam
+
+trap 'rm -f ${ref}* ${bam}*' EXIT
+
+function make_ref() {
+ local bases=$1 && shift
+ cat >${ref} <<REF
+>ref
+${bases}
+REF
+ samtools faidx ${ref}
+}
+
+samtools view -S -b - >${bam} <<SAM
+ at HD VN:1.5 SO:coordinate
+ at SQ SN:ref LN:31
+alt 0 ref 1 30 13=1X1=12X1=1X1=1X * 0 0 CCCCCCCCCCCCAGTTAAAAAAAAAAAGGTT *
+SAM
+samtools index ${bam}
+
+function run_freebayes() {
+ freebayes --haplotype-length 0 --min-alternate-count 1 \
+ --min-alternate-fraction 0 --pooled-continuous --report-monomorphic \
+ --ploidy 1 \
+ -f $ref $bam \
+ 2>&1 \
+ | grep -vE "^#" | cut -f1-5
+}
+
+
+make_ref "AATCGGCTAZ"
+expected='ERROR\(freebayes\): Found non-DNA character Z at position 9'
+like "$(run_freebayes)" "${expected}" "freebayes rejects invalid reference"
+
+expected=$(cat <<END
+ref 14 . A G
+ref 16 . CNNNNNNNNNNN TAAAAAAAAAAA
+ref 29 . C G
+ref 31 . A T
+END
+)
+make_ref "CCCCCCCCCCCCAATCURYKMSWBDHVGCTA"
+is "$(run_freebayes)" "${expected}" "freebayes does not put uppercase IUPAC bases in VCFs"
+
+make_ref "CCCCCCCCCCCCaatcurykmswbdhvgcta"
+is "$(run_freebayes)" "${expected}" "freebayes does not put lowercase IUPAC bases in VCFs"
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/freebayes.git
More information about the debian-med-commit
mailing list