[med-svn] [freebayes] 01/05: Imported Upstream version 1.0.2
Andreas Tille
tille at debian.org
Wed Jun 22 12:26:24 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository freebayes.
commit 492c86dd52d2e53f32f7dda452b47e5efc54ac4b
Author: Andreas Tille <tille at debian.org>
Date: Wed Jun 22 14:24:36 2016 +0200
Imported Upstream version 1.0.2
---
.gitmodules | 6 +
.travis.yml | 14 +
Makefile | 7 +-
README.md | 16 +-
paper/main.blg | 2 +-
paper/main.pdf | Bin 846894 -> 0 bytes
scripts/freebayes-parallel | 4 +-
src/AlleleParser.cpp | 752 +++++++++++++++-------------------
src/AlleleParser.h | 23 +-
src/DataLikelihood.cpp | 196 ++++++---
src/DataLikelihood.h | 18 +
src/Genotype.cpp | 18 +-
src/Genotype.h | 4 +-
src/IndelAllele.cpp | 6 +-
src/IndelAllele.h | 5 +-
src/LeftAlign.cpp | 34 +-
src/Makefile | 6 +-
src/Multinomial.cpp | 10 +
src/Multinomial.h | 2 +
src/NonCall.cpp | 86 ++++
src/NonCall.h | 48 +++
src/Parameters.cpp | 105 +++--
src/Parameters.h | 7 +-
src/ResultData.cpp | 68 +++
src/ResultData.h | 8 +-
src/Sample.cpp | 2 +-
src/Sample.h | 2 -
src/freebayes.cpp | 385 ++++++++---------
src/version_release.txt | 6 +-
test/Makefile | 15 +
test/region-and-target-handling.t | 104 +++++
test/splice/1:883884-887618.bam | Bin 0 -> 22589 bytes
test/splice/1:883884-887618.bam.bai | Bin 0 -> 288 bytes
test/splice/1:883884-887618.fa | 64 +++
test/splice/1:883884-887618.fa.fai | 1 +
test/t/01_call_variants.t | 110 +++++
test/tiny/1read.bam | Bin 0 -> 636 bytes
test/tiny/1read.bam.bai | Bin 0 -> 96 bytes
test/tiny/NA12878.chr22.tiny.bam | Bin 0 -> 287213 bytes
test/tiny/NA12878.chr22.tiny.bam.bai | Bin 0 -> 96 bytes
test/tiny/NA12878.chr22.tiny.giab.vcf | 89 ++++
test/tiny/q with spaces.fa | 207 ++++++++++
test/tiny/q with spaces.fa.fai | 1 +
test/tiny/q with spaces.regions | 3 +
test/tiny/q.fa | 207 ++++++++++
test/tiny/q.fa.fai | 1 +
test/tiny/q.regions | 3 +
test/tiny/q.vcf.gz | Bin 0 -> 4305 bytes
test/tiny/q.vcf.gz.tbi | Bin 0 -> 102 bytes
test/tiny/q_spiked.vcf.gz | Bin 0 -> 2378 bytes
test/tiny/q_spiked.vcf.gz.tbi | Bin 0 -> 91 bytes
51 files changed, 1864 insertions(+), 781 deletions(-)
diff --git a/.gitmodules b/.gitmodules
index 89dcd4f..6341f4d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,9 @@
[submodule "intervaltree"]
path = intervaltree
url = https://github.com/ekg/intervaltree.git
+[submodule "test/test-simple-bash"]
+ path = test/test-simple-bash
+ url = https://github.com/ingydotnet/test-simple-bash.git
+[submodule "bash-tap"]
+ path = test/bash-tap
+ url = https://github.com/illusori/bash-tap.git
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..8acf75c
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,14 @@
+# Control file for continuous integration testing at http://travis-ci.org/
+
+language: cpp
+compiler: gcc
+before_install:
+ - git submodule update --init --recursive
+ - sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+ - sudo apt-get update -qq
+ - sudo apt-get install -qq gcc-4.8 g++-4.8
+ - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 60 --slave /usr/bin/g++ g++ /usr/bin/g++-4.8
+ - gcc --version && g++ --version
+ - sudo apt-get install -qq bc samtools parallel
+#install: make get-deps
+script: make && make test
diff --git a/Makefile b/Makefile
index 7e6339e..6848dcd 100644
--- a/Makefile
+++ b/Makefile
@@ -23,8 +23,11 @@ install:
uninstall:
rm /usr/local/bin/freebayes /usr/local/bin/bamleftalign
+test:
+ cd test && make test
+
clean:
cd src && $(MAKE) clean
- rm -f bin/*
+ rm -fr bin/*
-.PHONY: all install uninstall clean
+.PHONY: all install uninstall clean test
diff --git a/README.md b/README.md
index 9a16f4d..f233549 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
# *freebayes*, a haplotype-based variant detector
## user manual and guide
+
+[![Build Status](https://travis-ci.org/ekg/freebayes.svg)](https://travis-ci.org/ekg/freebayes)
[![Gitter](https://badges.gitter.im/Join Chat.svg)](https://gitter.im/ekg/freebayes?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
--------
@@ -74,9 +76,9 @@ nested git submodules for external repositories.
After you've already done the above to clone the most recent development
version, if you wish to compile a specific version of FreeBayes from, you
-can then do:
+can then do something like the following:
- git checkout v0.9.18 && git submodule update --recursive
+ git checkout v0.9.20 && git submodule update --recursive
### Resolving proxy issues with git
@@ -155,7 +157,8 @@ copies. When running with highh --ploidy settings, it may be required to set
freebayes -f ref.fa -p 32 --use-best-n-alleles 4 --pooled-discrete aln.bam >var.vcf
-Generate frequency-based calls for all variants passing input thresholds:
+Generate frequency-based calls for all variants passing input thresholds. You'd do
+this in the case that you didn't know the number of samples in the pool.
freebayes -f ref.fa -F 0.01 -C 1 --pooled-continuous aln.bam >var.vcf
@@ -173,7 +176,7 @@ Naive variant calling: simply annotate observation counts of SNPs and indels:
freebayes -f ref.fa --haplotype-length 0 --min-alternate-count 1 \
--min-alternate-fraction 0 --pooled-continuous --report-monomorphic >var.vcf
-Parallel operation (use 36 cores in this case cores):
+Parallel operation (use 36 cores in this case):
freebayes-parallel <(fasta_generate_regions.py ref.fa.fai 100000) 36 \
-f ref.fa aln.bam >var.vcf
@@ -211,7 +214,7 @@ observation count (AO).
* (possibly, **Iterate** the variant detection process in response to insight
gained from your interpretation)
-FreeBayes a standard VCF 4.1 outut stream. This format is designed for the
+FreeBayes emits a standard VCF 4.1 output stream. This format is designed for the
probabilistic description of allelic variants within a population of samples,
but it is equally suited to describing the probability of variation in a single
sample.
@@ -486,8 +489,7 @@ Bayesian model. (Our upcoming publication will discuss this in more detail.)
A minimal pre-processing pipeline similar to that described in "Calling
variants" should be sufficient for most uses. For more information, please
refer to a recent post by Brad Chapman [on minimal BAM preprocessing
-methods](http://bcbio.wordpress.com/2013/10/21/updated-comparison-of-variant-det
-ection-methods-ensemble-freebayes-and-minimal-bam-preparation-pipelines/).
+methods](http://bcbio.wordpress.com/2013/10/21/updated-comparison-of-variant-detection-methods-ensemble-freebayes-and-minimal-bam-preparation-pipelines/).
For a push-button solution to variant detection, from reads to variant calls,
look no further than the [gkno genome analysis platform](http://gkno.me/).
diff --git a/paper/main.blg b/paper/main.blg
index 8f3c05a..91a5948 100644
--- a/paper/main.blg
+++ b/paper/main.blg
@@ -1,4 +1,4 @@
-This is BibTeX, Version 0.99d (TeX Live 2013/Debian)
+This is BibTeX, Version 0.99d (TeX Live 2015/dev/Debian)
Capacity: max_strings=35307, hash_size=35307, hash_prime=30011
The top-level auxiliary file: main.aux
The style file: genome_research.bst
diff --git a/paper/main.pdf b/paper/main.pdf
deleted file mode 100644
index 26bfa1e..0000000
Binary files a/paper/main.pdf and /dev/null differ
diff --git a/scripts/freebayes-parallel b/scripts/freebayes-parallel
index a293846..63bdc62 100755
--- a/scripts/freebayes-parallel
+++ b/scripts/freebayes-parallel
@@ -30,11 +30,11 @@ shift
ncpus=$1
shift
-command="freebayes $@"
+command=("freebayes" "$@")
(
#$command | head -100 | grep "^#" # generate header
# iterate over regions using gnu parallel to dispatch jobs
-cat $regionsfile | parallel -k -j $ncpus "$command --region {}"
+cat "$regionsfile" | parallel -k -j "$ncpus" "${command[@]}" --region {}
) | vcffirstheader \
| vcfstreamsort -w 1000 | vcfuniq # remove duplicates at region edges
diff --git a/src/AlleleParser.cpp b/src/AlleleParser.cpp
index 5a2b79c..444034e 100644
--- a/src/AlleleParser.cpp
+++ b/src/AlleleParser.cpp
@@ -83,28 +83,6 @@ void AlleleParser::openBams(void) {
}
-void AlleleParser::openTraceFile(void) {
- if (parameters.trace) {
- traceFile.open(parameters.traceFile.c_str(), ios::out);
- DEBUG("Opening trace file: " << parameters.traceFile << " ...");
- if (!traceFile) {
- ERROR(" unable to open trace file: " << parameters.traceFile );
- exit(1);
- }
- }
-}
-
-void AlleleParser::openFailedFile(void) {
- if (!parameters.failedFile.empty()) {
- failedFile.open(parameters.failedFile.c_str(), ios::out);
- DEBUG("Opening failed alleles file: " << parameters.failedFile << " ...");
- if (!failedFile) {
- ERROR(" unable to open failed alleles file: " << parameters.failedFile );
- exit(1);
- }
- }
-}
-
void AlleleParser::openOutputFile(void) {
if (parameters.outputFile != "") {
outputFile.open(parameters.outputFile.c_str(), ios::out);
@@ -456,7 +434,9 @@ string AlleleParser::vcfHeader() {
<< "##INFO=<ID=MQM,Number=A,Type=Float,Description=\"Mean mapping quality of observed alternate alleles\">" << endl
<< "##INFO=<ID=MQMR,Number=1,Type=Float,Description=\"Mean mapping quality of observed reference alleles\">" << endl
<< "##INFO=<ID=PAIRED,Number=A,Type=Float,Description=\"Proportion of observed alternate alleles which are supported by properly paired read fragments\">" << endl
- << "##INFO=<ID=PAIREDR,Number=1,Type=Float,Description=\"Proportion of observed reference alleles which are supported by properly paired read fragments\">" << endl;
+ << "##INFO=<ID=PAIREDR,Number=1,Type=Float,Description=\"Proportion of observed reference alleles which are supported by properly paired read fragments\">" << endl
+ << "##INFO=<ID=MIN,Number=1,Type=Integer,Description=\"Minimum depth in gVCF output block.\">" << endl
+ << "##INFO=<ID=END,Number=1,Type=Integer,Description=\"Last position (inclusive) in gVCF output record.\">" << endl;
// sequencing technology tags, which vary according to input data
for (vector<string>::iterator st = sequencingTechnologies.begin(); st != sequencingTechnologies.end(); ++st) {
@@ -479,6 +459,7 @@ string AlleleParser::vcfHeader() {
<< "##FORMAT=<ID=QR,Number=1,Type=Integer,Description=\"Sum of quality of the reference observations\">" << endl
<< "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count\">" << endl
<< "##FORMAT=<ID=QA,Number=A,Type=Integer,Description=\"Sum of quality of the alternate observations\">" << endl
+ << "##FORMAT=<ID=MIN,Number=1,Type=Integer,Description=\"Minimum depth in gVCF output block.\">" << endl
//<< "##FORMAT=<ID=SRF,Number=1,Type=Integer,Description=\"Number of reference observations on the forward strand\">" << endl
//<< "##FORMAT=<ID=SRR,Number=1,Type=Integer,Description=\"Number of reference observations on the reverse strand\">" << endl
//<< "##FORMAT=<ID=SAF,Number=1,Type=Integer,Description=\"Number of alternate observations on the forward strand\">" << endl
@@ -562,96 +543,64 @@ void AlleleParser::loadFastaReference(void) {
}
-// alignment-based method for loading the first bit of our reference sequence
-void AlleleParser::loadReferenceSequence(BamAlignment& alignment) {
- DEBUG2("loading reference sequence overlapping first alignment");
- currentPosition = alignment.Position;
- rightmostHaplotypeBasisAllelePosition = currentPosition;
- currentSequenceStart = alignment.Position;
- currentSequenceName = referenceIDToName[alignment.RefID];
- currentRefID = alignment.RefID;
- DEBUG2("reference.getSubSequence("<< currentSequenceName << ", " << currentSequenceStart << ", " << alignment.AlignedBases.length() << ")");
- currentSequence = uppercase(reference.getSubSequence(currentSequenceName, currentSequenceStart, alignment.Length));
-}
-
-// intended to load all the sequence covered by reads which overlap our current target
-// this lets us process the reads fully, checking for suspicious reads, etc.
-// but does not require us to load the whole sequence
-void AlleleParser::loadReferenceSequence(BedTarget* target, int before, int after) {
- basesBeforeCurrentTarget = before;
- basesAfterCurrentTarget = after;
- DEBUG2("loading reference subsequence " << target->seq << " from " << target->left << " - " << before << " to " << target->right + 1 << " + " << after << " + before");
- string name = reference.sequenceNameStartingWith(target->seq);
- currentSequence = uppercase(reference.getSubSequence(name, target->left - before, (target->right + 1 - target->left) + after + before));
- currentReferenceBase = currentReferenceBaseChar();
-}
-
-// used to extend the cached reference subsequence when we encounter a read which extends beyond its right bound
-void AlleleParser::extendReferenceSequence(int rightExtension) {
- currentSequence += uppercase(reference.getSubSequence(reference.sequenceNameStartingWith(currentSequenceName),
- currentTarget->right + basesAfterCurrentTarget,
- rightExtension));
- basesAfterCurrentTarget += rightExtension;
+bool AlleleParser::hasMoreInputVariants(void) {
+ pair<int, long> next = nextInputVariantPosition();
+ return next.first != -1;
}
-// maintain a 10bp window around the curent position
-// to guarantee our ability to process sequence
-void AlleleParser::preserveReferenceSequenceWindow(int bp) {
-
- // establish left difference between ideal window size and current cached sequence
- int leftdiff = currentSequenceStart - (floor(currentPosition) - bp);
- // guard against falling off the left end of our sequence
- //leftdiff = (floor(currentPosition) - leftdiff < 0) ? floor(currentPosition) : leftdiff;
- leftdiff = (currentSequenceStart - leftdiff < 0) ? currentSequenceStart : leftdiff;
- // right guard is not needed due to the fact that we are just attempting to
- // append sequence substr will simply return fewer additional characters if
- // we go past the right end of the ref
- int rightdiff = (floor(currentPosition) + bp) - (currentSequenceStart + currentSequence.size());
-
- if (leftdiff > 0) {
- //cerr << currentSequenceStart << endl;
- string left = reference.getSubSequence(currentSequenceName, currentSequenceStart, leftdiff);
- currentSequence.insert(0, uppercase(left));
- currentSequenceStart -= left.size();
-
- }
- if (rightdiff > 0) {
- currentSequence += uppercase(reference.getSubSequence(
- currentSequenceName,
- (currentSequenceStart + currentSequence.size()),
- rightdiff)); // always go 10bp past the end of what we need for alignment registration
+bool AlleleParser::loadNextPositionWithAlignmentOrInputVariant(BamAlignment& alignment) {
+ pair<int, long> next = nextInputVariantPosition();
+ if (next.first != -1) {
+ int varRefID = next.first;
+ //cerr << varRefID << " " << alignment.RefID << " " << next.second << " " << alignment.Position << endl;
+ if (!hasMoreAlignments || varRefID < alignment.RefID || varRefID == alignment.RefID && next.second < alignment.Position) {
+ return loadNextPositionWithInputVariant();
+ } else {
+ loadReferenceSequence(alignment);
+ }
+ } else {
+ loadReferenceSequence(alignment);
}
+ return true;
}
-// ensure we have cached reference sequence according to the current alignment
-void AlleleParser::extendReferenceSequence(BamAlignment& alignment) {
-
- int leftdiff = currentSequenceStart - alignment.Position;
- leftdiff = (currentSequenceStart - leftdiff < 0) ? currentSequenceStart : leftdiff;
- if (leftdiff > 0) {
- string left = reference.getSubSequence(currentSequenceName,
- currentSequenceStart,
- leftdiff);
- currentSequenceStart -= left.size();
- if (currentSequenceStart < 0) currentSequenceStart = 0;
- currentSequence.insert(0, uppercase(left));
+bool AlleleParser::loadNextPositionWithInputVariant(void) {
+ pair<int, long> next = nextInputVariantPosition();
+ if (next.first != -1) {
+ //cerr << "Next is " << next.first << ":" << next.second << endl;
+ loadReferenceSequence(referenceIDToName[next.first]);
+ currentPosition = next.second;
+ rightmostHaplotypeBasisAllelePosition = currentPosition;
+ return true;
+ } else {
+ return false;
}
+}
- int rightdiff =
- (alignment.Position + alignment.AlignedBases.size())
- - (currentSequenceStart + currentSequence.size());
- if (rightdiff > 0) {
- currentSequence += uppercase(reference.getSubSequence(
- currentSequenceName,
- (currentSequenceStart + currentSequence.size()),
- rightdiff));
- }
+// alignment-based method for loading the first bit of our reference sequence
+void AlleleParser::loadReferenceSequence(BamAlignment& alignment) {
+ loadReferenceSequence(referenceIDToName[alignment.RefID]);
+ currentPosition = alignment.Position;
}
-void AlleleParser::eraseReferenceSequence(int leftErasure) {
- //cerr << "erasing leftmost " << leftErasure << "bp of cached reference sequence" << endl;
- currentSequence.erase(0, leftErasure);
- currentSequenceStart += leftErasure;
+void AlleleParser::loadReferenceSequence(string& seqname) {
+ if (currentSequenceName != seqname) {
+ currentSequenceName = seqname;
+ currentSequenceStart = 0;
+ currentRefID = bamMultiReader.GetReferenceID(currentSequenceName);
+ currentSequence = uppercase(reference.getSequence(currentSequenceName));
+ int i = 0; // check the first few characters and verify they are not garbage
+ for (string::iterator citr = currentSequence.begin();
+ i < 100 && citr != currentSequence.end(); ++citr, ++i) {
+ char c = *citr;
+ if (c != 'A' && c != 'T' && c != 'G' && c != 'C' && c != 'N') {
+ ERROR("Found non-DNA character " << c << " at position " << i << " in " << seqname << endl
+ << ". Is your reference compressed or corrupted? "
+ << "freebayes requires an uncompressed reference sequence.");
+ exit(1);
+ }
+ }
+ }
}
void AlleleParser::loadTargets(void) {
@@ -728,7 +677,7 @@ void AlleleParser::loadTargets(void) {
// REAL BED format is 0 based, half open (end base not included)
BedTarget bd(startSeq,
(startPos == 0) ? 0 : startPos,
- (stopPos == -1) ? reference.sequenceLength(startSeq) : stopPos - 1); // internally, we use 0-base inclusive end
+ ((stopPos == -1) ? reference.sequenceLength(startSeq) : stopPos) - 1); // internally, we use 0-base inclusive end
DEBUG("will process reference sequence " << startSeq << ":" << bd.left << ".." << bd.right + 1);
targets.push_back(bd);
bedReader.targets.push_back(bd);
@@ -739,9 +688,9 @@ void AlleleParser::loadTargets(void) {
for (vector<BedTarget>::iterator e = targets.begin(); e != targets.end(); ++e) {
BedTarget& bd = *e;
// internally, we use 0-base inclusive end
- if (bd.left < 0 || bd.right > reference.sequenceLength(bd.seq)) {
+ if (bd.left < 0 || bd.right + 1 > reference.sequenceLength(bd.seq)) {
ERROR("Target region coordinates (" << bd.seq << " "
- << bd.left << " " << bd.right
+ << bd.left << " " << bd.right + 1
<< ") outside of reference sequence bounds ("
<< bd.seq << " " << reference.sequenceLength(bd.seq) << ") terminating.");
exit(1);
@@ -863,8 +812,6 @@ AlleleParser::AlleleParser(int argc, char** argv) : parameters(Parameters(argc,
referenceSampleName = "reference_sample";
// initialization
- openTraceFile();
- openFailedFile();
openOutputFile();
loadFastaReference();
@@ -1132,7 +1079,7 @@ void RegisteredAlignment::addAllele(Allele newAllele, bool mergeComplex, int max
}
}
- DEBUG("addAllele: mergeAllele/4:"
+ DEBUG2("addAllele: mergeAllele/4:"
<< " lastAllele " << lastAllele.typeStr() << "@" << lastAllele.position << ":" << lastAllele.cigar
<< " newAllele " << newAllele.typeStr() << "@" << newAllele.position << ":" << newAllele.cigar);
@@ -1392,10 +1339,6 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
updateHaplotypeBasisAlleles(sp, alignment.AlignedBases.size());
}
- if (usingVariantInputAlleles) {
- updateInputVariants(sp, alignment.AlignedBases.size());
- }
-
#ifdef VERBOSE_DEBUG
if (parameters.debug2) {
DEBUG2("registering alignment " << rp << " " << csp << " " << sp << endl <<
@@ -1968,7 +1911,7 @@ void AlleleParser::updateAlignmentQueue(long int position,
<< "; currentSequence end = " << currentSequence.size() + currentSequenceStart);
// make sure we have sequence for the *first* alignment
- extendReferenceSequence(currentAlignment);
+ //extendReferenceSequence(currentAlignment);
// push to the front until we get to an alignment that doesn't overlap our
// current position or we reach the end of available alignments
@@ -1985,7 +1928,7 @@ void AlleleParser::updateAlignmentQueue(long int position,
&& currentAlignment.RefID == currentRefID) {
do {
DEBUG2("top of alignment parsing loop");
- DEBUG2("currentAlignment.Name == " << currentAlignment.Name);
+ DEBUG("alignment: " << currentAlignment.Name);
// get read group, and map back to a sample name
string readGroup;
if (!currentAlignment.GetTag("RG", readGroup)) {
@@ -2049,7 +1992,7 @@ void AlleleParser::updateAlignmentQueue(long int position,
// initially skip reads with low mapping quality (what happens if MapQuality is not in the file)
if (currentAlignment.MapQuality >= parameters.MQL0) {
// extend our cached reference sequence to allow processing of this alignment
- extendReferenceSequence(currentAlignment);
+ //extendReferenceSequence(currentAlignment);
// left realign indels
if (parameters.leftAlignIndels) {
int length = currentAlignment.GetEndPosition() - currentAlignment.Position + 1;
@@ -2129,12 +2072,137 @@ void AlleleParser::updateRegisteredAlleles(void) {
alleles.erase(remove(alleles.begin(), alleles.end(), (Allele*)NULL), alleles.end());
- if (lowestPosition <= currentPosition) {
- int diff = lowestPosition - currentSequenceStart;
- // do we have excess bases beyond the current lowest position - cached_reference_window?
- // if so, erase them
- if (diff > CACHED_REFERENCE_WINDOW) {
- eraseReferenceSequence(diff - CACHED_REFERENCE_WINDOW);
+}
+
+pair<int, long int> AlleleParser::nextInputVariantPosition(void) {
+ // are we past the last one in the sequence?
+ if (usingVariantInputAlleles &&
+ ((inputVariantAlleles.find(currentRefID) != inputVariantAlleles.end()
+ && inputVariantAlleles[currentRefID].upper_bound(currentPosition) != inputVariantAlleles[currentRefID].end())
+ || inputVariantAlleles.upper_bound(currentRefID) != inputVariantAlleles.end())) {
+ map<long, vector<Allele> >& inChrom = inputVariantAlleles[currentRefID];
+ map<long, vector<Allele> >::iterator ic = inChrom.upper_bound(currentPosition);
+ if (ic != inChrom.end()) {
+ return make_pair(currentRefID, ic->first);
+ } else {
+ // find next chrom with input alleles
+ map<int, map<long, vector<Allele> > >::iterator nc = inputVariantAlleles.upper_bound(currentRefID);
+ if (nc != inputVariantAlleles.end()) {
+ return make_pair(nc->first, nc->second.begin()->first);
+ } else {
+ return make_pair(-1, 0);
+ }
+ }
+ }
+ return make_pair(-1, 0);
+}
+
+void AlleleParser::getAllInputVariants(void) {
+ string nullstr;
+ getInputVariantsInRegion(nullstr);
+}
+
+void AlleleParser::getInputVariantsInRegion(string& seq, long start, long end) {
+
+ if (!usingVariantInputAlleles) return;
+
+ // get the variants in the target region
+ vcf::Variant var(variantCallInputFile);
+ if (!seq.empty()) {
+ variantCallInputFile.setRegion(seq, start, end);
+ }
+ bool ok;
+ while (ok = variantCallInputFile.getNextVariant(*currentVariant)) {
+
+ long int pos = currentVariant->position - 1;
+ // get alternate alleles
+ bool includePreviousBaseForIndels = true;
+ map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->parsedAlternates();
+ // TODO this would be a nice option: why does it not work?
+ //map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->flatAlternates();
+ vector< vector<vcf::VariantAllele> > orderedVariantAlleles;
+ for (vector<string>::iterator a = currentVariant->alt.begin(); a != currentVariant->alt.end(); ++a) {
+ orderedVariantAlleles.push_back(variantAlleles[*a]);
+ }
+
+ vector<Allele> genotypeAlleles;
+ set<long int> alternatePositions;
+
+ for (vector< vector<vcf::VariantAllele> >::iterator g = orderedVariantAlleles.begin(); g != orderedVariantAlleles.end(); ++g) {
+
+ vector<vcf::VariantAllele>& altAllele = *g;
+
+ vector<Allele> alleles;
+
+ for (vector<vcf::VariantAllele>::iterator v = altAllele.begin(); v != altAllele.end(); ++v) {
+ vcf::VariantAllele& variant = *v;
+ long int allelePos = variant.position - 1;
+ AlleleType type;
+ string alleleSequence = variant.alt;
+
+ int len = 0;
+ int reflen = 0;
+ string cigar;
+
+ // XXX
+ // FAIL
+ // you need to add in the reference bases between the non-reference ones!
+ // to allow for complex events!
+
+ if (variant.ref == variant.alt) {
+ // XXX note that for reference alleles, we only use the first base internally
+ // but this is technically incorrect, so this hack should be noted
+ len = variant.ref.size();
+ reflen = len;
+ //alleleSequence = alleleSequence.at(0); // take only the first base
+ type = ALLELE_REFERENCE;
+ cigar = convert(len) + "M";
+ } else if (variant.ref.size() == variant.alt.size()) {
+ len = variant.ref.size();
+ reflen = len;
+ if (variant.ref.size() == 1) {
+ type = ALLELE_SNP;
+ } else {
+ type = ALLELE_MNP;
+ }
+ cigar = convert(len) + "X";
+ } else if (variant.ref.size() > variant.alt.size()) {
+ type = ALLELE_DELETION;
+ len = variant.ref.size() - variant.alt.size();
+ allelePos -= 1;
+ reflen = len + 2;
+ alleleSequence =
+ uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos, 1))
+ + alleleSequence
+ + uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos+1+len, 1));
+ cigar = "1M" + convert(len) + "D" + "1M";
+ } else {
+ // we always include the flanking bases for these elsewhere, so here too in order to be consistent and trigger use
+ type = ALLELE_INSERTION;
+ // add previous base and post base to match format typically used for calling
+ allelePos -= 1;
+ alleleSequence =
+ uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos, 1))
+ + alleleSequence
+ + uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos+1, 1));
+ len = variant.alt.size() - var.ref.size();
+ cigar = "1M" + convert(len) + "I" + "1M";
+ reflen = 2;
+ }
+ // TODO deal woth complex subs
+
+ Allele allele = genotypeAllele(type, alleleSequence, (unsigned int) len, cigar, (unsigned int) reflen, allelePos);
+ DEBUG("input allele: " << allele.referenceName << " " << allele);
+ //cerr << "input allele: " << allele.referenceName << " " << allele << endl;
+
+ //alleles.push_back(allele);
+ genotypeAlleles.push_back(allele);
+
+ if (allele.type != ALLELE_REFERENCE) {
+ inputVariantAlleles[bamMultiReader.GetReferenceID(currentVariant->sequenceName)][allele.position].push_back(allele);
+ alternatePositions.insert(allele.position);
+ }
+ }
}
}
}
@@ -2150,17 +2218,27 @@ void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
start = rightmostHaplotypeBasisAllelePosition;
}
- //stringstream r;
- //r << currentSequenceName << ":" << start
- // << "-" << pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
- //cerr << "getting variants in " << r.str() << endl;
+ /*
+ stringstream r;
+ r << currentSequenceName << ":" << start
+ << "-" << pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
+ cerr << "getting variants in " << r.str() << endl;
+ */
// tabix expects 1-based, fully closed regions for ti_parse_region()
// (which is what setRegion() calls eventually)
- if (variantCallInputFile.setRegion(currentSequenceName,
- start + 1,
- pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW + 1)) {
- //cerr << "the vcf line " << haplotypeVariantInputFile.line << endl;
+ bool gotRegion = false;
+ if (referenceLength > 0) {
+ gotRegion = variantCallInputFile.setRegion(currentSequenceName,
+ start + 1,
+ pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW + 1);
+ } else {
+ // whole chromosome
+ gotRegion = variantCallInputFile.setRegion(currentSequenceName);
+ }
+
+ if (gotRegion) {
+
// get the variants in the target region
vcf::Variant var(variantCallInputFile);
bool ok;
@@ -2245,13 +2323,13 @@ void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
// TODO deal woth complex subs
Allele allele = genotypeAllele(type, alleleSequence, (unsigned int) len, cigar, (unsigned int) reflen, allelePos);
- DEBUG("input allele: " << allele);
+ DEBUG("input allele: " << allele.referenceName << " " << allele);
//alleles.push_back(allele);
genotypeAlleles.push_back(allele);
if (allele.type != ALLELE_REFERENCE) {
- inputVariantAlleles[allele.position].push_back(allele);
+ inputVariantAlleles[bamMultiReader.GetReferenceID(allele.referenceName)][allele.position].push_back(allele);
alternatePositions.insert(allele.position);
}
@@ -2261,127 +2339,6 @@ void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
// store the allele counts, if they are provided
//
- continue;
-
- // xxx wverything here is skipped
- if (currentVariant->info.find("AC") != currentVariant->info.end()
- && currentVariant->info.find("AN") != currentVariant->info.end()) {
- vector<string>& afstrs = currentVariant->info["AC"];
- // XXX is the reference allele included?
- vector<Allele>& altAlleles = inputVariantAlleles[currentVariant->position - 1];
- vector<Allele>::iterator al = altAlleles.begin();
- assert(altAlleles.size() == afstrs.size());
- // XXX TODO include the reference allele--- how???
- // do we add another tag to the VCF, use NS, or some other data?
- map<Allele, int>& afs = inputAlleleCounts[currentVariant->position - 1];
- int altcountsum = 0;
- for (vector<string>::iterator afstr = afstrs.begin(); afstr != afstrs.end(); ++afstr, ++al) {
- int c; convert(*afstr, c);
- altcountsum += c;
- afs[*al] = c;
- }
- int numalleles; convert(currentVariant->info["AN"].front(), numalleles);
- int refcount = numalleles - altcountsum;
- assert(refcount >= 0);
- if (refcount > 0) {
- // make ref allele
- string ref = currentVariant->ref.substr(0, 1);
- Allele refAllele = genotypeAllele(ALLELE_REFERENCE, ref, ref.size(), "1M", ref.size(), currentVariant->position - 1);
- // and also cache the observed count of the reference allele
- afs[refAllele] = refcount;
- }
- }
-
- if (currentVariant->samples.empty()) {
- continue;
- }
-
- if (alternatePositions.size() == 1) {
-
- // there can be only one alternate allele position, per immediately previous check
- long int alternatePosition = *alternatePositions.begin();
-
- map<int, vector<Genotype> > genotypesByPloidy;
-
- // XXX hard-coded to a specific set of ploidys
- for (int p = 1; p <= 2; ++p) {
- vector<Genotype> genotypes = allPossibleGenotypes(p, genotypeAlleles);
- genotypesByPloidy[p] = genotypes;
- }
-
- map<int, map<Genotype*, int> > vcfGenotypeOrder;
- for (map<int, vector<Genotype> >::iterator gtg = genotypesByPloidy.begin(); gtg != genotypesByPloidy.end(); ++gtg) {
- int groupPloidy = gtg->first;
- vector<Genotype>& genotypes = gtg->second;
- for (vector<Genotype>::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
- Genotype* genotypePtr = &*g;
- Genotype& genotype = *g;
- vector<int> gtspec;
- genotype.relativeGenotype(gtspec, genotypeAlleles);
- // XXX TODO ... EVIL HACKS
- if (groupPloidy == 2) {
- int j = gtspec.front();
- int k = gtspec.back();
- vcfGenotypeOrder[groupPloidy][genotypePtr] = (k * (k + 1) / 2) + j;
- } else if (groupPloidy == 1) {
- vcfGenotypeOrder[groupPloidy][genotypePtr] = gtspec.front();
- } else {
- // XXX TODO ...
- }
- }
- }
-
- // get the GLs for each sample, and store for use in later computation
- for (map<string, map<string, vector<string> > >::iterator s = currentVariant->samples.begin();
- s != currentVariant->samples.end(); ++s) {
-
- string sampleName = s->first;
- map<string, vector<string> >& sample = s->second;
- string& gt = sample["GT"].front();
- map<int, int> genotype = vcf::decomposeGenotype(gt);
-
- // default case, give a fixed count for each observed allele
- int ploidy = 0;
- for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
- ploidy += g->second;
- }
-
- if (ploidy > 2) {
- cerr << "warning, cannot handle ploidy > 2 for in the input VCF due to limitations "
- << "of the VCF specification's definition of Genotype ordering" << endl;
- continue;
- }
-
- // in the case that we have genotype likelihoods in the VCF
- if (sample.find("GL") != sample.end()) {
- vector<string>& gls = sample["GL"];
- vector<long double> genotypeLikelihoods;
- genotypeLikelihoods.resize(gls.size());
- transform(gls.begin(), gls.end(), genotypeLikelihoods.begin(), log10string2ln);
-
- // now map the gls into genotype space
- map<Genotype*, int>& genotypeOrder = vcfGenotypeOrder[ploidy];
- for (map<Genotype*, int>::iterator gto = genotypeOrder.begin(); gto != genotypeOrder.end(); ++gto) {
- Genotype& genotype = *gto->first;
- int order = gto->second;
- map<string, long double>& sampleGenotypeLikelihoods = inputGenotypeLikelihoods[alternatePosition][sampleName];
- //cerr << sampleName << ":" << convert(genotype) << ":" << genotypeLikelihoods[order] << endl;
- sampleGenotypeLikelihoods[convert(genotype)] = genotypeLikelihoods[order];
- }
- }
- }
-
- } else {
- /*
- cerr << "warning, ambiguous VCF record (multiple mixed variant classes), unable to use for genotype likelihood input:" << endl
- << *currentVariant << endl;
- for (set<long int>::iterator i = alternatePositions.begin(); i != alternatePositions.end(); ++i) {
- cerr << "has position: " << *i << endl;
- }
- //<< currentVariant->sequenceName << ":" << currentVariant->position << endl;
- */
- }
-
}
if (!ok) hasMoreVariants = false;
@@ -2396,11 +2353,12 @@ void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
}
*/
//rightmostHaplotypeBasisAllelePosition = pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
- rightmostInputAllelePosition = pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
+ //rightmostInputAllelePosition = pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
}
}
+/*
void AlleleParser::addCurrentGenotypeLikelihoods(map<int, vector<Genotype> >& genotypesByPloidy,
vector<vector<SampleDataLikelihood> >& sampleDataLikelihoods) {
@@ -2462,55 +2420,6 @@ void AlleleParser::getInputAlleleCounts(vector<Allele>& genotypeAlleles, map<str
}
}
}
-
-/*
-void AlleleParser::__removeNonOverlappingAlleles(vector<Allele*>& alleles, int haplotypeLength, bool getAllAllelesInHaplotype) {
- for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
- Allele* allele = *a;
- if (haplotypeLength > 1) {
- if (allele->type == ALLELE_REFERENCE) {
- // does the reference allele overlap the haplotype
- if (getAllAllelesInHaplotype
- && !(currentPosition <= allele->position && allele->position < currentPosition + haplotypeLength)) {
- *a = NULL;
- } else if (!(allele->position <= currentPosition
- && allele->position + allele->referenceLength >= currentPosition + haplotypeLength)) {
- *a = NULL;
- } else if (currentPosition < allele->position) { // not there yet
- allele->processed = false;
- *a = NULL;
- }
- } else { // snps, insertions, deletions
- if (getAllAllelesInHaplotype
- && !(currentPosition <= allele->position && allele->position < currentPosition + haplotypeLength)) {
- *a = NULL;
- } else if (!(currentPosition == allele->position && allele->referenceLength == haplotypeLength)) {
- *a = NULL;
- } else if (currentPosition + haplotypeLength <= allele->position) {
- allele->processed = false;
- *a = NULL;
- }
- }
- } else {
- if (allele->type == ALLELE_REFERENCE) {
- if (!(allele->position + allele->referenceLength > currentPosition)) {
- *a = NULL;
- } else if (currentPosition < allele->position) {
- allele->processed = false;
- *a = NULL;
- }
- } else { // snps, insertions, deletions
- if (currentPosition >= allele->position) {
- *a = NULL;
- } else if (currentPosition < allele->position) {
- allele->processed = false;
- *a = NULL;
- }
- }
- }
- }
- alleles.erase(remove(alleles.begin(), alleles.end(), (Allele*)NULL), alleles.end());
-}
*/
void AlleleParser::removeAllelesWithoutReadSpan(vector<Allele*>& alleles, int probeLength, int haplotypeLength) {
@@ -2598,16 +2507,17 @@ void AlleleParser::removePreviousAlleles(vector<Allele*>& alleles) {
// false otherwise
bool AlleleParser::toNextTarget(void) {
- DEBUG("seeking to next target with alignments...");
+ DEBUG("to next target");
+
+ clearRegisteredAlignments();
// reset haplotype length; there is no last call in this sequence; it isn't relevant
lastHaplotypeLength = 0;
- // XXX
- // XXX
- // TODO cleanup the logic in this section
- // XXX
- // XXX
+ if (targets.empty() && usingVariantInputAlleles) {
+ // we are processing everything, so load the entire input variant allele set
+ getAllInputVariants();
+ }
// load first target if we have targets and have not loaded the first
if (!parameters.useStdin && !targets.empty()) {
@@ -2629,27 +2539,10 @@ bool AlleleParser::toNextTarget(void) {
}
}
- if (ok) {
-
- // XXX hack
- clearRegisteredAlignments();
- currentSequenceStart = currentAlignment.Position;
- currentSequenceName = referenceIDToName[currentAlignment.RefID];
- currentRefID = currentAlignment.RefID;
- currentPosition = (currentPosition < currentAlignment.Position) ? currentAlignment.Position : currentPosition;
- currentSequence = uppercase(reference.getSubSequence(currentSequenceName, currentSequenceStart, currentAlignment.Length));
- rightmostHaplotypeBasisAllelePosition = currentPosition;
-
- } else {
- if (hasMoreVariants) {
- DEBUG("continuing because we have more variants");
- return true;
- } else {
- return false; // last target, no variants, and couldn't get alignment
- }
+ if (!ok) {
+ return loadNextPositionWithInputVariant();
}
-
// stdin, no targets cases
} else if (!currentTarget && (parameters.useStdin || targets.empty())) {
// if we have a target for limiting the analysis, use it
@@ -2662,23 +2555,26 @@ bool AlleleParser::toNextTarget(void) {
ERROR("Could not get first alignment from target");
return false;
}
- clearRegisteredAlignments();
- loadReferenceSequence(currentAlignment); // this seeds us with new reference sequence
-
+ loadNextPositionWithAlignmentOrInputVariant(currentAlignment);
+ //loadReferenceSequence(currentAlignment); // this seeds us with new reference sequence
+ // however, if we have a target list of variants and we should also respect them
// we've reached the end of file, or stdin
} else if (parameters.useStdin || targets.empty()) {
return false;
}
- DEBUG("getting first variant");
- getFirstVariant();
- DEBUG("got first variant");
+ if (currentTarget && usingVariantInputAlleles) {
+ getInputVariantsInRegion(currentTarget->seq, currentTarget->left, currentTarget->right);
+ }
+
+ loadReferenceSequence(currentSequenceName);
justSwitchedTargets = true;
return true;
}
+
// TODO refactor this to allow reading from stdin or reading the whole file
// without loading each sequence as a target
bool AlleleParser::loadTarget(BedTarget* target) {
@@ -2686,21 +2582,17 @@ bool AlleleParser::loadTarget(BedTarget* target) {
currentTarget = target;
DEBUG("processing target " << currentTarget->desc << " " <<
- currentTarget->seq << " " << currentTarget->left << " " <<
- currentTarget->right + 1);
+ currentTarget->seq << " " << currentTarget->left << " " <<
+ currentTarget->right + 1);
DEBUG2("loading target reference subsequence");
- currentSequenceName = currentTarget->seq;
-
- int refSeqID = bamMultiReader.GetReferenceID(currentSequenceName);
-
- DEBUG2("reference sequence id " << refSeqID);
+ loadReferenceSequence(currentTarget->seq);
DEBUG2("setting new position " << currentTarget->left);
currentPosition = currentTarget->left;
rightmostHaplotypeBasisAllelePosition = currentTarget->left;
- if (!bamMultiReader.SetRegion(refSeqID, currentTarget->left, refSeqID, currentTarget->right + 1)) { // bamtools expects 0-based, half-open
+ if (!bamMultiReader.SetRegion(currentRefID, currentTarget->left, currentRefID, currentTarget->right + 1)) { // bamtools expects 0-based, half-open
ERROR("Could not SetRegion to " << currentTarget->seq << ":" << currentTarget->left << ".." << currentTarget->right + 1);
cerr << bamMultiReader.GetErrorString() << endl;
return false;
@@ -2750,9 +2642,9 @@ bool AlleleParser::getFirstAlignment(void) {
DEBUG2("got first alignment in target region");
} else {
if (currentTarget) {
- WARNING("Could not find any mapped reads in target region " << currentSequenceName << ":" << currentTarget->left << ".." << currentTarget->right + 1);
+ DEBUG("Could not find any mapped reads in target region " << currentSequenceName << ":" << currentTarget->left << ".." << currentTarget->right + 1);
} else {
- WARNING("Could not find any mapped reads in target region " << currentSequenceName);
+ DEBUG("Could not find any mapped reads in target region " << currentSequenceName);
}
return false;
}
@@ -2811,75 +2703,82 @@ void AlleleParser::clearRegisteredAlignments(void) {
// if none exist, return false
//
bool AlleleParser::toNextPosition(void) {
-
- // either bail out
+ // is this our first position? (indicated by empty currentSequenceName)
+ // if so, load it up
+ bool first_pos = false;
if (currentSequenceName.empty()) {
DEBUG("loading first target");
if (!toNextTarget()) {
return false;
}
- }
- // or step to the next position
- else {
- ++currentPosition;
- }
-
- // if we've run off the right edge of a target
- if (!targets.empty() && currentPosition > currentTarget->right) { // time to move to a new target
- DEBUG("next position " << (long int) currentPosition << " outside of current target right bound " << currentTarget->right + 1);
- // try to get to the next one, and if this fails, bail out
- if (!toNextTarget()) {
- DEBUG("no more targets, finishing");
- return false;
- }
+ first_pos = true;
}
- // in the stdin, or no targets case
// here we assume we are processing an entire BAM or one contiguous region
- if ((parameters.useStdin && targets.empty()) || targets.empty()) {
- // implicit step of target sequence
- // XXX this must wait for us to clean out all of our alignments at the end of the target
-
+ if (parameters.useStdin || targets.empty()) {
// here we loop over unaligned reads at the beginning of a target
// we need to get to a mapped read to figure out where we are
while (hasMoreAlignments && !currentAlignment.IsMapped()) {
hasMoreAlignments = bamMultiReader.GetNextAlignment(currentAlignment);
}
- // now, if the current position of this alignment is outside of the reference sequence length, switch references
- if (hasMoreAlignments) {
+ // determine if we have more alignments or not
+ if (!hasMoreAlignments) {
+ if (hasMoreInputVariants()) {
+ // continue as we have more variants
+ DEBUG("continuing because we have more input variants");
+ loadNextPositionWithInputVariant();
+ } else if (registeredAlignments.empty()) {
+ DEBUG("no more alignments in input");
+ return false;
+ } else if (currentPosition >= currentSequence.size() + currentSequenceStart) {
+ DEBUG("no more alignments in input");
+ DEBUG("at end of sequence");
+ return false;
+ } else {
+ ++currentPosition;
+ }
+ } else {
+ // step the position
+ ++currentPosition;
+ // if the current position of this alignment is outside of the reference sequence length
+ // we need to switch references
if (currentPosition >= reference.sequenceLength(currentSequenceName)
|| registeredAlignments.empty() && currentRefID != currentAlignment.RefID) {
DEBUG("at end of sequence");
clearRegisteredAlignments();
- loadReferenceSequence(currentAlignment);
+ loadNextPositionWithAlignmentOrInputVariant(currentAlignment);
justSwitchedTargets = true;
}
- // here, if we have run out of alignments
- } else if (!hasMoreAlignments) {
- if (registeredAlignments.empty()) {
- DEBUG("no more alignments in input");
- return false;
- } else if (currentPosition >= currentSequence.size() + currentSequenceStart) {
- DEBUG("at end of sequence");
+ }
+ } else {
+ // or if it's not we should step to the next position
+ ++currentPosition;
+ // if we've run off the right edge of a target, jump
+ if (currentPosition > currentTarget->right) {
+ // time to move to a new target
+ DEBUG("next position " << (long int) currentPosition
+ << " outside of current target right bound " << currentTarget->right + 1);
+ // try to get to the next one, and if this fails, bail out
+ if (!toNextTarget()) {
+ DEBUG("no more targets, finishing");
return false;
}
+ justSwitchedTargets = true;
}
}
// so we have to make sure it's still there (this matters in low-coverage)
- DEBUG2("updating reference sequence cache");
- preserveReferenceSequenceWindow(CACHED_REFERENCE_WINDOW);
currentReferenceBase = currentReferenceBaseChar();
// handle the case in which we don't have targets but in which we've switched reference sequence
- DEBUG2("processing position " << (long unsigned int) currentPosition + 1 << " in sequence " << currentSequenceName);
+ DEBUG("processing position " << (long unsigned int) currentPosition + 1 << " in sequence " << currentSequenceName);
vector<Allele*> newAlleles;
updateAlignmentQueue(currentPosition, newAlleles);
addToRegisteredAlleles(newAlleles);
DEBUG2("updating variants");
// done typically at each new read, but this handles the case where there is no data for a while
- updateInputVariants(currentPosition, 1);
+ //updateInputVariants(currentPosition, 1);
DEBUG2("updating registered alleles");
updateRegisteredAlleles(); // this removes unused left-flanking sequence
@@ -2902,34 +2801,26 @@ bool AlleleParser::toNextPosition(void) {
registeredAlleles.erase(unique(registeredAlleles.begin(), registeredAlleles.end()), registeredAlleles.end());
// and do the same for the variants from the input VCF
+ /*
DEBUG2("erasing old input variant alleles");
- map<long int, vector<Allele> >::iterator v = inputVariantAlleles.find(currentPosition - 3);
- if (v != inputVariantAlleles.end()) {
- inputVariantAlleles.erase(v);
+ if (inputVariantAlleles.find(currentSequenceName) != inputVariantAlleles.end()) {
+ map<long int, vector<Allele> >::iterator v = inputVariantAlleles[currentSequenceName].begin();
+ while (v != inputVariantAlleles[currentSequenceName].end() && v->first < currentPosition) {
+ inputVariantAlleles[currentSequenceName].erase(v++);
+ }
}
+ */
DEBUG2("erasing old input haplotype basis alleles");
- map<long int, vector<AllelicPrimitive> >::iterator z = haplotypeBasisAlleles.find(currentPosition - 3);
- if (z != haplotypeBasisAlleles.end()) {
- haplotypeBasisAlleles.erase(z);
- }
-
- DEBUG2("erasing old genotype likelihoods");
- map<long int, map<string, map<string, long double> > >::iterator l = inputGenotypeLikelihoods.find(currentPosition - 3);
- if (l != inputGenotypeLikelihoods.end()) {
- inputGenotypeLikelihoods.erase(l);
- }
-
- DEBUG2("erasing old allele frequencies");
- map<long int, map<Allele, int> >::iterator af = inputAlleleCounts.find(currentPosition - 3);
- if (af != inputAlleleCounts.end()) {
- inputAlleleCounts.erase(af);
+ map<long int, vector<AllelicPrimitive> >::iterator z = haplotypeBasisAlleles.begin();
+ while (z != haplotypeBasisAlleles.end() && z->first < currentPosition) {
+ haplotypeBasisAlleles.erase(z++);
}
DEBUG2("erasing old cached repeat counts");
- map<long int, map<string, int> >::iterator rc = cachedRepeatCounts.find(currentPosition - 3);
- if (rc != cachedRepeatCounts.end()) {
- cachedRepeatCounts.erase(rc);
+ map<long int, map<string, int> >::iterator rc = cachedRepeatCounts.begin();
+ while (rc != cachedRepeatCounts.end() && rc->first < currentPosition) {
+ cachedRepeatCounts.erase(rc++);
}
return true;
@@ -3258,6 +3149,7 @@ void AlleleParser::buildHaplotypeAlleles(
vector<Allele*> haplotypeObservations;
getCompleteObservationsOfHaplotype(samples, haplotypeLength, haplotypeObservations);
addToRegisteredAlleles(haplotypeObservations);
+ DEBUG("added to registered alleles");
// add partial observations
// first get all the alleles up to the end of the haplotype window
@@ -3265,6 +3157,7 @@ void AlleleParser::buildHaplotypeAlleles(
if (parameters.usePartialObservations && haplotypeLength > 1) {
getPartialObservationsOfHaplotype(samples, haplotypeLength, partialHaplotypeObservations);
}
+ DEBUG("got partial observations of haplotype");
//addToRegisteredAlleles(partialHaplotypeObservations);
// now align the sequences of these alleles to the haplotype alleles
// and put them into the partials bin in each sample
@@ -3285,6 +3178,7 @@ void AlleleParser::buildHaplotypeAlleles(
(*p)->setQuality();
(*p)->update(haplotypeLength);
}
+ DEBUG("done updating");
// debugging
/*
@@ -3382,6 +3276,13 @@ void AlleleParser::buildHaplotypeAlleles(
alleles = alleleUnion(alleles, refAlleleVector);
}
+ // this is where we have established our genotype alleles
+ /*
+ for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
+ cerr << "genotype allele " << &*a << " " << *a << endl;
+ }
+ */
+
// pick up observations that are potentially partial (not unambiguous)
// the way to do this is to test the full observations as if they are partial, and if they
// end up partially supporting multiple observations, removing them from the "complete" observations
@@ -3520,6 +3421,7 @@ bool AlleleParser::getCompleteObservationsOfHaplotype(Samples& samples, int hapl
}
}
}
+ DEBUG("got complete observations of haplotype");
}
void AlleleParser::unsetAllProcessedFlags(void) {
@@ -3542,7 +3444,9 @@ bool AlleleParser::getPartialObservationsOfHaplotype(Samples& samples, int haplo
vector<Allele*> newAlleles;
bool gettingPartials = true;
+ DEBUG("in AlleleParser::getPartialObservationsOfHaplotype, updating alignment queue");
updateAlignmentQueue(currentPosition + haplotypeLength, newAlleles, gettingPartials);
+ DEBUG("in AlleleParser::getPartialObservationsOfHaplotype, done updating alignment queue");
vector<Allele*> otherObs;
vector<Allele*> partialObs;
@@ -3550,6 +3454,7 @@ bool AlleleParser::getPartialObservationsOfHaplotype(Samples& samples, int haplo
// get the max alignment end position, iterate to there
long int maxAlignmentEnd = registeredAlignments.rbegin()->first;
for (long int i = currentPosition+1; i < maxAlignmentEnd; ++i) {
+ DEBUG("getting partial observations of haplotype @" << i);
deque<RegisteredAlignment>& ras = registeredAlignments[i];
for (deque<RegisteredAlignment>::iterator r = ras.begin(); r != ras.end(); ++r) {
RegisteredAlignment& ra = *r;
@@ -3587,6 +3492,7 @@ bool AlleleParser::getNextAlleles(Samples& samples, int allowedAlleleTypes) {
if (!toNextPosition()) {
return false;
} else {
+ // triggers cleanup
if (justSwitchedTargets) {
nextPosition = 0;
justSwitchedTargets = false;
@@ -3928,22 +3834,24 @@ vector<Allele> AlleleParser::genotypeAlleles(
// this needs to be fixed in a big way
// the alleles have to be put into the local haplotype structure
- map<long int, vector<Allele> >::iterator v = inputVariantAlleles.find(currentPosition);
- if (v != inputVariantAlleles.end()) {
- vector<Allele>& inputalleles = v->second;
- for (vector<Allele>::iterator a = inputalleles.begin(); a != inputalleles.end(); ++a) {
- DEBUG("evaluating input allele " << *a);
- Allele& allele = *a;
- // check if the allele is already present
- bool alreadyPresent = false;
- for (vector<Allele>::iterator r = resultAlleles.begin(); r != resultAlleles.end(); ++r) {
- if (r->equivalent(allele)) {
- alreadyPresent = true;
- break;
+ if (inputVariantAlleles.find(currentRefID) != inputVariantAlleles.end()) {
+ map<long int, vector<Allele> >::iterator v = inputVariantAlleles[currentRefID].find(currentPosition);
+ if (v != inputVariantAlleles[currentRefID].end()) {
+ vector<Allele>& inputalleles = v->second;
+ for (vector<Allele>::iterator a = inputalleles.begin(); a != inputalleles.end(); ++a) {
+ DEBUG("evaluating input allele " << *a);
+ Allele& allele = *a;
+ // check if the allele is already present
+ bool alreadyPresent = false;
+ for (vector<Allele>::iterator r = resultAlleles.begin(); r != resultAlleles.end(); ++r) {
+ if (r->equivalent(allele)) {
+ alreadyPresent = true;
+ break;
+ }
+ }
+ if (!alreadyPresent) {
+ resultAlleles.push_back(allele);
}
- }
- if (!alreadyPresent) {
- resultAlleles.push_back(allele);
}
}
}
@@ -4053,14 +3961,14 @@ bool AlleleParser::isRepeatUnit(const string& seq, const string& unit) {
}
-
bool AlleleParser::hasInputVariantAllelesAtCurrentPosition(void) {
- map<long int, vector<Allele> >::iterator v = inputVariantAlleles.find(currentPosition);
- if (v != inputVariantAlleles.end()) {
- return true;
- } else {
- return false;
+ if (inputVariantAlleles.find(currentRefID) != inputVariantAlleles.end()) {
+ map<long int, vector<Allele> >::iterator v = inputVariantAlleles[currentRefID].find(currentPosition);
+ if (v != inputVariantAlleles[currentRefID].end()) {
+ return true;
+ }
}
+ return false;
}
bool operator<(const AllelicPrimitive& a, const AllelicPrimitive& b) {
diff --git a/src/AlleleParser.h b/src/AlleleParser.h
index de65014..0eed6be 100644
--- a/src/AlleleParser.h
+++ b/src/AlleleParser.h
@@ -196,12 +196,19 @@ public:
vector<Allele*> registeredAlleles;
map<long unsigned int, deque<RegisteredAlignment> > registeredAlignments;
- map<long int, vector<Allele> > inputVariantAlleles; // all variants present in the input VCF, as 'genotype' alleles
+ map<int, map<long int, vector<Allele> > > inputVariantAlleles; // all variants present in the input VCF, as 'genotype' alleles
+ pair<int, long int> nextInputVariantPosition(void);
+ void getInputVariantsInRegion(string& seq, long start = 0, long end = 0);
+ void getAllInputVariants(void);
// position sample genotype likelihood
- map<long int, map<string, map<string, long double> > > inputGenotypeLikelihoods; // drawn from input VCF
- map<long int, map<Allele, int> > inputAlleleCounts; // drawn from input VCF
+ map<string, map<long int, map<string, map<string, long double> > > > inputGenotypeLikelihoods; // drawn from input VCF
+ map<string, map<long int, map<Allele, int> > > inputAlleleCounts; // drawn from input VCF
Sample* nullSample;
+ bool loadNextPositionWithAlignmentOrInputVariant(BamAlignment& currentAlignment);
+ bool loadNextPositionWithInputVariant(void);
+ bool hasMoreInputVariants(void);
+
void addCurrentGenotypeLikelihoods(map<int, vector<Genotype> >& genotypesByPloidy,
vector<vector<SampleDataLikelihood> >& sampleDataLikelihoods);
@@ -218,8 +225,6 @@ public:
vector<string> bamHeaderLines;
void openBams(void);
- void openTraceFile(void);
- void openFailedFile(void);
void openOutputFile(void);
void getSampleNames(void);
void getPopulations(void);
@@ -230,12 +235,8 @@ public:
vector<int> currentPloidies(Samples& samples);
void loadBamReferenceSequenceNames(void);
void loadFastaReference(void);
- void loadReferenceSequence(BedTarget*, int, int);
void loadReferenceSequence(BamAlignment& alignment);
- void preserveReferenceSequenceWindow(int bp);
- void extendReferenceSequence(int);
- void extendReferenceSequence(BamAlignment& alignment);
- void eraseReferenceSequence(int leftErasure);
+ void loadReferenceSequence(string& seqname);
string referenceSubstr(long int position, unsigned int length);
void loadTargets(void);
bool getFirstAlignment(void);
@@ -320,7 +321,7 @@ public:
string currentReferenceHaplotype();
// output files
- ofstream logFile, outputFile, traceFile, failedFile;
+ ofstream logFile, outputFile;
ostream* output;
// utility
diff --git a/src/DataLikelihood.cpp b/src/DataLikelihood.cpp
index 8dcb962..1779352 100644
--- a/src/DataLikelihood.cpp
+++ b/src/DataLikelihood.cpp
@@ -16,13 +16,15 @@ probObservedAllelesGivenGenotype(
map<string, double>& freqs
) {
+ //cerr << "P(" << genotype << " given" << endl << sample;
+
int observationCount = sample.observationCount();
vector<long double> alleleProbs = genotype.alleleProbabilities(observationBias);
vector<int> observationCounts = genotype.alleleObservationCounts(sample);
int countOut = 0;
double countIn = 0;
long double prodQout = 0; // the probability that the reads not in the genotype are all wrong
- long double probObsGivenGt = 0;
+ long double prodSample = 0;
if (standardGLs) {
for (Sample::iterator s = sample.begin(); s != sample.end(); ++s) {
@@ -43,10 +45,6 @@ probObservedAllelesGivenGenotype(
}
}
} else {
- // problem.
- // we should do this over all alleles
- // which have partial support or full support.
- // this is only over
vector<Allele*> emptyA;
vector<Allele*> emptyB;
for (set<string>::iterator c = sample.supportedAlleles.begin();
@@ -73,12 +71,14 @@ probObservedAllelesGivenGenotype(
}
}
Allele& obs = **a;
+ //cerr << "observation: " << obs << endl;
long double probi = 0;
ContaminationEstimate& contamination = contaminations.of(obs.readGroupID);
double scale = 1;
// note that this will underflow if we have mapping quality = 0
// we guard against this externally, by ignoring such alignments (quality has to be > MQL0)
- long double qual = (1 - exp(obs.lnquality)) * (1 - exp(obs.lnmapQuality));
+ long double qual = (1.0 - exp(obs.lnquality)) * (1.0 - exp(obs.lnmapQuality));
+
if (onPartials) {
map<Allele*, set<Allele*> >::iterator r = sample.reversePartials.find(*a);
if (r != sample.reversePartials.end()) {
@@ -86,11 +86,12 @@ probObservedAllelesGivenGenotype(
cerr << "partial " << *a << " has empty reverse" << endl;
exit(1);
}
- //cerr << "partial " << *a << " supports potentially " << sample.reversePartials[*a].size() << " alleles : ";
+ //cerr << "partial " << *a << " supports potentially " << sample.reversePartials[*a].size() << " alleles : " << endl;
//for (set<Allele*>::iterator m = sample.reversePartials[*a].begin();
- // m != sample.reversePartials[*a].end(); ++m) cerr << **m << " ";
+ //m != sample.reversePartials[*a].end(); ++m) cerr << **m << " ";
//cerr << endl;
scale = (double)1/(double)sample.reversePartials[*a].size();
+ qual *= scale;
}
}
@@ -98,57 +99,44 @@ probObservedAllelesGivenGenotype(
// how does this work?
// each partial obs is recorded as supporting, but with observation probability scaled by the number of possible haplotypes it supports
bool isInGenotype = false;
+ long double asampl = genotype.alleleSamplingProb(obs);
+ // for each of the unique genotype alleles
for (vector<Allele>::iterator b = genotypeAlleles.begin(); b != genotypeAlleles.end(); ++b) {
Allele& allele = *b;
- const string& base = b->currentBase;
-
- long double q;
- if (obs.currentBase == base
- || (onPartials && sample.observationSupports(*a, &*b))) {
+ const string& base = allele.currentBase;
+ if (genotype.containsAllele(base)
+ && (obs.currentBase == base
+ || (onPartials && sample.observationSupports(*a, &*b)))) {
isInGenotype = true;
- q = qual;
- } else {
- q = 1 - qual;
- }
-
- if (onPartials) {
- q *= scale; // distribute partial support evenly across supported haplotypes
+ // use the matched allele to estimate the asampl
+ asampl = max(asampl, (long double)genotype.alleleSamplingProb(allele));
}
+ }
- double asampl = genotype.alleleSamplingProb(base);
- //double freq = freqs[base];
-
- if (asampl == 0) {
- // scale by frequency of (this) possibly contaminating allele
- asampl = contamination.probRefGivenHomAlt;
- } else if (asampl == 1) {
- // scale by frequency of (other) possibly contaminating alleles
- asampl = 1 - contamination.probRefGivenHomAlt;
+ if (asampl == 0) {
+ // scale by frequency of (this) possibly contaminating allele
+ asampl = contamination.probRefGivenHomAlt;
+ } else if (asampl == 1) {
+ // scale by frequency of (other) possibly contaminating alleles
+ asampl = 1 - contamination.probRefGivenHomAlt;
+ } else { //if (genotype.ploidy == 2) {
+ // to deal with polyploids
+ // note that this reduces to 1 for diploid heterozygotes
+ // this term captures reference bias
+ if (obs.isReference()) {
+ asampl *= (contamination.probRefGivenHet / 0.5);
} else {
- // to deal with polyploids
- // note that this reduces to 1 for diploid heterozygotes
- double scale = asampl / 0.5;
- // this term captures reference bias
- if (allele.isReference()) {
- asampl = scale * contamination.probRefGivenHet;
- } else {
- asampl = 1 - (scale * contamination.probRefGivenHet);
- }
+ asampl *= ((1 - contamination.probRefGivenHet) / 0.5);
}
-
- probi += asampl * q;
-
}
- if (isInGenotype) {
- countIn += scale;
- }
-
- // bound to (0,1]
- if (probi > 0) {
- long double lnprobi = log(min(probi, (long double) 1.0));
- probObsGivenGt += lnprobi;
+ // distribute observation support across haplotypes
+ if (!isInGenotype) {
+ prodQout += log(1-qual);
+ countOut += scale;
+ } else {
+ prodSample += log(asampl*scale);
}
}
}
@@ -164,14 +152,15 @@ probObservedAllelesGivenGenotype(
if (sum(observationCounts) == 0) {
return prodQout;
} else {
+ //cerr << "P(obs|" << genotype << ") = " << prodQout + multinomialSamplingProbLn(alleleProbs, observationCounts) << endl << endl << string(80, '@') << endl << endl;
return prodQout + multinomialSamplingProbLn(alleleProbs, observationCounts);
+ //return prodQout + samplingProbLn(alleleProbs, observationCounts);
}
} else {
- // read dependence factor, but inverted to deal with the new GL implementation
- if (countIn > 1) {
- probObsGivenGt *= (1 + (countIn - 1) * dependenceFactor) / countIn;
+ if (countOut > 1) {
+ prodQout *= (1 + (countOut - 1) * dependenceFactor) / countOut;
}
- //cerr << "P(obs|" << genotype << ") = " << probObsGivenGt << endl;
+ long double probObsGivenGt = prodQout + prodSample;
return isinf(probObsGivenGt) ? 0 : probObsGivenGt;
}
@@ -192,6 +181,7 @@ probObservedAllelesGivenGenotypes(
) {
vector<pair<Genotype*, long double> > results;
for (vector<Genotype*>::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
+ Genotype& genotype = **g;
results.push_back(
make_pair(*g,
probObservedAllelesGivenGenotype(
@@ -207,3 +197,103 @@ probObservedAllelesGivenGenotypes(
}
return results;
}
+
+void
+calculateSampleDataLikelihoods(
+ Samples& samples,
+ Results& results,
+ AlleleParser* parser,
+ map<int, vector<Genotype> >& genotypesByPloidy,
+ Parameters& parameters,
+ bool usingNull,
+ Bias& observationBias,
+ vector<Allele>& genotypeAlleles,
+ Contamination& contaminationEstimates,
+ map<string, double>& estimatedAlleleFrequencies,
+ map<string, vector<vector<SampleDataLikelihood> > >& sampleDataLikelihoodsByPopulation,
+ map<string, vector<vector<SampleDataLikelihood> > >& variantSampleDataLikelihoodsByPopulation,
+ map<string, vector<vector<SampleDataLikelihood> > >& invariantSampleDataLikelihoodsByPopulation) {
+
+ for (vector<string>::iterator n = parser->sampleList.begin(); n != parser->sampleList.end(); ++n) {
+ //string sampleName = s->first;
+ string& sampleName = *n;
+ //DEBUG2("sample: " << sampleName);
+ //Sample& sample = s->second;
+ if (samples.find(sampleName) == samples.end()
+ && !(parser->hasInputVariantAllelesAtCurrentPosition()
+ || parameters.reportMonomorphic)) {
+ continue;
+ }
+ Sample& sample = samples[sampleName];
+ vector<Genotype>& genotypes = genotypesByPloidy[parser->currentSamplePloidy(sampleName)];
+ vector<Genotype*> genotypesWithObs;
+ for (vector<Genotype>::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
+ if (parameters.excludePartiallyObservedGenotypes) {
+ if (g->sampleHasSupportingObservationsForAllAlleles(sample)) {
+ genotypesWithObs.push_back(&*g);
+ }
+ } else if (parameters.excludeUnobservedGenotypes && usingNull) {
+ if (g->sampleHasSupportingObservations(sample)) {
+ //cerr << sampleName << " has suppporting obs for " << *g << endl;
+ genotypesWithObs.push_back(&*g);
+ } else if (g->hasNullAllele() && g->homozygous) {
+ // this genotype will never be added if we are running in observed-only mode, but
+ // we still need it for consistency
+ genotypesWithObs.push_back(&*g);
+ }
+ } else {
+ genotypesWithObs.push_back(&*g);
+ }
+ }
+
+ // skip this sample if we have no observations supporting any of the genotypes we are going to evaluate
+ if (genotypesWithObs.empty()) {
+ continue;
+ }
+
+ vector<pair<Genotype*, long double> > probs
+ = probObservedAllelesGivenGenotypes(sample, genotypesWithObs,
+ parameters.RDF, parameters.useMappingQuality,
+ observationBias, parameters.standardGLs,
+ genotypeAlleles,
+ contaminationEstimates,
+ estimatedAlleleFrequencies);
+
+#ifdef VERBOSE_DEBUG
+ if (parameters.debug2) {
+ for (vector<pair<Genotype*, long double> >::iterator p = probs.begin(); p != probs.end(); ++p) {
+ cerr << parser->currentSequenceName << "," << (long unsigned int) parser->currentPosition + 1 << ","
+ << sampleName << ",likelihood," << *(p->first) << "," << p->second << endl;
+ }
+ }
+#endif
+
+ Result& sampleData = results[sampleName];
+ sampleData.name = sampleName;
+ sampleData.observations = &sample;
+ for (vector<pair<Genotype*, long double> >::iterator p = probs.begin(); p != probs.end(); ++p) {
+ sampleData.push_back(SampleDataLikelihood(sampleName, &sample, p->first, p->second, 0));
+ }
+
+ sortSampleDataLikelihoods(sampleData);
+
+ string& population = parser->samplePopulation[sampleName];
+ vector<vector<SampleDataLikelihood> >& sampleDataLikelihoods = sampleDataLikelihoodsByPopulation[population];
+ vector<vector<SampleDataLikelihood> >& variantSampleDataLikelihoods = variantSampleDataLikelihoodsByPopulation[population];
+ vector<vector<SampleDataLikelihood> >& invariantSampleDataLikelihoods = invariantSampleDataLikelihoodsByPopulation[population];
+
+ if (parameters.genotypeVariantThreshold != 0) {
+ if (sampleData.size() > 1
+ && abs(sampleData.at(1).prob - sampleData.front().prob)
+ < parameters.genotypeVariantThreshold) {
+ variantSampleDataLikelihoods.push_back(sampleData);
+ } else {
+ invariantSampleDataLikelihoods.push_back(sampleData);
+ }
+ } else {
+ variantSampleDataLikelihoods.push_back(sampleData);
+ }
+ sampleDataLikelihoods.push_back(sampleData);
+
+ }
+}
diff --git a/src/DataLikelihood.h b/src/DataLikelihood.h
index 7351904..07aaaf4 100644
--- a/src/DataLikelihood.h
+++ b/src/DataLikelihood.h
@@ -17,6 +17,8 @@
#include "Dirichlet.h"
#include "Bias.h"
#include "Contamination.h"
+#include "AlleleParser.h"
+#include "ResultData.h"
using namespace std;
@@ -44,4 +46,20 @@ probObservedAllelesGivenGenotypes(
Contamination& contaminations,
map<string, double>& freqs);
+void
+calculateSampleDataLikelihoods(
+ Samples& samples,
+ Results& results,
+ AlleleParser* parser,
+ map<int, vector<Genotype> >& genotypesByPloidy,
+ Parameters& parameters,
+ bool usingNull,
+ Bias& observationBias,
+ vector<Allele>& genotypeAlleles,
+ Contamination& contaminationEstimates,
+ map<string, double>& estimatedAlleleFrequencies,
+ map<string, vector<vector<SampleDataLikelihood> > >& sampleDataLikelihoodsByPopulation,
+ map<string, vector<vector<SampleDataLikelihood> > >& variantSampleDataLikelihoodsByPopulation,
+ map<string, vector<vector<SampleDataLikelihood> > >& invariantSampleDataLikelihoodsByPopulation);
+
#endif
diff --git a/src/Genotype.cpp b/src/Genotype.cpp
index 5a1b1a2..779a03d 100644
--- a/src/Genotype.cpp
+++ b/src/Genotype.cpp
@@ -3,10 +3,11 @@
#include "multipermute.h"
-vector<Allele> Genotype::uniqueAlleles(void) {
- vector<Allele> uniques;
- for (Genotype::iterator g = this->begin(); g != this->end(); ++g)
- uniques.push_back(g->allele);
+vector<Allele*> Genotype::uniqueAlleles(void) {
+ vector<Allele*> uniques;
+ for (Genotype::iterator g = this->begin(); g != this->end(); ++g) {
+ uniques.push_back(&g->allele);
+ }
return uniques;
}
@@ -64,6 +65,15 @@ int Genotype::alleleCount(Allele& allele) {
}
}
+// returns true when the genotype is composed of a subset of the alleles
+bool Genotype::matchesAlleles(vector<Allele>& alleles) {
+ int p = 0;
+ for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
+ p += alleleCount(*a);
+ }
+ return ploidy == p;
+}
+
double Genotype::alleleSamplingProb(const string& base) {
map<string, int>::iterator ge = alleleCounts.find(base);
if (ge == alleleCounts.end()) {
diff --git a/src/Genotype.h b/src/Genotype.h
index ab30ed6..98cdb6b 100644
--- a/src/Genotype.h
+++ b/src/Genotype.h
@@ -71,12 +71,14 @@ public:
}
- vector<Allele> uniqueAlleles(void);
+ vector<Allele*> uniqueAlleles(void);
int getPloidy(void);
int alleleCount(const string& base);
int alleleCount(Allele& allele);
bool containsAllele(Allele& allele);
bool containsAllele(const string& base);
+ // returns true when the genotype is composed of a subset of the alleles
+ bool matchesAlleles(vector<Allele>& alleles);
vector<Allele> alternateAlleles(string& refbase);
vector<string> alternateBases(string& refbase);
vector<int> counts(void);
diff --git a/src/IndelAllele.cpp b/src/IndelAllele.cpp
index 237699b..20e6888 100644
--- a/src/IndelAllele.cpp
+++ b/src/IndelAllele.cpp
@@ -23,7 +23,8 @@ bool FBhomopolymer(string sequence) {
ostream& operator<<(ostream& out, const FBIndelAllele& indel) {
string t = indel.insertion ? "i" : "d";
- out << t << ":" << indel.position << ":" << indel.readPosition << ":" << indel.sequence;
+ out << t << ":" << indel.position << ":" << indel.readPosition
+ << ":" << indel.sequence << ":" << (indel.splice?"splice":"");
return out;
}
@@ -31,7 +32,8 @@ bool operator==(const FBIndelAllele& a, const FBIndelAllele& b) {
return (a.insertion == b.insertion
&& a.length == b.length
&& a.position == b.position
- && a.sequence == b.sequence);
+ && a.sequence == b.sequence
+ && a.splice == b.splice);
}
bool operator!=(const FBIndelAllele& a, const FBIndelAllele& b) {
diff --git a/src/IndelAllele.h b/src/IndelAllele.h
index 0e8d5fd..55c59e6 100644
--- a/src/IndelAllele.h
+++ b/src/IndelAllele.h
@@ -18,11 +18,12 @@ public:
int position;
int readPosition;
string sequence;
+ bool splice;
bool homopolymer(void);
- FBIndelAllele(bool i, int l, int p, int rp, string s)
- : insertion(i), length(l), position(p), readPosition(rp), sequence(s)
+ FBIndelAllele(bool i, int l, int p, int rp, string s, bool n)
+ : insertion(i), length(l), position(p), readPosition(rp), sequence(s), splice(n)
{ }
};
diff --git a/src/LeftAlign.cpp b/src/LeftAlign.cpp
index 4673a55..78b7046 100644
--- a/src/LeftAlign.cpp
+++ b/src/LeftAlign.cpp
@@ -48,12 +48,17 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
sp += l;
rp += l;
} else if (t == 'D') { // deletion
- indels.push_back(FBIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
+ indels.push_back(FBIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l), false));
+ alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
+ aabOffset += l;
+ sp += l; // update reference sequence position
+ } else if (t == 'N') {
+ indels.push_back(FBIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l), true));
alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
aabOffset += l;
sp += l; // update reference sequence position
} else if (t == 'I') { // insertion
- indels.push_back(FBIndelAllele(true, l, sp, rp, alignment.QueryBases.substr(rp, l)));
+ indels.push_back(FBIndelAllele(true, l, sp, rp, alignment.QueryBases.substr(rp, l), false));
alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
arsOffset += l;
rp += l;
@@ -68,8 +73,8 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
}
rp += l;
} else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
- } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
- sp += l;
+ //} else if (t == 'N') { // skipped region in the reference not present in read, aka splice
+ //sp += l;
}
}
@@ -117,6 +122,7 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
}
#endif
while (steppos >= 0 && readsteppos >= 0
+ && !indel.splice
&& indel.sequence == referenceSequence.substr(steppos, indel.length)
&& indel.sequence == alignment.QueryBases.substr(readsteppos, indel.length)
&& (id == indels.begin()
@@ -178,7 +184,8 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
// if so, do it
int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
- if (previous->insertion == indel.insertion
+ if (!previous->splice && !indel.splice &&
+ previous->insertion == indel.insertion
&& ((previous->insertion
&& (previous->position < indel.position
&& previous->readPosition + previous->readPosition < indel.readPosition))
@@ -240,9 +247,13 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
FBIndelAllele last = *id++;
if (last.position > 0) {
newCigar.push_back(CigarOp('M', last.position));
- newCigar.push_back(CigarOp((last.insertion ? 'I' : 'D'), last.length));
+ }
+ if (last.insertion) {
+ newCigar.push_back(CigarOp('I', last.length));
+ } else if (last.splice) {
+ newCigar.push_back(CigarOp('N', last.length));
} else {
- newCigar.push_back(CigarOp((last.insertion ? 'I' : 'D'), last.length));
+ newCigar.push_back(CigarOp('D', last.length));
}
int lastend = last.insertion ? last.position : (last.position + last.length);
LEFTALIGN_DEBUG(last << ",");
@@ -259,7 +270,14 @@ bool leftAlign(BamAlignment& alignment, string& referenceSequence, bool debug) {
op.Length += indel.length;
} else if (indel.position >= lastend) { // also catches differential indels, but with the same position
newCigar.push_back(CigarOp('M', indel.position - lastend));
- newCigar.push_back(CigarOp((indel.insertion ? 'I' : 'D'), indel.length));
+ if (indel.insertion) {
+ newCigar.push_back(CigarOp('I', indel.length));
+ } else if (indel.splice) {
+ newCigar.push_back(CigarOp('N', indel.length));
+ } else { // deletion
+ newCigar.push_back(CigarOp('D', indel.length));
+ }
+
}
last = *id;
lastend = last.insertion ? last.position : (last.position + last.length);
diff --git a/src/Makefile b/src/Makefile
index bd64279..86adbb2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,7 +9,7 @@ CXX=g++
C=gcc
# Compiler flags
-CFLAGS=-O3 -D_FILE_OFFSET_BITS=64
+CFLAGS=-O3 -D_FILE_OFFSET_BITS=64 -g
#CFLAGS=-O3 -static -D VERBOSE_DEBUG # enables verbose debugging via --debug2
BAMTOOLS_ROOT=../bamtools
@@ -61,6 +61,7 @@ OBJECTS=BedReader.o \
IndelAllele.o \
Bias.o \
Contamination.o \
+ NonCall.o \
SegfaultHandler.o \
../vcflib/tabixpp/tabix.o \
../vcflib/tabixpp/bgzf.o \
@@ -151,6 +152,9 @@ ResultData.o: ResultData.cpp ResultData.h Result.h Result.cpp Allele.h Utility.h
Result.o: Result.cpp Result.h
$(CXX) $(CFLAGS) $(INCLUDE) -c Result.cpp
+NonCall.o: NonCall.cpp NonCall.h
+ $(CXX) $(CFLAGS) $(INCLUDE) -c NonCall.cpp
+
BedReader.o: BedReader.cpp BedReader.h
$(CXX) $(CFLAGS) $(INCLUDE) -c BedReader.cpp
diff --git a/src/Multinomial.cpp b/src/Multinomial.cpp
index 3273e37..7453afb 100644
--- a/src/Multinomial.cpp
+++ b/src/Multinomial.cpp
@@ -37,3 +37,13 @@ long double multinomialCoefficientLn(int n, const vector<int>& counts) {
transform(counts.begin(), counts.end(), count_factorials.begin(), factorialln);
return factorialln(n) - sum(count_factorials);
}
+
+long double samplingProbLn(const vector<long double>& probs, const vector<int>& obs) {
+ vector<long double>::const_iterator p = probs.begin();
+ vector<int>::const_iterator o = obs.begin();
+ long double r = 0;
+ for (; p != probs.end() && o != obs.end(); ++p, ++o) {
+ r += powln(log(*p), *o);
+ }
+ return r;
+}
diff --git a/src/Multinomial.h b/src/Multinomial.h
index eff9102..e2ab0bd 100644
--- a/src/Multinomial.h
+++ b/src/Multinomial.h
@@ -8,4 +8,6 @@ long double multinomialSamplingProb(const vector<long double>& probs, const vect
long double multinomialSamplingProbLn(const vector<long double>& probs, const vector<int>& obs);
long double multinomialCoefficientLn(int n, const vector<int>& counts);
+long double samplingProbLn(const vector<long double>& probs, const vector<int>& obs);
+
#endif
diff --git a/src/NonCall.cpp b/src/NonCall.cpp
new file mode 100644
index 0000000..e9a7fe4
--- /dev/null
+++ b/src/NonCall.cpp
@@ -0,0 +1,86 @@
+#include "NonCall.h"
+
+NonCall NonCalls::aggregateAll(void) {
+ NonCall aggregate;
+ bool first = true;
+ for (NonCalls::const_iterator nc = this->begin(); nc != this->end(); ++nc) {
+ for (map<long, map<string, NonCall> >::const_iterator p = nc->second.begin();
+ p != nc->second.end(); ++p) {
+ for (map<string, NonCall>::const_iterator s = p->second.begin();
+ s != p->second.end(); ++s) {
+ const NonCall& nonCall = s->second;
+ aggregate.refCount += nonCall.refCount;
+ aggregate.altCount += nonCall.altCount;
+ aggregate.reflnQ += nonCall.reflnQ;
+ aggregate.altlnQ += nonCall.altlnQ;
+ if (first) {
+ aggregate.minDepth = nonCall.refCount + nonCall.altCount;
+ first = false;
+ } else {
+ aggregate.minDepth = min(aggregate.minDepth, nonCall.refCount + nonCall.altCount);
+ }
+ }
+ }
+ }
+ return aggregate;
+}
+
+void NonCalls::aggregatePerSample(map<string, NonCall>& perSample) {
+ set<string> seen;
+ for (NonCalls::const_iterator nc = this->begin(); nc != this->end(); ++nc) {
+ for (map<long, map<string, NonCall> >::const_iterator p = nc->second.begin();
+ p != nc->second.end(); ++p) {
+ for (map<string, NonCall>::const_iterator s = p->second.begin();
+ s != p->second.end(); ++s) {
+ const string& name = s->first;
+ const NonCall& nonCall = s->second;
+ NonCall& aggregate = perSample[name];
+ aggregate.refCount += nonCall.refCount;
+ aggregate.altCount += nonCall.altCount;
+ aggregate.reflnQ += nonCall.reflnQ;
+ aggregate.altlnQ += nonCall.altlnQ;
+ if (!seen.count(name)) {
+ aggregate.minDepth = nonCall.refCount + nonCall.altCount;
+ seen.insert(name);
+ } else {
+ aggregate.minDepth = min(aggregate.minDepth, nonCall.refCount + nonCall.altCount);
+ }
+ }
+ }
+ }
+}
+
+void NonCalls::record(const string& seqName, long pos, const Samples& samples) {
+ map<string, NonCall>& site = (*this)[seqName][pos];
+ for (Samples::const_iterator s = samples.begin(); s != samples.end(); ++s) {
+ // tally ref and non-ref alleles
+ const string& name = s->first;
+ const Sample& sample = s->second;
+ NonCall& noncall = site[name];
+ for (Sample::const_iterator a = sample.begin(); a != sample.end(); ++a) {
+ const vector<Allele*>& alleles = a->second;
+ for (vector<Allele*>::const_iterator o = alleles.begin(); o != alleles.end(); ++o) {
+ Allele& allele = **o;
+ if (allele.isReference()) {
+ ++noncall.refCount;
+ noncall.reflnQ += allele.lnquality;
+ } else {
+ ++noncall.altCount;
+ noncall.altlnQ += allele.lnquality;
+ }
+ }
+ }
+ }
+}
+
+pair<string, long> NonCalls::firstPos(void) {
+ const string& startChrom = begin()->first;
+ long startPos = begin()->second.begin()->first;
+ return make_pair(startChrom, startPos);
+}
+
+pair<string, long> NonCalls::lastPos(void) {
+ const string& endChrom = rbegin()->first;
+ long endPos = rbegin()->second.rbegin()->first;
+ return make_pair(endChrom, endPos);
+}
diff --git a/src/NonCall.h b/src/NonCall.h
new file mode 100644
index 0000000..c1715a1
--- /dev/null
+++ b/src/NonCall.h
@@ -0,0 +1,48 @@
+#ifndef __NONCALL_H
+#define __NONCALL_H
+
+#include <string>
+#include <vector>
+#include <set>
+#include <map>
+#include <utility>
+#include "Utility.h"
+#include "Allele.h"
+#include "Sample.h"
+
+using namespace std;
+
+class NonCall {
+public:
+ NonCall(void)
+ : refCount(0)
+ , reflnQ(0)
+ , altCount(0)
+ , altlnQ(0)
+ , minDepth(0)
+
+ { }
+ NonCall(int rc, long double rq, int ac, long double aq, int mdp)
+ : refCount(rc)
+ , reflnQ(rq)
+ , altCount(ac)
+ , altlnQ(aq)
+ , minDepth(mdp)
+ { }
+ int refCount;
+ int altCount;
+ int minDepth;
+ long double reflnQ;
+ long double altlnQ;
+};
+
+class NonCalls : public map<string, map<long, map<string, NonCall> > > {
+public:
+ void record(const string& seqName, long pos, const Samples& samples);
+ NonCall aggregateAll(void);
+ void aggregatePerSample(map<string, NonCall>& perSite);
+ pair<string, long> firstPos(void);
+ pair<string, long> lastPos(void);
+};
+
+#endif
diff --git a/src/Parameters.cpp b/src/Parameters.cpp
index 06c283b..3dfbf51 100644
--- a/src/Parameters.cpp
+++ b/src/Parameters.cpp
@@ -66,6 +66,9 @@ void Parameters::usage(char** argv) {
<< " # call variants assuming a diploid sample" << endl
<< " freebayes -f ref.fa aln.bam >var.vcf" << endl
<< endl
+ << " # call variants assuming a diploid sample, providing gVCF output" << endl
+ << " freebayes -f ref.fa --gvcf aln.bam >var.gvcf" << endl
+ << endl
<< " # require at least 5 supporting observations to consider a variant" << endl
<< " freebayes -f ref.fa -C 5 aln.bam >var.vcf" << endl
<< endl
@@ -95,13 +98,12 @@ void Parameters::usage(char** argv) {
<< " -h --help Prints this help dialog." << endl
<< " --version Prints the release number and the git commit id." << endl
<< endl
- << "input and output:" << endl
+ << "input:" << endl
<< endl
<< " -b --bam FILE Add FILE to the set of BAM files to be analyzed." << endl
<< " -L --bam-list FILE" << endl
<< " A file containing a list of BAM files to be analyzed." << endl
<< " -c --stdin Read BAM input on stdin." << endl
- << " -v --vcf FILE Output VCF-format results to FILE." << endl
<< " -f --fasta-reference FILE" << endl
<< " Use FILE as the reference sequence for analysis." << endl
<< " An index file (FILE.fai) will be created if none exists." << endl
@@ -127,10 +129,14 @@ void Parameters::usage(char** argv) {
<< " reference sequence, start, end, sample name, copy number" << endl
<< " ... for each region in each sample which does not have the" << endl
<< " default copy number as set by --ploidy." << endl
- << " --trace FILE Output an algorithmic trace to FILE." << endl
- << " --failed-alleles FILE" << endl
- << " Write a BED file of the analyzed positions which do not" << endl
- << " pass --pvar to FILE." << endl
+ << endl
+ << "output:" << endl
+ << endl
+ << " -v --vcf FILE Output VCF-format results to FILE. (default: stdout)" << endl
+ << " --gvcf" << endl
+ << " Write gVCF output, which indicates coverage in uncalled regions." << endl
+ << " --gvcf-chunk NUM" << endl
+ << " When writing gVCF output emit a record for every NUM bases." << endl
<< " -@ --variant-input VCF" << endl
<< " Use variants reported in VCF file as input to the algorithm." << endl
<< " Variants in this file will included in the output even if" << endl
@@ -152,9 +158,6 @@ void Parameters::usage(char** argv) {
<< " Report even loci which appear to be monomorphic, and report all" << endl
<< " considered alleles, even those which are not in called genotypes." << endl
<< " Loci which do not have any potential alternates have '.' for ALT." << endl
- << endl
- << "reporting:" << endl
- << endl
<< " -P --pvar N Report sites if the probability that there is a polymorphism" << endl
<< " at the site is greater than N. default: 0.0. Note that post-" << endl
<< " filtering is generally recommended over the use of this parameter." << endl
@@ -267,8 +270,10 @@ void Parameters::usage(char** argv) {
<< " Require at least this count of observations supporting" << endl
<< " an alternate allele within the total population in order" << endl
<< " to use the allele in analysis. default: 1" << endl
- << " -! --min-coverage N" << endl
- << " Require at least this coverage to process a site. default: 0" << endl
+ << " --min-coverage N" << endl
+ << " Require at least this coverage to process a site. default: 0" << endl
+ << " --max-coverage N" << endl
+ << " Do not process sites with greater than this coverage. default: no limit" << endl
<< endl
<< "population priors:" << endl
<< endl
@@ -298,13 +303,9 @@ void Parameters::usage(char** argv) {
<< " where the efficiency is 1 if there is no relative observation bias." << endl
<< " --base-quality-cap Q" << endl
<< " Limit estimated observation quality by capping base quality at Q." << endl
- << " --experimental-gls" << endl
- << " Generate genotype likelihoods using 'effective base depth' metric" << endl
- << " qual = 1-BaseQual * 1-MapQual. Incorporate partial observations." << endl
- << " This is the default when contamination estimates are provided." << endl
- << " Optimized for diploid samples." << endl
<< " --prob-contamination F" << endl
<< " An estimate of contamination to use for all samples. default: 10e-9" << endl
+ << " --legacy-gls Use legacy (polybayes equivalent) genotype likelihood calculations" << endl
<< " --contamination-estimates FILE" << endl
<< " A file containing per-sample estimates of contamination, such as" << endl
<< " those generated by VerifyBamID. The format should be:" << endl
@@ -381,13 +382,11 @@ Parameters::Parameters(int argc, char** argv) {
cnvFile = "";
output = "vcf"; // -v --vcf
outputFile = "";
- traceFile = "";
- failedFile = "";
+ gVCFout = false;
+ gVCFchunk = 0;
alleleObservationBiasFile = "";
// operation parameters
- outputAlleles = false; //
- trace = false; // -L --trace
useDuplicateReads = false; // -E --use-duplicate-reads
suppressOutput = false; // -N --suppress-output
useBestNAlleles = 0; // -n --use-best-n-alleles
@@ -427,7 +426,7 @@ Parameters::Parameters(int argc, char** argv) {
reportMonomorphic = false;
boundIndels = true; // ignore indels at ends of reads
onlyUseInputAlleles = false;
- standardGLs = true;
+ standardGLs = false; // use experimental gls by default // XXX
MQR = 100; // -M --reference-mapping-quality
BQR = 60; // -B --reference-base-quality
ploidy = 2; // -p --ploidy
@@ -457,6 +456,7 @@ Parameters::Parameters(int argc, char** argv) {
probContamination = 10e-9;
//minAltQSumTotal = 0;
minCoverage = 0;
+ maxCoverage = 0;
debuglevel = 0;
debug = false;
debug2 = false;
@@ -479,8 +479,8 @@ Parameters::Parameters(int argc, char** argv) {
{"populations", required_argument, 0, '2'},
{"cnv-map", required_argument, 0, 'A'},
{"vcf", required_argument, 0, 'v'},
- {"trace", required_argument, 0, '&'},
- {"failed-alleles", required_argument, 0, '8'},
+ {"gvcf", no_argument, 0, '8'},
+ {"gvcf-chunk", required_argument, 0, '&'},
{"use-duplicate-reads", no_argument, 0, '4'},
{"no-partial-observations", no_argument, 0, '['},
{"use-best-n-alleles", required_argument, 0, 'n'},
@@ -527,6 +527,7 @@ Parameters::Parameters(int argc, char** argv) {
//{"min-alternate-mean-mapq", required_argument, 0, 'k'},
{"min-alternate-qsum", required_argument, 0, '3'},
{"min-coverage", required_argument, 0, '!'},
+ {"max-coverage", required_argument, 0, '+'},
{"genotype-qualities", no_argument, 0, '='},
{"variant-input", required_argument, 0, '@'},
{"only-use-input-alleles", no_argument, 0, 'l'},
@@ -539,8 +540,8 @@ Parameters::Parameters(int argc, char** argv) {
{"haplotype-basis-alleles", required_argument, 0, '9'},
{"report-genotype-likelihood-max", no_argument, 0, '5'},
{"report-all-haplotype-alleles", no_argument, 0, '6'},
- {"experimental-gls", no_argument, 0, ')'},
{"base-quality-cap", required_argument, 0, '('},
+ {"legacy-gls", no_argument, 0, ')'},
{"prob-contamination", required_argument, 0, '_'},
{"contamination-estimates", required_argument, 0, ','},
{"report-monomorphic", no_argument, 0, '6'},
@@ -552,7 +553,7 @@ Parameters::Parameters(int argc, char** argv) {
while (true) {
int option_index = 0;
- c = getopt_long(argc, argv, "hcO4ZKjH[0diN5a)Ik=wl6#uVXJY:b:G:M:x:@:A:f:t:r:s:v:n:B:p:m:q:R:Q:U:$:e:T:P:D:^:S:W:F:C:&:L:8:z:1:3:E:7:2:9:%:(:_:,:",
+ c = getopt_long(argc, argv, "hcO4ZKjH[0diN5a)Ik=wl6#uVXJY:b:G:M:x:@:A:f:t:r:s:v:n:B:p:m:q:R:Q:U:$:e:T:P:D:^:S:W:F:C:&:L:8z:1:3:E:7:2:9:%:_:,:(:!:+:",
long_options, &option_index);
if (c == -1) // end of options
@@ -618,20 +619,18 @@ Parameters::Parameters(int argc, char** argv) {
leftAlignIndels = false;
break;
- // -L --trace
- case '&':
- traceFile = optarg;
- trace = true;
- break;
-
// --bam-list
case 'L':
addLinesFromFile(bams, string(optarg));
break;
- // -8 --failed-alleles
+ // -8 --gvcf
case '8':
- failedFile = optarg;
+ gVCFout = true;
+ break;
+
+ case '&':
+ gVCFchunk = atoi(optarg);
break;
// -4 --use-duplicate-reads
@@ -663,6 +662,14 @@ Parameters::Parameters(int argc, char** argv) {
}
break;
+ // -+ --max-coverage
+ case '+':
+ if (!convert(optarg, maxCoverage)) {
+ cerr << "could not parse max-coverage" << endl;
+ exit(1);
+ }
+ break;
+
// -n --use-best-n-alleles
case 'n':
if (!convert(optarg, useBestNAlleles)) {
@@ -812,7 +819,7 @@ Parameters::Parameters(int argc, char** argv) {
}
break;
- case '5':
+ case '5':
reportGenotypeLikelihoodMax = true;
break;
@@ -904,7 +911,7 @@ Parameters::Parameters(int argc, char** argv) {
break;
// -% --observation-bias
- case '%':
+ case '%':
alleleObservationBiasFile = optarg;
break;
@@ -970,7 +977,7 @@ Parameters::Parameters(int argc, char** argv) {
variantPriorsFile = optarg;
break;
- case '9':
+ case '9':
haplotypeVariantFile = optarg;
break;
@@ -978,7 +985,7 @@ Parameters::Parameters(int argc, char** argv) {
onlyUseInputAlleles = true;
break;
- case '6':
+ case '6':
{
string arg(argv[optind - 1]);
if (arg == "--report-monomorphic") {
@@ -997,16 +1004,14 @@ Parameters::Parameters(int argc, char** argv) {
cerr << "could not parse prob-contamination" << endl;
exit(1);
}
- standardGLs = false;
break;
case ',':
contaminationEstimateFile = optarg;
- standardGLs = false;
break;
case ')':
- standardGLs = false;
+ standardGLs = true;
break;
case '(':
@@ -1021,12 +1026,12 @@ Parameters::Parameters(int argc, char** argv) {
++debuglevel;
break;
- case '#':
-
- // --version
+ case '#':
+
+ // --version
cout << "version: " << VERSION_GIT << endl;
- exit(0);
- break;
+ exit(0);
+ break;
case 'h':
usage(argv);
@@ -1087,4 +1092,14 @@ Parameters::Parameters(int argc, char** argv) {
exit(1);
}
+ // check that there aren't duplicates in the bams list
+ for( int i=1; i<bams.size(); ++i ){
+ for( int j=0; j<i; ++j ){
+ if( bams[i] == bams[j] ){
+ cerr << "Error: Duplicate bam file '" << bams[i] << "'" << endl;
+ exit(1);
+ }
+ }
+ }
+
}
diff --git a/src/Parameters.h b/src/Parameters.h
index 4502bac..80d20ba 100644
--- a/src/Parameters.h
+++ b/src/Parameters.h
@@ -35,8 +35,8 @@ public:
//string log;
string output; // -v --vcf
string outputFile;
- string traceFile;
- string failedFile; // -l --failed-alleles
+ bool gVCFout; // -l --gvcf
+ int gVCFchunk;
string variantPriorsFile;
string haplotypeVariantFile;
bool reportAllHaplotypeAlleles;
@@ -50,8 +50,6 @@ public:
string contaminationEstimateFile;
// operation parameters
- bool outputAlleles; // unused...
- bool trace; // -L --trace
bool useDuplicateReads; // -E --use-duplicate-reads
bool suppressOutput; // -S --suppress-output
int useBestNAlleles; // -n --use-best-n-alleles
@@ -121,6 +119,7 @@ public:
int minAltCount; // -C --min-alternate-count
int minAltTotal; // -G --min-alternate-total
int minCoverage; // -! --min-coverage
+ int maxCoverage; // -+ --max-coverage
int debuglevel; // -d --debug increments
bool debug; // set if debuglevel >=1
bool debug2; // set if debuglevel >=2
diff --git a/src/ResultData.cpp b/src/ResultData.cpp
index de85c7b..b92dfac 100644
--- a/src/ResultData.cpp
+++ b/src/ResultData.cpp
@@ -628,3 +628,71 @@ vcf::Variant& Results::vcf(
return var;
}
+
+
+vcf::Variant& Results::gvcf(
+ vcf::Variant& var,
+ NonCalls& nonCalls,
+ AlleleParser* parser) {
+
+ // what is the first position in the nonCalls?
+ pair<string, long> start = nonCalls.firstPos();
+ const string& startChrom = start.first;
+ long startPos = start.second;
+ // startPos and endPos are zero-based, half-open -- [startPos,endPos)
+
+ // what is the current position? nb: can't be on a different chrom
+ long endPos;
+ if (startChrom != parser->currentSequenceName) {
+ endPos = parser->reference.sequenceLength(startChrom);
+ } else {
+ endPos = parser->currentPosition;
+ }
+ long numSites = endPos - startPos;
+ assert(numSites > 0);
+
+ // set up site call
+ var.ref = parser->currentReferenceBaseString();
+ var.alt.push_back("<*>");
+ var.sequenceName = parser->currentSequenceName;
+ var.position = startPos + 1; // output text field is one-based
+ var.id = ".";
+ var.filter = ".";
+ // TODO change to actual quality
+ var.quality = 0;
+ // set up format string
+ var.format.clear();
+ var.format.push_back("GQ");
+ var.format.push_back("DP");
+ var.format.push_back("MIN");
+ var.format.push_back("QR");
+ var.format.push_back("QA");
+
+ NonCall total = nonCalls.aggregateAll();
+ var.info["DP"].push_back(convert((total.refCount+total.altCount) / numSites));
+ var.info["MIN"].push_back(convert(total.minDepth));
+ // The text END field is one-based, inclusive. We proudly conflate this
+ // with our zero-based, exclusive endPos.
+ var.info["END"].push_back(convert(endPos));
+
+ // genotype quality is 1- p(polymorphic)
+
+ map<string, NonCall> perSample;
+ nonCalls.aggregatePerSample(perSample);
+
+ // iterate through the samples and aggregate information about them
+ for (vector<string>::const_iterator s = parser->sampleList.begin();
+ s != parser->sampleList.end(); ++s) {
+ const string& sampleName = *s;
+ const NonCall& nc = perSample[sampleName];
+ map<string, vector<string> >& sampleOutput = var.samples[sampleName];
+ long double qual = nc.reflnQ - nc.altlnQ;
+ sampleOutput["GQ"].push_back(convert(ln2phred(qual)));
+ sampleOutput["DP"].push_back(convert((nc.refCount+nc.altCount) / numSites));
+ sampleOutput["MIN"].push_back(convert(nc.minDepth));
+ sampleOutput["QR"].push_back(convert(ln2phred(nc.reflnQ)));
+ sampleOutput["QA"].push_back(convert(ln2phred(nc.altlnQ)));
+ }
+
+ return var;
+}
diff --git a/src/ResultData.h b/src/ResultData.h
index d067ee1..ef02cae 100644
--- a/src/ResultData.h
+++ b/src/ResultData.h
@@ -11,6 +11,7 @@
#include "Variant.h"
#include "version_git.h"
#include "Result.h"
+#include "NonCall.h"
using namespace std;
@@ -49,7 +50,7 @@ public:
string refbase,
vector<Allele>& altAlleles,
map<string, int> repeats,
- int genotypingIterations,
+ int genotypingIterations,
vector<string>& sampleNames,
int coverage,
GenotypeCombo& genotypeCombo,
@@ -59,6 +60,11 @@ public:
map<int, vector<Genotype> >& genotypesByPloidy,
vector<string>& sequencingTechnologies,
AlleleParser* parser);
+
+ vcf::Variant& gvcf(
+ vcf::Variant& var,
+ NonCalls& noncalls,
+ AlleleParser* parser);
};
diff --git a/src/Sample.cpp b/src/Sample.cpp
index 811cb0c..06207cd 100644
--- a/src/Sample.cpp
+++ b/src/Sample.cpp
@@ -346,7 +346,7 @@ int countAlleles(Samples& samples) {
ostream& operator<<(ostream& out, Sample& sample) {
for (Sample::iterator s = sample.begin(); s != sample.end(); ++s) {
- out << s->first << ":" << s->second.size() << " ";
+ out << s->first << " #" << s->second.size() << endl << s->second << endl;
}
return out;
}
diff --git a/src/Sample.h b/src/Sample.h
index 2449156..e74d1eb 100644
--- a/src/Sample.h
+++ b/src/Sample.h
@@ -124,8 +124,6 @@ public:
void setSupportedAlleles(void);
};
-
-
int countAlleles(Samples& samples);
// using this one...
void groupAlleles(Samples& samples, map<string, vector<Allele*> >& alleleGroups);
diff --git a/src/freebayes.cpp b/src/freebayes.cpp
index 83ca759..be412da 100644
--- a/src/freebayes.cpp
+++ b/src/freebayes.cpp
@@ -16,6 +16,7 @@
#include <cmath>
#include <time.h>
#include <float.h>
+#include <stdlib.h>
// private libraries
#include "api/BamReader.h"
@@ -38,6 +39,7 @@
#include "Bias.h"
#include "Contamination.h"
+#include "NonCall.h"
// local helper debugging macros to improve code readability
@@ -59,6 +61,13 @@
using namespace std;
+// todo
+// generalize the main function to take the parameters as input
+// so that we can invoke the entire algorithm on different regions
+// when requested to run in parallel
+// take the targets (or whole genome) and make small jobs
+// run the main function for each region in an omp parallel for loop
+// only do this if the --parallel flag is set > 1
// freebayes main
int main (int argc, char *argv[]) {
@@ -71,6 +80,7 @@ int main (int argc, char *argv[]) {
list<Allele*> alleles;
Samples samples;
+ NonCalls nonCalls;
ostream& out = *(parser->output);
@@ -110,6 +120,10 @@ int main (int argc, char *argv[]) {
if (parameters.output == "vcf") {
out << parser->variantCallFile.header << endl;
}
+
+ if (0 < parameters.maxCoverage) {
+ srand(13);
+ }
Allele nullAllele = genotypeAllele(ALLELE_NULL, "N", 1, "1N");
@@ -122,6 +136,19 @@ int main (int argc, char *argv[]) {
DEBUG2("at start of main loop");
+ // did we switch chromosomes or exceed our gVCF chunk size?
+ // if so, we may need to output a gVCF record
+ Results results;
+ if (parameters.gVCFout && !nonCalls.empty() &&
+ ( nonCalls.begin()->first != parser->currentSequenceName
+ || (parameters.gVCFchunk &&
+ nonCalls.lastPos().second - nonCalls.firstPos().second
+ > parameters.gVCFchunk))) {
+ vcf::Variant var(parser->variantCallFile);
+ out << results.gvcf(var, nonCalls, parser) << endl;
+ nonCalls.clear();
+ }
+
// don't process non-ATGC's in the reference
string cb = parser->currentReferenceBaseString();
if (cb != "A" && cb != "T" && cb != "C" && cb != "G") {
@@ -129,43 +156,80 @@ int main (int argc, char *argv[]) {
continue;
}
- if (parameters.trace) {
- for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
- const string& name = s->first;
- for (Sample::iterator g = s->second.begin(); g != s->second.end(); ++g) {
- vector<Allele*>& group = g->second;
- for (vector<Allele*>::iterator a = group.begin(); a != group.end(); ++a) {
- Allele& allele = **a;
- parser->traceFile << parser->currentSequenceName << "," << (long unsigned int) parser->currentPosition + 1
- << ",allele," << name << "," << allele.readID << "," << allele.base() << ","
- << allele.currentQuality() << "," << allele.mapQuality << endl;
- }
- }
- }
- DEBUG2("after trace generation");
- }
-
- if (!parser->inTarget()) {
- DEBUG("position: " << parser->currentSequenceName << ":" << (long unsigned int) parser->currentPosition + 1
- << " is not inside any targets, skipping");
- continue;
- }
-
int coverage = countAlleles(samples);
DEBUG("position: " << parser->currentSequenceName << ":" << (long unsigned int) parser->currentPosition + 1 << " coverage: " << coverage);
+ bool skip = false;
if (!parser->hasInputVariantAllelesAtCurrentPosition()) {
// skips 0-coverage regions
if (coverage == 0) {
DEBUG("no alleles left at this site after filtering");
- continue;
+ skip = true;
} else if (coverage < parameters.minCoverage) {
DEBUG("post-filtering coverage of " << coverage << " is less than --min-coverage of " << parameters.minCoverage);
- continue;
+ skip = true;
} else if (parameters.onlyUseInputAlleles) {
DEBUG("no input alleles, but using only input alleles for analysis, skipping position");
- continue;
+ skip = true;
+ } else if (0 < parameters.maxCoverage) {
+ // go through each sample
+ for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
+ string sampleName = s->first;
+ Sample& sample = s->second;
+ // get the coverage for this sample
+ int sampleCoverage = 0;
+ for (Sample::iterator sg = sample.begin(); sg != sample.end(); ++sg) {
+ sampleCoverage += sg->second.size();
+ }
+ if (sampleCoverage <= parameters.maxCoverage) {
+ skip = true;
+ continue;
+ }
+
+ DEBUG("coverage " << sampleCoverage << " for sample " << sampleName << " was > " << parameters.maxCoverage << ", so we will remove " << (sampleCoverage - parameters.maxCoverage) << " genotypes");
+ vector<string> genotypesToErase;
+ do {
+ double probRemove = (sampleCoverage - parameters.maxCoverage) / (double)sampleCoverage;
+ vector<string> genotypesToErase;
+ // iterate through the genotypes
+ for (Sample::iterator sg = sample.begin(); sg != sample.end(); ++sg) {
+ vector<Allele*> allelesToKeep;
+ // iterate through each allele
+ for (int alleleIndex = 0; alleleIndex < sg->second.size(); alleleIndex++) {
+ // only if we have more alleles to remove
+ if (parameters.maxCoverage < sampleCoverage) {
+ double r = rand() / (double)RAND_MAX;
+ if (r < probRemove) { // skip over this allele
+ sampleCoverage--;
+ continue;
+ }
+ }
+ // keep it
+ allelesToKeep.push_back(sg->second[alleleIndex]);
+ }
+ // re-assign the alleles to this genotype
+ if (allelesToKeep.size() < sg->second.size()) {
+ sg->second.assign(allelesToKeep.begin(), allelesToKeep.end());
+ }
+ // if no more alleles for this genotype, remove it later
+ if (sg->second.empty()) {
+ genotypesToErase.push_back(sg->first);
+ }
+ }
+ // remove empty genotypes
+ for (vector<string>::iterator gt = genotypesToErase.begin(); gt != genotypesToErase.end(); ++gt) {
+ sample.erase(*gt);
+ }
+ } while (parameters.maxCoverage < sampleCoverage);
+ sampleCoverage = 0;
+ for (Sample::iterator sg = sample.begin(); sg != sample.end(); ++sg) {
+ sampleCoverage += sg->second.size();
+ }
+ DEBUG("coverage for sample " << sampleName << " is now " << sampleCoverage);
+ }
+ // update coverage
+ coverage = countAlleles(samples);
}
DEBUG2("coverage " << parser->currentSequenceName << ":" << parser->currentPosition << " == " << coverage);
@@ -175,7 +239,7 @@ int main (int argc, char *argv[]) {
if (!parameters.reportMonomorphic
&& !sufficientAlternateObservations(samples, parameters.minAltCount, parameters.minAltFraction)) {
DEBUG("insufficient alternate observations");
- continue;
+ skip = true;
}
if (parameters.reportMonomorphic) {
DEBUG("calling at site even though there are no alternate observations");
@@ -183,13 +247,22 @@ int main (int argc, char *argv[]) {
} else {
/*
cerr << "has input variants at " << parser->currentSequenceName << ":" << parser->currentPosition << endl;
- vector<Allele>& inputs = parser->inputVariantAlleles[parser->currentPosition];
+ vector<Allele>& inputs = parser->inputVariantAlleles[parser->currentSequenceName][parser->currentPosition];
for (vector<Allele>::iterator a = inputs.begin(); a != inputs.end(); ++a) {
cerr << *a << endl;
}
*/
}
+ if (skip) {
+ // record data for gVCF
+ if (parameters.gVCFout) {
+ nonCalls.record(parser->currentSequenceName, parser->currentPosition, samples);
+ }
+ // and step ahead
+ continue;
+ }
+
// to ensure proper ordering of output stream
vector<string> sampleListPlusRef;
@@ -294,7 +367,6 @@ int main (int argc, char *argv[]) {
//cerr << "estimated minor count " << estimatedMinorAllelesAtLocus << endl;
- Results results;
map<string, vector<vector<SampleDataLikelihood> > > sampleDataLikelihoodsByPopulation;
map<string, vector<vector<SampleDataLikelihood> > > variantSampleDataLikelihoodsByPopulation;
map<string, vector<vector<SampleDataLikelihood> > > invariantSampleDataLikelihoodsByPopulation;
@@ -303,127 +375,28 @@ int main (int argc, char *argv[]) {
int inputLikelihoodCount = 0;
DEBUG2("calculating data likelihoods");
- // calculate data likelihoods
- //for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
- for (vector<string>::iterator n = parser->sampleList.begin(); n != parser->sampleList.end(); ++n) {
-
- //string sampleName = s->first;
- string& sampleName = *n;
- //DEBUG2("sample: " << sampleName);
- //Sample& sample = s->second;
- if (samples.find(sampleName) == samples.end()
- && !(parser->hasInputVariantAllelesAtCurrentPosition()
- || parameters.reportMonomorphic)) {
- continue;
- }
- Sample& sample = samples[sampleName];
- vector<Genotype>& genotypes = genotypesByPloidy[parser->currentSamplePloidy(sampleName)];
- vector<Genotype*> genotypesWithObs;
- for (vector<Genotype>::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
- if (parameters.excludePartiallyObservedGenotypes) {
- if (g->sampleHasSupportingObservationsForAllAlleles(sample)) {
- genotypesWithObs.push_back(&*g);
- }
- } else if (parameters.excludeUnobservedGenotypes && usingNull) {
- if (g->sampleHasSupportingObservations(sample)) {
- //cerr << sampleName << " has suppporting obs for " << *g << endl;
- genotypesWithObs.push_back(&*g);
- } else if (g->hasNullAllele() && g->homozygous) {
- // this genotype will never be added if we are running in observed-only mode, but
- // we still need it for consistency
- genotypesWithObs.push_back(&*g);
- }
- } else {
- genotypesWithObs.push_back(&*g);
- }
- }
-
- // skip this sample if we have no observations supporting any of the genotypes we are going to evaluate
- if (genotypesWithObs.empty()) {
- continue;
- }
-
- vector<pair<Genotype*, long double> > probs
- = probObservedAllelesGivenGenotypes(sample, genotypesWithObs,
- parameters.RDF, parameters.useMappingQuality,
- observationBias, parameters.standardGLs,
- genotypeAlleles,
- contaminationEstimates,
- estimatedAlleleFrequencies);
-
-#ifdef VERBOSE_DEBUG
- if (parameters.debug2) {
- for (vector<pair<Genotype*, long double> >::iterator p = probs.begin(); p != probs.end(); ++p) {
- cerr << parser->currentSequenceName << "," << (long unsigned int) parser->currentPosition + 1 << ","
- << sampleName << ",likelihood," << *(p->first) << "," << p->second << endl;
- }
- }
-#endif
-
- Result& sampleData = results[sampleName];
- sampleData.name = sampleName;
- sampleData.observations = &sample;
- for (vector<pair<Genotype*, long double> >::iterator p = probs.begin(); p != probs.end(); ++p) {
- sampleData.push_back(SampleDataLikelihood(sampleName, &sample, p->first, p->second, 0));
- }
-
- sortSampleDataLikelihoods(sampleData);
-
- string& population = parser->samplePopulation[sampleName];
- vector<vector<SampleDataLikelihood> >& sampleDataLikelihoods = sampleDataLikelihoodsByPopulation[population];
- vector<vector<SampleDataLikelihood> >& variantSampleDataLikelihoods = variantSampleDataLikelihoodsByPopulation[population];
- vector<vector<SampleDataLikelihood> >& invariantSampleDataLikelihoods = invariantSampleDataLikelihoodsByPopulation[population];
-
- if (parameters.genotypeVariantThreshold != 0) {
- if (sampleData.size() > 1
- && abs(sampleData.at(1).prob - sampleData.front().prob)
- < parameters.genotypeVariantThreshold) {
- variantSampleDataLikelihoods.push_back(sampleData);
- } else {
- invariantSampleDataLikelihoods.push_back(sampleData);
- }
- } else {
- variantSampleDataLikelihoods.push_back(sampleData);
- }
- sampleDataLikelihoods.push_back(sampleData);
-
- DEBUG2("obtaining genotype likelihoods input from VCF");
- int prevcount = sampleDataLikelihoods.size();
- parser->addCurrentGenotypeLikelihoods(genotypesByPloidy, sampleDataLikelihoods);
- // add these sample data likelihoods to 'invariant' likelihoods
- inputLikelihoodCount += sampleDataLikelihoods.size() - prevcount;
- parser->addCurrentGenotypeLikelihoods(genotypesByPloidy, invariantSampleDataLikelihoods);
-
- }
-
- // if there are not any input GLs, attempt to use the input ACs
- if (inputLikelihoodCount == 0) {
- parser->getInputAlleleCounts(genotypeAlleles, inputAlleleCounts);
- }
+ calculateSampleDataLikelihoods(
+ samples,
+ results,
+ parser,
+ genotypesByPloidy,
+ parameters,
+ usingNull,
+ observationBias,
+ genotypeAlleles,
+ contaminationEstimates,
+ estimatedAlleleFrequencies,
+ sampleDataLikelihoodsByPopulation,
+ variantSampleDataLikelihoodsByPopulation,
+ invariantSampleDataLikelihoodsByPopulation);
DEBUG2("finished calculating data likelihoods");
- // this section is a hack to make output of trace identical to BamBayes trace
- // and also outputs the list of samples
- vector<bool> samplesWithData;
- if (parameters.trace) {
- parser->traceFile << parser->currentSequenceName << "," << (long unsigned int) parser->currentPosition + 1 << ",samples,";
- for (vector<string>::iterator s = sampleListPlusRef.begin(); s != sampleListPlusRef.end(); ++s) {
- if (parameters.trace) parser->traceFile << *s << ":";
- Results::iterator r = results.find(*s);
- if (r != results.end()) {
- samplesWithData.push_back(true);
- } else {
- samplesWithData.push_back(false);
- }
- }
- parser->traceFile << endl;
- }
-
// if somehow we get here without any possible sample genotype likelihoods, bail out
bool hasSampleLikelihoods = false;
- for (map<string, vector<vector<SampleDataLikelihood> > >::iterator s = sampleDataLikelihoodsByPopulation.begin(); s != sampleDataLikelihoodsByPopulation.end(); ++s) {
+ for (map<string, vector<vector<SampleDataLikelihood> > >::iterator s = sampleDataLikelihoodsByPopulation.begin();
+ s != sampleDataLikelihoodsByPopulation.end(); ++s) {
if (!s->second.empty()) {
hasSampleLikelihoods = true;
break;
@@ -457,50 +430,6 @@ int main (int argc, char *argv[]) {
bool bestOverallComboIsHet = false;
GenotypeCombo bestCombo; // = NULL;
- // what a hack...
- /*
- if (parameters.trace) {
- for (list<GenotypeCombo>::iterator gc = genotypeCombos.begin(); gc != genotypeCombos.end(); ++gc) {
- vector<Genotype*> comboGenotypes;
- for (GenotypeCombo::iterator g = gc->begin(); g != gc->end(); ++g)
- comboGenotypes.push_back((*g)->genotype);
- long double posteriorProb = gc->posteriorProb;
- long double dataLikelihoodln = gc->probObsGivenGenotypes;
- long double priorln = gc->posteriorProb;
- long double priorlnG_Af = gc->priorProbG_Af;
- long double priorlnAf = gc->priorProbAf;
- long double priorlnBin = gc->priorProbObservations;
-
- parser->traceFile << parser->currentSequenceName << "," << (long unsigned int) parser->currentPosition + 1 << ",genotypecombo,";
-
- int j = 0;
- GenotypeCombo::iterator i = gc->begin();
- for (vector<bool>::iterator d = samplesWithData.begin(); d != samplesWithData.end(); ++d) {
- if (*d) {
- parser->traceFile << IUPAC(*(*i)->genotype);
- ++i;
- } else {
- parser->traceFile << "?";
- }
- }
- // TODO cleanup this and above
- parser->traceFile
- << "," << dataLikelihoodln
- << "," << priorln
- << "," << priorlnG_Af
- << "," << priorlnAf
- << "," << priorlnBin
- << "," << posteriorProb
- << "," << safe_exp(posteriorProb - posteriorNormalizer)
- << endl;
- }
- }
- */
-
- // the second clause guards against float underflow causing us not to output a position
- // practically, parameters.PVL == 0 means "report all genotypes which pass our input filters"
-
-
GenotypeCombo bestGenotypeComboByMarginals;
vector<vector<SampleDataLikelihood> > allSampleDataLikelihoods;
@@ -628,14 +557,6 @@ int main (int argc, char *argv[]) {
}
}
- // report the maximum a posteriori estimate
- // unless we're reporting the GL maximum
- if (parameters.reportGenotypeLikelihoodMax) {
- bestCombo = glMax;
- }
-
- DEBUG("best combo: " << bestCombo);
-
// odds ratio between the first and second-best combinations
if (genotypeCombos.size() > 1) {
bestComboOddsRatio = genotypeCombos.front().posteriorProb - (++genotypeCombos.begin())->posteriorProb;
@@ -690,10 +611,58 @@ int main (int argc, char *argv[]) {
}
}
- //if (alts.empty()) alts = genotypeAlleles;
+ // reporting the GL maximum *over all alleles*
+ if (parameters.reportGenotypeLikelihoodMax) {
+ bestCombo = glMax;
+ } else {
+ // the default behavior is to report the GL maximum genotyping over the alleles in the best posterior genotyping
+
+ // select the maximum-likelihood GL given the alternates we have
+ // this is not the same thing as the GL max over all alleles!
+ // it is the GL max over the selected alleles at this point
+
+ vector<Allele> alleles = alts;
+ for (vector<Allele>::iterator a = genotypeAlleles.begin(); a != genotypeAlleles.end(); ++a) {
+ if (a->isReference()) {
+ alleles.push_back(*a);
+ }
+ }
+ map<string, list<GenotypeCombo> > glMaxComboBasedOnAltsByPop;
+ for (map<string, SampleDataLikelihoods>::iterator p = sampleDataLikelihoodsByPopulation.begin(); p != sampleDataLikelihoodsByPopulation.end(); ++p) {
+ const string& population = p->first;
+ SampleDataLikelihoods& sampleDataLikelihoods = p->second;
+ GenotypeCombo glMaxBasedOnAlts;
+ for (SampleDataLikelihoods::iterator v = sampleDataLikelihoods.begin(); v != sampleDataLikelihoods.end(); ++v) {
+ SampleDataLikelihood* m = NULL;
+ for (vector<SampleDataLikelihood>::iterator d = v->begin(); d != v->end(); ++d) {
+ if (d->genotype->matchesAlleles(alleles)) {
+ m = &*d;
+ break;
+ }
+ }
+ assert(m != NULL);
+ glMaxBasedOnAlts.push_back(m);
+ }
+ glMaxComboBasedOnAltsByPop[population].push_back(glMaxBasedOnAlts);
+ }
+ list<GenotypeCombo> glMaxBasedOnAltsGenotypeCombos; // build new combos into this list
+ combinePopulationCombos(glMaxBasedOnAltsGenotypeCombos, glMaxComboBasedOnAltsByPop);
+ bestCombo = glMaxBasedOnAltsGenotypeCombos.front();
+ }
+
+ DEBUG("best combo: " << bestCombo);
+
+ // output
if (!alts.empty() && (1 - pHom.ToDouble()) >= parameters.PVL || parameters.PVL == 0) {
+ // write the last gVCF record(s)
+ if (parameters.gVCFout && !nonCalls.empty()) {
+ vcf::Variant var(parser->variantCallFile);
+ out << results.gvcf(var, nonCalls, parser) << endl;
+ nonCalls.clear();
+ }
+
vcf::Variant var(parser->variantCallFile);
out << results.vcf(
@@ -716,24 +685,22 @@ int main (int argc, char *argv[]) {
parser)
<< endl;
- } else if (!parameters.failedFile.empty()) {
- // get the unique alternate alleles in this combo, sorted by frequency in the combo
- long unsigned int position = parser->currentPosition;
- for (vector<Allele>::iterator ga = genotypeAlleles.begin(); ga != genotypeAlleles.end(); ++ga) {
- if (ga->type == ALLELE_REFERENCE)
- continue;
- parser->failedFile
- << parser->currentSequenceName << "\t"
- << position << "\t"
- << position + ga->length << "\t"
- << *ga << endl;
- }
- // BED format
+ } else if (parameters.gVCFout) {
+ // record statistics for gVCF output
+ nonCalls.record(parser->currentSequenceName, parser->currentPosition, samples);
}
DEBUG2("finished position");
}
+ // write the last gVCF record
+ if (parameters.gVCFout && !nonCalls.empty()) {
+ Results results;
+ vcf::Variant var(parser->variantCallFile);
+ out << results.gvcf(var, nonCalls, parser) << endl;
+ nonCalls.clear();
+ }
+
DEBUG("total sites: " << total_sites << endl
<< "processed sites: " << processed_sites << endl
<< "ratio: " << (float) processed_sites / (float) total_sites);
diff --git a/src/version_release.txt b/src/version_release.txt
index 82f7124..875d109 100644
--- a/src/version_release.txt
+++ b/src/version_release.txt
@@ -1,5 +1,5 @@
-# This file was auto-generated by running "make setversion VERSION=v0.9.20"
-# on Fri Dec 12 16:34:49 EST 2014 .
+# This file was auto-generated by running "make setversion VERSION=v1.0.0"
+# on Sat Nov 14 16:39:28 CET 2015 .
# Please do not edit or commit this file manually.
#
-v0.9.20
+v1.0.0
diff --git a/test/Makefile b/test/Makefile
new file mode 100644
index 0000000..6285c24
--- /dev/null
+++ b/test/Makefile
@@ -0,0 +1,15 @@
+.PHONY: all clean
+
+freebayes=../bin/freebayes
+vcfuniq=../vcflib/bin/vcfuniq
+
+all: test
+
+test: $(freebayes) $(vcfuniq)
+ prove -v t
+
+$(freebayes):
+ cd .. && $(MAKE)
+
+$(vcfuniq):
+ cd ../vcflib && $(MAKE)
diff --git a/test/region-and-target-handling.t b/test/region-and-target-handling.t
new file mode 100644
index 0000000..b99b6ee
--- /dev/null
+++ b/test/region-and-target-handling.t
@@ -0,0 +1,104 @@
+#!/bin/bash
+# vi: set ft=sh :
+test=$(dirname $0)
+root=$(dirname $0)/..
+source $test/test-simple-bash/lib/test-simple.bash \
+ tests 8
+
+PATH=$root/bin:$PATH
+
+ref=$test/`basename $0`.ref
+alt=$test/`basename $0`.alt
+bam=$test/`basename $0`.bam
+bed=$test/`basename $0`.bed
+vcf=$test/`basename $0`.vcf
+
+trap 'rm -f $ref* $alt $bam* $bed $vcf' EXIT
+
+# 01234567 start
+# ATCGGCTA
+# 12345678 end
+
+cat >$ref <<REF
+>ref
+ATCGGCTA
+REF
+samtools faidx $ref
+
+cat >$alt <<ALT
+>alt
+GTTAGGTT
+ALT
+
+samtools view -b - >$bam <<SAM
+ at HD VN:1.5 SO:coordinate
+ at SQ SN:ref LN:8
+alt 0 ref 1 30 1X1=2X1=1X1=1X * 0 0 GTTAGGTT *
+SAM
+samtools index $bam
+
+cat >$bed <<BED
+ref 0 1 first_base
+ref 2 4 third_and_fourth_base
+ref 5 6 sixth_base
+ref 7 8 eigth_base
+BED
+
+cat >$vcf <<VCF
+##fileformat=VCFv4.1
+##INFO=<ID=NAME,Number=0,Type=String,Description="Test name">
+#CHROM POS ID REF ALT QUAL FILTER INFO
+ref 1 . A G 1234 PASS NAME=first base
+ref 2 . T . 1234 PASS NAME=second base
+ref 3 . C T 1234 PASS NAME=third base
+ref 4 . G A 1234 PASS NAME=fourth base
+ref 5 . G . 1234 PASS NAME=fifth base
+ref 6 . C G 1234 PASS NAME=sixth base
+ref 7 . T . 1234 PASS NAME=seventh base
+ref 8 . A T 1234 PASS NAME=eigth base
+VCF
+
+PS4='\n+ '
+
+function run_freebayes() {
+ ($root/bin/freebayes "$@" \
+ --haplotype-length 0 --min-alternate-count 1 \
+ --min-alternate-fraction 0 --pooled-continuous --report-monomorphic \
+ --ploidy 1 \
+ -f $ref $bam \
+ | grep -vE '^#' | cut -f1-5)
+}
+
+if [[ -n $TEST_DEBUG ]]; then
+ cat $ref >&2
+ cat $bed >&2
+ cat $vcf >&2
+ vcfannotate --bed $bed --key MATCH $vcf >&2
+ vcfintersect --bed $bed $vcf >&2
+ bedtools intersect -a $vcf -b $bed >&2
+fi
+
+[[ -z $(run_freebayes --region ref:4-5 --region ref:6-7) ]]; ok $? 'ref:4-5 ref:6-7 are empty'
+[[ -z $(run_freebayes --region ref:4 --region ref:6) ]]; ok $? 'ref:4 ref:6 are empty'
+
+expected=`cat <<END
+ref 6 . C G
+ref 8 . A T
+END
+`
+[[ $(run_freebayes --region ref:5-6 --region ref:7-8) == $expected ]]; ok $? 'ref:5-6 ref:7-8'
+[[ $(run_freebayes --region ref:5 --region ref:7) == $expected ]]; ok $? 'ref:5 ref:7'
+[[ $(run_freebayes --region ref:5-) == $expected ]]; ok $? 'ref:5-'
+
+expected=`cat <<END
+ref 1 . A G
+ref 3 . CG TA
+ref 6 . C G
+ref 8 . A T
+END
+`
+[[ $(run_freebayes --targets $bed) == $expected ]]; ok $? "--targets $bed"
+[[ $(run_freebayes --region ref) == $expected ]]; ok $? "--region ref"
+
+[[ $(run_freebayes --region ref:1-20 2>&1) =~ "Target region coordinates (ref 1 19) outside of reference sequence bounds (ref 8)" ]]
+ ok $? 'region outside of bounds error'
diff --git a/test/splice/1:883884-887618.bam b/test/splice/1:883884-887618.bam
new file mode 100644
index 0000000..61bdcb9
Binary files /dev/null and b/test/splice/1:883884-887618.bam differ
diff --git a/test/splice/1:883884-887618.bam.bai b/test/splice/1:883884-887618.bam.bai
new file mode 100644
index 0000000..09649e4
Binary files /dev/null and b/test/splice/1:883884-887618.bam.bai differ
diff --git a/test/splice/1:883884-887618.fa b/test/splice/1:883884-887618.fa
new file mode 100644
index 0000000..6f31855
--- /dev/null
+++ b/test/splice/1:883884-887618.fa
@@ -0,0 +1,64 @@
+>1
+CCTGCAGGTTGACATTGGACAGCTTCAGGATCACGGAGAAGTTGATGGGCTTGGAGCTCA
+TGCGCCCTGGCTTCCTGTTGAAGTCGACCTGCTGGAACATCTGCCCCAAGGGCCGTGTCA
+GGCTCTCTCGGCCCCATGCCTGGTCACCCTGGCTTCACCCTGGCTGCACCCTGGTCCCCC
+TGGTCCCTTTGGCCCTGCACCTGGCTGCACCCTGGTCACCCTGGTCCTCAGGCTTCAGCA
+CGAGGAGACCCAACCCGAAACAAGAGATAAGCAGCTAGGCACGGTCCCAGCCCCATGGCC
+CCACACAGCCTGGCGTGTCCCCGAGTGGGGCTCTGATCAGCAGGGAAGGATCAGGAACAC
+GGGCCCTCTCCCAAAATCCTGGGAAGCACTAAGGACCCACGTCTGGAGACCTAGAGGTCC
+CTGCCACACAGAGTACCAGGACAGCCCCCATTCCTGACAACATGGCACTGAGGCCCATCC
+TGGCCCTCAGCACAGCAATGCCACATGGGGCTCCGTCCCGAGCAACCAGAAGACGCAGCA
+GCAACAGCAAAGAGTGCCAGACACAGGACGGGACAGATGGAGGTCACGGGAGGCCTGGGG
+GGCCCCTCCCACACCCTCCACAGAAGCCACAGGCCATGCACCTCCGTAGGACATGCCAGG
+AGGAGCAGAGAAAGAGTCAGCCTGGCCTCTCAGTCTTGTGACCCCTCCCCAACCACTAGG
+AGCCCTCAGGCTGTGAACCAGAGAGATCCAGGGTACATGCTGGGGCACCAGAGGACAGAA
+GCAGATTATGGGGGGCCCAAGGGTGGAGGCCCCAGCAAGACACCCCACTCAGCAGGGGTG
+GCTGCCAGACAGCTGAGCTCAGGTCCAGATGGGAAAACGCCCTGGTACTGATGAACCTCT
+GGGAGGACACTGCCCTTCTCTAGCCTGGGCCAAAAGTATGGGCAGGCGACTGGCTTCCTG
+TCCAGGCACCGGGGAAGGGCAGGCACGGCCTACAGCTGAGGGCTTTGTTTTAGCTTTTAG
+GGTTTGCTTTTTCCGGAGAGGAGAGGCCTGCATCCCTGCGTGAGAGGGGCCTACAGGGTG
+CGGCTGCAGTGAACCAGGGCAGTGCACAAGTGAAGAAACAGACACTGCAAGAGAGTAAGA
+AAGCACGTAGCCAAGGAGATTGGACAAGGGAAACAGACACAAAGCATGCCTGGAGCTGGG
+CAGCAGAGCTGGAGGCCCACAGACACAGGGGCCAGGCCAAGGGCTGGGCAATCAGAGACC
+ACTGCTGGCTGCCATGGGAGTGAACATGGCTTCCTAAAGATGTTCAGCATCCCTGCCTCA
+GTGGAAGGGGAATCAGAACGTGCCTGGGGCTTCCATACCTTCTGGACCCAGCAACCTGGG
+GACCCAGCCCCAAGGACAGCTGTGACACGACTCTGGGCCTTCATGAGGCTGAGAACTCAT
+TAGATAAACAGGGTGGAGATCGTGAATTCCTAAATTTACAGCGGGGTGGGGCCTAACCCA
+CCCACTGAAAGTTGAAACTTCGGACTCCAATCTTTCCCTACCTGAGTGGCATGTGCACGT
+GCAAAGATGCATACACACAGGTACACACAGGCGCACACATGCACAGGCACACACGTAGAC
+GCACGTACATGCACACAGGTGCACACACGCACAGGCACACACAAGCAGACGTGCATGCAT
+GCACCCAAGTGTACAGGTACACGCACAGGTACACACGCACAGTACACACATGCAGATGCA
+AATGCATGCACACAAGTGCACACAGACACGTGTGTATGCACATAGGTGCACACAGGTACA
+TAGATGCACGCAGACACACATGCATGCACACAGATGCACACATGCAAAGGTTCATGCATG
+CACAGGTAGACACATGCAGACGCAAATGCATGCACACAGGTGCACACACGCACAGGTACA
+CACATGCAGACAGGTGCACACAGGTACACACGCATGTGCATGCATACAGGTACACACAGG
+TACACACTGGTATACAAAGACACATGCACAGGTGCACACACGCAAAGGTACACGCAAAGG
+TACACACATGCATACACACAGGTGTACACACGCACAGATGCACACACCTGCAGGCACACA
+GGCATTCATGGATACACGTGCATACACACATACAGATGCACACATGAACTGATATGCACA
+CACACACAGATTTACAGACCCATGCACCCAGATGGTGCACACACACGCATGCATGCACAG
+ACAAACGCACCCACCCAGATGTACACACATCCCCCATACAAATGCATACACATGCACACA
+ACTGTCAGGGCAGGGCACTGTTCAGTGTACAGTTCAGCAATCAGACTTGGGTTTTGGCCC
+ATCTCGCCTCCTGCCTACTCCAGTCTTGGGCAAGCACCTACAGGGTCAGGTCCCGGCATC
+CTCAAGCACAGCCCTAGCAGCAGCCGCCTTCATCCCCAGTGGTCCCACGTTCCCCAACAC
+TGCCCCCAGCACACGGAGGAGCACTGTCCTCGTGACACCCGTGACAAGGAGGCGGCCAGG
+GCTCCGGGAAGTCGCCGGACAACTCAGCACAGACGCCCGGAGCAGCAGATGCTCCCTACA
+GGAGCAGGCAGAGGTGCCACACGCCCACCACAGCCTCACTCACCTCCAGGATGAAAGGCA
+GCACCGGGATGAAGGCCCCCGAGCTCCCCGAGAGCAGCGTCAGGGCACGGATGCAGTGCA
+TTCGCAGCGGGTAGAAGCGGGCAGTGGGGATGAGCCTGGGGGTGGGAAGGCCGAGTGAGC
+AGAGGCCCCGGCTCTGGCAGCCCCTGCCCCTCCCCCTGCTGTCTTGGACCTCCTCCCCCA
+CTCACCGACATCAGACCCCTAAAGCAGGACAAGGCACGTCTGAACCCCAGGGACCTGCAC
+CCCTTGCCCAGCTATGGGGCTGCAGGTACCTTACCCAGCTTTCAGCTTTATTTCAACACG
+GTGGCAGGGGTCATCTGTTCCTGCCTGCTCTCCTCCCTGTTCCCGCCTTTCCTACTTCTT
+CCCTTGGACTTCCCACCTCCCTTCCCAGCCTTGAGTTTCTAAGGCTCAGCCAGACAGATG
+CCCCCTCCTGGCCAAAGAGGCCACAGTACTGCTCCCACCTTTTCCTGCTGCAGCCCCTAC
+CGTGCCTGGACCCTCCTAAGCGCTCACACCCAAAATGCAGTTCACTGCATCAGATGTGCT
+CCACCCAAGAACCCCACGTAATGAACCAAACTCACTGCAAACTGCACACAAGCCCCCAGA
+ACACACGCTGGGTGGCACCCGAGGCGGCTAAACCGCTGTTGGAGGAGCTCATGTCACAGG
+TGCCCACCAGCCCTAGCTCTCCAGAATCCAGAGCATCTCCCCGTAGCAGGTACAGGGACC
+CCAGCCCTTCCTCAGGGACGGCCAGGACACAGACTCAGGCCCCAAGGCCCTGAACGCCAG
+AGGCACCGCCCACACAGTCCCAGCAAATTTGCTTCTCCTGACCCTCCCGCACAACCCTGC
+CCACCCCACAACTCACTTGATACAGCCAATGATGACTTGGGCAAGGGGGTAGACCAAGGG
+CTGGAGGGCTTCGCTGGGGCCCGCAGTGCTCAGGACCCGGCACCACAGGAAGAGGCAGTG
+CACATACTGCCAGTTGTACACAGACTGGTATGTTTCCTGGTCAGAGAGAACCACGTCAGC
+TACTGGCCAGGCTGACAAGTCAGGCTGATGCACGTTCCTCCTGGGCCGGCACAGGAATCA
+TTTCCTCATCTTGCA
diff --git a/test/splice/1:883884-887618.fa.fai b/test/splice/1:883884-887618.fa.fai
new file mode 100644
index 0000000..dc79067
--- /dev/null
+++ b/test/splice/1:883884-887618.fa.fai
@@ -0,0 +1 @@
+1 3735 3 60 61
diff --git a/test/t/01_call_variants.t b/test/t/01_call_variants.t
new file mode 100644
index 0000000..abc25f1
--- /dev/null
+++ b/test/t/01_call_variants.t
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+
+BASH_TAP_ROOT=bash-tap
+. ./bash-tap/bash-tap-bootstrap
+
+PATH=../bin:$PATH # for freebayes
+PATH=../scripts:$PATH # for freebayes-parallel
+PATH=../vcflib/bin:$PATH # for vcf binaries used by freebayes-parallel
+
+plan tests 19
+
+is $(echo "$(comm -12 <(cat tiny/NA12878.chr22.tiny.giab.vcf | grep -v "^#" | cut -f 2 | sort) <(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam | grep -v "^#" | cut -f 2 | sort) | wc -l) >= 13" | bc) 1 "variant calling recovers most of the GiAB variants in a test region"
+
+by_region=$((for region in \
+ q:180-191 \
+ q:1002-1013 \
+ q:1811-1825 \
+ q:1911-1922 \
+ q:2344-2355 \
+ q:3257-3268 \
+ q:4443-4454 \
+ q:5003-5014 \
+ q:5074-5085 \
+ q:5089-5100 \
+ q:5632-5646 \
+ q:6412-6423 \
+ q:8840-8851 \
+ q:9245-9265 \
+ q:9785-9796 \
+ q:10526-10537 \
+ q:11255-11266 \
+ q:11530-11541 \
+ q:12119-12130;
+do
+ freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam -r $region | grep -v "^#"
+done) |wc -l)
+
+at_once=$(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l)
+
+is $by_region $at_once "freebayes produces the same number of calls if targeted per site or called without targets"
+
+cat >targets.bed <<EOF
+q 180 191
+q 1002 1013
+q 1811 1825
+q 1911 1922
+q 2344 2355
+q 3257 3268
+q 4443 4454
+q 5003 5014
+q 5074 5085
+q 5089 5100
+q 5632 5646
+q 6412 6423
+q 8840 8851
+q 9245 9265
+q 9785 9796
+q 10526 10537
+q 11255 11266
+q 11530 11541
+q 12119 12130
+EOF
+
+is $(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam -t targets.bed | grep -v "^#" | wc -l) $by_region "a targets bed file can be used with the same effect as running by region"
+#rm targets.bed
+
+
+is $(samtools view -u tiny/NA12878.chr22.tiny.bam | freebayes -f tiny/q.fa --stdin | grep -v "^#" | wc -l) \
+ $(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) "reading from stdin or not makes no difference"
+
+is $(samtools view tiny/NA12878.chr22.tiny.bam | wc -l) $(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam -d 2>&1 | grep ^alignment: | wc -l) "freebayes processes all alignments in input"
+
+# ensure targeting works even when there are no reads
+is $(freebayes -f tiny/q.fa -@ tiny/q.vcf.gz tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) 19 "freebayes correctly handles variant input"
+
+# ensure that positions at which no variants exist get put in the out vcf
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz tiny/NA12878.chr22.tiny.bam | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t11000$|\t1000$)" | wc -l) 3 "freebayes puts required variants in output"
+
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz tiny/NA12878.chr22.tiny.bam -l | grep -v "^#" | wc -l) 3 "freebayes limits calls to input variants correctly"
+
+is $(freebayes -f tiny/q.fa -@ tiny/q.vcf.gz -l tiny/1read.bam | grep -v "^#" | wc -l) 20 "freebayes reports all input variants even when there is no input data"
+
+# check variant input with region specified
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -r q:1-10000 tiny/NA12878.chr22.tiny.bam | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t11000$|\t1000$)" | wc -l) 2 "freebayes handles region and variant input"
+
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -r q:1-10000 tiny/NA12878.chr22.tiny.bam -l | grep -v "^#" | wc -l) 2 "freebayes limits to variant input correctly when region is given"
+
+# check variant input when reading from stdin
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz --stdin < tiny/NA12878.chr22.tiny.bam | grep -v "^#" | cut -f1,2 | grep -P "(\t500$|\t11000$|\t1000$)" | wc -l) 3 "freebayes handles variant input and reading from stdin"
+
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -l --stdin < tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) 3 "freebayes limits to variant input when reading from stdin"
+
+is $(freebayes -f tiny/q.fa -@ tiny/q_spiked.vcf.gz -r q:1-10000 -l --stdin < tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) 2 "freebayes handles region, stdin, and variant input"
+
+gzip -c tiny/q.fa >tiny/q.fa.gz
+cp tiny/q.fa.fai tiny/q.fa.gz.fai
+freebayes -f tiny/q.fa.gz -@ tiny/q_spiked.vcf.gz -r q:1-10000 -l - < tiny/NA12878.chr22.tiny.bam >/dev/null 2>/dev/null
+is $? 1 "freebayes bails out when given a gzipped or corrupted reference"
+rm tiny/q.fa.gz.*
+
+is $(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) $(freebayes-parallel tiny/q.regions 2 -f tiny/q.fa tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) "running in parallel makes no difference"
+
+#is $(freebayes -f 'tiny/q with spaces.fa' tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) $(freebayes-parallel 'tiny/q with spaces.regions' 2 -f 'tiny/q with spaces.fa' tiny/NA12878.chr22.tiny.bam | grep -v "^#" | wc -l) "freebayes handles spaces in file names"
+
+
+is $(freebayes -f splice/1:883884-887618.fa splice/1:883884-887618.bam | grep ^1 | wc -l) 1 "freebayes can handle spliced reads"
+
+is $(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam --gvcf | grep '<\*>' | wc -l) 20 "freebayes produces the expected number of lines of gVCF output"
+
+is $(freebayes -f tiny/q.fa tiny/NA12878.chr22.tiny.bam --gvcf --gvcf-chunk 50 | grep '<\*>' | wc -l) 245 "freebayes produces the expected number of lines of gVCF output"
diff --git a/test/tiny/1read.bam b/test/tiny/1read.bam
new file mode 100644
index 0000000..f052290
Binary files /dev/null and b/test/tiny/1read.bam differ
diff --git a/test/tiny/1read.bam.bai b/test/tiny/1read.bam.bai
new file mode 100644
index 0000000..38bec45
Binary files /dev/null and b/test/tiny/1read.bam.bai differ
diff --git a/test/tiny/NA12878.chr22.tiny.bam b/test/tiny/NA12878.chr22.tiny.bam
new file mode 100644
index 0000000..1c7f605
Binary files /dev/null and b/test/tiny/NA12878.chr22.tiny.bam differ
diff --git a/test/tiny/NA12878.chr22.tiny.bam.bai b/test/tiny/NA12878.chr22.tiny.bam.bai
new file mode 100644
index 0000000..066e9bd
Binary files /dev/null and b/test/tiny/NA12878.chr22.tiny.bam.bai differ
diff --git a/test/tiny/NA12878.chr22.tiny.giab.vcf b/test/tiny/NA12878.chr22.tiny.giab.vcf
new file mode 100644
index 0000000..1b27478
--- /dev/null
+++ b/test/tiny/NA12878.chr22.tiny.giab.vcf
@@ -0,0 +1,89 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=Uncertain,Description="Uncertain genotype due to reason in filter INFO field">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth summed across all datasets, excluding MQ0 reads">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Net Genotype quality across all datasets, defined as difference between most likely and next most likely genotype likelihoods">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Net Genotype across all datasets">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods summed across all unfiltered datasets for genotypes as defined in the VCF specification">
+##INFO=<ID=allalts,Number=1,Type=Integer,Description="All ALT alleles originally considered at this position">
+##INFO=<ID=datasetcalls,Number=1,Type=Integer,Description="Number of datasets with any genotype call at this position">
+##INFO=<ID=DPSum,Number=1,Type=Integer,Description="Total read depth summed across all datasets, excluding MQ0 reads">
+##INFO=<ID=Entropy,Number=1,Type=Float,Description="Shannon entropy of variant flanking regions, 12bp on both sides">
+##INFO=<ID=filter,Number=1,Type=String,Description="Reason for filtering this genotype as uncertain">
+##INFO=<ID=geno,Number=1,Type=Integer,Description="Most probable genotype, corresponding to the minimum entry in the PL field (e.g., 1=0/0,2=0/1,3=1/1,4=0/2,etc)">
+##INFO=<ID=genoMapGood,Number=1,Type=Integer,Description="Number of datasets calling this genotype with VQSR mapping tranche <= 95">
+##INFO=<ID=HapNoVar,Number=1,Type=Integer,Description="Number of datasets for which HaplotypeCaller called a variant within 35bp and did not call a variant at this location">
+##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">
+##INFO=<ID=NoCG,Number=0,Type=Flag,Description="Present if no consensus reached, so looked at all datasets except Complete Genomics since it may have a different representation of complex variants">
+##INFO=<ID=NoPLTot,Number=1,Type=Integer,Description="Number of datasets with likelihood ratio > 20 for a genotype different from the called genotype">
+##INFO=<ID=platforms,Number=1,Type=Integer,Description="Number of different platforms that called this genotype">
+##INFO=<ID=platformbias,Number=.,Type=String,Description="Names of platforms that have at more than twice as many incorrect than correct genotypes at this location, indicating platform-specific bias (ill=Illumina,sol=SOLiD,454=454,ion=Ion Torrent,cg=Complete Genomics)">
+##INFO=<ID=platformnames,Number=.,Type=String,Description="Names of platforms that called this genotype (ill=Illumina,sol=SOLiD,454=454,ion=Ion Torrent,cg=Complete Genomics)">
+##INFO=<ID=PL454WG,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~16x 454 whole genome sequencing from 1000 Genomes Project, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLCG,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~73x Complete Genomics whole genome sequencing, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLHSWEx,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~66x 2x100bp Illumina exome sequencing from Broad Institute, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLHSWG,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~68x 2x100bp Illumina whole genome sequencing from Broad Institute, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLILL250,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~50x 2x250bp Illumina PCR-free whole genome sequencing from Broad Institute, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLILLCLIA,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~80x 2x100bp Illumina whole genome sequencing from Illumina CLIA lab, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLIllPCRFree,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~56x 2x100bp Illumina PCR-free whole genome sequencing from Illumina Platinum Genomes Project, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLILLWEx,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~30x 2x54bp Illumina exome sequencing from Broad Institute, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLILLWG,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~39x 2x44bp Illumina whole genome sequencing from Broad Institute, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLIonEx,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~80x mean 237bp Ion Torrent exome sequencing from Life Technologies, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLPlatGen,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~190x 2x100bp Illumina PCR-free whole genome sequencing from Illumina Platinum Genomes Project, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLXIll,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~37x 2x100bp Illumina whole genome sequencing from X Prize, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLXPSolWGLS,Number=.,Type=String,Description="Genotype likelihoods (PL) for ~24x 50bpx35bp SOLiD whole genome sequencing from X Prize, preceded by filtering info if this dataset was not used due to evidence of bias">
+##INFO=<ID=PLminsum,Number=1,Type=Integer,Description="Net Genotype quality across all datasets, defined as difference between most likely and next most likely genotype likelihoods">
+##INFO=<ID=PLminsumOverDP,Number=1,Type=Float,Description="Net Genotype quality across all datasets, defined as difference between most likely and next most likely genotype likelihoods, divided by the depth of coverage">
+##INFO=<ID=RU,Number=1,Type=String,Description="Tandem repeat unit (bases)">
+##INFO=<ID=RPA,Number=.,Type=Integer,Description="Number of times tandem repeat unit is repeated, for each allele (including reference)">
+##INFO=<ID=TrancheABQDmin2,Number=1,Type=Float,Description="2nd lowest VQSR tranche for the called genotype for annotations associated with abnormal allele balance (AB and QD)">
+##INFO=<ID=TrancheAlignmin2,Number=1,Type=Float,Description="2nd lowest VQSR tranche for the called genotype for annotations associated with local alignment errors (distance from the end of the read and clipping)">
+##INFO=<ID=TrancheMapmin2,Number=1,Type=Float,Description="2nd lowest VQSR tranche for the called genotype for annotations associated with mapping errors (mapping quality and depth of coverage)">
+##INFO=<ID=TrancheSSEmin2,Number=1,Type=Float,Description="2nd lowest VQSR tranche for the called genotype for annotations associated with systematic sequencing errors (strand bias and neighboring base quality)">
+##INFO=<ID=varType,Number=1,Type=String,Description="Type of variant">
+##INFO=<ID=YesPLtot,Number=1,Type=Integer,Description="Number of datasets with likelihood ratio > 20 for the called genotype">
+##contig=<ID=1,length=249250621,assembly=b37>
+##contig=<ID=2,length=243199373,assembly=b37>
+##contig=<ID=3,length=198022430,assembly=b37>
+##contig=<ID=4,length=191154276,assembly=b37>
+##contig=<ID=5,length=180915260,assembly=b37>
+##contig=<ID=6,length=171115067,assembly=b37>
+##contig=<ID=7,length=159138663,assembly=b37>
+##contig=<ID=8,length=146364022,assembly=b37>
+##contig=<ID=9,length=141213431,assembly=b37>
+##contig=<ID=10,length=135534747,assembly=b37>
+##contig=<ID=11,length=135006516,assembly=b37>
+##contig=<ID=12,length=133851895,assembly=b37>
+##contig=<ID=13,length=115169878,assembly=b37>
+##contig=<ID=14,length=107349540,assembly=b37>
+##contig=<ID=15,length=102531392,assembly=b37>
+##contig=<ID=16,length=90354753,assembly=b37>
+##contig=<ID=17,length=81195210,assembly=b37>
+##contig=<ID=18,length=78077248,assembly=b37>
+##contig=<ID=19,length=59128983,assembly=b37>
+##contig=<ID=20,length=63025520,assembly=b37>
+##contig=<ID=21,length=48129895,assembly=b37>
+##contig=<ID=22,length=51304566,assembly=b37>
+##contig=<ID=X,length=155270560,assembly=b37>
+##contig=<ID=Y,length=59373566,assembly=b37>
+##contig=<ID=MT,length=16569,assembly=b37>
+##fileDate=20130719
+##phasing=none
+##reference=human_g1k_v37.fasta
+##variants_justified=left
+##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
+##INFO=<ID=LEN,Number=A,Type=Integer,Description="allele length">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
+q 186 . T C 11488 PASS DPSum=824;HRun=0;HapNoVar=0;NoPLTot=0;PL454WG=353,0,389;PLCG=698,0,750;PLHSWG=1303,0,1514;PLILL250=694,0,744;PLILLCLIA=1984,0,1727;PLILLWG=652,0,578;PLIllPCRFree=749,0,1617;PLNCIIonWG=67,0,116;PLPlatGen=4041,0,4291;PLXIll=515,0,763;PLXPSolWGLS=432,0,608;PLminsum=11488;PLminsumOverDP=13.94;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=11;allalts=C;datasetcalls=11;geno=2;genoMapGood=11;platformbias=none;platformnames=ill,454,sol,cg,i [...]
+q 1008 . C T 6462 PASS DPSum=569;HRun=0;HapNoVar=0;NoPLTot=0;PL454WG=112,0,152;PLCG=471,0,330;PLHSWG=579,0,778;PLILL250=335,0,178;PLILLCLIA=1013,0,1657;PLILLWG=173,0,294;PLIllPCRFree=781,0,698;PLPlatGen=2598,0,2675;PLXIll=303,0,376;PLXPSolWGLS=97,0,57;PLminsum=6462;PLminsumOverDP=11.36;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=10;allalts=T;datasetcalls=10;geno=2;genoMapGood=10;platformbias=none;platformnames=ill,454,sol,cg;platforms=4;varType=SNP GT: [...]
+q 1817 . G A 3061 PASS DPSum=361;HRun=0;HapNoVar=0;NoPLTot=0;PL454WG=98,0,24;PLCG=125,0,138;PLHSWG=164,0,80;PLILL250=325,0,792;PLILLCLIA=316,0,212;PLILLWG=65,0,27;PLIllPCRFree=546,0,489;PLPlatGen=1422,0,2430;PLminsum=3061;PLminsumOverDP=8.48;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=8;allalts=A;datasetcalls=8;geno=2;genoMapGood=8;platformbias=none;platformnames=ill,454,cg;platforms=3;varType=SNP GT:DP:GQ:PL 0/1:361:3061:3061,0,4192
+q 1820 . C T 3594 PASS DPSum=358;HRun=0;HapNoVar=0;NoPLTot=0;PL454WG=97,0,20;PLCG=140,0,160;PLHSWG=165,0,106;PLILL250=332,0,821;PLILLCLIA=317,0,191;PLILLWG=63,0,62;PLIllPCRFree=695,0,582;PLPlatGen=1785,0,2767;PLminsum=3594;PLminsumOverDP=10.04;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=7;allalts=T;datasetcalls=8;geno=2;genoMapGood=7;platformbias=none;platformnames=ill,cg;platforms=2;varType=SNP GT:DP:GQ:PL 0/1:358:3594:3594,0,4709
+q 1917 . A G 5718 PASS DPSum=430;HRun=0;HapNoVar=0;NoPLTot=0;PLCG=175,0,339;PLHSWG=231,0,186;PLILL250=450,0,787;PLILLCLIA=697,0,510;PLILLWG=76,0,179;PLIllPCRFree=776,0,547;PLNCIIonWG=44,0,22;PLPlatGen=3171,0,2815;PLXIll=98,0,94;PLminsum=5479;PLminsumOverDP=12.74;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=9;allalts=G;datasetcalls=9;geno=2;genoMapGood=9;platformbias=none;platformnames=ill,cg,ion;platforms=3;varType=SNP GT:DP:GQ:PL 0/1:430:5479:5718,0,5479
+q 4449 . G A 11413 PASS DPSum=725;HRun=0;HapNoVar=0;NoPLTot=0;PL454WG=242,0,405;PLCG=685,0,483;PLHSWG=1176,0,998;PLILL250=967,0,487;PLILLCLIA=956,0,1221;PLILLWG=615,0,542;PLIllPCRFree=1346,0,1143;PLNCIIonWG=1,0,192;PLPlatGen=4726,0,3765;PLXIll=699,0,761;PLminsum=9997;PLminsumOverDP=13.79;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=9;allalts=A;datasetcalls=10;geno=2;genoMapGood=9;platformbias=none;platformnames=ill,454,cg;platforms=3;varType=SNP GT:DP:G [...]
+q 5009 . C T 8517 PASS DPSum=639;HRun=0;HapNoVar=0;NoPLTot=0;PL454WG=205,0,204;PLCG=744,0,686;PLHSWG=932,0,971;PLILL250=315,0,370;PLILLCLIA=774,0,943;PLILLWG=273,0,298;PLIllPCRFree=878,0,1020;PLNCIIonWG=28,0,82;PLPlatGen=3611,0,4445;PLXIll=567,0,492;PLXPSolWGLS=190,0,302;PLminsum=8517;PLminsumOverDP=13.33;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=11;allalts=T;datasetcalls=11;geno=2;genoMapGood=11;platformbias=none;platformnames=ill,454,sol,cg,ion;pla [...]
+q 6418 . G A 9700 PASS DPSum=777;HRun=1;HapNoVar=0;NoPLTot=0;PL454WG=117,0,141;PLCG=519,0,688;PLHSWG=1193,0,1365;PLILL250=760,0,399;PLILLCLIA=1473,0,1473;PLILLWG=477,0,680;PLIllPCRFree=743,0,1372;PLPlatGen=3923,0,3639;PLXIll=382,0,826;PLXPSolWGLS=113,0,198;PLminsum=9700;PLminsumOverDP=12.48;RPA=16,17;RU=A;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=10;allalts=A;datasetcalls=10;geno=2;genoMapGood=10;platformbias=none;platformnames=ill,454,sol,cg;platfor [...]
+q 8846 . T C 9797 PASS DPSum=727;HRun=1;HapNoVar=0;NoPLTot=0;PLCG=470,0,525;PLHSWG=1228,0,1675;PLILL250=516,0,649;PLILLCLIA=1506,0,1304;PLILLWG=541,0,453;PLIllPCRFree=1022,0,801;PLNCIIonWG=36,0,132;PLPlatGen=3724,0,4719;PLXIll=623,0,669;PLXPSolWGLS=131,0,408;PLminsum=9797;PLminsumOverDP=13.48;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=10;allalts=C;datasetcalls=10;geno=2;genoMapGood=10;platformbias=none;platformnames=ill,sol,cg,ion;platforms=4;varType= [...]
+q 9791 . A C 10272 PASS DPSum=777;HRun=1;HapNoVar=0;NoPLTot=0;PL454WG=239,0,213;PLCG=839,0,825;PLHSWG=1051,0,1286;PLILL250=337,0,490;PLILLCLIA=1591,0,1298;PLILLWG=458,0,844;PLIllPCRFree=716,0,1064;PLNCIIonWG=254,0,57;PLPlatGen=3778,0,4302;PLXIll=513,0,812;PLXPSolWGLS=496,0,558;PLminsum=10272;PLminsumOverDP=13.22;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=11;allalts=C;datasetcalls=11;geno=2;genoMapGood=11;platformbias=none;platformnames=ill,454,sol,cg, [...]
+q 10532 . C A 8913 PASS DPSum=705;HRun=2;HapNoVar=0;NoPLTot=0;PLCG=653,0,743;PLHSWG=1626,0,1610;PLILL250=501,0,624;PLILLCLIA=896,0,1250;PLILLWG=320,0,420;PLIllPCRFree=908,0,1146;PLPlatGen=3625,0,4125;PLXIll=384,0,581;PLminsum=8913;PLminsumOverDP=12.64;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=8;allalts=A;datasetcalls=8;geno=2;genoMapGood=8;platformbias=none;platformnames=ill,cg;platforms=2;varType=SNP GT:DP:GQ:PL 0/1:705:8913:8913,0,10499
+q 11261 . T C 9848 PASS DPSum=709;HRun=1;HapNoVar=0;NoPLTot=0;PL454WG=106,0,86;PLCG=471,0,482;PLHSWG=1029,0,914;PLILL250=632,0,656;PLILLCLIA=1189,0,1776;PLILLWG=315,0,760;PLIllPCRFree=987,0,701;PLNCIIonWG=91,0,94;PLPlatGen=4150,0,4008;PLXIll=829,0,904;PLXPSolWGLS=49,0,135;PLminsum=9848;PLminsumOverDP=13.89;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=11;allalts=C;datasetcalls=11;geno=2;genoMapGood=11;platformbias=none;platformnames=ill,454,sol,cg,ion;pl [...]
+q 11536 . T C 11666 PASS DPSum=841;HRun=1;HapNoVar=0;NoPLTot=0;PL454WG=172,0,410;PLCG=652,0,960;PLHSWEx=31,0,32;PLHSWG=1156,0,1240;PLILL250=818,0,685;PLILLCLIA=1490,0,1870;PLILLWG=807,0,626;PLIllPCRFree=1042,0,1136;PLNCIIonWG=303,0,116;PLPlatGen=4105,0,3971;PLXIll=795,0,850;PLXPSolWGLS=295,0,414;PLminsum=11666;PLminsumOverDP=13.87;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=12;allalts=C;datasetcalls=12;geno=2;genoMapGood=12;platformbias=none;platformna [...]
+q 12125 . T C 11323 PASS DPSum=806;HRun=1;HapNoVar=0;NoPLTot=0;PL454WG=254,0,312;PLCG=512,0,595;PLHSWG=1073,0,1401;PLILL250=603,0,978;PLILLCLIA=1833,0,1747;PLILLWG=597,0,727;PLIllPCRFree=1219,0,658;PLNCIIonWG=256,0,148;PLPlatGen=4169,0,3563;PLXIll=641,0,783;PLXPSolWGLS=166,0,106;PLminsum=11018;PLminsumOverDP=13.67;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=11;allalts=C;datasetcalls=11;geno=2;genoMapGood=11;platformbias=none;platformnames=ill,454,sol,c [...]
diff --git a/test/tiny/q with spaces.fa b/test/tiny/q with spaces.fa
new file mode 100644
index 0000000..085a809
--- /dev/null
+++ b/test/tiny/q with spaces.fa
@@ -0,0 +1,207 @@
+>q
+GATTCAACAAATATTTACTGAGTACCTACTATGTGCAATGTGTATACTAGGTGCTGGGGA
+TACAGCAGAGAACAAAGTCCCCACTCTCAATGAAAGGGGCCAGTCTTTTTATTCATCTTT
+AGTTAATTTTAAGCACACAACAGCTATCTCAAACTTTCTTCACACTTTCCAAGCCCCTGA
+TCCCTTATGTACTCTTGGCAGATGCCCCACCTTATCTCTTACCAGAACCTAAGGCCAACT
+AGCATAAAATCCCTTGAGCACACTTAGATTGGCATGCTTTTACTGTGACATCTGTTCCTA
+CAGCACCCTTACTTTCTTCTCTTGAGCTGAGGGTGAAGATCTCTTTCTTTCCCATCTGTA
+ATCTCACAATATGCACAACTACACCAGAGACAATGGAGGCACAATACAGGCCAGAGGTCT
+GTTCCTCATTCCTTCTGCGGGGCCTTCCTCCAGTGAATATCTCTTCTAACATTGTCAATC
+ATATCCAACAGAAAAAGAGATCTCTCCATTGGTACAAAAACCTACAGCTCCTAAGAAGAT
+TTTGACTCTCACTGCTCAACTTCTCAAATAAAAATTCCTCCACTTTAGTTCTAATTATCA
+CTCAAATGCTGTTAGACCTGTGGGCAGGTCACTATATCCCTGCCTAGGTCTGTTTCCCCA
+TCTGCGTAACAGAGAAATAATTGTGCTTTATCTGTCCTACAGAAGTGTGAAGAGGTTTGA
+TGTGACAGCAGTGAAAGCAATTTAAAAAGTACAAAGTGCTGTATAAAAGAAGACGTTTTC
+ATAATTATTCTGTTTTCACATGCTATCTTTATTCCAATGATCTCAAAGTACCTACCCAAG
+CACAGTTCTAACTCTAGTATATGTAAGCAAACGTGCAAGTCTGTCACCTTCCACACAAAG
+AGTCCCCCCACCCCCGCCAAACACACCACTCCCTCTCCCTCTCCTTCTCCCTCTCCCTCT
+CTCCTTCTCCCGCTTTCCACGGTCTCCCTCTGTTGCCGAAGCTGGACCGTACCGCCGTGA
+TCTCCGCTCGCTGCAACCTTCCTGCCTGTTTCTCCTGCCTCAGCCTGCCGAGTGCCTGGG
+ATTGCAGGCGCGCGCCGCCACGCCTGACTGGTTTTTGTATTTTTTGGAGACGGGGTTTCG
+CCATGTTGGATCGGCTGGTCTCCAGCTCCTAACCGCGAGTGATCTGCCTGCCTCGGCTTC
+CCGAGGTGCCGGGATTGCAGACGGAGTCTCGCTCACTCAGTGCTCAATGTTGCCCAGGCT
+GGAGTGCAGTGGCGTGATCTTGGCTCGCTACAACCTCCACCTCCCAGCCACCTGCCTTGG
+CCTCCTAAAGTGCCGAGATTGCAGCCTCTGCCTGGCTGCCACCCCGTCTGGGAAGTGAGG
+AGCGTCTCTGCCTGGCCGCCCATTGTCTGGGATGTGAAGAGCCCCTCTGCCTGGCCGCCC
+AGTCTGGGAAGTGAGGAGCGTCTCTGCCCAGCCGCCCATCGTCTGGGATGTGGGGAGCGC
+CTCTGCCCCGCCGCCCCGTCTGGGATGTGAGGAGCGCCTCTACCCGGCCGCGACCCCGTC
+TGGGAACTGAGGAGCGTCTCTGCCCGGCCGCCACCCCATCTGGGAGGTGAGGAGCGTCTC
+TGCCCAGCCGCCCCGTCTGAGAAGTAAGGAGCCCCTCCGCCCGGCAGCCGCCCCGTCTGG
+GAAGTGAGGAGCGTCTCCGCCCGGCAGCCAGCCCGTCTGGGAGGTGGGGGGCAGCCCCCG
+CCCGGCCAGCCGCCCCGTCCAGGAGGTGGGGGGCACCCCCCGCCCGGCAGCTGCCCCGTC
+GGTGGGGGGCGCCTCCGCCCGGCCGCCCCGTCTGGGAAGTGAGGAGCCCCTCTGCCGGGC
+CGCCACCCCGTCTGGGAGGTGTGCCCGGCAGCTCATTGAGAGCGGGCCATGATGACAATG
+GCGGTTTTGTCGAATAGAGGGGGGAAATGTGGGGAGAGGAGAGAGATCAGATTGTTACTG
+TGTCTGTGTGGAGGGAGGTGGACATGGGAGACTCCATTTTGTTCTGTACTGGGAAAAATT
+CTTCCGCCTTGGGATGCTGTTGATCTATGACCTTACCCCCAACCCCGTGCTCTCTGAAAC
+ATGTGCTGTGTCCACTCAGGGTTAAATGGATTAAGGGCAGTACAAGATGTGCTTTGTTAA
+ACAGATGCTTGAAGGCAGCATGCTAGTTAAGAGTCATCACCACTCCCTAATCTCAAGTAC
+CCAGGGACACGAACACTGAGGAAGGCCGCAGGGTCCTCCTCTGCCTAGGAAAACCAGAGA
+CCTTTGTTCACATGTTTATCTACTGACCTTCCCTCCACTATTGTCCTATGACCCTGCCAA
+ATCCCCCTCTCCGAGAAACACCCAAGAATGATCAATAAATACTAAAAAAAAAAAAAAAAA
+AAAAGGAATATTTTCACGTGCAAATGAAAAAGCCCTAAACAAATCTTAAAATATATAAAG
+TGGTTCAAAAAATGTTTATTGTTATTATTTTAATAGAGGCCTATACATGGAGCATTTATT
+TGGGAAAGTTTTAAATGAATGTTTTCCCCTGTTTTGGTTGTTATCAATCTACACTTTACC
+AATTTTCAATAATGATCATGCCTAATTAATTCAAATTAAAAGGTGTGTAATACAAAAAAA
+AAAAAAAAACTGCAACCAACCCTCGAATCAGCTCAATCTCTCACTGAATTATGGTTATCT
+GTCACCACTGTATCTGCCTATCAGTGATTAGGGAGAGCTCTCTCTGGAGAAAAATGTCAC
+CTAGAGCATCAACAATTTTCATACACAGTATCCAGCATTCACTTGAACGTTACCAAGCAT
+ACAAGGAAACAGAAACGAGAAAAAAGAGACAACATATCCACATATCCACAGTGACCCAGT
+TACTAGTTACTGAAGATGATCTTTAAAATAGCTGTAATTCGGCCGGGCGCAGTGGCTCAC
+GCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCTGAGGTCAGGAGTTCGAGACCAGCCTA
+GCCAACATGGTGAAATCCCGTCTCTAAAAATACAAAAATTAGCTGGGCGTGGTGGGGCGT
+GCCTGTAATCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATCACTTGAACCTAGGAAGCA
+GAGGTTGCAGTGAGCCTAGGGTGTGGTGGGGCGTGCCTGTAATCCCAGCTACTTGGGAGG
+CTGAGGCAGGAGAATCACTTGAACCTAGGAAGCAGAGATTGCACCATTGCACTCCAGCCT
+GGGCGACAAATTGCGACTGTATCAAAAAAAAAAAAAAAAAAAGAGGTAACTCTTGGAAAG
+GAGAAAATATCTGCAGCACATGAATCTAAAAAAAGACTCATCAAAATCTACAAAGAACTC
+CTATAATTTAATGAAACAAAAATCTCCACAGAAAAATTTGCAAAATAACCCTTAAATACT
+TCACGAGATAGCCGAATGTCATAAACATACAAAAAAGAGTTAACCACCAAAAAAGAGCTA
+AAATATCAGAGAAATGCAAACAAAACCCACAATGCATACCACTTCATACCCACTAGAGAG
+GCTAAAAATGATACCAAGTGTTGTTGAGAATATGCACTGGTCTGAACTCTAATACACTGC
+TAATGGAAATATTTTACCACTTTGGAAAACAGTTTGGCACATCTACTAAAATGGAATCTA
+TGCACAGACTGACACAGTCATTCTGCTACTAAATATATACCCAGCAGGACAATGTGGCTC
+ACGTCTATAATCAGCCCAGGCTGGTCTTGAACTCCTGAGCTCAAAGTGATCCTCTCACCT
+TGGCTTCCTAAAGTGTTGGGATTACAGGCATGAGCCACCACATCTGGCCACTTTTCTTTA
+TGTATATTATACATCAATAAAAGTAAAGAATAAAAAGGGAGAAGAGTCTGAAAAGAACAA
+TTATCACAAAATTAAGGAACGTACTCAACTCTTAAAGATTTGAGAAGGCGTTTCTAAGGT
+ACTAGCAATGTTCCATTTTGCAATCTGCACAGTGATGACAAGTATTTAACTTATTCTGTA
+AACAGTACCTTTTTTTTTTTTTTTTTTGAGACAGAGTCTTGCTCTGTCGCCCAGGATAGA
+GCACAGTGGCGCGATCTCAGCTCACGGCAGCCTCTGCCTCCTGGGGTGACGTGATTCTCA
+TGCCTCAGCCTCTGGAGTAGCTAGGATTACAGGCAGGCGCCACCACGCCCAGCTAATTTT
+TGTATTTTTAGTAGAGACAAGATTTTGCCATGTTGGCCAGGCTGGTCTTGAACTCCTGAC
+CTCAAGTGATCTGCCCGCCTTGGCCTTCCAAAGTGCTAGGATTACAGGCATGAGCCACTG
+TGCCCAGCCACAAACAGTACTTATATCTTACCTATGTTTAAATGGATATTCAAAATAAGA
+TTAAAAGAGGTAAGAAAATACAATACTCTAGTAAATGTAAAAATAAGTGAGAAGAATAAA
+TGCCAAGAGAAATATAATTACCAAGAAGAAATTCAAAGTAGGAACAGTCGTATAACCATT
+TTTAAAATAATTTTGAATCAGCTTAACATGTTTTCAACAAAAAACCAAACACATACCAAA
+ACCAAAAAACCACCCAGGTCCAGACTTGTACAGGCAAGTTCTACCAAACTTTGAAGGAAT
+CATTCTAATTTTACAGAAATAAAAACAGAAGAATAGGAAAAGAAACACTTGCCAATTCAT
+TTTGTAAGGGTAGTCTAATTTTCATACCAAACCACATGAGAAAAACCCTATTATTTATTA
+ACAAGCCAAATCCAGCAATATATAATAATAATACATCATGACCAAGTTGGGTTTAATCCC
+AGGAATGCAAAATTGATTCAATATTAGAAAATCTATTAGGGTAACATGAAACTTAAAGAA
+AAAAAACAAGAAAATTTCTATACATGCAATTAAAAAGCATCTGTAGAAATTCAATCATTC
+AGGATTCTTTTTTTCTTTAAGTTTAAAAAAACTTGACAAAGAAAGCTTTATGAAGCCAGG
+TGCTGTGGCATGTGCCTGTAATCCCAGCCATTCAGGAGGGTAAAGTGGGATGATCACTTG
+AGCCCAGGAGTTTGAGACCAGTCTGAGCAACATAGCAAGACCCTGTTTCCGGGGAGGGGG
+TGGGTGGGAAGTAAGCCTTATGAAAAACCAGTAGCCCCTATCACTCAATAAAAGCTTTCC
+TTTTAACATCAGAACAAGTCAGGGATGCCCTCTGTATCACCACTGCTCTTCAACTTTGTA
+ATGGAGAGTCCGGCCAGCACAGTTAAGACAAGATAAAATATATCTAAGTCATGAAAAGAA
+ACAAAATATCATGATTTTCAGATGATATGTTTGTCCTTAAAAAATTTAAAGAGGGATTAC
+TGGAATAAGAGTTTGTTGGACAATCTTTTTATTCCAGTAATTCATCAGTTTACAAAAGTC
+AAATGCATTTCTATAAACCATCTACAGATATCTCTTAGAATATAGCCGGGCATGGTGGTG
+GGCGCCTGTAATCCCAGCTACTAGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCAGGAG
+GCGGAGCTTGCAATGAGCTGAGATCGCCACTGCACTCCAGCCTGGGCCACAGAGCGAGAC
+TCTGTCTCAAAAAAAAAAAAAAAAAAAAAAGATATCACTTAGAATAGCAACAACAAACAT
+AAAGTATCAAGGAATCAATTAAACACATGATGTGCAATAATAATAACACAAAACTTTACT
+GGGAAAACAACTTAGGAAAACAATGTGGAATACCCGGTGAATGTACATATGCCCTACCAT
+CCAGCAATTCCACTACTGGGGAATCTGTCTAAAGATCTGTGAAACTCTTGTGTATCAGGA
+GAAATGTGCAAGGTTACTTGCAGGAACACTGTAATAACAACAAACCTAATGTCCATTAAT
+ACTAGAATGGATAAATTGTGGTATATTTATACAATGGTATATAAAGCAGTGAAAACAAGT
+AACTATACTATACTCAAAAACATAATGATGAATGAAAAAAGCTAGTTGAAGAATACACAC
+AGTATGATTTCACTTATATGAAATTCAAAACCAGGAAATACTAAGTACTGTATTGTTTAA
+TAGAAAGATAATAGTCCAGCCCGGGTGCGGCGGCTCAAGCCTGTAATCCCAGCACTTTGG
+GAGGCTGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACATGG
+AGAAACCCCATCTCTACTAAAAATACAAAACTACCTGGGCGTAGTGGCACATGCCTGTAA
+TTCCAGCTACTTGGGAGGCTGAGGCAGGAGAATCGCTTGAATCTGGGAGGTGGAGGTTGC
+AGTGAGTCAAGATTGCACCATTGCACTCCAGCCTGGGCAACAAGAGCAAAACTTCATCTC
+AAAAAAACAAAAAAAAAAACAACAGAAAAACAAGAAAAAAACAAGAATAAACACATCGAG
+CCGTGATCATGCCACTGCACTCAAGCCTGAGTGAGAGAGTGACACCCTGTCTCAAAGAAA
+ACAAAAGACAAAAATACATAACAGGCTGGGCTCAGTGGCTCACACCTATAATCCCAGGAT
+TTTGGGAGGTAGAGGTGGGTGGATCACTTGAGGTCAGGAGTTAGAGACCACCCTGGCCAA
+GGTGATGAAACCCTGTCTCTACTAAAAATACAAAAATTAGCCAGATGTGGTAGCGCACAC
+CTGTAATCCCAGCTACTTGGGAGGCTGAGGCTGGAGAACTGCTTGAACCCGGGAGGTGGA
+GGTTGGATTCAAGTGACAGCGCCACTGCAGTCCAGCCTGGGCGACAGTGAGACTTTGTAT
+CAAAACAACAAAACACATAACAGTCATTAATTACCACACATAAAGGTGCTTAATGGATAA
+TAAGTGCTCAAGGAAATGGCAGTCATGGTGGTGGTTGCAGAGAACAACTGGAGGGGATTA
+AGAGTGGTGGTAGAAACCAAGCAGAAACCAGTCTAAGAAAACCAAAAAAATACCAGGCAA
+ATAGTTTCAAAGGAAAATTATACCATTTTAAGATAAGGTAGACTGGTGGCTGGCGCCTGT
+AATCCCAGCACTTTGGGAGGCCGAGGTGGGTAGATCACCTGAGGTCAGGAGTTTGAGACC
+AGTCTGGCCAACATAGTGAAACCCCATCTCTACTAAAAATATAAAAATTAGTTACACGTG
+GTGGTGTGCACCTGTAGTCCCAACTTCTTGGGAGGCTGAGGCAGGAGAATCACTTGAACC
+TGGGAGGAAGAGGTTGCAGTCAGCCGAGATCATGCCACTGCATTCCAGCCTGGGTGACAG
+AGCAAGACTCTGTGTCAAAAAAAAAAAAAAAAAGACAAAGTAGTCTTAATGCTATTTCAG
+AACACAGAAAAGGAAAGAAAACTTCCACACTCATTTTATGAAGCAAGTGTAATGGTAACC
+CCAACCTGACAAAGACTGCACAAAAGATACTACAGTTGAATACTACTTACGAATATCAAC
+ACAAAATCTCTGAATATTAGCAAACAGAATCCAATGGCACATTATAAAAAGGAATACAAA
+TAAACAAGCACAGTTTATTCCAGGTATGAGAGATTCCATATAAGGAAATCTATTGACATG
+ATCTTCACATTAACAATGTTAAATGTGATGAAATCATATCATCTTCACGGAGGCAGTAAA
+GACATCTGACAGCTGGGTGCGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCAA
+GGCGGGTGGATTGCCTGAGCTCAGGAGTTCACGACCAGCATGGGCAACAGGGTGAAACCC
+TGTCTCTACTAAAATACAAAAAATTACCTGGGGGTGGCAATACGCGCTTGTAGTCCCAGC
+TACTTGGGAGGCTGAGGCAGGAGAATTGCTTGAAACCAGGAGGTGGAGGTTGCAGTGAGC
+TGAGATGGCACCACTGCACTCCAGCCTGAGAGACAGAGCAAGACTCCATCTCAAAAAACA
+AACAAACAAAAAGAAACAAAAAACTAAAATAGGAAATGATGGATACTGCCTTAAGTAAAC
+AAAATACACCTCGGTCCAAAGCCAGCATCTGACTTCATGTGAAAACATTAGAGACTTCCC
+TGCTAACATCACACACCACATAAGGCTAAAATATTTTCCAATTTACATTTTTAGAGGTAA
+GCATTAAATTCTATGATGCCAGACAAACATTTACCAGATTTTTTTTTTTGAGATAGAGTT
+TCACTCTTGTTGCTCAGGCTGGAGTGTAATGGCGCACAATCTCGGCTCGCCACAAGCTCC
+GCCTCCCAGGTTCAAGCGACTCTCCTGCCTCGGCCTCCCGAGTAGCTGGGATTACAGGCA
+TGTGCCACCACACCCAGCTAATTTTGTATTTTTTGTAGAGACGGGGTTTCTCCATGTTGG
+TCAGGCTGGTCTCGAACTCCTGACCTCAGGTGATCCGCCTGCCTCGGCCTCCCAAAGTGC
+TGGGATTACAGGTGTGAGCCACCGCGTCTGGCCAAAATTTACCAGTTCTTGAATGCTGAA
+TCCTGCTGGGATAACTGGAAACACTCCTAATGTTACTATAAACTCTCACACTGCCTTGTT
+GGCAATAATTAAATAGCTCTTTCTAGCAGTTGTGTTCAAGTGGACAGATAATCACAAATG
+TCTTTTCTTTCTAGCTCCCTTTGGACGAAAGAAAGAAAGAATTTACAACACGTCTAAAGA
+TTGTCACTTTTATCATCCCTTTGAATGATAAGTGATCAGGGCTGTCCCAAAAGAATGCCT
+GAAAACATTACAAAATGTTATCATAAGCAGAATGGCTCACTAAGGGGATGAACATTCTGG
+CAAGTCTTAATCTAGCATCACTACCAACTGTTTTTTCACTGGTCTAAAACATAACATCTA
+ATATTTTTATTTGCCTATATAGCCCTGCAGTAATAATTAACTTGTGAAAGGTCTACACTG
+ATTTTTTTCTTTATAAAGTATTTGCTTCAACCATGGAACAGAAACAAAGTTGATTTAATG
+GGAAAAATGTAAATTTATTTGAGGATAGGTTCAGCACTGCTAAATTGAAAAGGGAAAGAA
+AATTTCAAACAAACCCCAAAAAGAAACATTTTCACATCTGTAGTGAAAAAGCTAAATAGC
+AGTACACTAAACTTGGTTACTGACAAAATAAATCCTTGTGGCCAAGGTGAGGCAAAGATA
+TTACTCTCTACTTTGCTAGCCTACCTTGTGTTCAGTGAATTCAAACATTAATTGGGGCAA
+TGTATTGTTTTTGCTTCATTTTCTTTAGCTGTGCAGTAAAGTCTTTGCTTTCATAGAAAG
+ATTGTGCCAAGTTCTTTCTTTTTCAGAATTGCTTTTAAATCAGTTCTATTTAAGTATTTC
+TCTCTCTCTCAAACTTGACATTTTTCATTACACCTTTTTTTTCCAATGCCAAAATAGCAG
+CATTAACATTCATTCCAGTCACTAGGTAAAAACTCTCTTCAGTCAAACACACATGAATAA
+TCTAATTCGTGAGGTAAACGACTACTGTCATACAGTATTACGAAGTTCCAACCTTACTTG
+CAGAGCAAAAAATAAAAAATAAAAATAGTCATTTTCCTCAAGGGTTTAATTGTGGGCTTT
+TAATGTTGAAGTAAATCAATTTACTAGTTTTTAACCTCAAGTTTTTCCCTATTTCATTAT
+ATCTCAATAATGATTATCCAGAGATTTCTGAGGTAAATTACATTAAAGACAGCGCTAGTA
+ACTGGGAAACAATCCACACATGAATCTTAATATATTGCATATATGCTAATTTCATGTAAT
+GTTCCTGAGGGCGGTGACAATATTAGAGCTGTACGAGACCCTAGAGACCATCCCAATCCC
+TTAACTCCACAGATGAGAAACACAAAAACTTGCAGAAGCAGAGTGACGAGCACATGCCCA
+TGTGACATGTCATCAGAAGCAGGGACCAAGACTCAGGTCTCTTGCCTTCCAGGCCAGTCC
+TCTTTTGGTAGGCTAAAAGCTGCCTTTCACAGCTTCCCACAAATTTCAACACATCCTCAC
+TCAAGTACGTTTCTGAAAGGGACTCACCGACAACTCTTTAGAGAAAAGGAACTATATCTC
+AAACCATAGGAAAAGTTCTATATCTAGCTTATCAAACTTTTAGTCTCTGAAGCATGTGAG
+AGTAAGTTTGCCAGTATTTCTTTCTCATTATCCCCCAATACTTTAGTGACTATTTCCAAC
+AAACAAAACTAGACTATTCTCCCATACAAACAAAATGCTACCATCAAAATCGTCAACGCT
+CACCTGTCAGATTATGACCAAAGGCCTCTAAGTCAGAATCCCGCCCAGGCAAAATGACAT
+GGCAGCCAACTGTGGAGCCTCGGTTGGCCTCGGATAGCTGGTCCCCCCTTCATTCACACC
+ACATATCTGAGGACCAGATATGGAAAGCCTCTCATGCTGGAAAACTTGGCACAGTAGAAA
+GGCGGCCACACTTTTGCCCGTCACACAATGCACATTCACGGGGGACCTGTGCTAAACCAT
+TCACAGACAACCTGTTTCTGGGTCAGGGTTTCGTATACAGCAGAGCAGTCAGCTATCGAC
+ACAAGGGTTTGTAACAAAAAACAAAAAACAACAAAAAAAAGGAAGGACAAACCAAAACAA
+AAAAGAAAATACCAAAAAAAAGAAGAGTTAAAAAAAAAAGAAGATAAAAATAAAATCATT
+AACACAGATACACGACTACAATCTAATTTTCAGACATTCTAGTTTTGCCAATCATGCCGA
+TTCAGAATCATGAACCACATGGAGTTGCCATGTTTCTGCACACTTCTTTAGTGTGGAACA
+ATTCCTCAGTGTTTTCTTGGTTCCATGATCTTGACTTAATTTTGAGGTTTACAGGCCAGT
+TATTTTGCAGAATATCCCTAAACTCGGGTTACCTTGATGTTGCCTCCTGAAGATTTGGAT
+TATGCATGAAACACTTGACTGTCCCTATTCTAACTATATAAAGATGCTCCTTGACTTACT
+ATGGACTTACATCCCAATAAACCCTTCATAAATGGAAAACCTCATAAGGTGAAATGCTTG
+TTTTTTTTGGGAAAGTTTCTTGCTCTGTTGCCAGGCGGGAATGCGGTGACACGACGACGG
+CTCACTGTAGCCTTGACCACTCAGGTTCAAGCGATCCTCAGCCCCAGGCTCCCAAGTAGC
+TGGGACTATAGGCACGTACCACCACACCTGGCTAATTTTTTTTTTTTTTAATTTTTAGTA
+CCTTTAGTACAGAAAAGGCCTCACTATGTTGCCTAGATTGGTCTCAAACTCCTAAGCTCA
+AGGAATCCTCATGCCTCGGTCTCCCAAAGTCCTGGGATTATAGGCATGAGTCCTGCACCC
+AGCCTAGCTGAAAATACATTTAATACACCTAACTTACCAAACATCACAGCTTAGCTTAGC
+CTACCTTAAACATGTTCAGAGCACTTACATTAGCCTACAGTTGGGCAAAATCATCTAACG
+CAAAGCCCATTTTATAATAAAAGTGTTGAGTAACTCATATGATGTATGAAATACTATACT
+GAAAATGAAAAACAGAACAGTTGCGTGGGTACTCGGAAGTAAGGTTTCTACTGCATGTGT
+ATCACTTCCATACCATTGTAAAGTCATCTGACCACCAAGCCAAATATTTGTACTTCTGCA
+CATGCTGCTTTACATAGAGACCGAGACAAAGGGGGATTCACATCTTGTCAACCCTGTTTA
+CTTCAAAGGAATAACAGAGCATTAACTAGGTACCAAGTGCTCAGGACTTCAAGTATACTG
+TTTTATTTAATACTCATGACAATCCTTTGACCCCTTACTTGTGACTGGCAAGTTACCCAA
+CTTTTGTTTCATTTTCCTCATCTGTAAAAACCGGAGAGTGATACACACTCCCTGGAGAAT
+GACAGGGAAAACAGTACCAAGAACTTTAACCTAACAAGTCAGAAATCAATGAGACCCCTT
+TTGACAATTCCCACCTTGCCCTTTAGACATTCCGAGATATGCAGCTTTATTTAATAAATG
+ACCCCACCTTGGGCCTCAGCTGACTAGACCAGAGGCAGACACTCTCCCAGGCTGGAAAGG
+CCAGGTGGAGTGAGCTGTTCTGTGCAGAATGATCTCCCTAACAGTGTTTTTACACATTCT
+GTTCCACTTGGAATTGGATTCAGAGTGTTCTCAGCCTCTGGTCATCAAAAAACCTGAAGA
+CACATACCTTCAGGGAGCTAGGGTCAGCTCTCCTCTGCTGAGTGCATGGGGAAGTGACAA
+AAGAGAAAAGAGGGGGGAAAATCCAGCTGCTGAGAGATGTAGGGTACAAGAAGCTTTTGT
+AACTGTAAAGAGGGAGGGAGGTAGCTACGTTGGTCTATTTTCCAGTTCCTGGTTCCAACT
+ATACCTAAAGCTTTAACTGCACTTTCTATCCTTGCAGTAATATGACAAAGGCCCCC
diff --git a/test/tiny/q with spaces.fa.fai b/test/tiny/q with spaces.fa.fai
new file mode 100644
index 0000000..dd7673b
--- /dev/null
+++ b/test/tiny/q with spaces.fa.fai
@@ -0,0 +1 @@
+q 12356 3 60 61
diff --git a/test/tiny/q with spaces.regions b/test/tiny/q with spaces.regions
new file mode 100644
index 0000000..d129bcc
--- /dev/null
+++ b/test/tiny/q with spaces.regions
@@ -0,0 +1,3 @@
+q:0-5000
+q:5000-10000
+q:10000-12356
diff --git a/test/tiny/q.fa b/test/tiny/q.fa
new file mode 100644
index 0000000..085a809
--- /dev/null
+++ b/test/tiny/q.fa
@@ -0,0 +1,207 @@
+>q
+GATTCAACAAATATTTACTGAGTACCTACTATGTGCAATGTGTATACTAGGTGCTGGGGA
+TACAGCAGAGAACAAAGTCCCCACTCTCAATGAAAGGGGCCAGTCTTTTTATTCATCTTT
+AGTTAATTTTAAGCACACAACAGCTATCTCAAACTTTCTTCACACTTTCCAAGCCCCTGA
+TCCCTTATGTACTCTTGGCAGATGCCCCACCTTATCTCTTACCAGAACCTAAGGCCAACT
+AGCATAAAATCCCTTGAGCACACTTAGATTGGCATGCTTTTACTGTGACATCTGTTCCTA
+CAGCACCCTTACTTTCTTCTCTTGAGCTGAGGGTGAAGATCTCTTTCTTTCCCATCTGTA
+ATCTCACAATATGCACAACTACACCAGAGACAATGGAGGCACAATACAGGCCAGAGGTCT
+GTTCCTCATTCCTTCTGCGGGGCCTTCCTCCAGTGAATATCTCTTCTAACATTGTCAATC
+ATATCCAACAGAAAAAGAGATCTCTCCATTGGTACAAAAACCTACAGCTCCTAAGAAGAT
+TTTGACTCTCACTGCTCAACTTCTCAAATAAAAATTCCTCCACTTTAGTTCTAATTATCA
+CTCAAATGCTGTTAGACCTGTGGGCAGGTCACTATATCCCTGCCTAGGTCTGTTTCCCCA
+TCTGCGTAACAGAGAAATAATTGTGCTTTATCTGTCCTACAGAAGTGTGAAGAGGTTTGA
+TGTGACAGCAGTGAAAGCAATTTAAAAAGTACAAAGTGCTGTATAAAAGAAGACGTTTTC
+ATAATTATTCTGTTTTCACATGCTATCTTTATTCCAATGATCTCAAAGTACCTACCCAAG
+CACAGTTCTAACTCTAGTATATGTAAGCAAACGTGCAAGTCTGTCACCTTCCACACAAAG
+AGTCCCCCCACCCCCGCCAAACACACCACTCCCTCTCCCTCTCCTTCTCCCTCTCCCTCT
+CTCCTTCTCCCGCTTTCCACGGTCTCCCTCTGTTGCCGAAGCTGGACCGTACCGCCGTGA
+TCTCCGCTCGCTGCAACCTTCCTGCCTGTTTCTCCTGCCTCAGCCTGCCGAGTGCCTGGG
+ATTGCAGGCGCGCGCCGCCACGCCTGACTGGTTTTTGTATTTTTTGGAGACGGGGTTTCG
+CCATGTTGGATCGGCTGGTCTCCAGCTCCTAACCGCGAGTGATCTGCCTGCCTCGGCTTC
+CCGAGGTGCCGGGATTGCAGACGGAGTCTCGCTCACTCAGTGCTCAATGTTGCCCAGGCT
+GGAGTGCAGTGGCGTGATCTTGGCTCGCTACAACCTCCACCTCCCAGCCACCTGCCTTGG
+CCTCCTAAAGTGCCGAGATTGCAGCCTCTGCCTGGCTGCCACCCCGTCTGGGAAGTGAGG
+AGCGTCTCTGCCTGGCCGCCCATTGTCTGGGATGTGAAGAGCCCCTCTGCCTGGCCGCCC
+AGTCTGGGAAGTGAGGAGCGTCTCTGCCCAGCCGCCCATCGTCTGGGATGTGGGGAGCGC
+CTCTGCCCCGCCGCCCCGTCTGGGATGTGAGGAGCGCCTCTACCCGGCCGCGACCCCGTC
+TGGGAACTGAGGAGCGTCTCTGCCCGGCCGCCACCCCATCTGGGAGGTGAGGAGCGTCTC
+TGCCCAGCCGCCCCGTCTGAGAAGTAAGGAGCCCCTCCGCCCGGCAGCCGCCCCGTCTGG
+GAAGTGAGGAGCGTCTCCGCCCGGCAGCCAGCCCGTCTGGGAGGTGGGGGGCAGCCCCCG
+CCCGGCCAGCCGCCCCGTCCAGGAGGTGGGGGGCACCCCCCGCCCGGCAGCTGCCCCGTC
+GGTGGGGGGCGCCTCCGCCCGGCCGCCCCGTCTGGGAAGTGAGGAGCCCCTCTGCCGGGC
+CGCCACCCCGTCTGGGAGGTGTGCCCGGCAGCTCATTGAGAGCGGGCCATGATGACAATG
+GCGGTTTTGTCGAATAGAGGGGGGAAATGTGGGGAGAGGAGAGAGATCAGATTGTTACTG
+TGTCTGTGTGGAGGGAGGTGGACATGGGAGACTCCATTTTGTTCTGTACTGGGAAAAATT
+CTTCCGCCTTGGGATGCTGTTGATCTATGACCTTACCCCCAACCCCGTGCTCTCTGAAAC
+ATGTGCTGTGTCCACTCAGGGTTAAATGGATTAAGGGCAGTACAAGATGTGCTTTGTTAA
+ACAGATGCTTGAAGGCAGCATGCTAGTTAAGAGTCATCACCACTCCCTAATCTCAAGTAC
+CCAGGGACACGAACACTGAGGAAGGCCGCAGGGTCCTCCTCTGCCTAGGAAAACCAGAGA
+CCTTTGTTCACATGTTTATCTACTGACCTTCCCTCCACTATTGTCCTATGACCCTGCCAA
+ATCCCCCTCTCCGAGAAACACCCAAGAATGATCAATAAATACTAAAAAAAAAAAAAAAAA
+AAAAGGAATATTTTCACGTGCAAATGAAAAAGCCCTAAACAAATCTTAAAATATATAAAG
+TGGTTCAAAAAATGTTTATTGTTATTATTTTAATAGAGGCCTATACATGGAGCATTTATT
+TGGGAAAGTTTTAAATGAATGTTTTCCCCTGTTTTGGTTGTTATCAATCTACACTTTACC
+AATTTTCAATAATGATCATGCCTAATTAATTCAAATTAAAAGGTGTGTAATACAAAAAAA
+AAAAAAAAACTGCAACCAACCCTCGAATCAGCTCAATCTCTCACTGAATTATGGTTATCT
+GTCACCACTGTATCTGCCTATCAGTGATTAGGGAGAGCTCTCTCTGGAGAAAAATGTCAC
+CTAGAGCATCAACAATTTTCATACACAGTATCCAGCATTCACTTGAACGTTACCAAGCAT
+ACAAGGAAACAGAAACGAGAAAAAAGAGACAACATATCCACATATCCACAGTGACCCAGT
+TACTAGTTACTGAAGATGATCTTTAAAATAGCTGTAATTCGGCCGGGCGCAGTGGCTCAC
+GCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCTGAGGTCAGGAGTTCGAGACCAGCCTA
+GCCAACATGGTGAAATCCCGTCTCTAAAAATACAAAAATTAGCTGGGCGTGGTGGGGCGT
+GCCTGTAATCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATCACTTGAACCTAGGAAGCA
+GAGGTTGCAGTGAGCCTAGGGTGTGGTGGGGCGTGCCTGTAATCCCAGCTACTTGGGAGG
+CTGAGGCAGGAGAATCACTTGAACCTAGGAAGCAGAGATTGCACCATTGCACTCCAGCCT
+GGGCGACAAATTGCGACTGTATCAAAAAAAAAAAAAAAAAAAGAGGTAACTCTTGGAAAG
+GAGAAAATATCTGCAGCACATGAATCTAAAAAAAGACTCATCAAAATCTACAAAGAACTC
+CTATAATTTAATGAAACAAAAATCTCCACAGAAAAATTTGCAAAATAACCCTTAAATACT
+TCACGAGATAGCCGAATGTCATAAACATACAAAAAAGAGTTAACCACCAAAAAAGAGCTA
+AAATATCAGAGAAATGCAAACAAAACCCACAATGCATACCACTTCATACCCACTAGAGAG
+GCTAAAAATGATACCAAGTGTTGTTGAGAATATGCACTGGTCTGAACTCTAATACACTGC
+TAATGGAAATATTTTACCACTTTGGAAAACAGTTTGGCACATCTACTAAAATGGAATCTA
+TGCACAGACTGACACAGTCATTCTGCTACTAAATATATACCCAGCAGGACAATGTGGCTC
+ACGTCTATAATCAGCCCAGGCTGGTCTTGAACTCCTGAGCTCAAAGTGATCCTCTCACCT
+TGGCTTCCTAAAGTGTTGGGATTACAGGCATGAGCCACCACATCTGGCCACTTTTCTTTA
+TGTATATTATACATCAATAAAAGTAAAGAATAAAAAGGGAGAAGAGTCTGAAAAGAACAA
+TTATCACAAAATTAAGGAACGTACTCAACTCTTAAAGATTTGAGAAGGCGTTTCTAAGGT
+ACTAGCAATGTTCCATTTTGCAATCTGCACAGTGATGACAAGTATTTAACTTATTCTGTA
+AACAGTACCTTTTTTTTTTTTTTTTTTGAGACAGAGTCTTGCTCTGTCGCCCAGGATAGA
+GCACAGTGGCGCGATCTCAGCTCACGGCAGCCTCTGCCTCCTGGGGTGACGTGATTCTCA
+TGCCTCAGCCTCTGGAGTAGCTAGGATTACAGGCAGGCGCCACCACGCCCAGCTAATTTT
+TGTATTTTTAGTAGAGACAAGATTTTGCCATGTTGGCCAGGCTGGTCTTGAACTCCTGAC
+CTCAAGTGATCTGCCCGCCTTGGCCTTCCAAAGTGCTAGGATTACAGGCATGAGCCACTG
+TGCCCAGCCACAAACAGTACTTATATCTTACCTATGTTTAAATGGATATTCAAAATAAGA
+TTAAAAGAGGTAAGAAAATACAATACTCTAGTAAATGTAAAAATAAGTGAGAAGAATAAA
+TGCCAAGAGAAATATAATTACCAAGAAGAAATTCAAAGTAGGAACAGTCGTATAACCATT
+TTTAAAATAATTTTGAATCAGCTTAACATGTTTTCAACAAAAAACCAAACACATACCAAA
+ACCAAAAAACCACCCAGGTCCAGACTTGTACAGGCAAGTTCTACCAAACTTTGAAGGAAT
+CATTCTAATTTTACAGAAATAAAAACAGAAGAATAGGAAAAGAAACACTTGCCAATTCAT
+TTTGTAAGGGTAGTCTAATTTTCATACCAAACCACATGAGAAAAACCCTATTATTTATTA
+ACAAGCCAAATCCAGCAATATATAATAATAATACATCATGACCAAGTTGGGTTTAATCCC
+AGGAATGCAAAATTGATTCAATATTAGAAAATCTATTAGGGTAACATGAAACTTAAAGAA
+AAAAAACAAGAAAATTTCTATACATGCAATTAAAAAGCATCTGTAGAAATTCAATCATTC
+AGGATTCTTTTTTTCTTTAAGTTTAAAAAAACTTGACAAAGAAAGCTTTATGAAGCCAGG
+TGCTGTGGCATGTGCCTGTAATCCCAGCCATTCAGGAGGGTAAAGTGGGATGATCACTTG
+AGCCCAGGAGTTTGAGACCAGTCTGAGCAACATAGCAAGACCCTGTTTCCGGGGAGGGGG
+TGGGTGGGAAGTAAGCCTTATGAAAAACCAGTAGCCCCTATCACTCAATAAAAGCTTTCC
+TTTTAACATCAGAACAAGTCAGGGATGCCCTCTGTATCACCACTGCTCTTCAACTTTGTA
+ATGGAGAGTCCGGCCAGCACAGTTAAGACAAGATAAAATATATCTAAGTCATGAAAAGAA
+ACAAAATATCATGATTTTCAGATGATATGTTTGTCCTTAAAAAATTTAAAGAGGGATTAC
+TGGAATAAGAGTTTGTTGGACAATCTTTTTATTCCAGTAATTCATCAGTTTACAAAAGTC
+AAATGCATTTCTATAAACCATCTACAGATATCTCTTAGAATATAGCCGGGCATGGTGGTG
+GGCGCCTGTAATCCCAGCTACTAGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCAGGAG
+GCGGAGCTTGCAATGAGCTGAGATCGCCACTGCACTCCAGCCTGGGCCACAGAGCGAGAC
+TCTGTCTCAAAAAAAAAAAAAAAAAAAAAAGATATCACTTAGAATAGCAACAACAAACAT
+AAAGTATCAAGGAATCAATTAAACACATGATGTGCAATAATAATAACACAAAACTTTACT
+GGGAAAACAACTTAGGAAAACAATGTGGAATACCCGGTGAATGTACATATGCCCTACCAT
+CCAGCAATTCCACTACTGGGGAATCTGTCTAAAGATCTGTGAAACTCTTGTGTATCAGGA
+GAAATGTGCAAGGTTACTTGCAGGAACACTGTAATAACAACAAACCTAATGTCCATTAAT
+ACTAGAATGGATAAATTGTGGTATATTTATACAATGGTATATAAAGCAGTGAAAACAAGT
+AACTATACTATACTCAAAAACATAATGATGAATGAAAAAAGCTAGTTGAAGAATACACAC
+AGTATGATTTCACTTATATGAAATTCAAAACCAGGAAATACTAAGTACTGTATTGTTTAA
+TAGAAAGATAATAGTCCAGCCCGGGTGCGGCGGCTCAAGCCTGTAATCCCAGCACTTTGG
+GAGGCTGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACATGG
+AGAAACCCCATCTCTACTAAAAATACAAAACTACCTGGGCGTAGTGGCACATGCCTGTAA
+TTCCAGCTACTTGGGAGGCTGAGGCAGGAGAATCGCTTGAATCTGGGAGGTGGAGGTTGC
+AGTGAGTCAAGATTGCACCATTGCACTCCAGCCTGGGCAACAAGAGCAAAACTTCATCTC
+AAAAAAACAAAAAAAAAAACAACAGAAAAACAAGAAAAAAACAAGAATAAACACATCGAG
+CCGTGATCATGCCACTGCACTCAAGCCTGAGTGAGAGAGTGACACCCTGTCTCAAAGAAA
+ACAAAAGACAAAAATACATAACAGGCTGGGCTCAGTGGCTCACACCTATAATCCCAGGAT
+TTTGGGAGGTAGAGGTGGGTGGATCACTTGAGGTCAGGAGTTAGAGACCACCCTGGCCAA
+GGTGATGAAACCCTGTCTCTACTAAAAATACAAAAATTAGCCAGATGTGGTAGCGCACAC
+CTGTAATCCCAGCTACTTGGGAGGCTGAGGCTGGAGAACTGCTTGAACCCGGGAGGTGGA
+GGTTGGATTCAAGTGACAGCGCCACTGCAGTCCAGCCTGGGCGACAGTGAGACTTTGTAT
+CAAAACAACAAAACACATAACAGTCATTAATTACCACACATAAAGGTGCTTAATGGATAA
+TAAGTGCTCAAGGAAATGGCAGTCATGGTGGTGGTTGCAGAGAACAACTGGAGGGGATTA
+AGAGTGGTGGTAGAAACCAAGCAGAAACCAGTCTAAGAAAACCAAAAAAATACCAGGCAA
+ATAGTTTCAAAGGAAAATTATACCATTTTAAGATAAGGTAGACTGGTGGCTGGCGCCTGT
+AATCCCAGCACTTTGGGAGGCCGAGGTGGGTAGATCACCTGAGGTCAGGAGTTTGAGACC
+AGTCTGGCCAACATAGTGAAACCCCATCTCTACTAAAAATATAAAAATTAGTTACACGTG
+GTGGTGTGCACCTGTAGTCCCAACTTCTTGGGAGGCTGAGGCAGGAGAATCACTTGAACC
+TGGGAGGAAGAGGTTGCAGTCAGCCGAGATCATGCCACTGCATTCCAGCCTGGGTGACAG
+AGCAAGACTCTGTGTCAAAAAAAAAAAAAAAAAGACAAAGTAGTCTTAATGCTATTTCAG
+AACACAGAAAAGGAAAGAAAACTTCCACACTCATTTTATGAAGCAAGTGTAATGGTAACC
+CCAACCTGACAAAGACTGCACAAAAGATACTACAGTTGAATACTACTTACGAATATCAAC
+ACAAAATCTCTGAATATTAGCAAACAGAATCCAATGGCACATTATAAAAAGGAATACAAA
+TAAACAAGCACAGTTTATTCCAGGTATGAGAGATTCCATATAAGGAAATCTATTGACATG
+ATCTTCACATTAACAATGTTAAATGTGATGAAATCATATCATCTTCACGGAGGCAGTAAA
+GACATCTGACAGCTGGGTGCGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCAA
+GGCGGGTGGATTGCCTGAGCTCAGGAGTTCACGACCAGCATGGGCAACAGGGTGAAACCC
+TGTCTCTACTAAAATACAAAAAATTACCTGGGGGTGGCAATACGCGCTTGTAGTCCCAGC
+TACTTGGGAGGCTGAGGCAGGAGAATTGCTTGAAACCAGGAGGTGGAGGTTGCAGTGAGC
+TGAGATGGCACCACTGCACTCCAGCCTGAGAGACAGAGCAAGACTCCATCTCAAAAAACA
+AACAAACAAAAAGAAACAAAAAACTAAAATAGGAAATGATGGATACTGCCTTAAGTAAAC
+AAAATACACCTCGGTCCAAAGCCAGCATCTGACTTCATGTGAAAACATTAGAGACTTCCC
+TGCTAACATCACACACCACATAAGGCTAAAATATTTTCCAATTTACATTTTTAGAGGTAA
+GCATTAAATTCTATGATGCCAGACAAACATTTACCAGATTTTTTTTTTTGAGATAGAGTT
+TCACTCTTGTTGCTCAGGCTGGAGTGTAATGGCGCACAATCTCGGCTCGCCACAAGCTCC
+GCCTCCCAGGTTCAAGCGACTCTCCTGCCTCGGCCTCCCGAGTAGCTGGGATTACAGGCA
+TGTGCCACCACACCCAGCTAATTTTGTATTTTTTGTAGAGACGGGGTTTCTCCATGTTGG
+TCAGGCTGGTCTCGAACTCCTGACCTCAGGTGATCCGCCTGCCTCGGCCTCCCAAAGTGC
+TGGGATTACAGGTGTGAGCCACCGCGTCTGGCCAAAATTTACCAGTTCTTGAATGCTGAA
+TCCTGCTGGGATAACTGGAAACACTCCTAATGTTACTATAAACTCTCACACTGCCTTGTT
+GGCAATAATTAAATAGCTCTTTCTAGCAGTTGTGTTCAAGTGGACAGATAATCACAAATG
+TCTTTTCTTTCTAGCTCCCTTTGGACGAAAGAAAGAAAGAATTTACAACACGTCTAAAGA
+TTGTCACTTTTATCATCCCTTTGAATGATAAGTGATCAGGGCTGTCCCAAAAGAATGCCT
+GAAAACATTACAAAATGTTATCATAAGCAGAATGGCTCACTAAGGGGATGAACATTCTGG
+CAAGTCTTAATCTAGCATCACTACCAACTGTTTTTTCACTGGTCTAAAACATAACATCTA
+ATATTTTTATTTGCCTATATAGCCCTGCAGTAATAATTAACTTGTGAAAGGTCTACACTG
+ATTTTTTTCTTTATAAAGTATTTGCTTCAACCATGGAACAGAAACAAAGTTGATTTAATG
+GGAAAAATGTAAATTTATTTGAGGATAGGTTCAGCACTGCTAAATTGAAAAGGGAAAGAA
+AATTTCAAACAAACCCCAAAAAGAAACATTTTCACATCTGTAGTGAAAAAGCTAAATAGC
+AGTACACTAAACTTGGTTACTGACAAAATAAATCCTTGTGGCCAAGGTGAGGCAAAGATA
+TTACTCTCTACTTTGCTAGCCTACCTTGTGTTCAGTGAATTCAAACATTAATTGGGGCAA
+TGTATTGTTTTTGCTTCATTTTCTTTAGCTGTGCAGTAAAGTCTTTGCTTTCATAGAAAG
+ATTGTGCCAAGTTCTTTCTTTTTCAGAATTGCTTTTAAATCAGTTCTATTTAAGTATTTC
+TCTCTCTCTCAAACTTGACATTTTTCATTACACCTTTTTTTTCCAATGCCAAAATAGCAG
+CATTAACATTCATTCCAGTCACTAGGTAAAAACTCTCTTCAGTCAAACACACATGAATAA
+TCTAATTCGTGAGGTAAACGACTACTGTCATACAGTATTACGAAGTTCCAACCTTACTTG
+CAGAGCAAAAAATAAAAAATAAAAATAGTCATTTTCCTCAAGGGTTTAATTGTGGGCTTT
+TAATGTTGAAGTAAATCAATTTACTAGTTTTTAACCTCAAGTTTTTCCCTATTTCATTAT
+ATCTCAATAATGATTATCCAGAGATTTCTGAGGTAAATTACATTAAAGACAGCGCTAGTA
+ACTGGGAAACAATCCACACATGAATCTTAATATATTGCATATATGCTAATTTCATGTAAT
+GTTCCTGAGGGCGGTGACAATATTAGAGCTGTACGAGACCCTAGAGACCATCCCAATCCC
+TTAACTCCACAGATGAGAAACACAAAAACTTGCAGAAGCAGAGTGACGAGCACATGCCCA
+TGTGACATGTCATCAGAAGCAGGGACCAAGACTCAGGTCTCTTGCCTTCCAGGCCAGTCC
+TCTTTTGGTAGGCTAAAAGCTGCCTTTCACAGCTTCCCACAAATTTCAACACATCCTCAC
+TCAAGTACGTTTCTGAAAGGGACTCACCGACAACTCTTTAGAGAAAAGGAACTATATCTC
+AAACCATAGGAAAAGTTCTATATCTAGCTTATCAAACTTTTAGTCTCTGAAGCATGTGAG
+AGTAAGTTTGCCAGTATTTCTTTCTCATTATCCCCCAATACTTTAGTGACTATTTCCAAC
+AAACAAAACTAGACTATTCTCCCATACAAACAAAATGCTACCATCAAAATCGTCAACGCT
+CACCTGTCAGATTATGACCAAAGGCCTCTAAGTCAGAATCCCGCCCAGGCAAAATGACAT
+GGCAGCCAACTGTGGAGCCTCGGTTGGCCTCGGATAGCTGGTCCCCCCTTCATTCACACC
+ACATATCTGAGGACCAGATATGGAAAGCCTCTCATGCTGGAAAACTTGGCACAGTAGAAA
+GGCGGCCACACTTTTGCCCGTCACACAATGCACATTCACGGGGGACCTGTGCTAAACCAT
+TCACAGACAACCTGTTTCTGGGTCAGGGTTTCGTATACAGCAGAGCAGTCAGCTATCGAC
+ACAAGGGTTTGTAACAAAAAACAAAAAACAACAAAAAAAAGGAAGGACAAACCAAAACAA
+AAAAGAAAATACCAAAAAAAAGAAGAGTTAAAAAAAAAAGAAGATAAAAATAAAATCATT
+AACACAGATACACGACTACAATCTAATTTTCAGACATTCTAGTTTTGCCAATCATGCCGA
+TTCAGAATCATGAACCACATGGAGTTGCCATGTTTCTGCACACTTCTTTAGTGTGGAACA
+ATTCCTCAGTGTTTTCTTGGTTCCATGATCTTGACTTAATTTTGAGGTTTACAGGCCAGT
+TATTTTGCAGAATATCCCTAAACTCGGGTTACCTTGATGTTGCCTCCTGAAGATTTGGAT
+TATGCATGAAACACTTGACTGTCCCTATTCTAACTATATAAAGATGCTCCTTGACTTACT
+ATGGACTTACATCCCAATAAACCCTTCATAAATGGAAAACCTCATAAGGTGAAATGCTTG
+TTTTTTTTGGGAAAGTTTCTTGCTCTGTTGCCAGGCGGGAATGCGGTGACACGACGACGG
+CTCACTGTAGCCTTGACCACTCAGGTTCAAGCGATCCTCAGCCCCAGGCTCCCAAGTAGC
+TGGGACTATAGGCACGTACCACCACACCTGGCTAATTTTTTTTTTTTTTAATTTTTAGTA
+CCTTTAGTACAGAAAAGGCCTCACTATGTTGCCTAGATTGGTCTCAAACTCCTAAGCTCA
+AGGAATCCTCATGCCTCGGTCTCCCAAAGTCCTGGGATTATAGGCATGAGTCCTGCACCC
+AGCCTAGCTGAAAATACATTTAATACACCTAACTTACCAAACATCACAGCTTAGCTTAGC
+CTACCTTAAACATGTTCAGAGCACTTACATTAGCCTACAGTTGGGCAAAATCATCTAACG
+CAAAGCCCATTTTATAATAAAAGTGTTGAGTAACTCATATGATGTATGAAATACTATACT
+GAAAATGAAAAACAGAACAGTTGCGTGGGTACTCGGAAGTAAGGTTTCTACTGCATGTGT
+ATCACTTCCATACCATTGTAAAGTCATCTGACCACCAAGCCAAATATTTGTACTTCTGCA
+CATGCTGCTTTACATAGAGACCGAGACAAAGGGGGATTCACATCTTGTCAACCCTGTTTA
+CTTCAAAGGAATAACAGAGCATTAACTAGGTACCAAGTGCTCAGGACTTCAAGTATACTG
+TTTTATTTAATACTCATGACAATCCTTTGACCCCTTACTTGTGACTGGCAAGTTACCCAA
+CTTTTGTTTCATTTTCCTCATCTGTAAAAACCGGAGAGTGATACACACTCCCTGGAGAAT
+GACAGGGAAAACAGTACCAAGAACTTTAACCTAACAAGTCAGAAATCAATGAGACCCCTT
+TTGACAATTCCCACCTTGCCCTTTAGACATTCCGAGATATGCAGCTTTATTTAATAAATG
+ACCCCACCTTGGGCCTCAGCTGACTAGACCAGAGGCAGACACTCTCCCAGGCTGGAAAGG
+CCAGGTGGAGTGAGCTGTTCTGTGCAGAATGATCTCCCTAACAGTGTTTTTACACATTCT
+GTTCCACTTGGAATTGGATTCAGAGTGTTCTCAGCCTCTGGTCATCAAAAAACCTGAAGA
+CACATACCTTCAGGGAGCTAGGGTCAGCTCTCCTCTGCTGAGTGCATGGGGAAGTGACAA
+AAGAGAAAAGAGGGGGGAAAATCCAGCTGCTGAGAGATGTAGGGTACAAGAAGCTTTTGT
+AACTGTAAAGAGGGAGGGAGGTAGCTACGTTGGTCTATTTTCCAGTTCCTGGTTCCAACT
+ATACCTAAAGCTTTAACTGCACTTTCTATCCTTGCAGTAATATGACAAAGGCCCCC
diff --git a/test/tiny/q.fa.fai b/test/tiny/q.fa.fai
new file mode 100644
index 0000000..dd7673b
--- /dev/null
+++ b/test/tiny/q.fa.fai
@@ -0,0 +1 @@
+q 12356 3 60 61
diff --git a/test/tiny/q.regions b/test/tiny/q.regions
new file mode 100644
index 0000000..d129bcc
--- /dev/null
+++ b/test/tiny/q.regions
@@ -0,0 +1,3 @@
+q:0-5000
+q:5000-10000
+q:10000-12356
diff --git a/test/tiny/q.vcf.gz b/test/tiny/q.vcf.gz
new file mode 100644
index 0000000..87d557e
Binary files /dev/null and b/test/tiny/q.vcf.gz differ
diff --git a/test/tiny/q.vcf.gz.tbi b/test/tiny/q.vcf.gz.tbi
new file mode 100644
index 0000000..0781014
Binary files /dev/null and b/test/tiny/q.vcf.gz.tbi differ
diff --git a/test/tiny/q_spiked.vcf.gz b/test/tiny/q_spiked.vcf.gz
new file mode 100644
index 0000000..16f3ac4
Binary files /dev/null and b/test/tiny/q_spiked.vcf.gz differ
diff --git a/test/tiny/q_spiked.vcf.gz.tbi b/test/tiny/q_spiked.vcf.gz.tbi
new file mode 100644
index 0000000..0f0f0da
Binary files /dev/null and b/test/tiny/q_spiked.vcf.gz.tbi differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/freebayes.git
More information about the debian-med-commit
mailing list