[med-svn] [tvc] 03/05: New upstream version 5.0.3+dfsg1
Andreas Tille
tille at debian.org
Fri Jan 13 12:46:02 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository tvc.
commit c7f1a0f78ce0f69ce88e7d5f9d71012d21e181fd
Author: Andreas Tille <tille at debian.org>
Date: Fri Jan 13 13:31:39 2017 +0100
New upstream version 5.0.3+dfsg1
---
external/vcflib/BedReader.h | 123 --
external/vcflib/LICENSE | 19 -
external/vcflib/README | 29 -
external/vcflib/Variant.cpp | 2004 --------------------
external/vcflib/Variant.h | 480 -----
external/vcflib/convert.h | 22 -
external/vcflib/join.h | 36 -
external/vcflib/multichoose/Makefile | 17 -
external/vcflib/multichoose/README | 40 -
external/vcflib/multichoose/multichoose.c | 53 -
external/vcflib/multichoose/multichoose.cpp | 66 -
external/vcflib/multichoose/multichoose.h | 79 -
external/vcflib/multichoose/multichoose.py | 55 -
external/vcflib/multichoose/multipermute.cpp | 66 -
external/vcflib/multichoose/multipermute.h | 132 --
external/vcflib/multichoose/multipermute.py | 98 -
.../vcflib/smithwaterman/BandedSmithWaterman.cpp | 670 -------
.../vcflib/smithwaterman/BandedSmithWaterman.h | 111 --
external/vcflib/smithwaterman/IndelAllele.cpp | 62 -
external/vcflib/smithwaterman/IndelAllele.h | 37 -
external/vcflib/smithwaterman/LeftAlign.cpp | 853 ---------
external/vcflib/smithwaterman/LeftAlign.h | 32 -
external/vcflib/smithwaterman/Makefile | 34 -
external/vcflib/smithwaterman/Mosaik.h | 73 -
external/vcflib/smithwaterman/Repeats.cpp | 69 -
external/vcflib/smithwaterman/Repeats.h | 8 -
external/vcflib/smithwaterman/SWMain.cpp | 126 --
.../vcflib/smithwaterman/SmithWatermanGotoh.cpp | 741 --------
external/vcflib/smithwaterman/SmithWatermanGotoh.h | 100 -
external/vcflib/smithwaterman/convert.h | 22 -
external/vcflib/smithwaterman/smithwaterman.cpp | 246 ---
external/vcflib/split.cpp | 23 -
external/vcflib/split.h | 53 -
external/vcflib/tabixpp/ChangeLog | 593 ------
external/vcflib/tabixpp/NEWS | 126 --
external/vcflib/tabixpp/README | 6 -
external/vcflib/tabixpp/bam_endian.h | 42 -
external/vcflib/tabixpp/bedidx.c | 156 --
external/vcflib/tabixpp/bgzf.c | 711 -------
external/vcflib/tabixpp/bgzf.h | 157 --
external/vcflib/tabixpp/bgzip.c | 206 --
external/vcflib/tabixpp/index.c | 998 ----------
external/vcflib/tabixpp/khash.h | 486 -----
external/vcflib/tabixpp/knetfile.c | 632 ------
external/vcflib/tabixpp/knetfile.h | 75 -
external/vcflib/tabixpp/kseq.h | 227 ---
external/vcflib/tabixpp/ksort.h | 271 ---
external/vcflib/tabixpp/kstring.c | 165 --
external/vcflib/tabixpp/kstring.h | 68 -
external/vcflib/tabixpp/main.c | 290 ---
external/vcflib/tabixpp/main.cpp | 47 -
external/vcflib/tabixpp/tabix.1 | 132 --
external/vcflib/tabixpp/tabix.cpp | 90 -
external/vcflib/tabixpp/tabix.h | 145 --
external/vcflib/tabixpp/tabix.hpp | 31 -
55 files changed, 12233 deletions(-)
diff --git a/external/vcflib/BedReader.h b/external/vcflib/BedReader.h
deleted file mode 100644
index b78ca43..0000000
--- a/external/vcflib/BedReader.h
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef BEDREADER_H
-#define BEDREADER_H
-
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <vector>
-#include <deque>
-#include <map>
-#include <iterator>
-#include <algorithm>
-#include "intervaltree/IntervalTree.h"
-#include "split.h"
-
-using namespace std;
-
-string strip(string const& str, char const* separators = " \t") {
- string::size_type const first = str.find_first_not_of(separators);
- return (first == string::npos) ? string()
- : str.substr(first, str.find_last_not_of(separators) - first + 1);
-}
-
-// stores the posiitional information of a bed target entry
-class BedTarget {
-
-public:
-
- string seq; // sequence name
- int left; // left position
- int right; // right position, adjusted to 0-base
- string desc; // descriptive information, target name typically
-
- BedTarget(string s, int l, int r, string d = "")
- : seq(s)
- , left(l)
- , right(r)
- , desc(d)
- { }
-
-};
-
-
-class BedReader {
-
- bool _isOpen;
- ifstream file;
-
-public:
-
- bool isOpen(void) { return _isOpen; }
-
- vector<BedTarget> targets;
- map<string, IntervalTree<BedTarget*> > intervals; // intervals by reference sequence
-
- vector<BedTarget> entries(void) {
-
- vector<BedTarget> entries;
-
- if (!isOpen()) {
- cerr << "bed targets file is not open" << endl;
- exit(1);
- }
-
- string line;
- while (std::getline(file, line)) {
- vector<string> fields = split(line, " \t");
- BedTarget entry(strip(fields[0]),
- atoi(strip(fields[1]).c_str()),
- atoi(strip(fields[2]).c_str()),
- (fields.size() >= 4) ? strip(fields[3]) : "");
- entries.push_back(entry);
- }
-
- return entries;
-
- }
-
- vector<BedTarget*> targetsContained(BedTarget& target) {
- vector<Interval<BedTarget*> > results;
- intervals[target.seq].findContained(target.left, target.right, results);
- vector<BedTarget*> contained;
- for (vector<Interval<BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
- contained.push_back(r->value);
- }
- return contained;
- }
-
- vector<BedTarget*> targetsOverlapping(BedTarget& target) {
- vector<Interval<BedTarget*> > results;
- intervals[target.seq].findOverlapping(target.left, target.right, results);
- vector<BedTarget*> overlapping;
- for (vector<Interval<BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
- overlapping.push_back(r->value);
- }
- return overlapping;
- }
-
- BedReader(void)
- : _isOpen(false)
- { }
-
- BedReader(string& fname)
- : _isOpen(false) {
- open(fname);
- }
-
- void open(const string& fname) {
- file.open(fname.c_str());
- _isOpen = true;
- targets = entries();
- map<string, vector<Interval<BedTarget*> > > intervalsBySeq;
- for (vector<BedTarget>::iterator t = targets.begin(); t != targets.end(); ++t) {
- intervalsBySeq[t->seq].push_back(Interval<BedTarget*>(t->left, t->right, &*t));
- }
- for (map<string, vector<Interval<BedTarget*> > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) {
- intervals[s->first] = IntervalTree<BedTarget*>(s->second);
- }
- }
-
-};
-
-#endif
-
diff --git a/external/vcflib/LICENSE b/external/vcflib/LICENSE
deleted file mode 100644
index 0708937..0000000
--- a/external/vcflib/LICENSE
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2012 Erik Garrison
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/external/vcflib/README b/external/vcflib/README
deleted file mode 100644
index a321e75..0000000
--- a/external/vcflib/README
+++ /dev/null
@@ -1,29 +0,0 @@
-vcflib
- a simple C++ library for parsing and manipulating VCF files.
-
-author: Erik Garrison <erik.garrison at bc.edu>
-
-license: MIT
-
-The Variant Call Format (VCF) is a flat-file, tab-delimited textual format
-intended to concisely describe reference-indexed variations between
-individuals. The current specification can be found on the 1000 Genomes wiki
-(http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41)
-
-This library provides a simple API to map each VCF record into a C++ class.
-See included applications for example usage.
-
-note: vcflib includes submodules, so to obtain vcflib you have to use:
-
- % git clone --recursive git://github.com/ekg/vcflib.git
-
-or
-
- % git clone --recursive https://github.com/ekg/vcflib.git
-
-To build, use Make:
-
- % cd vcflib
- % make
-
-Executables are built into the root directory of the repository.
diff --git a/external/vcflib/Variant.cpp b/external/vcflib/Variant.cpp
deleted file mode 100644
index 2baa44f..0000000
--- a/external/vcflib/Variant.cpp
+++ /dev/null
@@ -1,2004 +0,0 @@
-#include "Variant.h"
-
-namespace vcf {
-
- void Variant::parse(string& line, bool parseSamples) {
-
- // clean up potentially variable data structures
- info.clear();
- infoFlags.clear();
- format.clear();
- alt.clear();
- alleles.clear();
-
- // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT [SAMPLE1 .. SAMPLEN]
- vector<string> fields = split(line, '\t');
-
- sequenceName = fields.at(0);
- char* end; // dummy variable for strtoll
- position = strtoll(fields.at(1).c_str(), &end, 10);
- id = fields.at(2);
- ref = fields.at(3);
- alt = split(fields.at(4), ","); // a comma-separated list of alternate alleles
-
- // make a list of all (ref + alts) alleles, allele[0] = ref, alleles[1:] = alts
- // add the ref allele ([0]), resize for the alt alleles, and then add the alt alleles
- alleles.push_back(ref);
- alleles.resize(alt.size()+1);
- std::copy(alt.begin(), alt.end(), alleles.begin()+1);
-
- // set up reverse lookup of allele index
- altAlleleIndexes.clear();
- int n = 0;
- for (vector<string>::iterator a = alt.begin();
- a != alt.end(); ++a, ++n) {
- altAlleleIndexes[*a] = n;
- }
-
- convert(fields.at(5), quality);
- filter = fields.at(6);
- if (fields.size() > 7) {
- vector<string> infofields = split(fields.at(7), ';');
- for (vector<string>::iterator f = infofields.begin(); f != infofields.end(); ++f) {
- if (*f == ".") {
- continue;
- }
- vector<string> kv = split(*f, '=');
- if (kv.size() == 2) {
- split(kv.at(1), ',', info[kv.at(0)]);
- } else
- if (kv.size() == 1) {
- infoFlags[kv.at(0)] = true;
- }
- }
- }
- // check if we have samples specified
- // and that we are supposed to parse them
- if (parseSamples && fields.size() > 8) {
- format = split(fields.at(8), ':');
- // if the format changed, we have to rebuild the samples
- if (fields.at(8) != lastFormat) {
- samples.clear();
- lastFormat = fields.at(8);
- }
- vector<string>::iterator sampleName = sampleNames.begin();
- vector<string>::iterator sample = fields.begin() + 9;
- for (; sample != fields.end() && sampleName != sampleNames.end(); ++sample, ++sampleName) {
- string& name = *sampleName;
- if (*sample == "." || *sample == "./.") {
- samples.erase(name);
- continue;
- }
- vector<string> samplefields = split(*sample, ':');
- vector<string>::iterator i = samplefields.begin();
- if (samplefields.size() != format.size()) {
- // ignore this case... malformed (or 'null') sample specs are caught above
- // /*
- // cerr << "inconsistent number of fields for sample " << name << endl
- // << "format is " << join(format, ":") << endl
- // << "sample is " << *sample << endl;
- // exit(1);
- // *
- } else {
- for (vector<string>::iterator f = format.begin(); f != format.end(); ++f) {
- samples[name][*f] = split(*i, ',');
- ++i;
- }
- }
- }
- if (sampleName != sampleNames.end()) {
- cerr << "error: more sample names in header than sample fields" << endl;
- cerr << "samples: " << join(sampleNames, " ") << endl;
- cerr << "line: " << line << endl;
- exit(1);
- }
- if (sample != fields.end()) {
- cerr << "error: more sample fields than samples listed in header" << endl;
- cerr << "samples: " << join(sampleNames, " ") << endl;
- cerr << "line: " << line << endl;
- exit(1);
- }
- }
-
- //return true; // we should be catching exceptions...
- }
-
- void Variant::setVariantCallFile(VariantCallFile& v) {
- sampleNames = v.sampleNames;
- outputSampleNames = v.sampleNames;
- vcf = &v;
- }
-
- void Variant::setVariantCallFile(VariantCallFile* v) {
- sampleNames = v->sampleNames;
- outputSampleNames = v->sampleNames;
- vcf = v;
- }
-
- ostream& operator<<(ostream& out, VariantFieldType type) {
- switch (type) {
- case FIELD_INTEGER:
- out << "integer";
- break;
- case FIELD_FLOAT:
- out << "float";
- break;
- case FIELD_BOOL:
- out << "bool";
- break;
- case FIELD_STRING:
- out << "string";
- break;
- default:
- out << "unknown";
- break;
- }
- return out;
- }
-
- VariantFieldType typeStrToVariantFieldType(string& typeStr) {
- if (typeStr == "Integer") {
- return FIELD_INTEGER;
- } else
- if (typeStr == "Float") {
- return FIELD_FLOAT;
- } else
- if (typeStr == "Flag") {
- return FIELD_BOOL;
- } else
- if (typeStr == "String") {
- return FIELD_STRING;
- } else {
- return FIELD_UNKNOWN;
- }
- }
-
- VariantFieldType Variant::infoType(string& key) {
- map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
- if (s == vcf->infoTypes.end()) {
- if (key == "QUAL") { // hack to use QUAL as an "info" field
- return FIELD_INTEGER;
- }
- cerr << "no info field " << key << endl;
- exit(1);
- } else {
- return s->second;
- }
- }
-
- VariantFieldType Variant::formatType(string& key) {
- map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
- if (s == vcf->formatTypes.end()) {
- cerr << "no format field " << key << endl;
- exit(1);
- } else {
- return s->second;
- }
- }
-
- bool Variant::getInfoValueBool(string& key, int index) {
- map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
- if (s == vcf->infoTypes.end()) {
- cerr << "no info field " << key << endl;
- exit(1);
- } else {
- int count = vcf->infoCounts[key];
- // XXX TODO, fix for Genotype variants...
- if (count != ALLELE_NUMBER) {
- index = 0;
- }
- if (index == INDEX_NONE) {
- if (count != 1) {
- cerr << "no field index supplied and field count != 1" << endl;
- exit(1);
- } else {
- index = 0;
- }
- }
- VariantFieldType type = s->second;
- if (type == FIELD_BOOL) {
- map<string, bool>::iterator b = infoFlags.find(key);
- if (b == infoFlags.end())
- return false;
- else
- return true;
- } else {
- cerr << "not flag type " << key << endl;
- exit(1);
- }
- }
- }
-
- string Variant::getInfoValueString(string& key, int index) {
- map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
- if (s == vcf->infoTypes.end()) {
- cerr << "no info field " << key << endl;
- exit(1);
- } else {
- int count = vcf->infoCounts[key];
- // XXX TODO, fix for Genotype variants...
- if (count != ALLELE_NUMBER) {
- index = 0;
- }
- if (index == INDEX_NONE) {
- if (count != 1) {
- cerr << "no field index supplied and field count != 1" << endl;
- exit(1);
- } else {
- index = 0;
- }
- }
- VariantFieldType type = s->second;
- if (type == FIELD_STRING) {
- map<string, vector<string> >::iterator b = info.find(key);
- if (b == info.end())
- return "";
- return b->second.at(index);
- } else {
- cerr << "not string type " << key << endl;
- return "";
- }
- }
- }
-
- double Variant::getInfoValueFloat(string& key, int index) {
- map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
- if (s == vcf->infoTypes.end()) {
- if (key == "QUAL") {
- return quality;
- }
- cerr << "no info field " << key << endl;
- exit(1);
- } else {
- int count = vcf->infoCounts[key];
- // XXX TODO, fix for Genotype variants...
- if (count != ALLELE_NUMBER) {
- index = 0;
- }
- if (index == INDEX_NONE) {
- if (count != 1) {
- cerr << "no field index supplied and field count != 1" << endl;
- exit(1);
- } else {
- index = 0;
- }
- }
- VariantFieldType type = s->second;
- if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
- map<string, vector<string> >::iterator b = info.find(key);
- if (b == info.end())
- return false;
- double r;
- if (!convert(b->second.at(index), r)) {
- cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
- exit(1);
- }
- return r;
- } else {
- cerr << "unsupported type for variant record " << type << endl;
- exit(1);
- }
- }
- }
-
- int Variant::getNumSamples(void) {
- return sampleNames.size();
- }
-
- int Variant::getNumValidGenotypes(void) {
- int valid_genotypes = 0;
- map<string, map<string, vector<string> > >::const_iterator s = samples.begin();
- map<string, map<string, vector<string> > >::const_iterator sEnd = samples.end();
- for (; s != sEnd; ++s) {
- map<string, vector<string> > sample_info = s->second;
- if (sample_info["GT"].front() != "./.") {
- valid_genotypes++;
- }
- }
- return valid_genotypes;
- }
-
- bool Variant::getSampleValueBool(string& key, string& sample, int index) {
- map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
- if (s == vcf->infoTypes.end()) {
- cerr << "no info field " << key << endl;
- exit(1);
- } else {
- int count = vcf->formatCounts[key];
- // XXX TODO, fix for Genotype variants...
- if (count != ALLELE_NUMBER) {
- index = 0;
- }
- if (index == INDEX_NONE) {
- if (count != 1) {
- cerr << "no field index supplied and field count != 1" << endl;
- exit(1);
- } else {
- index = 0;
- }
- }
- VariantFieldType type = s->second;
- map<string, vector<string> >& sampleData = samples[sample];
- if (type == FIELD_BOOL) {
- map<string, vector<string> >::iterator b = sampleData.find(key);
- if (b == sampleData.end())
- return false;
- else
- return true;
- } else {
- cerr << "not bool type " << key << endl;
- exit(1);
- }
- }
- }
-
- string Variant::getSampleValueString(string& key, string& sample, int index) {
- map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
- if (s == vcf->infoTypes.end()) {
- cerr << "no info field " << key << endl;
- exit(1);
- } else {
- int count = vcf->formatCounts[key];
- // XXX TODO, fix for Genotype variants...
- if (count != ALLELE_NUMBER) {
- index = 0;
- }
- if (index == INDEX_NONE) {
- if (count != 1) {
- cerr << "no field index supplied and field count != 1" << endl;
- exit(1);
- } else {
- index = 0;
- }
- }
- VariantFieldType type = s->second;
- map<string, vector<string> >& sampleData = samples[sample];
- if (type == FIELD_STRING) {
- map<string, vector<string> >::iterator b = sampleData.find(key);
- if (b == sampleData.end()) {
- return "";
- } else {
- return b->second.at(index);
- }
- } else {
- cerr << "not string type " << key << endl;
- exit(1);
- }
- }
- }
-
- double Variant::getSampleValueFloat(string& key, string& sample, int index) {
- map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
- if (s == vcf->infoTypes.end()) {
- cerr << "no info field " << key << endl;
- exit(1);
- } else {
- // XXX TODO wrap this with a function call
- int count = vcf->formatCounts[key];
- // XXX TODO, fix for Genotype variants...
- if (count != ALLELE_NUMBER) {
- index = 0;
- }
- if (index == INDEX_NONE) {
- if (count != 1) {
- cerr << "no field index supplied and field count != 1" << endl;
- exit(1);
- } else {
- index = 0;
- }
- }
- VariantFieldType type = s->second;
- map<string, vector<string> >& sampleData = samples[sample];
- if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
- map<string, vector<string> >::iterator b = sampleData.find(key);
- if (b == sampleData.end())
- return false;
- double r;
- if (!convert(b->second.at(index), r)) {
- cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
- exit(1);
- }
- return r;
- } else {
- cerr << "unsupported type for sample " << type << endl;
- exit(1);
- }
- }
- }
-
- bool Variant::getValueBool(string& key, string& sample, int index) {
- if (sample.empty()) { // an empty sample name means
- return getInfoValueBool(key, index);
- } else {
- return getSampleValueBool(key, sample, index);
- }
- }
-
- double Variant::getValueFloat(string& key, string& sample, int index) {
- if (sample.empty()) { // an empty sample name means
- return getInfoValueFloat(key, index);
- } else {
- return getSampleValueFloat(key, sample, index);
- }
- }
-
- string Variant::getValueString(string& key, string& sample, int index) {
- if (sample.empty()) { // an empty sample name means
- return getInfoValueString(key, index);
- } else {
- return getSampleValueString(key, sample, index);
- }
- }
-
- int Variant::getAltAlleleIndex(string& allele) {
- map<string, int>::iterator f = altAlleleIndexes.find(allele);
- if (f == altAlleleIndexes.end()) {
- cerr << "no such allele \'" << allele << "\' in record " << sequenceName << ":" << position << endl;
- exit(1);
- } else {
- return f->second;
- }
- }
-
- void Variant::addFilter(string& tag) {
- if (filter == "" || filter == ".")
- filter = tag;
- else
- filter += "," + tag;
- }
-
- void Variant::addFormatField(string& key) {
- bool hasTag = false;
- for (vector<string>::iterator t = format.begin(); t != format.end(); ++t) {
- if (*t == key) {
- hasTag = true;
- break;
- }
- }
- if (!hasTag) {
- format.push_back(key);
- }
- }
-
- void Variant::printAlt(ostream& out) {
- for (vector<string>::iterator i = alt.begin(); i != alt.end(); ++i) {
- out << *i;
- // add a comma for all but the last alternate allele
- if (i != (alt.end() - 1))
- out << ",";
- }
- }
-
- void Variant::printAlleles(ostream& out) {
- for (vector<string>::iterator i = alleles.begin(); i != alleles.end(); ++i) {
- out << *i;
- // add a comma for all but the last alternate allele
- if (i != (alleles.end() - 1))
- out << ",";
- }
- }
-
- ostream& operator<<(ostream& out, Variant& var) {
- out << var.sequenceName << "\t"
- << var.position << "\t"
- << var.id << "\t"
- << var.ref << "\t";
- // report the list of alternate alleles.
- var.printAlt(out);
- out << "\t"
- << var.quality << "\t"
- << var.filter << "\t";
- for (map<string, vector<string> >::iterator i = var.info.begin(); i != var.info.end(); ++i) {
- if (!i->second.empty()) {
- out << ((i == var.info.begin()) ? "" : ";") << i->first << "=" << join(i->second, ",");
- }
- }
- for (map<string, bool>::iterator i = var.infoFlags.begin(); i != var.infoFlags.end(); ++i) {
- if (i == var.infoFlags.end()) {
- out << "";
- } else
- if (i == var.infoFlags.begin() && var.info.empty()) {
- out << "";
- } else {
- out << ";";
- }
- out << i->first;
- }
- if (!var.format.empty()) {
- out << "\t";
- for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
- out << ((f == var.format.begin()) ? "" : ":") << *f;
- }
- for (vector<string>::iterator s = var.outputSampleNames.begin(); s != var.outputSampleNames.end(); ++s) {
- out << "\t";
- map<string, map<string, vector<string> > >::iterator sampleItr = var.samples.find(*s);
- if (sampleItr == var.samples.end()) {
- out << ".";
- } else {
- map<string, vector<string> >& sample = sampleItr->second;
- if (sample.size() == 0) {
- out << ".";
- } else {
- for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
- map<string, vector<string> >::iterator g = sample.find(*f);
- out << ((f == var.format.begin()) ? "" : ":");
- if (g != sample.end()) {
- out << join(g->second, ",");
- } else {
- out << ".";
- }
- }
- }
- }
- }
- }
- return out;
- }
-
- void Variant::setOutputSampleNames(vector<string>& samplesToOutput) {
- outputSampleNames = samplesToOutput;
- }
-
-
-// shunting yard algorithm
- void infixToPrefix(queue<RuleToken> tokens, queue<RuleToken>& prefixtokens) {
- stack<RuleToken> ops;
- while (!tokens.empty()) {
- RuleToken& token = tokens.front();
- if (isOperator(token)) {
- //cerr << "found operator " << token.value << endl;
- while (ops.size() > 0 && isOperator(ops.top())
- && ((isLeftAssociative(token) && priority(token) <= priority(ops.top()))
- || (isRightAssociative(token) && priority(token) < priority(ops.top())))) {
- prefixtokens.push(ops.top());
- ops.pop();
- }
- ops.push(token);
- } else
- if (isLeftParenthesis(token)) {
- //cerr << "found paran " << token.value << endl;
- ops.push(token);
- } else
- if (isRightParenthesis(token)) {
- //cerr << "found paran " << token.value << endl;
- while (ops.size() > 0 && !isLeftParenthesis(ops.top())) {
- prefixtokens.push(ops.top());
- ops.pop();
- }
- if (ops.size() == 0) {
- cerr << "error: mismatched parentheses" << endl;
- exit(1);
- }
- if (isLeftParenthesis(ops.top())) {
- ops.pop();
- }
- } else {
- //cerr << "found operand " << token.value << endl;
- prefixtokens.push(token);
- }
- tokens.pop();
- }
- while (ops.size() > 0) {
- if (isRightParenthesis(ops.top()) || isLeftParenthesis(ops.top())) {
- cerr << "error: mismatched parentheses" << endl;
- exit(1);
- }
- prefixtokens.push(ops.top());
- ops.pop();
- }
- }
-
- RuleToken::RuleToken(string tokenstr, map<string, VariantFieldType>& variables) {
- isVariable = false;
- if (tokenstr == "!") {
- type = RuleToken::NOT_OPERATOR;
- } else
- if (tokenstr == "&") {
- type = RuleToken::AND_OPERATOR;
- } else
- if (tokenstr == "|") {
- type = RuleToken::OR_OPERATOR;
- } else
- if (tokenstr == "+") {
- type = RuleToken::ADD_OPERATOR;
- } else
- if (tokenstr == "-") {
- type = RuleToken::SUBTRACT_OPERATOR;
- } else
- if (tokenstr == "*") {
- type = RuleToken::MULTIPLY_OPERATOR;
- } else
- if (tokenstr == "/") {
- type = RuleToken::DIVIDE_OPERATOR;
- } else
- if (tokenstr == "=") {
- type = RuleToken::EQUAL_OPERATOR;
- } else
- if (tokenstr == ">") {
- type = RuleToken::GREATER_THAN_OPERATOR;
- } else
- if (tokenstr == "<") {
- type = RuleToken::LESS_THAN_OPERATOR;
- } else
- if (tokenstr == "(") {
- type = RuleToken::LEFT_PARENTHESIS;
- } else
- if (tokenstr == ")") {
- type = RuleToken::RIGHT_PARENTHESIS;
- } else { // operand
- type = RuleToken::OPERAND;
- if (variables.find(tokenstr) == variables.end()) {
- if (convert(tokenstr, number)) {
- type = RuleToken::NUMBER;
- } else
- if (tokenstr == "QUAL") {
- isVariable = true;
- } else {
- type = RuleToken::STRING_VARIABLE;
- }
- } else {
- isVariable = true;
- }
- }
- value = tokenstr;
- }
-
-
- void tokenizeFilterSpec(string& filterspec, queue<RuleToken>& tokens, map<string, VariantFieldType>& variables) {
- string lastToken = "";
- bool inToken = false;
- for (unsigned int i = 0; i < filterspec.size(); ++i) {
- char c = filterspec.at(i);
- if (c == ' ' || c == '\n') {
- inToken = false;
- if (!inToken && lastToken.size() > 0) {
- tokens.push(RuleToken(lastToken, variables));
- lastToken = "";
- }
- } else
- if (!inToken && (isOperatorChar(c) || isParanChar(c))) {
- inToken = false;
- if (lastToken.size() > 0) {
- tokens.push(RuleToken(lastToken, variables));
- lastToken = "";
- }
- tokens.push(RuleToken(filterspec.substr(i,1), variables));
- } else {
- inToken = true;
- lastToken += c;
- }
- }
- // get the last token
- if (inToken) {
- tokens.push(RuleToken(lastToken, variables));
- }
- }
-
-// class which evaluates filter expressions
-// allow filters to be defined using boolean infix expressions e.g.:
-//
-// "GQ > 10 & (DP < 3 | DP > 5) & SAMPLE = NA12878"
-// or
-// "GT = 1/1 | GT = 0/0"
-//
-// on initialization, tokenizes the input sequence, and converts it from infix to postfix
-// on call to
-//
-
-
- VariantFilter::VariantFilter(string filterspec, VariantFilterType filtertype, map<string, VariantFieldType>& variables) {
- type = filtertype;
- spec = filterspec;
- tokenizeFilterSpec(filterspec, tokens, variables);
- infixToPrefix(tokens, rules);
- /*while (!rules.empty()) {
- cerr << " " << rules.front().value << ((isNumeric(rules.front())) ? "f" : "");
- rules.pop();
- }
- */
- //cerr << endl;
- //cerr << join(" ", tokens) << endl;
- }
-
-// all alts pass
- bool VariantFilter::passes(Variant& var, string& sample) {
- for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
- string& allele = *a;
- if (!passes(var, sample, allele)) {
- return false;
- }
- }
- return true;
- }
-
- bool VariantFilter::passes(Variant& var, string& sample, string& allele) {
- // to evaluate a rpn boolean queue with embedded numbers and variables
- // make a result stack, use float to allow comparison of floating point
- // numbers, booleans, and integers
- stack<RuleToken> results;
- queue<RuleToken> rulesCopy = rules; // copy
-
- int index;
- if (allele.empty()) {
- index = 0; // apply to the whole record
- } else {
- // apply to a specific allele
- index = var.getAltAlleleIndex(allele);
- }
-
- while (!rulesCopy.empty()) {
- RuleToken token = rulesCopy.front();
- rulesCopy.pop();
- // pop operands from the front of the queue and push them onto the stack
- if (isOperand(token)) {
- //cout << "is operand: " << token.value << endl;
- // if the token is variable, i.e. not evaluated in this context, we
- // must evaluate it before pushing it onto the stack
- if (token.isVariable) {
- //cout << "is variable" << endl;
- // look up the variable using the Variant, depending on our filter type
- //cout << "token.value " << token.value << endl;
- VariantFieldType vtype;
- if (sample.empty()) { // means we are record-specific
- vtype = var.infoType(token.value);
- } else {
- vtype = var.formatType(token.value);
- //cout << "type = " << type << endl;
- }
- //cout << "type: " << type << endl;
-
- if (vtype == FIELD_INTEGER || vtype == FIELD_FLOAT) {
- token.type = RuleToken::NUMERIC_VARIABLE;
- token.number = var.getValueFloat(token.value, sample, index);
- //cerr << "number: " << token.number << endl;
- } else
- if (vtype == FIELD_BOOL) {
- token.type = RuleToken::BOOLEAN_VARIABLE;
- token.state = var.getValueBool(token.value, sample, index);
- //cerr << "state: " << token.state << endl;
- } else
- if (vtype == FIELD_STRING) {
- //cout << "token.value = " << token.value << endl;
- token.type = RuleToken::STRING_VARIABLE;
- token.str = var.getValueString(token.value, sample, index);
- } else
- if (isString(token)) {
- token.type = RuleToken::STRING_VARIABLE;
- token.str = var.getValueString(token.value, sample, index);
- //cerr << "string: " << token.str << endl;
- }
- } else {
- double f;
- string s;
- //cerr << "parsing operand" << endl;
- if (convert(token.value, f)) {
- token.type = RuleToken::NUMERIC_VARIABLE;
- token.number = f;
- //cerr << "number: " << token.number << endl;
- } else
- if (convert(token.value, s)) {
- token.type = RuleToken::STRING_VARIABLE;
- token.str = s;
- //cerr << "string: " << token.str << endl;
- } else {
- cerr << "could not parse non-variable operand " << token.value << endl;
- exit(1);
- }
- }
- results.push(token);
- }
- // apply operators to the first n elements on the stack and push the result back onto the stack
- else
- if (isOperator(token)) {
- //cerr << "is operator: " << token.value << endl;
- RuleToken a, b, r;
- // is it a not-operator?
- switch (token.type) {
- case(RuleToken::NOT_OPERATOR):
- a = results.top();
- results.pop();
- if (!isBoolean(a)) {
- cerr << "cannot negate a non-boolean" << endl;
- } else {
- a.state = !a.state;
- results.push(a);
- }
- break;
-
- case(RuleToken::EQUAL_OPERATOR):
- a = results.top();
- results.pop();
- b = results.top();
- results.pop();
- if (a.type == b.type) {
- switch (a.type) {
- case(RuleToken::STRING_VARIABLE):
- r.state = (a.str == b.str);
- break;
- case(RuleToken::NUMERIC_VARIABLE):
- r.state = (a.number == b.number);
- break;
- case(RuleToken::BOOLEAN_VARIABLE):
- r.state = (a.state == b.state);
- break;
- default:
- cerr << "should not get here" << endl;
- exit(1);
- break;
- }
- }
- results.push(r);
- break;
-
- case(RuleToken::GREATER_THAN_OPERATOR):
- a = results.top();
- results.pop();
- b = results.top();
- results.pop();
- if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
- r.state = (b.number > a.number);
- } else {
- cerr << "cannot compare (>) objects of dissimilar types" << endl;
- ;
- cerr << a.type << " " << b.type << endl;
- exit(1);
- }
- results.push(r);
- break;
-
- case(RuleToken::LESS_THAN_OPERATOR):
- a = results.top();
- results.pop();
- b = results.top();
- results.pop();
- if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
- r.state = (b.number < a.number);
- } else {
- cerr << "cannot compare (<) objects of dissimilar types" << endl;
- cerr << a.type << " " << b.type << endl;
- exit(1);
- }
- results.push(r);
- break;
-
- case(RuleToken::ADD_OPERATOR):
- a = results.top();
- results.pop();
- b = results.top();
- results.pop();
- if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
- r.number = (b.number + a.number);
- r.type = RuleToken::NUMERIC_VARIABLE;
- } else {
- cerr << "cannot add objects of dissimilar types" << endl;
- cerr << a.type << " " << b.type << endl;
- exit(1);
- }
- results.push(r);
- break;
-
- case(RuleToken::SUBTRACT_OPERATOR):
- a = results.top();
- results.pop();
- b = results.top();
- results.pop();
- if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
- r.number = (b.number - a.number);
- r.type = RuleToken::NUMERIC_VARIABLE;
- } else {
- cerr << "cannot subtract objects of dissimilar types" << endl;
- cerr << a.type << " " << b.type << endl;
- exit(1);
- }
- results.push(r);
- break;
-
- case(RuleToken::MULTIPLY_OPERATOR):
- a = results.top();
- results.pop();
- b = results.top();
- results.pop();
- if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
- r.number = (b.number * a.number);
- r.type = RuleToken::NUMERIC_VARIABLE;
- } else {
- cerr << "cannot multiply objects of dissimilar types" << endl;
- cerr << a.type << " " << b.type << endl;
- exit(1);
- }
- results.push(r);
- break;
-
- case(RuleToken::DIVIDE_OPERATOR):
- a = results.top();
- results.pop();
- b = results.top();
- results.pop();
- if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
- r.number = (b.number / a.number);
- r.type = RuleToken::NUMERIC_VARIABLE;
- } else {
- cerr << "cannot divide objects of dissimilar types" << endl;
- cerr << a.type << " " << b.type << endl;
- exit(1);
- }
- results.push(r);
- break;
-
- case(RuleToken::AND_OPERATOR):
- case(RuleToken::OR_OPERATOR):
- a = results.top();
- results.pop();
- b = results.top();
- results.pop();
- if (a.type == b.type && a.type == RuleToken::BOOLEAN_VARIABLE) {
- if (token.type == RuleToken::AND_OPERATOR) {
- r.state = (a.state && b.state);
- } else {
- r.state = (a.state || b.state);
- }
- } else {
- cerr << "cannot compare (& or |) objects of dissimilar types" << endl;
- exit(1);
- }
- results.push(r);
- break;
- default:
- cerr << "should not get here!" << endl;
- exit(1);
- break;
- }
- }
- }
- // at the end you should have only one value on the stack, return it as a boolean
- if (results.size() == 1) {
- if (isBoolean(results.top())) {
- return results.top().state;
- } else {
- cerr << "error, non-boolean value left on stack" << endl;
- //cerr << results.top().value << endl;
- exit(1);
- }
- } else
- if (results.size() > 1) {
- cerr << "more than one value left on results stack!" << endl;
- while (!results.empty()) {
- cerr << results.top().value << endl;
- results.pop();
- }
- exit(1);
- } else {
- cerr << "results stack empty" << endl;
- exit(1);
- }
- }
-
- void VariantFilter::removeFilteredGenotypes(Variant& var) {
-
- for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
- string& name = *s;
- if (!passes(var, name)) {
- var.samples.erase(name);
- }
- }
- }
-
- /*
- bool VariantCallFile::openVCF(string& filename) {
- file.open(filename.c_str(), ifstream::in);
- if (!file.is_open()) {
- cerr << "could not open " << filename << endl;
- return false;
- } else {
- return parseHeader();
- }
- }
-
- bool VariantCallFile::openVCF(ifstream& stream) {
- file = stream;
- if (!file.is_open()) {
- cerr << "provided file is not open" << endl;
- return false;
- } else {
- return parseHeader();
- }
- }
- */
-
- void VariantCallFile::updateSamples(vector<string>& newSamples) {
- sampleNames = newSamples;
- // regenerate the last line of the header
- vector<string> headerLines = split(header, '\n');
- vector<string> colnames = split(headerLines.at(headerLines.size() - 1), '\t'); // get the last, update the samples
- vector<string> newcolnames;
- newcolnames.resize(9 + sampleNames.size());
- copy(colnames.begin(), colnames.begin() + 9, newcolnames.begin());
- copy(sampleNames.begin(), sampleNames.end(), newcolnames.begin() + 9);
- headerLines.at(headerLines.size() - 1) = join(newcolnames, "\t");
- header = join(headerLines, "\n");
- }
-
-// TODO cleanup, store header lines instead of bulk header
- void VariantCallFile::addHeaderLine(string line) {
- vector<string> headerLines = split(header, '\n');
- headerLines.insert(headerLines.end() - 1, line);
- header = join(unique(headerLines), "\n");
- }
-
-// helper to addHeaderLine
- vector<string>& unique(vector<string>& strings) {
- set<string> uniq;
- vector<string> res;
- for (vector<string>::const_iterator s = strings.begin(); s != strings.end(); ++s) {
- if (uniq.find(*s) == uniq.end()) {
- res.push_back(*s);
- uniq.insert(*s);
- }
- }
- strings = res;
- return strings;
- }
-
- vector<string> VariantCallFile::infoIds(void) {
- vector<string> tags;
- vector<string> headerLines = split(header, '\n');
- for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
- string& line = *s;
- if (line.find("##INFO") == 0) {
- size_t pos = line.find("ID=");
- if (pos != string::npos) {
- pos += 3;
- size_t tagend = line.find(",", pos);
- if (tagend != string::npos) {
- tags.push_back(line.substr(pos, tagend - pos));
- }
- }
- }
- }
- return tags;
- }
-
- vector<string> VariantCallFile::formatIds(void) {
- vector<string> tags;
- vector<string> headerLines = split(header, '\n');
- for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
- string& line = *s;
- if (line.find("##FORMAT") == 0) {
- size_t pos = line.find("ID=");
- if (pos != string::npos) {
- pos += 3;
- size_t tagend = line.find(",", pos);
- if (tagend != string::npos) {
- tags.push_back(line.substr(pos, tagend - pos));
- }
- }
- }
- }
- return tags;
- }
-
- void VariantCallFile::removeInfoHeaderLine(string tag) {
- vector<string> headerLines = split(header, '\n');
- vector<string> newHeader;
- string id = "ID=" + tag;
- for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
- string& line = *s;
- if (line.find("##INFO") == 0) {
- if (line.find(id) == string::npos) {
- newHeader.push_back(line);
- }
- } else {
- newHeader.push_back(line);
- }
- }
- header = join(newHeader, "\n");
- }
-
- void VariantCallFile::removeGenoHeaderLine(string tag) {
- vector<string> headerLines = split(header, '\n');
- vector<string> newHeader;
- string id = "ID=" + tag;
- for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
- string& headerLine = *s;
- if (headerLine.find("##FORMAT") == 0) {
- if (headerLine.find(id) == string::npos) {
- newHeader.push_back(headerLine);
- }
- } else {
- newHeader.push_back(headerLine);
- }
- }
- header = join(newHeader, "\n");
- }
-
- bool VariantCallFile::parseHeader(void) {
-
- string headerStr = "";
-
- if (usingTabix) {
- tabixFile->getHeader(headerStr);
- if (headerStr.empty()) {
- cerr << "error: no VCF header" << endl;
- exit(1);
- }
- tabixFile->getNextLine(line);
- firstRecord = true;
- } else {
- while (std::getline(*file, line)) {
- if (line.substr(0,1) == "#") {
- headerStr += line + '\n';
- } else {
- // done with header
- if (headerStr.empty()) {
- cerr << "error: no VCF header" << endl;
- exit(1);
- }
- firstRecord = true;
- break;
- }
- }
- }
-
- return parseHeader(headerStr);
-
- }
-
- bool VariantCallFile::parseHeader(string& hs) {
-
- if (hs.substr(hs.size() - 1, 1) == "\n") {
- hs.erase(hs.size() - 1, 1); // remove trailing newline
- }
- header = hs; // stores the header in the object instance
-
- vector<string> headerLines = split(header, "\n");
- for (vector<string>::iterator h = headerLines.begin(); h != headerLines.end(); ++h) {
- string headerLine = *h;
- if (headerLine.substr(0,2) == "##") {
- // meta-information headerLines
- // TODO parse into map from info/format key to type
- // ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
- // ##FORMAT=<ID=CB,Number=1,Type=String,Description="Called by S(Sanger), M(UMich), B(BI)">
- size_t found = headerLine.find_first_of("=");
- string entryType = headerLine.substr(2, found - 2);
- // handle reference here, no "<" and ">" given
- //} else if (entryType == "reference") {
- size_t dataStart = headerLine.find_first_of("<");
- size_t dataEnd = headerLine.find_first_of(">");
- if (dataStart != string::npos && dataEnd != string::npos) {
- string entryData = headerLine.substr(dataStart + 1, dataEnd - dataStart - 1);
- // XXX bad; this will break if anyone ever moves the order
- // of the fields around to include a "long form" string
- // including either a = or , in the first or second field
- if (entryType == "INFO" || entryType == "FORMAT") {
- vector<string> fields = split(entryData, "=,");
- if (fields[0] != "ID") {
- cerr << "header parse error at:" << endl
- << "fields[0] != \"ID\"" << endl
- << headerLine << endl;
- exit(1);
- }
- string id = fields[1];
- if (fields[2] != "Number") {
- cerr << "header parse error at:" << endl
- << "fields[2] != \"Number\"" << endl
- << headerLine << endl;
- exit(1);
- }
- int number;
- string numberstr = fields[3].c_str();
- // XXX TODO VCF has variable numbers of fields...
- if (numberstr == "A") {
- number = ALLELE_NUMBER;
- } else
- if (numberstr == "G") {
- number = GENOTYPE_NUMBER;
- } else
- if (numberstr == ".") {
- number = 1;
- } else {
- convert(numberstr, number);
- }
- if (fields[4] != "Type") {
- cerr << "header parse error at:" << endl
- << "fields[4] != \"Type\"" << endl
- << headerLine << endl;
- exit(1);
- }
- VariantFieldType type = typeStrToVariantFieldType(fields[5]);
- if (entryType == "INFO") {
- infoCounts[id] = number;
- infoTypes[id] = type;
- //cerr << id << " == " << type << endl;
- } else
- if (entryType == "FORMAT") {
- //cout << "found format field " << id << " with type " << type << endl;
- formatCounts[id] = number;
- formatTypes[id] = type;
- }
- }
- }
- } else
- if (headerLine.substr(0,1) == "#") {
- // field name headerLine
- vector<string> fields = split(headerLine, '\t');
- if (fields.size() > 8) {
- sampleNames.resize(fields.size() - 9);
- copy(fields.begin() + 9, fields.end(), sampleNames.begin());
- }
- }
- }
-
- return true;
- }
-
- bool VariantCallFile::getNextVariant(Variant& var) {
- if (firstRecord && !justSetRegion) {
- if (!line.empty()) {
- var.parse(line, parseSamples);
- firstRecord = false;
- _done = false;
- return true;
- } else {
- return false;
- }
- }
- if (usingTabix) {
- if (justSetRegion && !line.empty()) {
- if (firstRecord) {
- firstRecord = false;
- }
- var.parse(line, parseSamples);
- line.clear();
- justSetRegion = false;
- _done = false;
- return true;
- } else
- if (tabixFile->getNextLine(line)) {
- var.parse(line, parseSamples);
- _done = false;
- return true;
- } else {
- _done = true;
- return false;
- }
- } else {
- if (std::getline(*file, line)) {
- var.parse(line, parseSamples);
- _done = false;
- return true;
- } else {
- _done = true;
- return false;
- }
- }
- }
-
- bool VariantCallFile::setRegion(string seq, long int start, long int end) {
- stringstream regionstr;
- if (end) {
- regionstr << seq << ":" << start << "-" << end;
- } else {
- regionstr << seq << ":" << start;
- }
- return setRegion(regionstr.str());
- }
-
- bool VariantCallFile::setRegion(string region) {
- if (!usingTabix) {
- cerr << "cannot setRegion on a non-tabix indexed file " << region << endl;
- exit(1);
- }
- size_t dots = region.find("..");
- // convert between bamtools/freebayes style region string and tabix/samtools style
- if (dots != string::npos) {
- region.replace(dots, 2, "-");
- }
- if (tabixFile->setRegion(region)) {
- if (tabixFile->getNextLine(line)) {
- justSetRegion = true;
- return true;
- } else {
- return false;
- }
- } else {
- return false;
- }
- }
-
-
-// genotype manipulation
-
- /*
- map<string, int> decomposeGenotype(string& genotype) {
- string splitter = "/";
- if (genotype.find("|") != string::npos) {
- splitter = "|";
- }
- vector<string> haps = split(genotype, splitter);
- map<string, int> decomposed;
- for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
- ++decomposed[*h];
- }
- return decomposed;
- }
- */
-
- map<int, int> decomposeGenotype(string& genotype) {
- string splitter = "/";
- if (genotype.find("|") != string::npos) {
- splitter = "|";
- }
- vector<string> haps = split(genotype, splitter);
- map<int, int> decomposed;
- for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
- int alt;
- if (*h == ".") {
- ++decomposed[NULL_ALLELE];
- } else {
- convert(*h, alt);
- ++decomposed[alt];
- }
- }
- return decomposed;
- }
-
- string genotypeToString(map<int, int>& genotype) {
- vector<int> s;
- for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
- int a = g->first;
- int c = g->second;
- for (int i = 0; i < c; ++i)
- s.push_back(a);
- }
- sort(s.begin(), s.end());
- vector<string> r;
- for (vector<int>::iterator i = s.begin(); i != s.end(); ++i) {
- if (*i == NULL_ALLELE)
- r.push_back(".");
- else
- r.push_back(convert(*i));
- }
- return join(r, "/"); // TODO adjust for phased/unphased
- }
-
- bool isHet(map<int, int>& genotype) {
- return genotype.size() > 1;
- }
-
- bool isHom(map<int, int>& genotype) {
- return genotype.size() == 1;
- }
-
- bool hasNonRef(map<int, int>& genotype) {
- for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
- if (g->first != 0) {
- return true;
- }
- }
- return false;
- }
-
- bool isHomRef(map<int, int>& genotype) {
- return isHom(genotype) && !hasNonRef(genotype);
- }
-
- bool isHomNonRef(map<int, int>& genotype) {
- return isHom(genotype) && hasNonRef(genotype);
- }
-
- bool isNull(map<int, int>& genotype) {
- return genotype.find(NULL_ALLELE) != genotype.end();
- }
-
- int ploidy(map<int, int>& genotype) {
- int i = 0;
- for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
- i += g->second;
- }
- return i;
- }
-
- map<string, vector<VariantAllele> > Variant::parsedAlternates(bool includePreviousBaseForIndels,
- bool useMNPs,
- bool useEntropy,
- float matchScore,
- float mismatchScore,
- float gapOpenPenalty,
- float gapExtendPenalty,
- float repeatGapExtendPenalty,
- string flankingRefLeft,
- string flankingRefRight) {
-
- map<string, vector<VariantAllele> > variantAlleles;
-
- // add the reference allele
- variantAlleles[ref].push_back(VariantAllele(ref, ref, position));
- // single SNP case, no ambiguity possible, no need to spend a lot of
- // compute aligning ref and alt fields
- if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
- variantAlleles[alt.front()].push_back(VariantAllele(ref, alt.front(), position));
- return variantAlleles;
- }
-
- // padding is used to ensure a stable alignment of the alternates to the reference
- // without having to go back and look at the full reference sequence
- int paddingLen = max(10, (int)(ref.size())); // dynamically determine optimum padding length
- for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
- string& alternate = *a;
- paddingLen = max(paddingLen, (int)(alternate.size()));
- }
- char padChar = 'Z';
- char anchorChar = 'Q';
- string padding(paddingLen, padChar);
-
- // this 'anchored' string is done for stability
- // the assumption is that there should be a positional match in the first base
- // this is true for VCF 4.1, and standard best practices
- // using the anchor char ensures this without other kinds of realignment
- string reference_M;
- if (flankingRefLeft.empty() && flankingRefRight.empty()) {
- reference_M = padding + ref + padding;
- reference_M[paddingLen] = anchorChar;
- } else {
- reference_M = flankingRefLeft + ref + flankingRefRight;
- paddingLen = flankingRefLeft.size();
- }
-
- // passed to sw.Align
- unsigned int referencePos;
-
- string cigar;
-
- for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
- //cerr << " vcflib : alt = " << *a << endl;
- string& alternate = *a;
- vector<VariantAllele>& variants = variantAlleles[alternate];
- string alternateQuery_M;
- if (flankingRefLeft.empty() && flankingRefRight.empty()) {
- alternateQuery_M = padding + alternate + padding;
- alternateQuery_M[paddingLen] = anchorChar;
- } else {
- alternateQuery_M = flankingRefLeft + alternate + flankingRefRight;
- }
- //const unsigned int alternateLen = alternate.size();
-
- CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
- if (useEntropy)
- sw.EnableEntropyGapPenalty(1);
- if (repeatGapExtendPenalty != 0)
- sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
- sw.Align(referencePos, cigar, reference_M, alternateQuery_M);
-
- // left-realign the alignment...
-
- vector<pair<int, string> > cigarData = splitCigar(cigar);
- if (cigarData.front().second != "M" || cigarData.back().second != "M"
- || cigarData.front().first < paddingLen || cigarData.back().first < paddingLen) {
- cerr << "parsedAlternates: alignment does not start with match over padded sequence" << endl;
- cerr << cigar << endl;
- cerr << reference_M << endl;
- cerr << alternateQuery_M << endl;
- exit(1);
- } else {
- cigarData.front().first -= paddingLen;
- cigarData.back().first -= paddingLen;;
- }
- cigar = joinCigar(cigarData);
-
- int altpos = 0;
- int refpos = 0;
-
- for (vector<pair<int, string> >::iterator e = cigarData.begin(); e != cigarData.end(); ++e) {
-
- int len = e->first;
- string type = e->second;
- // cerr << " vcflib : type = " << type.at(0) << " " << ref << " " << refpos << " " << alternate << " " << altpos << " " << len << endl;
- switch (type.at(0)) {
- case 'I':
- if (includePreviousBaseForIndels) {
- variants.push_back(VariantAllele(ref.substr(refpos - 1, 1), alternate.substr(altpos - 1, len + 1), refpos + position - 1));
- } else {
- variants.push_back(VariantAllele("", alternate.substr(altpos, len), refpos + position));
- }
- altpos += len;
- break;
- case 'D':
- if (includePreviousBaseForIndels) {
- variants.push_back(VariantAllele(ref.substr(refpos - 1, len + 1), alternate.substr(altpos - 1, 1), refpos + position - 1));
- } else {
- variants.push_back(VariantAllele(ref.substr(refpos, len), "", refpos + position));
- }
- refpos += len;
- break;
- case 'M': {
- for (int i = 0; i < len; ++i) {
- variants.push_back(VariantAllele(ref.substr(refpos + i, 1),
- alternate.substr(altpos + i, 1),
- refpos + i + position));
- }
- }
- refpos += len;
- altpos += len;
- break;
- case 'S':
- refpos += len;
- altpos += len;
- break;
- default:
- break;
- }
-
- // deal with MNP representation
- if (useMNPs) {
- vector<VariantAllele> adjustedVariants;
- for (vector<VariantAllele>::iterator v = variants.begin(); v != variants.end(); ++v) {
- if (adjustedVariants.empty()) {
- adjustedVariants.push_back(*v);
- } else {
- if (adjustedVariants.back().ref.size() == adjustedVariants.back().alt.size()
- && adjustedVariants.back().ref != adjustedVariants.back().alt
- && v->ref.size() == v->alt.size()
- && v->ref != v->alt) {
- adjustedVariants.back().ref += v->ref;
- adjustedVariants.back().alt += v->alt;
- } else {
- adjustedVariants.push_back(*v);
- }
- }
- }
- variants = adjustedVariants;
- }
-
- // if the last two variants have the same alt position,
- // take the one which covers more ref or alt sequence
- // this deals with things like ACG/TTCG, which decomposes to A/T, A/TT
- //REMOVE this section
- if (variants.size() > 1) {
- VariantAllele& varA = variants.at(variants.size() - 2);
- VariantAllele& varB = variants.back();
- if (varA.position == varB.position) {
- if (varA.ref.size() == varB.ref.size()) {
- if (varA.alt.size() >= varB.alt.size()) {
- variants.pop_back();
- } else {
- VariantAllele varB_copy = variants.back();
- variants.pop_back();
- variants.pop_back();
- variants.push_back(varB_copy);
- }
- } else
- if (varA.ref.size() > varB.ref.size()) {
- variants.pop_back();
- } else {
- VariantAllele varB_copy = variants.back();
- variants.pop_back();
- variants.pop_back();
- variants.push_back(varB_copy);
- }
- }
- }
- ///COMMENT out this SECTION
- }
- }
-
- return variantAlleles;
- }
-
-
- ostream& operator<<(ostream& out, VariantAllele& var) {
- out << var.position << " " << var.ref << " -> " << var.alt;
- return out;
- }
-
- bool operator<(const VariantAllele& a, const VariantAllele& b) {
- return a.repr < b.repr;
- }
-
- map<pair<int, int>, int> Variant::getGenotypeIndexesDiploid(void) {
-
- map<pair<int, int>, int> genotypeIndexes;
- //map<int, map<Genotype*, int> > vcfGenotypeOrder;
- vector<int> indexes;
- for (int i = 0; i < (int)alleles.size(); ++i) {
- indexes.push_back(i);
- }
- int ploidy = 2; // ONLY diploid
- vector<vector<int> > genotypes = multichoose(ploidy, indexes);
- for (vector<vector<int> >::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
- sort(g->begin(), g->end()); // enforce e.g. 0/1, 0/2, 1/2 ordering over reverse
- // XXX this does not handle non-diploid!!!!
- int j = g->front();
- int k = g->back();
- genotypeIndexes[make_pair(j, k)] = (k * (k + 1) / 2) + j;
- }
- return genotypeIndexes;
-
- }
-
- void Variant::updateAlleleIndexes(void) {
- // adjust the allele index
- altAlleleIndexes.clear();
- int m = 0;
- for (vector<string>::iterator a = alt.begin();
- a != alt.end(); ++a, ++m) {
- altAlleleIndexes[*a] = m;
- }
- }
-
-// TODO only works on "A"llele variant fields
- void Variant::removeAlt(string& altAllele) {
- //cout << "Func RemoveAlt " << " " << altAlleleIndexes.size() << altAllele << endl;
- int altIndex = getAltAlleleIndex(altAllele); // this is the alt-relative index, 0-based
- //cout << "RemoveAlt = " << altIndex << endl;
- //cout << "Infocounts = " << vcf->infoCounts.size() << endl;
- for (map<string, int>::iterator c = vcf->infoCounts.begin(); c != vcf->infoCounts.end(); ++c) {
- int count = c->second;
- if (count == ALLELE_NUMBER) {
- string key = c->first;
- map<string, vector<string> >::iterator v = info.find(key);
- if (v != info.end()) {
- vector<string>& vals = v->second;
- vector<string> tokeep;
- int i = 0;
- for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
- if (i != altIndex) {
- tokeep.push_back(*a);
- }
- }
- vals = tokeep;
- }
- }
- }
- for (map<string, int>::iterator c = vcf->formatCounts.begin(); c != vcf->formatCounts.end(); ++c) {
- int count = c->second;
- if (count == ALLELE_NUMBER) {
- string key = c->first;
- for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
- map<string, vector<string> >& sample = s->second;
- map<string, vector<string> >::iterator v = sample.find(key);
- if (v != sample.end()) {
- vector<string>& vals = v->second;
- vector<string> tokeep;
- int i = 0;
- for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
- if (i != altIndex) {
- tokeep.push_back(*a);
- }
- }
- vals = tokeep;
- }
- }
- }
- }
-
-
- int altSpecIndex = altIndex + 1; // this is the genotype-spec index, ref=0, 1-based for alts
-
- vector<string> newalt;
- map<int, int> alleleIndexMapping;
- // setup the new alt string
- alleleIndexMapping[0] = 0; // reference allele remains the same
- int i = 1; // current index
- int j = 1; // new index
- for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a, ++i) {
- if (i != altSpecIndex) {
- newalt.push_back(*a);
- // get the mapping between new and old allele indexes
- alleleIndexMapping[i] = j;
- ++j;
- } else {
- alleleIndexMapping[i] = NULL_ALLELE;
- }
- }
-
- // fix the sample genotypes, removing reference to the old allele
- map<string, int> samplePloidy;
- for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
- map<string, vector<string> >& sample = s->second;
- if (sample.find("GT") != sample.end()) {
- string& gt = sample["GT"].front();
- string splitter = "/";
- if (gt.find("|") != string::npos) {
- splitter = "|";
- }
- samplePloidy[s->first] = split(gt, splitter).size();
- map<int, int> genotype = decomposeGenotype(sample["GT"].front());
- map<int, int> newGenotype;
- for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
- newGenotype[alleleIndexMapping[g->first]] += g->second;
- }
- if (gt.compare(".") == 0 || gt.compare("./.") == 0 || gt.compare(".|.") == 0) {
- //do nothing
- } else {
- sample["GT"].clear();
- sample["GT"].push_back(genotypeToString(newGenotype));
- }
- //cout << "Remove Alt : Old genotype = " << gt << " New genotype = " << genotypeToString(newGenotype) << endl;
- }
- }
-
- set<int> ploidies;
- for (map<string, int>::iterator p = samplePloidy.begin(); p != samplePloidy.end(); ++p) {
- ploidies.insert(p->second);
- }
-
- // fix the sample genotype likelihoods, removing reference to the old allele
- // which GL fields should we remove?
- vector<int> toRemove;
- toRemove.push_back(altSpecIndex);
- map<int, map<int, int> > glMappingByPloidy;
- for (set<int>::iterator p = ploidies.begin(); p != ploidies.end(); ++p) {
- glMappingByPloidy[*p] = glReorder(*p, alt.size() + 1, alleleIndexMapping, toRemove);
- }
-
- for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
- map<string, vector<string> >& sample = s->second;
- map<string, vector<string> >::iterator glsit = sample.find("GL");
- if (glsit != sample.end()) {
- vector<string>& gls = glsit->second; // should be split already
- map<int, string> newgls;
- map<int, int>& newOrder = glMappingByPloidy[samplePloidy[s->first]];
- int i = 0;
- for (vector<string>::iterator g = gls.begin(); g != gls.end(); ++g, ++i) {
- int j = newOrder[i];
- if (j != -1) {
- newgls[i] = *g;
- }
- }
- // update the gls
- gls.clear();
- for (map<int, string>::iterator g = newgls.begin(); g != newgls.end(); ++g) {
- gls.push_back(g->second);
- }
- }
- }
-
- // reset the alt
- alt = newalt;
-
- // and the alleles
- alleles.clear();
- alleles.push_back(ref);
- alleles.insert(alleles.end(), alt.begin(), alt.end());
-
- updateAlleleIndexes();
-
- }
-
-// union of lines in headers of input files
- string unionInfoHeaderLines(string& s1, string& s2) {
- vector<string> lines1 = split(s1, "\n");
- vector<string> lines2 = split(s2, "\n");
- vector<string> result;
- set<string> l2;
- string lastHeaderLine; // this one needs to be at the end
- for (vector<string>::iterator s = lines2.begin(); s != lines2.end(); ++s) {
- if (s->substr(0,6) == "##INFO") {
- l2.insert(*s);
- }
- }
- for (vector<string>::iterator s = lines1.begin(); s != lines1.end(); ++s) {
- if (l2.count(*s)) {
- l2.erase(*s);
- }
- if (s->substr(0,6) == "#CHROM") {
- lastHeaderLine = *s;
- } else {
- result.push_back(*s);
- }
- }
- for (set<string>::iterator s = l2.begin(); s != l2.end(); ++s) {
- result.push_back(*s);
- }
- if (lastHeaderLine.empty()) {
- cerr << "could not find CHROM POS ... header line" << endl;
- exit(1);
- }
- result.push_back(lastHeaderLine);
- return join(result, "\n");
- }
-
- string mergeCigar(const string& c1, const string& c2) {
- vector<pair<int, string> > cigar1 = splitCigar(c1);
- vector<pair<int, string> > cigar2 = splitCigar(c2);
- // check if the middle elements are the same
- if (cigar1.back().second == cigar2.front().second) {
- cigar1.back().first += cigar2.front().first;
- cigar2.erase(cigar2.begin());
- }
- for (vector<pair<int, string> >::iterator c = cigar2.begin(); c != cigar2.end(); ++c) {
- cigar1.push_back(*c);
- }
- return joinCigar(cigar1);
- }
-
- vector<pair<int, string> > splitCigar(const string& cigarStr) {
- vector<pair<int, string> > cigar;
- string number;
- string type;
- // strings go [Number][Type] ...
- for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
- char c = *s;
- if (isdigit(c)) {
- if (type.empty()) {
- number += c;
- } else {
- // signal for next token, push back the last pair, clean up
- cigar.push_back(make_pair(atoi(number.c_str()), type));
- number.clear();
- type.clear();
- number += c;
- }
- } else {
- type += c;
- }
- }
- if (!number.empty() && !type.empty()) {
- cigar.push_back(make_pair(atoi(number.c_str()), type));
- }
- return cigar;
- }
-
- list<pair<int, string> > splitCigarList(const string& cigarStr) {
- list<pair<int, string> > cigar;
- string number;
- string type;
- // strings go [Number][Type] ...
- for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
- char c = *s;
- if (isdigit(c)) {
- if (type.empty()) {
- number += c;
- } else {
- // signal for next token, push back the last pair, clean up
- cigar.push_back(make_pair(atoi(number.c_str()), type));
- number.clear();
- type.clear();
- number += c;
- }
- } else {
- type += c;
- }
- }
- if (!number.empty() && !type.empty()) {
- cigar.push_back(make_pair(atoi(number.c_str()), type));
- }
- return cigar;
- }
-
- string joinCigar(const vector<pair<int, string> >& cigar) {
- string cigarStr;
- for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
- if (c->first) {
- cigarStr += convert(c->first) + c->second;
- }
- }
- return cigarStr;
- }
-
- string joinCigar(const vector<pair<int, char> >& cigar) {
- string cigarStr;
- for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
- if (c->first) {
- cigarStr += convert(c->first) + string(1, c->second);
- }
- }
- return cigarStr;
- }
-
- string joinCigarList(const list<pair<int, string> >& cigar) {
- string cigarStr;
- for (list<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
- cigarStr += convert(c->first) + c->second;
- }
- return cigarStr;
- }
-
- int cigarRefLen(const vector<pair<int, char> >& cigar) {
- int len = 0;
- for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
- if (c->second == 'M' || c->second == 'D' || c->second == 'X') {
- len += c->first;
- }
- }
- return len;
- }
-
- int cigarRefLen(const vector<pair<int, string> >& cigar) {
- int len = 0;
- for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
- if (c->second == "M" || c->second == "D" || c->second == "X") {
- len += c->first;
- }
- }
- return len;
- }
-
- bool isEmptyCigarElement(const pair<int, string>& elem) {
- return elem.first == 0;
- }
-
- list<list<int> > _glorder(int ploidy, int alts) {
- if (ploidy == 1) {
- list<list<int> > results;
- for (int n = 0; n < alts; ++n) {
- list<int> v;
- v.push_back(n);
- results.push_back(v);
- }
- return results;
- } else {
- list<list<int> > results;
- for (int n = 0; n < alts; ++n) {
- list<list<int> > x = _glorder(ploidy - 1, alts);
- for (list<list<int> >::iterator v = x.begin(); v != x.end(); ++v) {
- if (v->front() <= n) {
- v->push_front(n);
- results.push_back(*v);
- }
- }
- }
- return results;
- }
- }
-
-// genotype likelihood-ordering of genotypes, where each genotype is a
-// list of integers (as written in the GT field)
- list<list<int> > glorder(int ploidy, int alts) {
- list<list<int> > results = _glorder(ploidy, alts);
- for (list<list<int> >::iterator v = results.begin(); v != results.end(); ++v) {
- v->reverse();
- }
- return results;
- }
-
-// which genotype likelihoods would include this alternate allele
- list<int> glsWithAlt(int alt, int ploidy, int numalts) {
- list<int> gls;
- list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
- int i = 0;
- for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
- for (list<int>::iterator q = v->begin(); q != v->end(); ++q) {
- if (*q == alt) {
- gls.push_back(i);
- break;
- }
- }
- }
- return gls;
- }
-
-// describes the mapping between the old gl ordering and and a new
-// one in which the GLs including the old alt have been removed
-// a map to -1 means "remove"
- map<int, int> glReorder(int ploidy, int numalts, map<int, int>& alleleIndexMapping, vector<int>& altsToRemove) {
- map<int, int> mapping;
- list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
- for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v) {
- for (list<int>::iterator n = v->begin(); n != v->end(); ++n) {
- *n = alleleIndexMapping[*n];
- }
- }
- list<list<int> > newOrderedGenotypes = glorder(ploidy, numalts - altsToRemove.size());
- map<list<int>, int> newOrderedGenotypesMapping;
- int i = 0;
- // mapping is wrong...
- for (list<list<int> >::iterator v = newOrderedGenotypes.begin(); v != newOrderedGenotypes.end(); ++v, ++i) {
- newOrderedGenotypesMapping[*v] = i;
- }
- i = 0;
- for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
- map<list<int>, int>::iterator m = newOrderedGenotypesMapping.find(*v);
- if (m != newOrderedGenotypesMapping.end()) {
- //cout << "new gl order of " << i << " is " << m->second << endl;
- mapping[i] = m->second;
- } else {
- //cout << i << " will be removed" << endl;
- mapping[i] = -1;
- }
- }
- return mapping;
- }
-
-
-} // end namespace vcf
diff --git a/external/vcflib/Variant.h b/external/vcflib/Variant.h
deleted file mode 100644
index 3c0643d..0000000
--- a/external/vcflib/Variant.h
+++ /dev/null
@@ -1,480 +0,0 @@
-#ifndef __VARIANT_H
-#define __VARIANT_H
-
-#include <vector>
-#include <list>
-#include <map>
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <utility>
-#include <stdlib.h>
-#include <assert.h>
-#include <stack>
-#include <queue>
-#include <set>
-#include <algorithm>
-#include "split.h"
-#include "join.h"
-#include "tabixpp/tabix.hpp"
-#include "smithwaterman/SmithWatermanGotoh.h"
-#include "convert.h"
-#include "multichoose/multichoose.h"
-
-using namespace std;
-
-namespace vcf {
-
-class Variant;
-
-enum VariantFieldType { FIELD_FLOAT = 0
- , FIELD_INTEGER
- , FIELD_BOOL
- , FIELD_STRING
- , FIELD_UNKNOWN
- };
-
-enum VariantFieldNumber { ALLELE_NUMBER = -2
- , GENOTYPE_NUMBER = -1
- };
-
-const int INDEX_NONE = -1;
-const int NULL_ALLELE = -1;
-
-VariantFieldType typeStrToFieldType(string& typeStr);
-ostream& operator<<(ostream& out, VariantFieldType type);
-
-
-class VariantCallFile {
-
-public:
-
- istream* file;
- Tabix* tabixFile;
-
- bool usingTabix;
-
- string header;
- string line; // the current line
- string fileformat;
- string fileDate;
- string source;
- string reference;
- string phasing;
- map<string, VariantFieldType> infoTypes;
- map<string, int> infoCounts;
- map<string, VariantFieldType> formatTypes;
- map<string, int> formatCounts;
- vector<string> sampleNames;
- bool parseSamples;
- bool _done;
-
- void updateSamples(vector<string>& newSampleNames);
- void addHeaderLine(string line);
- void removeInfoHeaderLine(string line);
- void removeGenoHeaderLine(string line);
- vector<string> infoIds(void);
- vector<string> formatIds(void);
-
- bool open(string& filename) {
- vector<string> filenameParts = split(filename, ".");
- if (filenameParts.back() == "vcf") {
- return openFile(filename);
- } else if (filenameParts.back() == "gz" || filenameParts.back() == "bgz") {
- return openTabix(filename);
- } else {
- return false;
- }
- }
-
- bool openFile(string& filename) {
- file = &_file;
- _file.open(filename.c_str(), ifstream::in);
- parsedHeader = parseHeader();
- return parsedHeader;
- }
-
- bool openTabix(string& filename) {
- usingTabix = true;
- //cout << "Opending tabix file : " << filename << endl;
- tabixFile = new Tabix(filename);
- parsedHeader = parseHeader();
- return parsedHeader;
- }
-
-
- bool open(istream& stream) {
- file = &stream;
- parsedHeader = parseHeader();
- return parsedHeader;
- }
-
- bool open(ifstream& stream) {
- file = &stream;
- parsedHeader = parseHeader();
- return parsedHeader;
- }
-
- bool openForOutput(string& headerStr) {
- parsedHeader = parseHeader(headerStr);
- return parsedHeader;
- }
-
- VariantCallFile(void) : usingTabix(false), parseSamples(true), justSetRegion(false), parsedHeader(false) { }
- ~VariantCallFile(void) {
- if (usingTabix) {
- delete tabixFile;
- }
- }
-
- bool is_open(void) {
- return parsedHeader;
- }
-
- bool eof(void) { return _file.eof(); }
-
- bool done(void) { return _done; }
-
- bool parseHeader(string& headerStr);
-
- bool parseHeader(void);
-
- bool getNextVariant(Variant& var);
-
- bool setRegion(string region);
- bool setRegion(string seq, long int start, long int end = 0);
-
-private:
- bool firstRecord;
- bool justSetRegion;
- bool usingFile;
- ifstream _file;
- bool parsedHeader;
-
-};
-
-class VariantAllele {
- friend ostream& operator<<(ostream& out, VariantAllele& var);
- friend bool operator<(const VariantAllele& a, const VariantAllele& b);
-public:
- string ref;
- string alt;
- string repr;
- long position;
- VariantAllele(string r, string a, long p)
- : ref(r), alt(a), position(p)
- {
- stringstream s;
- s << position << ":" << ref << "/" << alt;
- repr = s.str();
- }
-};
-
-class Variant {
-
- friend ostream& operator<<(ostream& out, Variant& var);
-
-public:
-
- string sequenceName;
- long position;
- string id;
- string ref;
- vector<string> alt; // a list of all the alternate alleles present at this locus
- vector<string> alleles; // a list all alleles (ref + alt) at this locus
- // the indicies are organized such that the genotype codes (0,1,2,.etc.)
- // correspond to the correct offest into the allelese vector.
- // that is, alleles[0] = ref, alleles[1] = first alternate allele, etc.
- map<string, int> altAlleleIndexes; // reverse lookup for alleles
-
- map<string, vector<VariantAllele> > parsedAlternates(bool includePreviousBaseForIndels = false,
- bool useMNPs = false,
- bool useEntropy = false,
- float matchScore = 10.0f,
- float mismatchScore = -9.0f,
- float gapOpenPenalty = 15.0f,
- float gapExtendPenalty = 6.66f,
- float repeatGapExtendPenalty = 0.0f,
- string flankingRefLeft = "",
- string flankingRefRight = "");
-
-
-
- map<string, string> extendedAlternates(long int newPosition, long int length);
-
- string originalLine; // the literal of the record, as read
- // TODO
- // the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j
- // vector<pair<int, int> > genotypes; // indexes into the alleles, ordered as per the spec
- string filter;
- double quality;
- VariantFieldType infoType(string& key);
- map<string, vector<string> > info; // vector<string> allows for lists by Genotypes or Alternates
- map<string, bool> infoFlags;
- VariantFieldType formatType(string& key);
- vector<string> format;
- map<string, map<string, vector<string> > > samples; // vector<string> allows for lists by Genotypes or Alternates
- vector<string> sampleNames;
- vector<string> outputSampleNames;
- VariantCallFile* vcf;
- bool isFiltered;
- bool isHotSpot;
- //void addInfoInt(string& tag, int value);
- //void addInfoFloat(string& tag, double value);
- //void addInfoString(string& tag, string& value);
-
- void removeAlt(string& altallele);
-
-
-public:
-
- Variant() :isFiltered(false), isHotSpot(false) { }
-
- Variant(VariantCallFile& v)
- : sampleNames(v.sampleNames)
- , outputSampleNames(v.sampleNames)
- , vcf(&v)
- { }
-
- void setVariantCallFile(VariantCallFile& v);
- void setVariantCallFile(VariantCallFile* v);
-
- void parse(string& line, bool parseSamples = true);
- void addFilter(string& tag);
- bool getValueBool(string& key, string& sample, int index = INDEX_NONE);
- double getValueFloat(string& key, string& sample, int index = INDEX_NONE);
- string getValueString(string& key, string& sample, int index = INDEX_NONE);
- bool getSampleValueBool(string& key, string& sample, int index = INDEX_NONE);
- double getSampleValueFloat(string& key, string& sample, int index = INDEX_NONE);
- string getSampleValueString(string& key, string& sample, int index = INDEX_NONE);
- bool getInfoValueBool(string& key, int index = INDEX_NONE);
- double getInfoValueFloat(string& key, int index = INDEX_NONE);
- string getInfoValueString(string& key, int index = INDEX_NONE);
- void printAlt(ostream& out); // print a comma-sep list of alternate alleles to an ostream
- void printAlleles(ostream& out); // print a comma-sep list of *all* alleles to an ostream
- int getAltAlleleIndex(string& allele);
- void updateAlleleIndexes(void);
- void addFormatField(string& key);
- void setOutputSampleNames(vector<string>& outputSamples);
- map<pair<int, int>, int> getGenotypeIndexesDiploid(void);
- int getNumSamples(void);
- int getNumValidGenotypes(void);
- // TODO
- //void setInfoField(string& key, string& val);
-
-private:
- string lastFormat;
-
-};
-
-// from BamTools
-// RuleToken implementation
-
-class RuleToken {
-
-public:
-
- // enums
- enum RuleTokenType { OPERAND = 0
- , NUMBER
- , BOOLEAN_VARIABLE
- , NUMERIC_VARIABLE
- , STRING_VARIABLE
- , AND_OPERATOR
- , OR_OPERATOR
- , ADD_OPERATOR
- , SUBTRACT_OPERATOR
- , MULTIPLY_OPERATOR
- , DIVIDE_OPERATOR
- , NOT_OPERATOR
- , EQUAL_OPERATOR
- , GREATER_THAN_OPERATOR
- , LESS_THAN_OPERATOR
- , LEFT_PARENTHESIS
- , RIGHT_PARENTHESIS
- };
-
- // constructor
- RuleToken(string token, map<string, VariantFieldType>& variables);
- RuleToken(void)
- : type(BOOLEAN_VARIABLE)
- , state(false)
- { }
-
- // data members
- RuleTokenType type;
- string value;
-
- double number;
- string str;
- bool state;
-
- bool isVariable; // if this is a variable
- //bool isEvaluated; // when we evaluate variables
-
- RuleToken apply(RuleToken& other);
-
-};
-
-inline int priority(const RuleToken& token) {
- switch ( token.type ) {
- case ( RuleToken::MULTIPLY_OPERATOR ) : return 8;
- case ( RuleToken::DIVIDE_OPERATOR ) : return 8;
- case ( RuleToken::ADD_OPERATOR ) : return 7;
- case ( RuleToken::SUBTRACT_OPERATOR ) : return 7;
- case ( RuleToken::NOT_OPERATOR ) : return 6;
- case ( RuleToken::EQUAL_OPERATOR ) : return 5;
- case ( RuleToken::GREATER_THAN_OPERATOR ) : return 5;
- case ( RuleToken::LESS_THAN_OPERATOR ) : return 5;
- case ( RuleToken::AND_OPERATOR ) : return 4;
- case ( RuleToken::OR_OPERATOR ) : return 3;
- case ( RuleToken::LEFT_PARENTHESIS ) : return 0;
- case ( RuleToken::RIGHT_PARENTHESIS ) : return 0;
- default: cerr << "invalid token type" << endl; exit(1);
- }
-}
-
-inline bool isRightAssociative(const RuleToken& token) {
- return (token.type == RuleToken::NOT_OPERATOR ||
- token.type == RuleToken::LEFT_PARENTHESIS);
-}
-
-inline bool isLeftAssociative(const RuleToken& token) {
- return !isRightAssociative(token);
-}
-
-inline bool isLeftParenthesis(const RuleToken& token) {
- return ( token.type == RuleToken::LEFT_PARENTHESIS );
-}
-
-inline bool isRightParenthesis(const RuleToken& token) {
- return ( token.type == RuleToken::RIGHT_PARENTHESIS );
-}
-
-inline bool isOperand(const RuleToken& token) {
- return ( token.type == RuleToken::OPERAND ||
- token.type == RuleToken::NUMBER ||
- token.type == RuleToken::NUMERIC_VARIABLE ||
- token.type == RuleToken::STRING_VARIABLE ||
- token.type == RuleToken::BOOLEAN_VARIABLE
- );
-}
-
-inline bool isOperator(const RuleToken& token) {
- return ( token.type == RuleToken::AND_OPERATOR ||
- token.type == RuleToken::OR_OPERATOR ||
- token.type == RuleToken::NOT_OPERATOR ||
- token.type == RuleToken::EQUAL_OPERATOR ||
- token.type == RuleToken::GREATER_THAN_OPERATOR ||
- token.type == RuleToken::LESS_THAN_OPERATOR ||
- token.type == RuleToken::MULTIPLY_OPERATOR ||
- token.type == RuleToken::DIVIDE_OPERATOR ||
- token.type == RuleToken::ADD_OPERATOR ||
- token.type == RuleToken::SUBTRACT_OPERATOR
- );
-}
-
-inline bool isOperatorChar(const char& c) {
- return (c == '!' ||
- c == '&' ||
- c == '|' ||
- c == '=' ||
- c == '>' ||
- c == '<' ||
- c == '*' ||
- c == '/' ||
- c == '+' ||
- c == '-');
-}
-
-inline bool isParanChar(const char& c) {
- return (c == '(' || c == ')');
-}
-
-inline bool isNumeric(const RuleToken& token) {
- return token.type == RuleToken::NUMERIC_VARIABLE;
-}
-
-inline bool isString(const RuleToken& token) {
- return token.type == RuleToken::STRING_VARIABLE;
-}
-
-inline bool isBoolean(const RuleToken& token) {
- return token.type == RuleToken::BOOLEAN_VARIABLE;
-}
-
-inline bool isVariable(const RuleToken& token) {
- return isNumeric(token) || isString(token) || isBoolean(token);
-}
-
-void tokenizeFilterSpec(string& filterspec, stack<RuleToken>& tokens, map<string, VariantFieldType>& variables);
-
-
-class VariantFilter {
-
-public:
-
- enum VariantFilterType { SAMPLE = 0,
- RECORD };
-
- string spec;
- queue<RuleToken> tokens; // tokens, infix notation
- queue<RuleToken> rules; // tokens, prefix notation
- VariantFilterType type;
- VariantFilter(string filterspec, VariantFilterType filtertype, map<string, VariantFieldType>& variables);
- bool passes(Variant& var, string& sample); // all alts pass
- bool passes(Variant& var, string& sample, string& allele);
- void removeFilteredGenotypes(Variant& var);
-
-};
-
-
-// genotype manipulation
-
-// TODO
-//map<string, int> decomposeGenotype(string& genotype);
-
-map<int, int> decomposeGenotype(string& genotype);
-
-string genotypeToString(map<int, int>& genotype);
-
-bool isHet(map<int, int>& genotype);
-
-bool isHom(map<int, int>& genotype);
-
-bool hasNonRef(map<int, int>& genotype);
-
-bool isHomRef(map<int, int>& genotype);
-
-bool isHomNonRef(map<int, int>& genotype);
-
-bool isNull(map<int, int>& genotype);
-
-int ploidy(map<int, int>& genotype);
-
-string unionInfoHeaderLines(string& s1, string& s2);
-
-
-// genotype likelihood ordering
-
-list<list<int> > glorder(int ploidy, int alleles);
-list<list<int> > _glorder(int ploidy, int alleles);
-list<int> glsWithAlt(int alt, int ploidy, int numalts);
-map<int, int> glReorder(int ploidy, int numalts, map<int, int>& alleleIndexMapping, vector<int>& altsToRemove);
-
-vector<string>& unique(vector<string>& strings);
-
-string mergeCigar(const string& c1, const string& c2);
-vector<pair<int, string> > splitCigar(const string& cigarStr);
-list<pair<int, string> > splitCigarList(const string& cigarStr);
-int cigarRefLen(const vector<pair<int, char> >& cigar);
-int cigarRefLen(const vector<pair<int, string> >& cigar);
-string joinCigar(const vector<pair<int, string> >& cigar);
-string joinCigar(const vector<pair<int, char> >& cigar);
-string joinCigarList(const list<pair<int, string> >& cigar);
-bool isEmptyCigarElement(const pair<int, string>& elem);
-
-
-} // end namespace VCF
-
-#endif
diff --git a/external/vcflib/convert.h b/external/vcflib/convert.h
deleted file mode 100644
index d73d518..0000000
--- a/external/vcflib/convert.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef __CONVERT_H
-#define __CONVERT_H
-
-#include <sstream>
-
-// converts the string into the specified type, setting r to the converted
-// value and returning true/false on success or failure
-template<typename T>
-bool convert(const std::string& s, T& r) {
- std::istringstream iss(s);
- iss >> r;
- return iss.eof() ? true : false;
-}
-
-template<typename T>
-std::string convert(const T& r) {
- std::ostringstream oss;
- oss << r;
- return oss.str();
-}
-
-#endif
diff --git a/external/vcflib/join.h b/external/vcflib/join.h
deleted file mode 100644
index c46a75f..0000000
--- a/external/vcflib/join.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef __JOIN_H
-#define __JOIN_H
-
-// functions to split a string by a specific delimiter
-#include <string>
-#include <vector>
-#include <sstream>
-#include <string.h>
-
-// join a vector of elements by a delimiter object. ostream<< must be defined
-// for both class S and T and an ostream, as it is e.g. in the case of strings
-// and character arrays
-template<class S, class T>
-std::string join(std::vector<T>& elems, S& delim) {
- std::stringstream ss;
- typename std::vector<T>::iterator e = elems.begin();
- ss << *e++;
- for (; e != elems.end(); ++e) {
- ss << delim << *e;
- }
- return ss.str();
-}
-
-// same for lists
-template<class S, class T>
-std::string join(std::list<T>& elems, S& delim) {
- std::stringstream ss;
- typename std::list<T>::iterator e = elems.begin();
- ss << *e++;
- for (; e != elems.end(); ++e) {
- ss << delim << *e;
- }
- return ss.str();
-}
-
-#endif
diff --git a/external/vcflib/multichoose/Makefile b/external/vcflib/multichoose/Makefile
deleted file mode 100644
index 04e6cab..0000000
--- a/external/vcflib/multichoose/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-all: multichoose multipermute
-
-#multichoose_recursive: multichoose_recursive.cpp
-# g++ multichoose_recursive.cpp -o multichoose_recursive
-
-multichoose: multichoose.cpp multichoose.h
- g++ multichoose.cpp -o multichoose
-
-multipermute: multipermute.cpp multipermute.h
- g++ multipermute.cpp -o multipermute
-
-cmultichoose: multichoose.c
- gcc multichoose.c -o cmultichoose
-
-clean:
- rm cmultichoose
- rm multichoose
diff --git a/external/vcflib/multichoose/README b/external/vcflib/multichoose/README
deleted file mode 100644
index 605eeb4..0000000
--- a/external/vcflib/multichoose/README
+++ /dev/null
@@ -1,40 +0,0 @@
-Multiset combinations, n multichoose k
-
-Erik Garrison <erik.garrison at bc.edu>
-
-multichoose.cpp --
-
-This is a small C++ library/program which contains a generic function to
-generate multisets for vectors of any type of object. You can test out the
-program using strings input from the command line by typing:
-
- % make
-
-Running:
-
- % ./multichoose
-
-Prints usage information:
-
- usage:
- ./multichoose <k> <item1> <item2> ... <itemN> ~ n multichoose k
-
-Example usage:
-
- % ./multichoose 2 a t g c
- a a
- a t
- a g
- a c
- t t
- t g
- t c
- g g
- g c
- c c
-
-This example lists all the possible *unordered' genotypes at a given genetic
-loci of which there are two copies (e.g. chromosomes). 'k' (2 in this case)
-could be understood as the expected ploidy of the given locus. Applying
-multiset permutations to each of the results would generate all possible
-ordered multisets.
diff --git a/external/vcflib/multichoose/multichoose.c b/external/vcflib/multichoose/multichoose.c
deleted file mode 100644
index e29b540..0000000
--- a/external/vcflib/multichoose/multichoose.c
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-
-
-int main(int argc, char** argv){
-
- int *a, *b;
- char **m;
- int i,j,j_1,k,n,r;
-
- if (argc<3) {
- printf("usage: ./multi_erik k item_1 ... item_n\n");
- return 0;
- }
- n = atoi(argv[1]);
- r = argc - 2;
-
- m = malloc(r * sizeof(char*));
- a = malloc(n * sizeof(int));
- b = malloc(n * sizeof(int));
-
- for (i=2;i<argc;i++)
- m[i-1] = argv[i];
-
- for (i=1;i<=n;i++) {
- a[i] = 1; b[i] = r;
- }
-
- j=n;
- while(1){
- // emit multiset combination
- for(i=1;i<=n;i++)
- printf("%s ", m[a[i]]);
- printf("\n");
- j=n;
- while(a[j]==b[j])j--;
- if (j<=0) break;
- j_1=j;
- while(j_1<=n){
- a[j_1]=a[j_1]+1;
- k=j_1;
- while(k<n) {
- a[k+1]=a[k];
- k++;
- }
- k++;
- j_1=k;
- }
- }
-
- return 0;
-}
-
diff --git a/external/vcflib/multichoose/multichoose.cpp b/external/vcflib/multichoose/multichoose.cpp
deleted file mode 100644
index 455e7e6..0000000
--- a/external/vcflib/multichoose/multichoose.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-
-multichoose.cpp -- n multichoose k for generic vectors
-
-author: Erik Garrison <erik.garrison at bc.edu>
-last revised: 2010-04-16
-
-Copyright (c) 2010 by Erik Garrison
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
- */
-
-#include <iostream>
-#include <vector>
-#include <list>
-#include <stdlib.h>
-#include "multichoose.h"
-
-
-using namespace std;
-
-
-int main(int argc, char** argv) {
- if (argc < 3) {
- cerr << "usage: " << endl
- << argv[0] << " <k> <item1> <item2> ... <itemN> ~ n multichoose k" << endl;
- return 1;
- }
-
- int k = atoi(argv[1]);
- vector<string> items;
- for (int i = 2; i < argc; ++i) {
- items.push_back(string(argv[i]));
- }
-
- vector< vector<string> > results = multichoose(k, items);
-
- for (vector< vector<string> >::const_iterator i = results.begin(); i != results.end(); ++i) {
- for (vector<string>::const_iterator j = i->begin(); j != i->end(); ++j) {
- cout << *j << " ";
- }
- cout << endl;
- }
-
- return 0;
-}
diff --git a/external/vcflib/multichoose/multichoose.h b/external/vcflib/multichoose/multichoose.h
deleted file mode 100644
index 51acb24..0000000
--- a/external/vcflib/multichoose/multichoose.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef __MULTICHOOSE_H
-#define __MULTICHOOSE_H
-
-/*
-
-multichoose.h -- n multichoose k for generic vectors
-
-author: Erik Garrison <erik.garrison at bc.edu>
-last revised: 2010-04-16
-
-Copyright (c) 2010 by Erik Garrison
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
-*/
-
-
-// provides multiset combinations out of the std::vector of objects
-template <class T>
-std::vector< std::vector<T> > multichoose(int k, std::vector<T>& objects) {
-
- std::vector< std::vector<T> > choices;
-
- int j,j_1,q,r;
-
- r = objects.size() - 1;
-
- // combination indexes
- std::vector<T*> a, b;
-
- for (int i=0;i<k;i++) {
- a.push_back(&objects[0]); b.push_back(&objects[r]);
- }
-
- j=k;
- while(1){
- std::vector<T> multiset;
- for(int i=0;i<k;i++)
- multiset.push_back(*a[i]);
- choices.push_back(multiset);
- j=k;
- do { j--; } while(a[j]==b[j]);
- if (j<0) break;
- j_1=j;
- while(j_1<=k-1){
- a[j_1]=a[j_1]+1;
- q=j_1;
- while(q<k-1) {
- a[q+1]=a[q];
- q++;
- }
- q++;
- j_1=q;
- }
- }
-
- return choices;
-}
-
-#endif
diff --git a/external/vcflib/multichoose/multichoose.py b/external/vcflib/multichoose/multichoose.py
deleted file mode 100644
index 62ae7ba..0000000
--- a/external/vcflib/multichoose/multichoose.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# encoding: utf-8
-
-"""
-
-multichoose.py -- non-recursive n multichoose k for python lists
-
-author: Erik Garrison <erik.garrison at bc.edu>
-last revised: 2010-04-30
-
-Copyright (c) 2010 by Erik Garrison
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
-"""
-
-def multichoose(k, objects):
- """n multichoose k multisets from the list of objects. n is the size of
- the objects."""
- j,j_1,q = k,k,k # init here for scoping
- r = len(objects) - 1
- a = [0 for i in range(k)] # initial multiset indexes
- while True:
- yield [objects[a[i]] for i in range(0,k)] # emit result
- j = k - 1
- while j >= 0 and a[j] == r: j -= 1
- if j < 0: break # check for end condition
- j_1 = j
- while j_1 <= k - 1:
- a[j_1] = a[j_1] + 1 # increment
- q = j_1
- while q < k - 1:
- a[q+1] = a[q] # shift left
- q += 1
- q += 1
- j_1 = q
-
diff --git a/external/vcflib/multichoose/multipermute.cpp b/external/vcflib/multichoose/multipermute.cpp
deleted file mode 100644
index fc26a25..0000000
--- a/external/vcflib/multichoose/multipermute.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-
-multipermute.cpp -- multiset permutations for generic vectors
-
-author: Erik Garrison <erik.garrison at bc.edu>
-last revised: 2010-06-02
-
-Copyright (c) 2010 by Erik Garrison
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
-*/
-
-
-#include <iostream>
-#include <vector>
-#include <list>
-#include <stdlib.h>
-#include "multipermute.h"
-
-
-using namespace std;
-
-
-int main(int argc, char** argv) {
- if (argc < 3) {
- cerr << "usage: " << endl
- << argv[0] << " [multiset, items delimited by space] ~ multiset permutations of " << endl;
- return 1;
- }
-
- vector<string> items;
- for (int i = 1; i < argc; ++i) {
- items.push_back(string(argv[i]));
- }
-
- vector< vector<string> > results = multipermute(items);
-
- for (vector< vector<string> >::const_iterator i = results.begin(); i != results.end(); ++i) {
- for (vector<string>::const_iterator j = i->begin(); j != i->end(); ++j) {
- cout << *j << " ";
- }
- cout << endl;
- }
-
- return 0;
-}
diff --git a/external/vcflib/multichoose/multipermute.h b/external/vcflib/multichoose/multipermute.h
deleted file mode 100644
index 6c270a3..0000000
--- a/external/vcflib/multichoose/multipermute.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
-
-multipermute.h -- multiset permutations for generic vectors
-
-Follows 'Algorithm 1' from "Loopless Generation of Multiset Permutations using
-a Constant Number of Variables by Prefix Shifts." Aaron Williams, 2009
-
-author: Erik Garrison <erik.garrison at bc.edu>
-last revised: 2010-04-16
-
-Copyright (c) 2010 by Erik Garrison
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
-*/
-
-
-#include <vector>
-#include <algorithm>
-
-template <class T>
-class ListElement {
-
-public:
- T value;
- ListElement<T>* next;
-
- ListElement<T>() { }
-
- ListElement<T>(T val, ListElement<T>* n) {
- value = val;
- next = n;
- }
-
- ListElement<T>* nth(int n) {
- ListElement<T>* o = this;
- int i = 0;
- while (i < n && o->next != NULL) {
- o = o->next;
- ++i;
- }
- return o;
- }
-
- ~ListElement<T>() {
- if (next != NULL) {
- delete next;
- }
- }
-
-};
-
-template <class T>
-ListElement<T>* list_init(std::vector<T>& multiset) {
- std::sort(multiset.begin(), multiset.end()); // ensures proper non-increasing order
- typename std::vector<T>::const_iterator item = multiset.begin();
- ListElement<T>* h = new ListElement<T>(*item, NULL);
- ++item;
- while (item != multiset.end()) {
- h = new ListElement<T>(*item, h);
- ++item;
- }
- return h;
-}
-
-template <class T>
-std::vector<T> linked_list_to_vector(ListElement<T>* h) {
- ListElement<T>* o = h;
- std::vector<T> l;
- while (o != NULL) {
- l.push_back(o->value);
- o = o->next;
- }
- return l;
-}
-
-// provides multiset permutations out of the std::vector multiset
-template <class T>
-std::vector< std::vector<T> > multipermute(std::vector<T>& multiset) {
-
- std::vector< std::vector<T> > results;
-
- ListElement<T>* h = list_init(multiset);
- ListElement<T>* i = h->nth(multiset.size() - 2);
- ListElement<T>* j = h->nth(multiset.size() - 1);
- ListElement<T>* s;
- ListElement<T>* t;
-
- results.push_back(linked_list_to_vector(h));
-
- while (j->next != NULL || j->value < h->value) {
- if (j->next != NULL && i->value >= j->next->value) {
- s = j;
- } else {
- s = i;
- }
- t = s->next;
- s->next = t->next;
- t->next = h;
- if (t->value < h->value) {
- i = t;
- }
- j = i->next;
- h = t;
- results.push_back(linked_list_to_vector(h));
- }
-
- delete h;
-
- return results;
-
-}
-
diff --git a/external/vcflib/multichoose/multipermute.py b/external/vcflib/multichoose/multipermute.py
deleted file mode 100644
index 10f7ccc..0000000
--- a/external/vcflib/multichoose/multipermute.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-# multipermute.py - permutations of a multiset
-# Erik Garrison <erik.garrison at bc.edu> 2010
-
-"""
-This module encodes functions to generate the permutations of a multiset
-following this algorithm:
-
-Algorithm 1
-Visits the permutations of multiset E. The permutations are stored
-in a singly-linked list pointed to by head pointer h. Each node in the linked
-list has a value field v and a next field n. The init(E) call creates a
-singly-linked list storing the elements of E in non-increasing order with h, i,
-and j pointing to its first, second-last, and last nodes, respectively. The
-null pointer is given by φ. Note: If E is empty, then init(E) should exit.
-Also, if E contains only one element, then init(E) does not need to provide a
-value for i.
-
-[h, i, j] ← init(E)
-visit(h)
-while j.n ≠ φ orj.v <h.v do
- if j.n ≠ φ and i.v ≥ j.n.v then
- s←j
- else
- s←i
- end if
- t←s.n
- s.n ← t.n
- t.n ← h
- if t.v < h.v then
- i←t
- end if
- j←i.n
- h←t
- visit(h)
-end while
-
-... from "Loopless Generation of Multiset Permutations using a Constant Number
-of Variables by Prefix Shifts." Aaron Williams, 2009
-"""
-
-class ListElement:
- def __init__(self, value, next):
- self.value = value
- self.next = next
- def nth(self, n):
- o = self
- i = 0
- while i < n and o.next is not None:
- o = o.next
- i += 1
- return o
-
-def init(multiset):
- multiset.sort() # ensures proper non-increasing order
- h = ListElement(multiset[0], None)
- for item in multiset[1:]:
- h = ListElement(item, h)
- return h, h.nth(len(multiset) - 2), h.nth(len(multiset) - 1)
-
-def visit(h):
- """Converts our bespoke linked list to a python list."""
- o = h
- l = []
- while o is not None:
- l.append(o.value)
- o = o.next
- return l
-
-def permutations(multiset):
- """Generator providing all multiset permutations of a multiset."""
- h, i, j = init(multiset)
- yield visit(h)
- while j.next is not None or j.value < h.value:
- if j.next is not None and i.value >= j.next.value:
- s = j
- else:
- s = i
- t = s.next
- s.next = t.next
- t.next = h
- if t.value < h.value:
- i = t
- j = i.next
- h = t
- yield visit(h)
-
-if __name__ == '__main__':
- import sys
- multiset = sys.argv[1:]
- if multiset != []:
- for permutation in permutations(multiset):
- for item in permutation:
- print item,
- print
- else:
- print "usage", sys.argv[0], "<multiset>"
diff --git a/external/vcflib/smithwaterman/BandedSmithWaterman.cpp b/external/vcflib/smithwaterman/BandedSmithWaterman.cpp
deleted file mode 100644
index a961f7c..0000000
--- a/external/vcflib/smithwaterman/BandedSmithWaterman.cpp
+++ /dev/null
@@ -1,670 +0,0 @@
-#include "BandedSmithWaterman.h"
-
-// define our static constants
-const float CBandedSmithWaterman::FLOAT_NEGATIVE_INFINITY = (float)-1e+30;
-
-const DirectionType CBandedSmithWaterman::Directions_STOP = 0;
-const DirectionType CBandedSmithWaterman::Directions_LEFT = 1;
-const DirectionType CBandedSmithWaterman::Directions_DIAGONAL = 2;
-const DirectionType CBandedSmithWaterman::Directions_UP = 3;
-
-const PositionType CBandedSmithWaterman::Position_REF_AND_QUERY_ZERO = 0;
-const PositionType CBandedSmithWaterman::Position_REF_ZERO = 1;
-const PositionType CBandedSmithWaterman::Position_QUERY_ZERO = 2;
-const PositionType CBandedSmithWaterman::Position_REF_AND_QUERO_NONZERO = 3;
-
-// constructor
-CBandedSmithWaterman::CBandedSmithWaterman(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty, unsigned int bandWidth)
-: mCurrentMatrixSize(0)
-, mCurrentAnchorSize(0)
-, mCurrentAQSumSize(0)
-, mBandwidth(bandWidth)
-, mPointers(NULL)
-, mMatchScore(matchScore)
-, mMismatchScore(mismatchScore)
-, mGapOpenPenalty(gapOpenPenalty)
-, mGapExtendPenalty(gapExtendPenalty)
-, mAnchorGapScores(NULL)
-, mBestScores(NULL)
-, mReversedAnchor(NULL)
-, mReversedQuery(NULL)
-, mUseHomoPolymerGapOpenPenalty(false)
-{
- CreateScoringMatrix();
-
- //if((bandWidth % 2) != 1) {
- //printf("ERROR: The bandwidth must be an odd number.\n");
- //exit(1);
- //}
-
- try {
- mBestScores = new float[bandWidth + 2];
- mAnchorGapScores = new float[bandWidth + 2];
- } catch(bad_alloc) {
- printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
- exit(1);
- }
-}
-
-// destructor
-CBandedSmithWaterman::~CBandedSmithWaterman(void) {
- if(mPointers) delete [] mPointers;
- if(mAnchorGapScores) delete [] mAnchorGapScores;
- if(mBestScores) delete [] mBestScores;
- if(mReversedAnchor) delete [] mReversedAnchor;
- if(mReversedQuery) delete [] mReversedQuery;
-}
-
-// aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm
-void CBandedSmithWaterman::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2, pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> >& hr) {
-
-
-
- unsigned int rowStart = min(hr.first.first, (unsigned int)hr.second.first);
- hr.first.first -= rowStart;
- hr.second.first -= rowStart;
-
- //bool isLegalBandWidth = (s2.length() - hr.QueryBegin) > (mBandwidth / 2);
- // isLegalBandWidth = isLegalBandWidth && ((s1.length() - hr.Begin) > (mBandwidth / 2));
-
-
-
- // check the lengths of the input sequences
- //if( (s1.length() <= 0) || (s2.length() <= 0) || (s1.length() < s2.length()) ) {
- // printf("ERROR: An unexpected sequence length was encountered during pairwise alignment.\n");
- // printf("Sequence lengths are listed as following:\n");
- // printf("1. Reference length: %u\n2. Query length: %u\n", s1.length(), s2.length());
- //printf("3. Hash region in reference:%4u-%4u\n", hr.Begin + rowStart, hr.End);
- //printf("4. Hash region in query: %4u-%4u\n", hr.QueryBegin + rowStart, hr.QueryEnd);
- // exit(1);
- //}
-
-
- // determine the hash region type
- unsigned int rowOffset;
- unsigned int columnOffset;
- PositionType positionType;
-
- if(hr.first.first == 0) {
- if(hr.second.first == 0) {
- rowOffset = 1;
- columnOffset = (mBandwidth / 2) + 1;
- positionType = Position_REF_AND_QUERY_ZERO;
- } else {
- rowOffset = 1 - hr.second.first;
- columnOffset = (mBandwidth / 2) + 1 + hr.second.first;
- positionType = Position_REF_ZERO;
- }
- } else {
- if(hr.second.first == 0) {
- rowOffset = 1;
- columnOffset = (mBandwidth / 2) + 1 - hr.first.first;
- positionType = Position_QUERY_ZERO;
- } else {
- rowOffset = 1 - hr.second.first;
- columnOffset = (mBandwidth / 2) + 1 + hr.second.first - hr.first.first;
- positionType = Position_REF_AND_QUERO_NONZERO;
- }
- }
-
- // =========================
- // Reinitialize the matrices
- // =========================
-
- ReinitializeMatrices(positionType, s1.length(), s2.length(), hr);
-
- // =======================================
- // Banded Smith-Waterman forward algorithm
- // =======================================
-
- unsigned int bestColumn = 0;
- unsigned int bestRow = 0;
- float bestScore = FLOAT_NEGATIVE_INFINITY;
- float currentQueryGapScore;
-
- // rowNum and column indicate the row and column numbers in the Smith-Waterman matrix respectively
- unsigned int rowNum = hr.second.first;
- unsigned int columnNum = hr.first.first;
-
- // indicates how many rows including blank elements in the Banded SmithWaterman
- int numBlankElements = (mBandwidth / 2) - columnNum;
-
- //cout << numBlankElements << endl;
- // upper triangle matrix in Banded Smith-Waterman
- for( ; numBlankElements > 0; numBlankElements--, rowNum++){
- // in the upper triangle matrix, we always start at the 0th column
- columnNum = 0;
-
- // columnEnd indicates how many columns which should be dealt with in the current row
- unsigned int columnEnd = min((mBandwidth - numBlankElements), ((unsigned int) s1.length() - columnNum + 1) );
- currentQueryGapScore = FLOAT_NEGATIVE_INFINITY;
- for( unsigned int j = 0; j < columnEnd; j++){
- float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset);
- //cout << s1[columnNum] << s2[rowNum] << score << endl;
- UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score);
- columnNum++;
- }
-
- // replace the columnNum to the middle column in the Smith-Waterman matrix
- columnNum = columnNum - (mBandwidth / 2);
- }
- // complete matrix in Banded Smith-Waterman
- unsigned int completeNum = min((s1.length() - columnNum - (mBandwidth / 2)), (s2.length() - rowNum));
- //cout << completeNum << endl;
- for(unsigned int i = 0; i < completeNum; i++, rowNum++){
- columnNum = columnNum - (mBandwidth / 2);
-
- // there are mBandwidth columns which should be dealt with in each row
- currentQueryGapScore = FLOAT_NEGATIVE_INFINITY;
-
- for(unsigned int j = 0; j < mBandwidth; j++){
- float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset);
- UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score);
- //cout << s1[columnNum] << s2[rowNum] << score << endl;
- columnNum++;
- }
-
- // replace the columnNum to the middle column in the Smith-Waterman matrix
- // because mBandwidth is an odd number, everytime the following equation shifts a column (pluses 1).
- columnNum = columnNum - (mBandwidth / 2);
- }
-
- // lower triangle matrix
- numBlankElements = min(mBandwidth, ((unsigned int) s2.length() - rowNum));
- columnNum = columnNum - (mBandwidth / 2);
- for(unsigned int i = 0; numBlankElements > 0; i++, rowNum++, numBlankElements--) {
-
- mBestScores[ mBandwidth - i ] = FLOAT_NEGATIVE_INFINITY;;
- // columnEnd indicates how many columns which should be dealt with
- currentQueryGapScore = FLOAT_NEGATIVE_INFINITY;
-
- for( unsigned int j = columnNum; j < s1.length(); j++){
- float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset);
- UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score);
- //cout << s1[columnNum] << s2[rowNum] << score << endl;
- columnNum++;
- }
-
- // replace the columnNum to the middle column in the Smith-Waterman matrix
- columnNum = columnNum - mBandwidth + i + 2;
- }
-
- // =========================================
- // Banded Smith-Waterman backtrace algorithm
- // =========================================
-
- Traceback(referenceAl, cigarAl, s1, s2, bestRow, bestColumn, rowOffset, columnOffset);
-
-}
-
-// calculates the score during the forward algorithm
-float CBandedSmithWaterman::CalculateScore(const string& s1, const string& s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset) {
-
- // initialize
- const unsigned int row = rowNum + rowOffset;
- const unsigned int column = columnOffset - rowNum + columnNum;
- const unsigned int position = row * (mBandwidth + 2) + column;
-
- // retrieve the similarity scores
- const float similarityScore = mScoringMatrix[s1[columnNum] - 'A'][s2[rowNum] - 'A'];
- const float totalSimilarityScore = mBestScores[column] + similarityScore;
-
- // ================================
- // open a gap in the query sequence
- // ================================
-
- float queryGapExtendScore = currentQueryGapScore - mGapExtendPenalty;
- float queryGapOpenScore = mBestScores[column - 1] - mGapOpenPenalty;
-
- // compute the homo-polymer gap score if enabled
- if(mUseHomoPolymerGapOpenPenalty)
- if((rowNum > 1) && (s2[rowNum] == s2[rowNum - 1]))
- queryGapOpenScore = mBestScores[column - 1] - mHomoPolymerGapOpenPenalty;
-
- if(queryGapExtendScore > queryGapOpenScore) {
- currentQueryGapScore = queryGapExtendScore;
- mPointers[position].mSizeOfHorizontalGaps = mPointers[position - 1].mSizeOfHorizontalGaps + 1;
- } else currentQueryGapScore = queryGapOpenScore;
-
-
- // ====================================
- // open a gap in the reference sequence
- // ====================================
-
-
- float anchorGapExtendScore = mAnchorGapScores[column + 1] - mGapExtendPenalty;
- float anchorGapOpenScore = mBestScores[column + 1] - mGapOpenPenalty;
-
- // compute the homo-polymer gap score if enabled
- if(mUseHomoPolymerGapOpenPenalty)
- if((columnNum > 1) && (s1[columnNum] == s1[columnNum - 1]))
- anchorGapOpenScore = mBestScores[column + 1] - mHomoPolymerGapOpenPenalty;
-
- if(anchorGapExtendScore > anchorGapOpenScore) {
- mAnchorGapScores[column] = anchorGapExtendScore;
- mPointers[position].mSizeOfVerticalGaps = mPointers[position - mBandwidth - 1].mSizeOfVerticalGaps + 1;
- } else mAnchorGapScores[column] = anchorGapOpenScore;
-
- // ======================================
- // calculate the best score and direction
- // ======================================
-
- //mBestScores[column] = MaxFloats(totalSimilarityScore, mAnchorGapScores[column], currentQueryGapScore);
- mBestScores[column] = MaxFloats(totalSimilarityScore, currentQueryGapScore, mAnchorGapScores[column]);
-
- // determine the traceback direction
- // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
- if(mBestScores[column] == 0) mPointers[position].Direction = Directions_STOP;
- else if(mBestScores[column] == totalSimilarityScore) mPointers[position].Direction = Directions_UP;
- else if(mBestScores[column] == currentQueryGapScore) mPointers[position].Direction = Directions_LEFT;
- else mPointers[position].Direction = Directions_DIAGONAL;
-
- return mBestScores[column];
-}
-
-// corrects the homopolymer gap order for forward alignments
-void CBandedSmithWaterman::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) {
-
-
- // this is only required for alignments with mismatches
- //if(al.NumMismatches == 0) return;
- if ( numMismatches == 0 ) return;
-
- // localize the alignment data
- //char* pReference = al.Reference.Data();
- //char* pQuery = al.Query.Data();
- //const unsigned int numBases = al.Reference.Length();
- char* pReference = mReversedAnchor;
- char* pQuery = mReversedQuery;
-
- // initialize
- bool hasReferenceGap = false, hasQueryGap = false;
- char* pNonGapSeq = NULL;
- char* pGapSeq = NULL;
- char nonGapBase = 'J';
-
- // identify gapped regions
- for(unsigned int i = 0; i < numBases; i++) {
-
- // check if the current position is gapped
- hasReferenceGap = false;
- hasQueryGap = false;
-
- if(pReference[i] == GAP) {
- hasReferenceGap = true;
- pNonGapSeq = pQuery;
- pGapSeq = pReference;
- nonGapBase = pQuery[i];
- }
-
- if(pQuery[i] == GAP) {
- hasQueryGap = true;
- pNonGapSeq = pReference;
- pGapSeq = pQuery;
- nonGapBase = pReference[i];
- }
-
- // continue if we don't have any gaps
- if(!hasReferenceGap && !hasQueryGap) continue;
-
- // sanity check
- if(hasReferenceGap && hasQueryGap) {
- printf("ERROR: Found a gap in both the reference sequence and query sequence.\n");
- exit(1);
- }
-
- // find the non-gapped length (forward)
- unsigned short numGappedBases = 0;
- unsigned short nonGapLength = 0;
- unsigned short testPos = i;
- while(testPos < numBases) {
-
- const char gs = pGapSeq[testPos];
- const char ngs = pNonGapSeq[testPos];
-
- bool isPartofHomopolymer = false;
- if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true;
- if(!isPartofHomopolymer) break;
-
- if(gs == GAP) numGappedBases++;
- else nonGapLength++;
- testPos++;
- }
-
- // fix the gap order
- if(numGappedBases != 0) {
- char* pCurrentSequence = pGapSeq + i;
- memset(pCurrentSequence, nonGapBase, nonGapLength);
- pCurrentSequence += nonGapLength;
- memset(pCurrentSequence, GAP, numGappedBases);
- }
-
- // increment
- i += numGappedBases + nonGapLength - 1;
- }
-}
-
-// creates a simple scoring matrix to align the nucleotides and the ambiguity code N
-void CBandedSmithWaterman::CreateScoringMatrix(void) {
-
- unsigned int nIndex = 13;
- unsigned int xIndex = 23;
-
- // define the N score to be 1/4 of the span between mismatch and match
- //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5);
-
- // calculate the scoring matrix
- for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) {
- for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) {
-
- // N.B. matching N to everything (while conceptually correct) leads to some
- // bad alignments, lets make N be a mismatch instead.
-
- // add the matches or mismatches to the hashtable (N is a mismatch)
- if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore;
- else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore;
- else if(i == j) mScoringMatrix[i][j] = mMatchScore;
- else mScoringMatrix[i][j] = mMismatchScore;
- }
- }
-
- // add ambiguity codes
- mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
- mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
- // add ambiguity codes
- mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
- mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
- mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C
- mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore;
-
- mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore; // R - A
- mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore;
- mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G
- mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore;
-
- mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore; // W - A
- mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore;
- mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T
- mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore;
-
- mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore; // S - C
- mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore;
- mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G
- mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore;
-
- mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore; // Y - C
- mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore;
- mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T
- mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore;
-
- mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore; // K - G
- mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore;
- mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T
- mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore;
-
- mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore; // V - A
- mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore;
- mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C
- mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore;
- mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G
- mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore;
-
- mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore; // H - A
- mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore;
- mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C
- mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore;
- mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T
- mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore;
-
- mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore; // D - A
- mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore;
- mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G
- mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore;
- mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T
- mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore;
-
- mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore; // B - C
- mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore;
- mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G
- mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore;
- mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T
- mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore;
-}
-
-// enables homo-polymer scoring
-void CBandedSmithWaterman::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) {
- mUseHomoPolymerGapOpenPenalty = true;
- mHomoPolymerGapOpenPenalty = hpGapOpenPenalty;
-}
-
-// reinitializes the matrices
-void CBandedSmithWaterman::ReinitializeMatrices(const PositionType& positionType, const unsigned int& s1Length, const unsigned int& s2Length, const pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr) {
-
-/*
- try {
- mBestScores = new float[mBandwidth + 2];
- mAnchorGapScores = new float[mBandwidth + 2];
- } catch(bad_alloc) {
- printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
- exit(1);
- }
-*/
-
- const unsigned int numColumns = mBandwidth + 2;
- unsigned int numRows = 0;
-
- switch(positionType) {
- case Position_REF_AND_QUERY_ZERO:
- numRows = s2Length + 1;
- break;
- case Position_REF_ZERO:
- numRows = s2Length - hr.second.first + 2;
- break;
- case Position_QUERY_ZERO:
- numRows = min(s2Length + 1, s1Length - hr.first.first + 2);
- break;
- case Position_REF_AND_QUERO_NONZERO:
- numRows = min(s1Length - hr.first.first + 2, s2Length - hr.second.first + 2);
- break;
- }
-
- // update the size of the backtrace matrix
- if((numColumns * numRows) > mCurrentMatrixSize) {
-
- mCurrentMatrixSize = numColumns * numRows;
- if(mPointers) delete [] mPointers;
-
- try {
- mPointers = new ElementInfo[mCurrentMatrixSize];
- } catch(bad_alloc) {
- printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
- exit(1);
- }
- }
-
- // initialize our backtrace matrix
- ElementInfo defaultElement;
- defaultElement.Direction = Directions_STOP;
- defaultElement.mSizeOfHorizontalGaps = 1;
- defaultElement.mSizeOfVerticalGaps = 1;
-
- uninitialized_fill(mPointers, mPointers + mCurrentMatrixSize, defaultElement);
-
- // update the sequence character arrays
- if((s1Length + s2Length) > mCurrentAQSumSize) {
-
- mCurrentAQSumSize = s1Length + s2Length;
- if(mReversedAnchor) delete [] mReversedAnchor;
- if(mReversedQuery) delete [] mReversedQuery;
-
- try {
- mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1
- mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2
- } catch(bad_alloc) {
- printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
- exit(1);
- }
- }
-
- // initialize the gap score and score vectors
- uninitialized_fill(mAnchorGapScores, mAnchorGapScores + mBandwidth + 2, FLOAT_NEGATIVE_INFINITY);
- memset((char*)mBestScores, 0, SIZEOF_FLOAT * (mBandwidth + 2));
- mBestScores[0] = FLOAT_NEGATIVE_INFINITY;
- mBestScores[mBandwidth + 1] = FLOAT_NEGATIVE_INFINITY;
-}
-
-// performs the backtrace algorithm
-void CBandedSmithWaterman::Traceback(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2, unsigned int bestRow, unsigned int bestColumn, const unsigned int rowOffset, const unsigned int columnOffset){
-
-
- unsigned int currentRow = bestRow;
- unsigned int currentColumn = bestColumn;
- unsigned int currentPosition = ((currentRow + rowOffset) * (mBandwidth + 2)) + (columnOffset - currentRow + currentColumn);
-
-
- // record the numbers of row and column before the current row and column
- unsigned int previousRow = bestRow;
- unsigned int previousColumn = bestColumn;
-
- unsigned int gappedAnchorLen = 0;
- unsigned int gappedQueryLen = 0;
- unsigned int numMismatches = 0;
-
- bool keepProcessing = true;
- while(keepProcessing) {
- unsigned int nVerticalGap = 0;
- unsigned int nHorizontalGap = 0;
- switch(mPointers[currentPosition].Direction){
- case Directions_DIAGONAL:
- nVerticalGap = mPointers[currentPosition].mSizeOfVerticalGaps;
- for(unsigned int i = 0; i < nVerticalGap; i++){
- mReversedAnchor[gappedAnchorLen++] = GAP;
- mReversedQuery[gappedQueryLen++] = s2[currentRow];
-
- numMismatches++;
-
- previousRow = currentRow;
- previousColumn = currentColumn;
-
- currentRow--;
- }
- break;
-
- case Directions_STOP:
- keepProcessing = false;
- //mReversedAnchor[gappedAnchorLen+1]='\0';
- //mReversedQuery [gappedQueryLen+1]='\0';
- break;
-
- case Directions_UP:
-
- mReversedAnchor[gappedAnchorLen++] = s1[currentColumn];
- mReversedQuery[gappedQueryLen++] = s2[currentRow];
-
- if(s1[currentColumn] != s2[currentRow]) numMismatches++;
- previousRow = currentRow;
- previousColumn = currentColumn;
-
- currentRow--;
- currentColumn--;
- break;
-
- case Directions_LEFT:
- nHorizontalGap = mPointers[currentPosition].mSizeOfHorizontalGaps;
- for(unsigned int i = 0; i < nHorizontalGap; i++){
-
- mReversedAnchor[gappedAnchorLen++] = s1[currentColumn];
- mReversedQuery[gappedQueryLen++] = GAP;
-
- numMismatches++;
-
- previousRow = currentRow;
- previousColumn = currentColumn;
-
-
- currentColumn--;
- }
- break;
- }
- currentPosition = ((currentRow + rowOffset) * (mBandwidth + 2)) + (columnOffset - currentRow + currentColumn);
- }
-
- // correct the reference and query sequence order
- mReversedAnchor[gappedAnchorLen] = 0;
- mReversedQuery [gappedQueryLen] = 0;
- reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
- reverse(mReversedQuery, mReversedQuery + gappedQueryLen);
-
- //alignment.Reference = mReversedAnchor;
- //alignment.Query = mReversedQuery;
-
- // assign the alignment endpoints
- //alignment.ReferenceBegin = previousColumn;
- //alignment.ReferenceEnd = bestColumn;
- referenceAl = previousColumn;
- /*
- if(alignment.IsReverseComplement){
- alignment.QueryBegin = s2.length() - bestRow - 1;
- alignment.QueryEnd = s2.length() - previousRow - 1;
- } else {
- alignment.QueryBegin = previousRow;
- alignment.QueryEnd = bestRow;
- }
- */
-
- //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
- //alignment.NumMismatches = numMismatches;
-
- const unsigned int alLength = strlen(mReversedAnchor);
- unsigned int m = 0, d = 0, i = 0;
- bool dashRegion = false;
- ostringstream oCigar;
-
- if ( previousRow != 0 )
- oCigar << previousRow << 'S';
-
- for ( unsigned int j = 0; j < alLength; j++ ) {
- // m
- if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
- if ( dashRegion ) {
- if ( d != 0 ) oCigar << d << 'D';
- else oCigar << i << 'I';
- }
- dashRegion = false;
- m++;
- d = 0;
- i = 0;
- }
- // I or D
- else {
- if ( !dashRegion )
- oCigar << m << 'M';
- dashRegion = true;
- m = 0;
- if ( mReversedAnchor[j] == GAP ) {
- if ( d != 0 ) oCigar << d << 'D';
- i++;
- d = 0;
- }
- else {
- if ( i != 0 ) oCigar << i << 'I';
- d++;
- i = 0;
- }
- }
- }
-
- if ( m != 0 ) oCigar << m << 'M';
- else if ( d != 0 ) oCigar << d << 'D';
- else if ( i != 0 ) oCigar << i << 'I';
-
- if ( ( bestRow + 1 ) != s2.length() )
- oCigar << s2.length() - bestRow - 1 << 'S';
-
- cigarAl = oCigar.str();
-
-
- // correct the homopolymer gap order
- CorrectHomopolymerGapOrder(alLength, numMismatches);
-
-}
diff --git a/external/vcflib/smithwaterman/BandedSmithWaterman.h b/external/vcflib/smithwaterman/BandedSmithWaterman.h
deleted file mode 100644
index ca78ac2..0000000
--- a/external/vcflib/smithwaterman/BandedSmithWaterman.h
+++ /dev/null
@@ -1,111 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <algorithm>
-#include <memory>
-//#include "Alignment.h"
-#include "Mosaik.h"
-//#include "HashRegion.h"
-#include <string.h>
-#include <stdio.h>
-#include <sstream>
-#include <string>
-
-using namespace std;
-
-#define MOSAIK_NUM_NUCLEOTIDES 26
-#define GAP '-'
-
-typedef unsigned char DirectionType;
-typedef unsigned char PositionType;
-
-struct ElementInfo {
- unsigned int Direction : 2;
- unsigned int mSizeOfVerticalGaps : 15;
- unsigned int mSizeOfHorizontalGaps : 15;
-};
-
-class CBandedSmithWaterman {
-public:
- // constructor
- CBandedSmithWaterman(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty, unsigned int bandWidth);
- // destructor
- ~CBandedSmithWaterman(void);
- // aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm
- void Align(unsigned int& referenceAl, string& stringAl, const string& s1, const string& s2, pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> >& hr);
- // enables homo-polymer scoring
- void EnableHomoPolymerGapPenalty(float hpGapOpenPenalty);
-private:
- // calculates the score during the forward algorithm
- float CalculateScore(const string& s1, const string& s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset);
- // creates a simple scoring matrix to align the nucleotides and the ambiguity code N
- void CreateScoringMatrix(void);
- // corrects the homopolymer gap order for forward alignments
- void CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches);
- // returns the maximum floating point number
- static inline float MaxFloats(const float& a, const float& b, const float& c);
- // reinitializes the matrices
- void ReinitializeMatrices(const PositionType& positionType, const unsigned int& s1Length, const unsigned int& s2Length, const pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr);
- // performs the backtrace algorithm
- void Traceback(unsigned int& referenceAl, string& stringAl, const string& s1, const string& s2, unsigned int bestRow, unsigned int bestColumn, const unsigned int rowOffset, const unsigned int columnOffset);
- // updates the best score during the forward algorithm
- inline void UpdateBestScore(unsigned int& bestRow, unsigned int& bestColumn, float& bestScore, const unsigned int rowNum, const unsigned int columnNum, const float score);
- // our simple scoring matrix
- float mScoringMatrix[MOSAIK_NUM_NUCLEOTIDES][MOSAIK_NUM_NUCLEOTIDES];
- // keep track of maximum initialized sizes
- unsigned int mCurrentMatrixSize;
- unsigned int mCurrentAnchorSize;
- unsigned int mCurrentAQSumSize;
- unsigned int mBandwidth;
- // define our backtrace directions
- const static DirectionType Directions_STOP;
- const static DirectionType Directions_LEFT;
- const static DirectionType Directions_DIAGONAL;
- const static DirectionType Directions_UP;
- // store the backtrace pointers
- ElementInfo* mPointers;
- // define our position types
- const static PositionType Position_REF_AND_QUERY_ZERO;
- const static PositionType Position_REF_ZERO;
- const static PositionType Position_QUERY_ZERO;
- const static PositionType Position_REF_AND_QUERO_NONZERO;
- // define scoring constants
- const float mMatchScore;
- const float mMismatchScore;
- const float mGapOpenPenalty;
- const float mGapExtendPenalty;
- // score if xi aligns to a gap after yi
- float* mAnchorGapScores;
- // best score of alignment x1...xi to y1...yi
- float* mBestScores;
- // our reversed alignment
- char* mReversedAnchor;
- char* mReversedQuery;
- // define static constants
- static const float FLOAT_NEGATIVE_INFINITY;
- // toggles the use of the homo-polymer gap open penalty
- bool mUseHomoPolymerGapOpenPenalty;
- float mHomoPolymerGapOpenPenalty;
-};
-
-// returns the maximum floating point number
-inline float CBandedSmithWaterman::MaxFloats(const float& a, const float& b, const float& c) {
- float max = 0.0f;
- if(a > max) max = a;
- if(b > max) max = b;
- if(c > max) max = c;
- return max;
-}
-
-// updates the best score during the forward algorithm
-inline void CBandedSmithWaterman::UpdateBestScore(unsigned int& bestRow, unsigned int& bestColumn, float& bestScore, const unsigned int rowNum, const unsigned int columnNum, const float score) {
-
- //const unsigned int row = rowNum + rowOffset;
- //const unsigned int column = columnOffset - rowNum + columnNum;
-
- if(score > bestScore) {
- bestRow = rowNum;
- bestColumn = columnNum;
- bestScore = score;
- }
-}
diff --git a/external/vcflib/smithwaterman/IndelAllele.cpp b/external/vcflib/smithwaterman/IndelAllele.cpp
deleted file mode 100644
index 80b5fac..0000000
--- a/external/vcflib/smithwaterman/IndelAllele.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "IndelAllele.h"
-
-using namespace std;
-
-
-bool IndelAllele::homopolymer(void) {
- string::iterator s = sequence.begin();
- char c = *s++;
- while (s != sequence.end()) {
- if (c != *s++) return false;
- }
- return true;
-}
-
-int IndelAllele::readLength(void) {
- if (insertion) {
- return length;
- } else {
- return 0;
- }
-}
-
-int IndelAllele::referenceLength(void) {
- if (insertion) {
- return 0;
- } else {
- return length;
- }
-}
-
-bool homopolymer(string sequence) {
- string::iterator s = sequence.begin();
- char c = *s++;
- while (s != sequence.end()) {
- if (c != *s++) return false;
- }
- return true;
-}
-
-ostream& operator<<(ostream& out, const IndelAllele& indel) {
- string t = indel.insertion ? "i" : "d";
- out << t << ":" << indel.position << ":" << indel.readPosition << ":" << indel.length << ":" << indel.sequence;
- return out;
-}
-
-bool operator==(const IndelAllele& a, const IndelAllele& b) {
- return (a.insertion == b.insertion
- && a.length == b.length
- && a.position == b.position
- && a.sequence == b.sequence);
-}
-
-bool operator!=(const IndelAllele& a, const IndelAllele& b) {
- return !(a==b);
-}
-
-bool operator<(const IndelAllele& a, const IndelAllele& b) {
- ostringstream as, bs;
- as << a;
- bs << b;
- return as.str() < bs.str();
-}
diff --git a/external/vcflib/smithwaterman/IndelAllele.h b/external/vcflib/smithwaterman/IndelAllele.h
deleted file mode 100644
index 247c01f..0000000
--- a/external/vcflib/smithwaterman/IndelAllele.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef __INDEL_ALLELE_H
-#define __INDEL_ALLELE_H
-
-#include <string>
-#include <iostream>
-#include <sstream>
-
-using namespace std;
-
-class IndelAllele {
- friend ostream& operator<<(ostream&, const IndelAllele&);
- friend bool operator==(const IndelAllele&, const IndelAllele&);
- friend bool operator!=(const IndelAllele&, const IndelAllele&);
- friend bool operator<(const IndelAllele&, const IndelAllele&);
-public:
- bool insertion;
- int length;
- int referenceLength(void);
- int readLength(void);
- int position;
- int readPosition;
- string sequence;
-
- bool homopolymer(void);
-
- IndelAllele(bool i, int l, int p, int rp, string s)
- : insertion(i), length(l), position(p), readPosition(rp), sequence(s)
- { }
-};
-
-bool homopolymer(string sequence);
-ostream& operator<<(ostream& out, const IndelAllele& indel);
-bool operator==(const IndelAllele& a, const IndelAllele& b);
-bool operator!=(const IndelAllele& a, const IndelAllele& b);
-bool operator<(const IndelAllele& a, const IndelAllele& b);
-
-#endif
diff --git a/external/vcflib/smithwaterman/LeftAlign.cpp b/external/vcflib/smithwaterman/LeftAlign.cpp
deleted file mode 100644
index bff36c6..0000000
--- a/external/vcflib/smithwaterman/LeftAlign.cpp
+++ /dev/null
@@ -1,853 +0,0 @@
-#include "LeftAlign.h"
-
-//bool debug;
-#define VERBOSE_DEBUG
-
-// Attempts to left-realign all the indels represented by the alignment cigar.
-//
-// This is done by shifting all indels as far left as they can go without
-// mismatch, then merging neighboring indels of the same class. leftAlign
-// updates the alignment cigar with changes, and returns true if realignment
-// changed the alignment cigar.
-//
-// To left-align, we move multi-base indels left by their own length as long as
-// the preceding bases match the inserted or deleted sequence. After this
-// step, we handle multi-base homopolymer indels by shifting them one base to
-// the left until they mismatch the reference.
-//
-// To merge neighboring indels, we iterate through the set of left-stabilized
-// indels. For each indel we add a new cigar element to the new cigar. If a
-// deletion follows a deletion, or an insertion occurs at the same place as
-// another insertion, we merge the events by extending the previous cigar
-// element.
-//
-// In practice, we must call this function until the alignment is stabilized.
-//
-bool leftAlign(string& querySequence, string& cigar, string& baseReferenceSequence, int& offset, bool debug) {
-
- debug = false;
-
- string referenceSequence = baseReferenceSequence.substr(offset);
-
- int arsOffset = 0; // pointer to insertion point in aligned reference sequence
- string alignedReferenceSequence, alignedQuerySequence;
- if (debug) alignedReferenceSequence = referenceSequence;
- if (debug) alignedQuerySequence = querySequence;
- int aabOffset = 0;
-
- // store information about the indels
- vector<IndelAllele> indels;
-
- int rp = 0; // read position, 0-based relative to read
- int sp = 0; // sequence position
-
- string softBegin;
- string softEnd;
-
- string cigarbefore = cigar;
-
- vector<pair<int, string> > cigarData = splitCIGAR(cigar);
- for (vector<pair<int, string> >::const_iterator c = cigarData.begin();
- c != cigarData.end(); ++c) {
- unsigned int l = c->first;
- string t = c->second;
- if (debug) cerr << l << t << " " << sp << " " << rp << endl;
- if (t == "M") { // match or mismatch
- sp += l;
- rp += l;
- } else if (t == "D") { // deletion
- indels.push_back(IndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
- if (debug) { cerr << indels.back() << endl; alignedQuerySequence.insert(rp + aabOffset, string(l, '-')); }
- aabOffset += l;
- sp += l; // update reference sequence position
- } else if (t == "I") { // insertion
- indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l)));
- if (debug) { cerr << indels.back() << endl; alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-')); }
- arsOffset += l;
- rp += l;
- } else if (t == "S") { // soft clip, clipped sequence present in the read not matching the reference
- // remove these bases from the refseq and read seq, but don't modify the alignment sequence
- if (rp == 0) {
- alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
- //indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l)));
- softBegin = querySequence.substr(0, l);
- } else {
- alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
- //indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l)));
- softEnd = querySequence.substr(querySequence.size() - l, l);
- }
- rp += l;
- } else if (t == "H") { // hard clip on the read, clipped sequence is not present in the read
- } else if (t == "N") { // skipped region in the reference not present in read, aka splice
- sp += l;
- }
- }
-
-
- if (debug) cerr << "| " << cigarbefore << endl
- << "| " << alignedReferenceSequence << endl
- << "| " << alignedQuerySequence << endl;
-
- // if no indels, return the alignment
- if (indels.empty()) { return false; }
-
- if (debug) {
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- }
-
- // for each indel, from left to right
- // while the indel sequence repeated to the left and we're not matched up with the left-previous indel
- // move the indel left
-
- vector<IndelAllele>::iterator previous = indels.begin();
- for (vector<IndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
-
- // left shift by repeats
- //
- // from 1 base to the length of the indel, attempt to shift left
- // if the move would cause no change in alignment optimality (no
- // introduction of mismatches, and by definition no change in gap
- // length), move to the new position.
- // in practice this moves the indel left when we reach the size of
- // the repeat unit.
- //
- int steppos, readsteppos;
- IndelAllele& indel = *id;
- int i = 1;
-
- while (i <= indel.length) {
-
- int steppos = indel.position - i;
- int readsteppos = indel.readPosition - i;
-
- if (debug) {
- if (steppos >= 0 && readsteppos >= 0) {
- cerr << "refseq flank " << referenceSequence.substr(steppos, indel.length) << endl;
- cerr << "qryseq flank " << querySequence.substr(readsteppos, indel.length) << endl;
- cerr << "indelseq " << indel.sequence << endl;
- }
- }
-
- while (steppos >= 0 && readsteppos >= 0
- && indel.sequence == referenceSequence.substr(steppos, indel.length)
- && indel.sequence == querySequence.substr(readsteppos, indel.length)
- && (id == indels.begin()
- || (previous->insertion && steppos >= previous->position)
- || (!previous->insertion && steppos >= previous->position + previous->length))) {
- LEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
- indel.position -= i;
- indel.readPosition -= i;
- steppos = indel.position - i;
- readsteppos = indel.readPosition - i;
- }
- do {
- ++i;
- } while (i <= indel.length && indel.length % i != 0);
- }
-
-
-
- // left shift indels with exchangeable flanking sequence
- //
- // for example:
- //
- // GTTACGTT GTTACGTT
- // GT-----T ----> G-----TT
- //
- // GTGTGACGTGT GTGTGACGTGT
- // GTGTG-----T ----> GTG-----TGT
- //
- // GTGTG-----T GTG-----TGT
- // GTGTGACGTGT ----> GTGTGACGTGT
- //
- //
-
- steppos = indel.position - 1;
- readsteppos = indel.readPosition - 1;
- while (steppos >= 0 && readsteppos >= 0
- && querySequence.at(readsteppos) == referenceSequence.at(steppos)
- && (int)referenceSequence.size() > steppos + indel.length
- && indel.sequence.at((int) indel.sequence.size() - 1) == referenceSequence.at(steppos + indel.length) // are the exchanged bases going to match wrt. the reference?
- && querySequence.at(readsteppos) == indel.sequence.at((int) indel.sequence.size() - 1)
- && (id == indels.begin()
- || (previous->insertion && indel.position - 1 >= previous->position)
- || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
- if (debug) cerr << (indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl;
- indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
- indel.position -= 1;
- indel.readPosition -= 1;
- if (debug) cerr << indel << endl;
- steppos = indel.position - 1;
- readsteppos = indel.readPosition - 1;
- //if (debug && steppos && readsteppos) cerr << querySequence.at(readsteppos) << " ==? " << referenceSequence.at(steppos) << endl;
- //if (debug && steppos && readsteppos) cerr << indel.sequence.at((int) indel.sequence.size() - 1) << " ==? " << referenceSequence.at(steppos + indel.length) << endl;
- }
- // tracks previous indel, so we don't run into it with the next shift
- previous = id;
- }
-
- if (debug) {
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- }
-
- if (debug) cerr << "bring together floating indels" << endl;
-
- // bring together floating indels
- // from left to right
- // check if we could merge with the next indel
- // if so, adjust so that we will merge in the next step
- if (indels.size() > 1) {
- previous = indels.begin();
- for (vector<IndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
- IndelAllele& indel = *id;
- // parsimony: could we shift right and merge with the previous indel?
- // if so, do it
- int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
- int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
- if (previous->insertion == indel.insertion
- && ((previous->insertion
- && (previous->position < indel.position
- && previous->readPosition < indel.readPosition))
- ||
- (!previous->insertion
- && (previous->position + previous->length < indel.position)
- && (previous->readPosition < indel.readPosition)
- ))) {
- if (previous->homopolymer()) {
- string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
- string readseq = querySequence.substr(prev_end_read, indel.position - prev_end_ref);
- if (debug) cerr << "seq: " << seq << endl << "readseq: " << readseq << endl;
- if (previous->sequence.at(0) == seq.at(0)
- && homopolymer(seq)
- && homopolymer(readseq)) {
- if (debug) cerr << "moving " << *previous << " right to "
- << (indel.insertion ? indel.position : indel.position - previous->length) << endl;
- previous->position = indel.insertion ? indel.position : indel.position - previous->length;
- previous->readPosition = !indel.insertion ? indel.readPosition : indel.readPosition - previous->readLength(); // should this be readLength?
- }
- }
- /*
- else {
- int pos = previous->position;
- int readpos = previous->readPosition;
- while (pos < (int) referenceSequence.length() &&
- ((previous->insertion && pos + previous->length <= indel.position)
- ||
- (!previous->insertion && pos + previous->length < indel.position))
- && previous->sequence == referenceSequence.substr(pos + previous->length, previous->length)
- && previous->sequence == querySequence.substr(readpos + previous->length, previous->length)
- ) {
- pos += previous->length;
- readpos += previous->length;
- }
- string seq = previous->sequence;
- if (pos > previous->position) {
- // wobble bases right to left as far as we can go
- int steppos = previous->position + seq.size();
- int readsteppos = previous->readPosition + seq.size();
-
- while (querySequence.at(readsteppos) == referenceSequence.at(steppos)
- && querySequence.at(readsteppos) == seq.at(0)
- && (id == indels.begin()
- || (indel.insertion && pos + seq.size() - 1 <= indel.position)
- || (!previous->insertion && indel.position - 1 >= pos + previous->length))) {
- seq = seq.substr(1) + seq.at(0);
- ++pos;
- ++readpos;
- steppos = pos + 1;
- readsteppos = readpos + 1;
- }
-
- if (((previous->insertion && pos + previous->length == indel.position)
- ||
- (!previous->insertion && pos == indel.position - previous->length))
- ) {
- if (debug) cerr << "right-merging tandem repeat: moving " << *previous << " right to " << pos << endl;
- previous->position = pos;
- previous->readPosition = readpos;
- previous->sequence = seq;
- }
- }
- }
- */
- }
- previous = id;
- }
- }
-
- if (debug) {
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- }
-
-
- if (debug) cerr << "bring in indels at ends of read" << endl;
-
- // try to "bring in" repeat indels at the end, for maximum parsimony
- //
- // e.g.
- //
- // AGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAA
- // AGAAAG------AAAGAAAAAGAACCAAGAAGAAAA
- //
- // has no information which distinguishes it from:
- //
- // AGAAAGAAAAAGAAAAAGAACCAAGAAGAAAA
- // AGAAAG--AAAGAAAAAGAACCAAGAAGAAAA
- //
- // here we take the parsimonious explanation
-
- if (!indels.empty()) {
- // deal with the first indel
- // the first deletion ... or the biggest deletion
- vector<IndelAllele>::iterator a = indels.begin();
- vector<IndelAllele>::iterator del = indels.begin();
- for (; a != indels.end(); ++a) {
- //if (!a->insertion && a->length > biggestDel->length) biggestDel = a;
- if (!a->insertion && a->length) del = a;
- if (!del->insertion) {
- //if (!indel.insertion) { // only handle deletions like this for now
- //if (!indel.insertion && !(indels.size() > 1 && indel.readPosition == indels.at(1).readPosition)) { // only handle deletions like this for now
- int insertedBpBefore = 0;
- int deletedBpBefore = 0;
- for (vector<IndelAllele>::iterator i = indels.begin(); i != del; ++i) {
- if (i->insertion) insertedBpBefore += i->length;
- else deletedBpBefore += i->length;
- }
- IndelAllele& indel = *del;
- int minsize = indel.length;
- int flankingLength = indel.readPosition;
- if (debug) cerr << indel << endl;
- string flanking = querySequence.substr(0, flankingLength);
- if (debug) cerr << flanking << endl;
-
- size_t p = referenceSequence.substr(0, indel.position + indel.length).rfind(flanking);
- if (p == string::npos) {
- if (debug) cerr << "flanking not found" << endl;
- } else {
- if (debug) {
- cerr << "flanking is at " << p << endl;
- cerr << "minsize would be " << (indel.position + indel.length) - ((int) p + flankingLength) << endl;
- }
- minsize = (indel.position + indel.length) - ((int) p + flankingLength);
- }
-
- if (debug) cerr << minsize << endl;
-
- if (minsize >= 0 && minsize < indel.length) {
-
- int softdiff = softBegin.size();
- if (!softBegin.empty()) { // remove soft clips if we can
- if (flankingLength < (int)softBegin.size()) {
- softBegin = softBegin.substr(0, flankingLength - softBegin.size());
- softdiff -= softBegin.size();
- } else {
- softBegin.clear();
- }
- }
-
- // the new read position == the current read position
- // the new reference position == the flanking length size
- // the positional offset of the reference sequence == the new position of the deletion - the flanking length
-
- int diff = indel.length - minsize - softdiff + deletedBpBefore - insertedBpBefore;
- //int querydiff = indel.length - minsize - softBegin.size() - insertedBpBefore + deletedBpBefore;
- if (debug) cerr << "adjusting " << indel.length <<" " << minsize << " " << softdiff << " " << diff << endl;
- offset += diff;
- ///
- indel.length = minsize;
- indel.sequence = indel.sequence.substr(indel.sequence.size() - minsize);
- indel.position = flankingLength;
- indel.readPosition = indel.position; // if we have removed all the sequence before, this should be ==
- referenceSequence = referenceSequence.substr(diff);
-
- for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
- if (i < del) {
- i->length = 0; // remove
- } else if (i > del) {
- i->position -= diff;
- }
- }
- }
- if (debug) cerr << indel << endl;
-
- // now, do the same for the reverse
- if (indel.length > 0) {
- int minsize = indel.length + 1;
- int flankingLength = querySequence.size() - indel.readPosition + indel.readLength();
- string flanking = querySequence.substr(indel.readPosition + indel.readLength(), flankingLength);
- //int indelRefEnd = indel.position + indel.referenceLength();
-
- size_t p = referenceSequence.find(flanking, indel.position);
- if (p == string::npos) {
- if (debug)
- cerr << "flanking not found" << endl;
- } else {
- if (debug) {
- cerr << "flanking is at " << p << endl;
- cerr << "minsize would be " << (int) p - indel.position << endl;
- }
- minsize = (int) p - indel.position;
- }
-
- if (debug) cerr << "minsize " << minsize << endl;
- if (minsize >= 0 && minsize <= indel.length) {
- //referenceSequence = referenceSequence.substr(0, referenceSequence.size() - (indel.length - minsize));
- if (debug) cerr << "adjusting " << indel << endl;
- indel.length = minsize;
- indel.sequence = indel.sequence.substr(0, minsize);
- //cerr << indel << endl;
- if (!softEnd.empty()) { // remove soft clips if we can
- if (flankingLength < (int)softEnd.size()) {
- softEnd = softEnd.substr(flankingLength - softEnd.size());
- } else {
- softEnd.clear();
- }
- }
- for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
- if (i > del) {
- i->length = 0; // remove
- }
- }
- }
- }
- }
- }
- }
-
- if (debug) {
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- }
-
- if (debug) cerr << "parsing indels" << endl;
-
- // if soft clipping can be reduced by adjusting the tailing indels in the read, do it
- // TODO
-
- /*
- int numEmptyIndels = 0;
-
- if (!indels.empty()) {
- vector<IndelAllele>::iterator a = indels.begin();
- while (a != indels.end()) {
- if (debug) cerr << "parsing " << *a << endl;
- if (!(a->length > 0 && a->position >= 0)) {
- ++numEmptyIndels;
- }
- ++a;
- }
- }
- */
-
- for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
- if (i->length == 0) continue;
- if (i->insertion) {
- if (querySequence.substr(i->readPosition, i->readLength()) != i->sequence) {
- cerr << "failure: " << *i << " should be " << querySequence.substr(i->readPosition, i->readLength()) << endl;
- cerr << baseReferenceSequence << endl;
- cerr << querySequence << endl;
- throw 1;
- }
- } else {
- if (referenceSequence.substr(i->position, i->length) != i->sequence) {
- cerr << "failure: " << *i << " should be " << referenceSequence.substr(i->position, i->length) << endl;
- cerr << baseReferenceSequence << endl;
- cerr << querySequence << endl;
- throw 1;
- }
- }
- }
-
- if (indels.size() > 1) {
- vector<IndelAllele>::iterator id = indels.begin();
- while ((id + 1) != indels.end()) {
- if (debug) {
- cerr << "indels: ";
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- //for (vector<IndelAllele>::iterator a = newIndels.begin(); a != newIndels.end(); ++a) cerr << *a << " ";
- //cerr << endl;
- }
-
- // get the indels to try to merge
- while (id->length == 0 && (id + 1) != indels.end()) ++id;
- vector<IndelAllele>::iterator idn = (id + 1);
- while (idn != indels.end() && idn->length == 0) ++idn;
- if (idn == indels.end()) break;
-
- IndelAllele& indel = *idn;
- IndelAllele& last = *id;
- if (debug) cerr << "trying " << last << " against " << indel << endl;
-
- int lastend = last.insertion ? last.position : (last.position + last.length);
- if (indel.position == lastend) {
- if (debug) cerr << "indel.position " << indel.position << " lastend " << lastend << endl;
- if (indel.insertion == last.insertion) {
- last.length += indel.length;
- last.sequence += indel.sequence;
- indel.length = 0;
- indel.sequence.clear();
- id = idn;
- } else if (last.length && indel.length) { // if the end of the previous == the start of the current, cut it off of both the ins and the del
- if (debug) cerr << "Merging " << last << " " << indel << endl;
- int matchsize = 1;
- int biggestmatchsize = 0;
-
- while (matchsize <= (int)last.sequence.size() && matchsize <= (int)indel.sequence.size()) {
- if (last.sequence.substr(last.sequence.size() - matchsize) == indel.sequence.substr(0, matchsize)) {
- biggestmatchsize = matchsize;
- }
- ++matchsize;
- }
- if (debug) cerr << "biggestmatchsize " << biggestmatchsize << endl;
-
- last.sequence = last.sequence.substr(0, last.sequence.size() - biggestmatchsize);
- last.length -= biggestmatchsize;
- indel.sequence = indel.sequence.substr(biggestmatchsize);
- indel.length -= biggestmatchsize;
- if (indel.insertion) indel.readPosition += biggestmatchsize;
- else indel.position += biggestmatchsize;
-
- if (indel.length > 0) {
- id = idn;
- }
- }
- } else {
- if (last.insertion != indel.insertion) {
- if (debug) cerr << "merging by overlap " << last << " " << indel << endl;
- // see if we can slide the sequence in between these two indels together
- string lastOverlapSeq;
- string indelOverlapSeq;
-
- if (last.insertion) {
- lastOverlapSeq =
- last.sequence
- + querySequence.substr(last.readPosition + last.readLength(),
- indel.readPosition - (last.readPosition + last.readLength()));
- indelOverlapSeq =
- referenceSequence.substr(last.position + last.referenceLength(),
- indel.position - (last.position + last.referenceLength()))
- + indel.sequence;
- } else {
- lastOverlapSeq =
- last.sequence
- + referenceSequence.substr(last.position + last.referenceLength(),
- indel.position - (last.position + last.referenceLength()));
- indelOverlapSeq =
- querySequence.substr(last.readPosition + last.readLength(),
- indel.readPosition - (last.readPosition + last.readLength()))
- + indel.sequence;
- }
-
- if (debug) {
- if (!last.insertion) {
- if (last.insertion) cerr << string(last.length, '-');
- cerr << lastOverlapSeq;
- if (indel.insertion) cerr << string(indel.length, '-');
- cerr << endl;
- if (!last.insertion) cerr << string(last.length, '-');
- cerr << indelOverlapSeq;
- if (!indel.insertion) cerr << string(indel.length, '-');
- cerr << endl;
- } else {
- if (last.insertion) cerr << string(last.length, '-');
- cerr << indelOverlapSeq;
- if (indel.insertion) cerr << string(indel.length, '-');
- cerr << endl;
- if (!last.insertion) cerr << string(last.length, '-');
- cerr << lastOverlapSeq;
- if (!indel.insertion) cerr << string(indel.length, '-');
- cerr << endl;
- }
- }
-
-
- int dist = min(last.length, indel.length);
- int matchingInBetween = indel.position - (last.position + last.referenceLength());
- int previousMatchingInBetween = matchingInBetween;
- //int matchingInBetween = indel.position - last.position;
- if (debug) cerr << "matchingInBetween " << matchingInBetween << endl;
- if (debug) cerr << "dist " << dist << endl;
- //int mindist = matchingInBetween - dist;
- if (lastOverlapSeq == indelOverlapSeq) {
- matchingInBetween = lastOverlapSeq.size();
- } else {
- // TODO change to use string::find()
- for (int i = dist; i > 0; --i) {
- if (debug) cerr << "lastoverlap: "
- << lastOverlapSeq.substr(lastOverlapSeq.size() - previousMatchingInBetween - i)
- << " thisoverlap: "
- << indelOverlapSeq.substr(0, i + previousMatchingInBetween) << endl;
- if (lastOverlapSeq.substr(lastOverlapSeq.size() - previousMatchingInBetween - i)
- == indelOverlapSeq.substr(0, i + previousMatchingInBetween)) {
- matchingInBetween = previousMatchingInBetween + i;
- break;
- }
- }
- }
- //cerr << last << " " << indel << endl;
- if (matchingInBetween > 0 && matchingInBetween > previousMatchingInBetween) {
- if (debug) cerr << "matching " << matchingInBetween << "bp between " << last << " " << indel
- << " was matching " << previousMatchingInBetween << endl;
- int diff = matchingInBetween - previousMatchingInBetween;
- last.length -= diff;
- last.sequence = last.sequence.substr(0, last.length);
- indel.length -= diff;
- indel.sequence = indel.sequence.substr(diff);
- if (!indel.insertion) indel.position += diff;
- else indel.readPosition += diff;
- if (debug) cerr << last << " " << indel << endl;
- }// else if (matchingInBetween == 0 || matchingInBetween == indel.position - last.position) {
- //if (!newIndels.empty()) newIndels.pop_back();
- //} //else { newIndels.push_back(indel); }
- id = idn;
- //newIndels.push_back(indel);
- } else {
- id = idn;
- //newIndels.push_back(indel);
- }
- }
- }
- }
-
- vector<IndelAllele> newIndels;
- for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
- if (!i->insertion && i->position == 0) { offset += i->length;
- } else if (i->length > 0) newIndels.push_back(*i); // remove dels at front
- }
-
- // for each indel
- // if ( we're matched up to the previous insertion (or deletion)
- // and it's also an insertion or deletion )
- // merge the indels
- //
- // and simultaneously reconstruct the cigar
- //
- // if there are spurious deletions at the start and end of the read, remove them
- // if there are spurious insertions after soft-clipped bases, make them soft clips
-
- vector<pair<int, string> > newCigar;
-
- if (!softBegin.empty()) {
- newCigar.push_back(make_pair(softBegin.size(), "S"));
- }
-
- if (newIndels.empty()) {
-
- int remainingReadBp = querySequence.size() - softEnd.size() - softBegin.size();
- newCigar.push_back(make_pair(remainingReadBp, "M"));
-
- if (!softEnd.empty()) {
- newCigar.push_back(make_pair(softEnd.size(), "S"));
- }
-
- cigar = joinCIGAR(newCigar);
-
- // check if we're realigned
- if (cigar == cigarbefore) {
- return false;
- } else {
- return true;
- }
- }
-
- vector<IndelAllele>::iterator id = newIndels.begin();
- vector<IndelAllele>::iterator last = id++;
-
- if (last->position > 0) {
- newCigar.push_back(make_pair(last->position, "M"));
- newCigar.push_back(make_pair(last->length, (last->insertion ? "I" : "D")));
- } else if (last->position == 0) { // discard floating indels
- if (last->insertion) newCigar.push_back(make_pair(last->length, "S"));
- else newCigar.push_back(make_pair(last->length, "D"));
- } else {
- cerr << "negative indel position " << *last << endl;
- }
-
- int lastend = last->insertion ? last->position : (last->position + last->length);
- LEFTALIGN_DEBUG(*last << ",");
-
- for (; id != newIndels.end(); ++id) {
- IndelAllele& indel = *id;
- if (indel.length == 0) continue; // remove 0-length indels
- if (debug) cerr << indel << " " << *last << endl;
- LEFTALIGN_DEBUG(indel << ",");
- if ((((id + 1) == newIndels.end()
- && indel.insertion && indel.position == (int)referenceSequence.size())
- || (!indel.insertion && indel.position + indel.length == (int)referenceSequence.size()))
- ) {
- if (indel.insertion) {
- if (!newCigar.empty() && newCigar.back().second == "S") {
- newCigar.back().first += indel.length;
- } else {
- newCigar.push_back(make_pair(indel.length, "S"));
- }
- }
- } else if (indel.position < lastend) {
- cerr << "impossibility?: indel realigned left of another indel" << endl;
- return false;
- } else if (indel.position == lastend) {
- // how?
- if (indel.insertion == last->insertion) {
- pair<int, string>& op = newCigar.back();
- op.first += indel.length;
- } else {
- newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
- }
- } else if (indel.position > lastend) { // also catches differential indels, but with the same position
- if (!newCigar.empty() && newCigar.back().second == "M") newCigar.back().first += indel.position - lastend;
- else newCigar.push_back(make_pair(indel.position - lastend, "M"));
- newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
- }
-
- last = id;
- lastend = last->insertion ? last->position : (last->position + last->length);
-
- if (debug) {
- for (vector<pair<int, string> >::iterator c = newCigar.begin(); c != newCigar.end(); ++c)
- cerr << c->first << c->second;
- cerr << endl;
- }
-
- }
-
- int remainingReadBp = querySequence.size() - (last->readPosition + last->readLength()) - softEnd.size();
- if (remainingReadBp > 0) {
- if (debug) cerr << "bp remaining = " << remainingReadBp << endl;
- if (newCigar.back().second == "M") newCigar.back().first += remainingReadBp;
- else newCigar.push_back(make_pair(remainingReadBp, "M"));
- }
-
- if (newCigar.back().second == "D") newCigar.pop_back(); // remove trailing deletions
-
- if (!softEnd.empty()) {
- if (newCigar.back().second == "S") newCigar.back().first += softEnd.size();
- else newCigar.push_back(make_pair(softEnd.size(), "S"));
- }
-
- LEFTALIGN_DEBUG(endl);
-
- cigar = joinCIGAR(newCigar);
-
- LEFTALIGN_DEBUG(cigar << endl);
-
- // check if we're realigned
- if (cigar == cigarbefore) {
- return false;
- } else {
- return true;
- }
-
-}
-
-int countMismatches(string& querySequence, string& cigar, string referenceSequence) {
-
- int mismatches = 0;
- int sp = 0;
- int rp = 0;
- vector<pair<int, string> > cigarData = splitCIGAR(cigar);
- for (vector<pair<int, string> >::const_iterator c = cigarData.begin();
- c != cigarData.end(); ++c) {
- unsigned int l = c->first;
- string t = c->second;
- if (t == "M") { // match or mismatch
- for (int i = 0; i < (int)l; ++i) {
- if (querySequence.at(rp) != referenceSequence.at(sp))
- ++mismatches;
- ++sp;
- ++rp;
- }
- } else if (t == "D") { // deletion
- sp += l; // update reference sequence position
- } else if (t == "I") { // insertion
- rp += l; // update read position
- } else if (t == "S") { // soft clip, clipped sequence present in the read not matching the reference
- rp += l;
- } else if (t == "H") { // hard clip on the read, clipped sequence is not present in the read
- } else if (t == "N") { // skipped region in the reference not present in read, aka splice
- sp += l;
- }
- }
-
- return mismatches;
-
-}
-
-// Iteratively left-aligns the indels in the alignment until we have a stable
-// realignment. Returns true on realignment success or non-realignment.
-// Returns false if we exceed the maximum number of realignment iterations.
-//
-bool stablyLeftAlign(string querySequence, string& cigar, string referenceSequence, int& offset, int maxiterations, bool debug) {
-
- if (!leftAlign(querySequence, cigar, referenceSequence, offset)) {
-
- LEFTALIGN_DEBUG("did not realign" << endl);
- return true;
-
- } else {
-
- while (leftAlign(querySequence, cigar, referenceSequence, offset) && --maxiterations > 0) {
- LEFTALIGN_DEBUG("realigning ..." << endl);
- }
-
- if (maxiterations <= 0) {
- return false;
- } else {
- return true;
- }
- }
-}
-
-string mergeCIGAR(const string& c1, const string& c2) {
- vector<pair<int, string> > cigar1 = splitCIGAR(c1);
- vector<pair<int, string> > cigar2 = splitCIGAR(c2);
- // check if the middle elements are the same
- if (cigar1.back().second == cigar2.front().second) {
- cigar1.back().first += cigar2.front().first;
- cigar2.erase(cigar2.begin());
- }
- for (vector<pair<int, string> >::iterator c = cigar2.begin(); c != cigar2.end(); ++c) {
- cigar1.push_back(*c);
- }
- return joinCIGAR(cigar1);
-}
-
-vector<pair<int, string> > splitCIGAR(const string& cigarStr) {
- vector<pair<int, string> > cigar;
- string number;
- string type;
- // strings go [Number][Type] ...
- for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
- char c = *s;
- if (isdigit(c)) {
- if (type.empty()) {
- number += c;
- } else {
- // signal for next token, push back the last pair, clean up
- cigar.push_back(make_pair(atoi(number.c_str()), type));
- number.clear();
- type.clear();
- number += c;
- }
- } else {
- type += c;
- }
- }
- if (!number.empty() && !type.empty()) {
- cigar.push_back(make_pair(atoi(number.c_str()), type));
- }
- return cigar;
-}
-
-string joinCIGAR(const vector<pair<int, string> >& cigar) {
- string cigarStr;
- for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
- if (c->first) {
- cigarStr += convert(c->first) + c->second;
- }
- }
- return cigarStr;
-}
diff --git a/external/vcflib/smithwaterman/LeftAlign.h b/external/vcflib/smithwaterman/LeftAlign.h
deleted file mode 100644
index 7fb796e..0000000
--- a/external/vcflib/smithwaterman/LeftAlign.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef __LEFTALIGN_H
-#define __LEFTALIGN_H
-
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <list>
-#include <utility>
-#include <sstream>
-
-#include "IndelAllele.h"
-#include "convert.h"
-
-#ifdef VERBOSE_DEBUG
-#define LEFTALIGN_DEBUG(msg) \
- if (debug) { cerr << msg; }
-#else
-#define LEFTALIGN_DEBUG(msg)
-#endif
-
-using namespace std;
-
-bool leftAlign(string& alternateQuery, string& cigar, string& referenceSequence, int& offset, bool debug = false);
-bool stablyLeftAlign(string alternateQuery, string& cigar, string referenceSequence, int& offset, int maxiterations = 20, bool debug = false);
-int countMismatches(string& alternateQuery, string& cigar, string& referenceSequence);
-
-string mergeCIGAR(const string& c1, const string& c2);
-vector<pair<int, string> > splitCIGAR(const string& cigarStr);
-string joinCIGAR(const vector<pair<int, string> >& cigar);
-
-
-#endif
diff --git a/external/vcflib/smithwaterman/Makefile b/external/vcflib/smithwaterman/Makefile
deleted file mode 100644
index 06e0a9d..0000000
--- a/external/vcflib/smithwaterman/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
-# =========================================
-# MOSAIK Banded Smith-Waterman Makefile
-# (c) 2009 Michael Stromberg & Wan-Ping Lee
-# =========================================
-
-# ----------------------------------
-# define our source and object files
-# ----------------------------------
-SOURCES= smithwaterman.cpp BandedSmithWaterman.cpp SmithWatermanGotoh.cpp Repeats.cpp LeftAlign.cpp IndelAllele.cpp
-OBJECTS= $(SOURCES:.cpp=.o)
-
-# ----------------
-# compiler options
-# ----------------
-
-CFLAGS=-Wall -O3
-LDFLAGS=-Wl,-s
-#CFLAGS=-g
-PROGRAM=smithwaterman
-LIBS=
-
-all: $(PROGRAM)
-
-.PHONY: all
-
-$(PROGRAM): $(OBJECTS)
- @echo " * linking $(PROGRAM)"
- @$(CXX) $(LDFLAGS) $(CFLAGS) -o $@ $^ $(LIBS)
-
-.PHONY: clean
-
-clean:
- @echo "Cleaning up."
- @rm -f *.o $(PROGRAM) *~
diff --git a/external/vcflib/smithwaterman/Mosaik.h b/external/vcflib/smithwaterman/Mosaik.h
deleted file mode 100644
index 24d70d0..0000000
--- a/external/vcflib/smithwaterman/Mosaik.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#pragma once
-
-#ifndef WIN32
-//#include "SafeFunctions.h"
-#endif
-
-// ==============
-// MOSAIK version
-// ==============
-
-#define MOSAIK_VERSION_DATE "2009-02-11"
-
-// adopt a major.minor.build version number [1].[1].[3]
-const unsigned char MOSAIK_MAJOR_VERSION = 0;
-const unsigned char MOSAIK_MINOR_VERSION = 9;
-const unsigned short MOSAIK_BUILD_VERSION = 899;
-
-// ================================
-// Platform specific variable sizes
-// ================================
-
-// Windows Vista 32-bit
-// Fedora Core 7 32-bit
-// Fedora Core 6 64-bit
-// Itanium2 64-bit
-#define SIZEOF_CHAR 1
-#define SIZEOF_WCHAR 2
-#define SIZEOF_SHORT 2
-#define SIZEOF_INT 4
-#define SIZEOF_FLOAT 4
-#define SIZEOF_DOUBLE 8
-#define SIZEOF_UINT64 8
-#define MOSAIK_LITTLE_ENDIAN 1
-
-#ifdef WIN32
-typedef signed long long int64_t;
-typedef unsigned long long uint64_t;
-#endif
-
-#define NEGATIVE_ONE_INT 0xffffffff
-#define NEGATIVE_TWO_INT 0xfffffffe
-#define NEGATIVE_THREE_INT 0xfffffffd
-#define NEGATIVE_FOUR_INT 0xfffffffc
-#define MAX_SHORT 0xffff
-
-// ==========================
-// Platform specific file I/O
-// ==========================
-
-#ifdef WIN32
-const char OS_DIRECTORY_SEPARATOR = '\\';
-#else
-const char OS_DIRECTORY_SEPARATOR = '/';
-#endif
-
-#define DIRECTORY_NAME_LENGTH 255
-
-// ====================================
-// Enable unit test diagnostic messages
-// ====================================
-
-#ifdef UNITTEST
-#define SILENTMODE if(0)
-#else
-#define SILENTMODE
-#endif
-
-// =================
-// Aligner constants
-// =================
-
-const double HASH_REGION_EXTENSION_PERCENT = 0.025;
-const unsigned char REFERENCE_SEQUENCE_QUALITY = 40;
diff --git a/external/vcflib/smithwaterman/Repeats.cpp b/external/vcflib/smithwaterman/Repeats.cpp
deleted file mode 100644
index 43c282f..0000000
--- a/external/vcflib/smithwaterman/Repeats.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "Repeats.h"
-
-map<string, int> repeatCounts(long int position, const string& sequence, int maxsize) {
- map<string, int> counts;
- for (int i = 1; i <= maxsize; ++i) {
- // subseq here i bases
- string seq = sequence.substr(position, i);
- // go left.
-
- int j = position - i;
- int leftsteps = 0;
- while (j >= 0 && seq == sequence.substr(j, i)) {
- j -= i;
- ++leftsteps;
- }
-
- // go right.
- j = position;
-
- int rightsteps = 0;
- while (j + i <= (int)sequence.size() && seq == sequence.substr(j, i)) {
- j += i;
- ++rightsteps;
- }
- // if we went left and right a non-zero number of times,
- if (leftsteps + rightsteps > 1) {
- counts[seq] = leftsteps + rightsteps;
- }
- }
-
- // filter out redundant repeat information
- if (counts.size() > 1) {
- map<string, int> filteredcounts;
- map<string, int>::iterator c = counts.begin();
- string prev = c->first;
- filteredcounts[prev] = c->second; // shortest sequence
- ++c;
- for (; c != counts.end(); ++c) {
- int i = 0;
- string seq = c->first;
- while (i + prev.length() <= seq.length() && seq.substr(i, prev.length()) == prev) {
- i += prev.length();
- }
- if (i < (int)seq.length()) {
- filteredcounts[seq] = c->second;
- prev = seq;
- }
- }
- return filteredcounts;
- } else {
- return counts;
- }
-}
-
-bool isRepeatUnit(const string& seq, const string& unit) {
-
- if (seq.size() % unit.size() != 0) {
- return false;
- } else {
- int maxrepeats = seq.size() / unit.size();
- for (int i = 0; i < maxrepeats; ++i) {
- if (seq.substr(i * unit.size(), unit.size()) != unit) {
- return false;
- }
- }
- return true;
- }
-
-}
diff --git a/external/vcflib/smithwaterman/Repeats.h b/external/vcflib/smithwaterman/Repeats.h
deleted file mode 100644
index 2efc0ea..0000000
--- a/external/vcflib/smithwaterman/Repeats.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <iostream>
-#include <string>
-#include <map>
-
-using namespace std;
-
-map<string, int> repeatCounts(long int pos, const string& seq, int maxsize);
-bool isRepeatUnit(const string& seq, const string& unit);
diff --git a/external/vcflib/smithwaterman/SWMain.cpp b/external/vcflib/smithwaterman/SWMain.cpp
deleted file mode 100644
index 6ef3421..0000000
--- a/external/vcflib/smithwaterman/SWMain.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include <iostream>
-#include <string.h>
-//#include "Alignment.h"
-//#include "Benchmark.h"
-//#include "HashRegion.h"
-#include "SmithWatermanGotoh.h"
-#include "BandedSmithWaterman.h"
-
-using namespace std;
-
-int main(int argc, char* argv[]) {
-/*
- printf("------------------------------------------------------------------------------\n");
- printf("Banded Smith-Waterman Algorithm (worst case)\n");
- printf("Michael Stromberg & Wan-Ping Lee Marth Lab, Boston College Biology Department\n");
- printf("------------------------------------------------------------------------------\n\n");
-*/
- // this version simulates the worst case of only a fragment hashing to the
- // reference sequence. Basically a non-centered diagonal in the Smith-Waterman
- // dynamic programming matrix.
-
- // here we simulate a region on the reference that occurs between position 4001
- // and position 4136. During hashing, only the first 20 bases in the query
- // matched perfectly.
-
- // define the start and end coordinates of the entire reference region
- //const unsigned int start = 4001;
- //const unsigned int end = 4136;
-
- //const unsigned int testStart = atoi(argv[1]);
- //const unsigned int testEnd = atoi(argv[2]);
- //const unsigned int testQueryStart = atoi(argv[3]);
- //const unsigned int testQueryEnd = atoi(argv[4]);
-
- //cout << endl<< "=====================================================" << endl;
- //cout << testStart << "\t" << testQueryStart << endl;
-
- // define the 20 b:q
- // ases that matched perfectly
- //HashRegion hr;
-
- //=====================================================
- // defind the hash region
- // first.first: reference begin
- // first.second: reference end
- // second.first: query begin
- // second.second: query end
- //=====================================================
-
- pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr;
- hr.first.first = 5;
- hr.first.second = 13;
- hr.second.first = 0;
- hr.second.second = 8;
-
- //=====================================================
-
- // for 76 bp reads, we expect as much as 12 mismatches - however this does not
- // translate to a bandwidth of 12 * 2 + 1 since most of these will be
- // substitution errors
- const unsigned char bandwidth = 11;
-
- // initialize
- const char* pReference = "ATGGCGGGGATCGGGACACTCGCCGGTGCGGGTACCCTA";
- const char* pQuery = "GGGGATCGGGACACTCGCTCTCCGGTGCGGGTA";
-
- const unsigned int referenceLen = strlen(pReference);
- const unsigned int queryLen = strlen(pQuery);
-
- // ==============================================================================================
- // benchmarking reference on koi.bc.edu when NUM_ITERATIONS = 38000 on 76 bp read (1 try):
- // CPU time: 23.920 s, wall time: 24.012 s (1582.6 alignments/s)
- // ==============================================================================================
- //const unsigned int NUM_ITERATIONS = 38000;
- //unsigned int NUM_ITERATIONS = 1;
-
- // create a new Smith-Waterman alignment object
- CSmithWatermanGotoh sw(10.0f, -9.0f, 15.0f, 6.66f);
- CBandedSmithWaterman bsw(10.0f, -9.0f, 15.0f, 6.66f, bandwidth);
-
- // start timing the algorithm
- //CBenchmark bench;
- //bench.Start();
-
- // perform NUM_ITERATIONS alignments
- //Alignment bswAl;
- //Alignment swAl;
- // referenceBegin, referenceEnd
- unsigned int referenceSW, referenceBSW;
- string cigarSW, cigarBSW;
- //for(unsigned int i = 0; i < NUM_ITERATIONS; i++) {
- sw.Align(referenceSW, cigarSW, pReference, referenceLen, pQuery, queryLen);
- bsw.Align(referenceBSW, cigarBSW, pReference, referenceLen, pQuery, queryLen, hr);
- //}
-
- // stop timing the algorithm
- //bench.Stop();
-
- // calculate the alignments per second
- //double elapsedWallTime = bench.GetElapsedWallTime();
- //double alignmentsPerSecond = (double)NUM_ITERATIONS / elapsedWallTime;
-
- // show our results
- //printf("%d\t%d\n", al.ReferenceBegin,al.QueryBegin);
-
- printf("Smith-Waterman\n");
- printf("reference: %s %3u\n", cigarSW.c_str(), referenceSW);
- printf("Banded Smith-Waterman\n");
- printf("reference: %s %3u\n", cigarBSW.c_str(), referenceBSW);
- /*
- printf("Smith-Waterman\n");
- printf("reference: %s %3u %3u\n", swAl.Reference.CData(), swAl.ReferenceBegin, swAl.ReferenceEnd);
- printf("query: %s %3u %3u\n", swAl.Query.CData(), swAl.QueryBegin, swAl.QueryEnd);
- printf("mismatches: %u\n", swAl.NumMismatches);
- printf("\n");
- printf("Banded Smith-Waterman\n");
- printf("reference: %s %3u %3u\n", bswAl.Reference.CData(), bswAl.ReferenceBegin, bswAl.ReferenceEnd);
- printf("query: %s %3u %3u\n", bswAl.Query.CData(), bswAl.QueryBegin, bswAl.QueryEnd);
- printf("mismatches: %u\n", bswAl.NumMismatches);
- */
- //printf("alignments/s: %.1f\n\n", alignmentsPerSecond);
-
- //bench.DisplayTime("BandedSmithWaterman");
-
- return 0;
-}
diff --git a/external/vcflib/smithwaterman/SmithWatermanGotoh.cpp b/external/vcflib/smithwaterman/SmithWatermanGotoh.cpp
deleted file mode 100644
index 0d8b5eb..0000000
--- a/external/vcflib/smithwaterman/SmithWatermanGotoh.cpp
+++ /dev/null
@@ -1,741 +0,0 @@
-#include "SmithWatermanGotoh.h"
-
-const float CSmithWatermanGotoh::FLOAT_NEGATIVE_INFINITY = (float)-1e+30;
-
-const char CSmithWatermanGotoh::Directions_STOP = 0;
-const char CSmithWatermanGotoh::Directions_LEFT = 1;
-const char CSmithWatermanGotoh::Directions_DIAGONAL = 2;
-const char CSmithWatermanGotoh::Directions_UP = 3;
-
-const int CSmithWatermanGotoh::repeat_size_max = 12;
-
-CSmithWatermanGotoh::CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty)
- : mCurrentMatrixSize(0)
- , mCurrentAnchorSize(0)
- , mCurrentQuerySize(0)
- , mCurrentAQSumSize(0)
- , mMatchScore(matchScore)
- , mMismatchScore(mismatchScore)
- , mGapOpenPenalty(gapOpenPenalty)
- , mGapExtendPenalty(gapExtendPenalty)
- , mPointers(NULL)
- , mSizesOfVerticalGaps(NULL)
- , mSizesOfHorizontalGaps(NULL)
- , mQueryGapScores(NULL)
- , mBestScores(NULL)
- , mReversedAnchor(NULL)
- , mReversedQuery(NULL)
- , mUseHomoPolymerGapOpenPenalty(false)
- , mUseEntropyGapOpenPenalty(false)
- , mUseRepeatGapExtensionPenalty(false)
-{
- CreateScoringMatrix();
-}
-
-CSmithWatermanGotoh::~CSmithWatermanGotoh(void) {
- if(mPointers) delete [] mPointers;
- if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps;
- if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
- if(mQueryGapScores) delete [] mQueryGapScores;
- if(mBestScores) delete [] mBestScores;
- if(mReversedAnchor) delete [] mReversedAnchor;
- if(mReversedQuery) delete [] mReversedQuery;
-}
-
-// aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
-void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) {
-
- if((s1.length() == 0) || (s2.length() == 0)) {
- cout << "ERROR: Found a read with a zero length." << endl;
- exit(1);
- }
-
- unsigned int referenceLen = s1.length() + 1;
- unsigned int queryLen = s2.length() + 1;
- unsigned int sequenceSumLength = s1.length() + s2.length();
-
- // reinitialize our matrices
-
- if((referenceLen * queryLen) > mCurrentMatrixSize) {
-
- // calculate the new matrix size
- mCurrentMatrixSize = referenceLen * queryLen;
-
- // delete the old arrays
- if(mPointers) delete [] mPointers;
- if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps;
- if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
-
- try {
-
- // initialize the arrays
- mPointers = new char[mCurrentMatrixSize];
- mSizesOfVerticalGaps = new short[mCurrentMatrixSize];
- mSizesOfHorizontalGaps = new short[mCurrentMatrixSize];
-
- } catch(bad_alloc) {
- cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
- exit(1);
- }
- }
-
- // initialize the traceback matrix to STOP
- memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen);
- for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0;
-
- // initialize the gap matrices to 1
- uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1);
- uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1);
-
-
- // initialize our repeat counts if they are needed
- vector<map<string, int> > referenceRepeats;
- vector<map<string, int> > queryRepeats;
- //int queryBeginRepeatBases = 0;
- //int queryEndRepeatBases = 0;
- if (mUseRepeatGapExtensionPenalty) {
- for (unsigned int i = 0; i < queryLen; ++i)
- queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max));
- for (unsigned int i = 0; i < referenceLen; ++i)
- referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max));
-
- // keep only the biggest repeat
- vector<map<string, int> >::iterator q = queryRepeats.begin();
- for (; q != queryRepeats.end(); ++q) {
- map<string, int>::iterator biggest = q->begin();
- map<string, int>::iterator z = q->begin();
- for (; z != q->end(); ++z)
- if (z->first.size() > biggest->first.size()) biggest = z;
- z = q->begin();
- while (z != q->end()) {
- if (z != biggest)
- q->erase(z++);
- else ++z;
- }
- }
-
- q = referenceRepeats.begin();
- for (; q != referenceRepeats.end(); ++q) {
- map<string, int>::iterator biggest = q->begin();
- map<string, int>::iterator z = q->begin();
- for (; z != q->end(); ++z)
- if (z->first.size() > biggest->first.size()) biggest = z;
- z = q->begin();
- while (z != q->end()) {
- if (z != biggest)
- q->erase(z++);
- else ++z;
- }
- }
-
- // remove repeat information from ends of queries
- // this results in the addition of spurious flanking deletions in repeats
- map<string, int>& qrend = queryRepeats.at(queryRepeats.size() - 2);
- if (!qrend.empty()) {
- int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second;
- for (int i = 0; i < queryEndRepeatBases; ++i)
- queryRepeats.at(queryRepeats.size() - 2 - i).clear();
- }
-
- map<string, int>& qrbegin = queryRepeats.front();
- if (!qrbegin.empty()) {
- int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second;
- for (int i = 0; i < queryBeginRepeatBases; ++i)
- queryRepeats.at(i).clear();
- }
-
- }
-
- //int entropyWindowSize = 8;
- vector<float> referenceEntropies;
- vector<float> queryEntropies;
- /*if (mUseEntropyGapOpenPenalty) {
- for (unsigned int i = 0; i < queryLen; ++i)
- queryEntropies.push_back(
- shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))],
- entropyWindowSize));
- for (unsigned int i = 0; i < referenceLen; ++i)
- referenceEntropies.push_back(
- shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))],
- entropyWindowSize));
- }
- */
- // normalize entropies
- /*
- float qsum = 0;
- float qnorm = 0;
- float qmax = 0;
- for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) {
- qsum += *q;
- if (*q > qmax) qmax = *q;
- }
- qnorm = qsum / queryEntropies.size();
- for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q)
- *q = *q / qsum + qmax;
-
- float rsum = 0;
- float rnorm = 0;
- float rmax = 0;
- for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) {
- rsum += *r;
- if (*r > rmax) rmax = *r;
- }
- rnorm = rsum / referenceEntropies.size();
- for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r)
- *r = *r / rsum + rmax;
- */
-
- //
- // construct
- //
-
- // reinitialize our query-dependent arrays
- if(s2.length() > mCurrentQuerySize) {
-
- // calculate the new query array size
- mCurrentQuerySize = s2.length();
-
- // delete the old arrays
- if(mQueryGapScores) delete [] mQueryGapScores;
- if(mBestScores) delete [] mBestScores;
-
- // initialize the arrays
- try {
-
- mQueryGapScores = new float[mCurrentQuerySize + 1];
- mBestScores = new float[mCurrentQuerySize + 1];
-
- } catch(bad_alloc) {
- cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
- exit(1);
- }
- }
-
- // reinitialize our reference+query-dependent arrays
- if(sequenceSumLength > mCurrentAQSumSize) {
-
- // calculate the new reference array size
- mCurrentAQSumSize = sequenceSumLength;
-
- // delete the old arrays
- if(mReversedAnchor) delete [] mReversedAnchor;
- if(mReversedQuery) delete [] mReversedQuery;
-
- // initialize the arrays
- try {
-
- mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1
- mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2
-
- } catch(bad_alloc) {
- cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
- exit(1);
- }
- }
-
- // initialize the gap score and score vectors
- uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY);
- memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen);
-
- float similarityScore, totalSimilarityScore, bestScoreDiagonal;
- float queryGapExtendScore, queryGapOpenScore;
- float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore;
-
- unsigned int BestColumn = 0;
- unsigned int BestRow = 0;
- float BestScore = FLOAT_NEGATIVE_INFINITY;
-
- for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) {
-
- currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY;
- bestScoreDiagonal = mBestScores[0];
-
- for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) {
-
- // calculate our similarity score
- similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A'];
-
- // fill the matrices
- totalSimilarityScore = bestScoreDiagonal + similarityScore;
-
- //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl;
-
- queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
- queryGapOpenScore = mBestScores[j] - mGapOpenPenalty;
-
- // compute the homo-polymer gap score if enabled
- if(mUseHomoPolymerGapOpenPenalty)
- if((j > 1) && (s2[j - 1] == s2[j - 2]))
- queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty;
-
- // compute the entropy gap score if enabled
- if (mUseEntropyGapOpenPenalty) {
- queryGapOpenScore =
- mBestScores[j] - mGapOpenPenalty
- * max(queryEntropies.at(j), referenceEntropies.at(i))
- * mEntropyGapOpenPenalty;
- }
-
- int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1;
-
- if (mUseRepeatGapExtensionPenalty) {
- map<string, int>& repeats = queryRepeats[j];
- // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
- if (!repeats.empty()) {
-
- const pair<string, int>& repeat = *repeats.begin();
- int repeatsize = repeat.first.size();
- if (gaplen != repeatsize && gaplen % repeatsize != 0) {
- gaplen = gaplen / repeatsize + repeatsize;
- }
-
- if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) {
- string gapseq = string(&s1[i], gaplen);
- if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
- queryGapExtendScore = mQueryGapScores[j]
- + mRepeatGapExtensionPenalty / (float) gaplen;
- // mMaxRepeatGapExtensionPenalty)
- } else {
- queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
- }
- }
- } else {
- queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
- }
- }
-
- if(queryGapExtendScore > queryGapOpenScore) {
- mQueryGapScores[j] = queryGapExtendScore;
- mSizesOfVerticalGaps[l] = gaplen;
- } else mQueryGapScores[j] = queryGapOpenScore;
-
- referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
- referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty;
-
- // compute the homo-polymer gap score if enabled
- if(mUseHomoPolymerGapOpenPenalty)
- if((i > 1) && (s1[i - 1] == s1[i - 2]))
- referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty;
-
- // compute the entropy gap score if enabled
- if (mUseEntropyGapOpenPenalty) {
- referenceGapOpenScore =
- mBestScores[j - 1] - mGapOpenPenalty
- * max(queryEntropies.at(j), referenceEntropies.at(i))
- * mEntropyGapOpenPenalty;
- }
-
- gaplen = mSizesOfHorizontalGaps[l - 1] + 1;
-
- if (mUseRepeatGapExtensionPenalty) {
- map<string, int>& repeats = referenceRepeats[i];
- // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
- if (!repeats.empty()) {
-
- const pair<string, int>& repeat = *repeats.begin();
- int repeatsize = repeat.first.size();
- if (gaplen != repeatsize && gaplen % repeatsize != 0) {
- gaplen = gaplen / repeatsize + repeatsize;
- }
-
- if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) {
- string gapseq = string(&s2[j], gaplen);
- if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
- referenceGapExtendScore = currentAnchorGapScore
- + mRepeatGapExtensionPenalty / (float) gaplen;
- //mMaxRepeatGapExtensionPenalty)
- } else {
- referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
- }
- }
- } else {
- referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
- }
- }
-
- if(referenceGapExtendScore > referenceGapOpenScore) {
- currentAnchorGapScore = referenceGapExtendScore;
- mSizesOfHorizontalGaps[l] = gaplen;
- } else currentAnchorGapScore = referenceGapOpenScore;
-
- bestScoreDiagonal = mBestScores[j];
- mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore);
-
-
- // determine the traceback direction
- // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
- if(mBestScores[j] == 0) mPointers[l] = Directions_STOP;
- else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL;
- else if(mBestScores[j] == mQueryGapScores[j]) mPointers[l] = Directions_UP;
- else mPointers[l] = Directions_LEFT;
-
- // set the traceback start at the current cell i, j and score
- if(mBestScores[j] > BestScore) {
- BestRow = i;
- BestColumn = j;
- BestScore = mBestScores[j];
- }
- }
- }
-
- //
- // traceback
- //
-
- // aligned sequences
- int gappedAnchorLen = 0; // length of sequence #1 after alignment
- int gappedQueryLen = 0; // length of sequence #2 after alignment
- int numMismatches = 0; // the mismatched nucleotide count
-
- char c1, c2;
-
- int ci = BestRow;
- int cj = BestColumn;
- int ck = ci * queryLen;
-
- // traceback flag
- bool keepProcessing = true;
-
- while(keepProcessing) {
- //cerr << ci << " " << cj << " " << ck << " ... " << gappedAnchorLen << " " << gappedQueryLen << endl;
-
- // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
- switch(mPointers[ck + cj]) {
-
- case Directions_DIAGONAL:
- c1 = s1[--ci];
- c2 = s2[--cj];
- ck -= queryLen;
-
- mReversedAnchor[gappedAnchorLen++] = c1;
- mReversedQuery[gappedQueryLen++] = c2;
-
- // increment our mismatch counter
- if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++;
- break;
-
- case Directions_STOP:
- keepProcessing = false;
- break;
-
- case Directions_UP:
- for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) {
- if (ci <= 0) {
- keepProcessing = false;
- break;
- }
- mReversedAnchor[gappedAnchorLen++] = s1[--ci];
- mReversedQuery[gappedQueryLen++] = GAP;
- ck -= queryLen;
- numMismatches++;
- }
- break;
-
- case Directions_LEFT:
- for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) {
- if (cj <= 0) {
- keepProcessing = false;
- break;
- }
- mReversedAnchor[gappedAnchorLen++] = GAP;
- mReversedQuery[gappedQueryLen++] = s2[--cj];
- numMismatches++;
- }
- break;
- }
- }
-
- // define the reference and query sequences
- mReversedAnchor[gappedAnchorLen] = 0;
- mReversedQuery[gappedQueryLen] = 0;
-
- // catch sequences with different lengths
- if(gappedAnchorLen != gappedQueryLen) {
- cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl;
- exit(1);
- }
-
- // reverse the strings and assign them to our alignment structure
- reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
- reverse(mReversedQuery, mReversedQuery + gappedQueryLen);
-
- //alignment.Reference = mReversedAnchor;
- //alignment.Query = mReversedQuery;
-
- // set the reference endpoints
- //alignment.ReferenceBegin = ci;
- //alignment.ReferenceEnd = BestRow - 1;
- referenceAl = ci;
-
- // set the query endpoints
- /*
- if(alignment.IsReverseComplement) {
- alignment.QueryBegin = s2Length - BestColumn;
- alignment.QueryEnd = s2Length - cj - 1;
- // alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1;
- } else {
- alignment.QueryBegin = cj;
- alignment.QueryEnd = BestColumn - 1;
- // alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1;
- }
- */
-
- // set the query length and number of mismatches
- //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
- //alignment.NumMismatches = numMismatches;
-
- unsigned int alLength = strlen(mReversedAnchor);
- unsigned int m = 0, d = 0, i = 0;
- bool dashRegion = false;
- ostringstream oCigar (ostringstream::out);
- int insertedBases = 0;
-
- if ( cj != 0 ) {
- if ( cj > 0 ) {
- oCigar << cj << 'S';
- } else { // how do we get negative cj's?
- referenceAl -= cj;
- alLength += cj;
- }
- }
-
- for ( unsigned int j = 0; j < alLength; j++ ) {
- // m
- if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
- if ( dashRegion ) {
- if ( d != 0 ) oCigar << d << 'D';
- else { oCigar << i << 'I'; insertedBases += i; }
- }
- dashRegion = false;
- m++;
- d = 0;
- i = 0;
- }
- else {
- if ( !dashRegion && m )
- oCigar << m << 'M';
- dashRegion = true;
- m = 0;
- if ( mReversedAnchor[j] == GAP ) {
- if ( d != 0 ) oCigar << d << 'D';
- i++;
- d = 0;
- }
- else {
- if ( i != 0) { oCigar << i << 'I'; insertedBases += i; }
- d++;
- i = 0;
- }
- }
- }
- if ( m != 0 ) oCigar << m << 'M';
- else if ( d != 0 ) oCigar << d << 'D';
- else if ( i != 0 ) oCigar << i << 'I';
-
- if ( BestColumn != s2.length() )
- oCigar << s2.length() - BestColumn << 'S';
-
- cigarAl = oCigar.str();
-
- // fix the gap order
- CorrectHomopolymerGapOrder(alLength, numMismatches);
-
- if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) {
- int offset = 0;
- string oldCigar;
- try {
- oldCigar = cigarAl;
- stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset);
- } catch (...) {
- cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl;
- cigarAl = oldCigar; // undo the failed left-realignment attempt
- offset = 0;
- }
- referenceAl += offset;
- }
-
-}
-
-// creates a simple scoring matrix to align the nucleotides and the ambiguity code N
-void CSmithWatermanGotoh::CreateScoringMatrix(void) {
-
- unsigned int nIndex = 13;
- unsigned int xIndex = 23;
-
- // define the N score to be 1/4 of the span between mismatch and match
- //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5);
-
- // calculate the scoring matrix
- for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) {
- for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) {
-
- // N.B. matching N to everything (while conceptually correct) leads to some
- // bad alignments, lets make N be a mismatch instead.
-
- // add the matches or mismatches to the hashtable (N is a mismatch)
- if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore;
- else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore;
- else if(i == j) mScoringMatrix[i][j] = mMatchScore;
- else mScoringMatrix[i][j] = mMismatchScore;
- }
- }
-
- // add ambiguity codes
- mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
- mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
- mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C
- mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore;
-
- mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore; // R - A
- mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore;
- mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G
- mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore;
-
- mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore; // W - A
- mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore;
- mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T
- mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore;
-
- mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore; // S - C
- mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore;
- mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G
- mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore;
-
- mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore; // Y - C
- mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore;
- mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T
- mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore;
-
- mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore; // K - G
- mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore;
- mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T
- mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore;
-
- mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore; // V - A
- mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore;
- mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C
- mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore;
- mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G
- mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore;
-
- mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore; // H - A
- mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore;
- mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C
- mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore;
- mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T
- mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore;
-
- mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore; // D - A
- mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore;
- mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G
- mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore;
- mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T
- mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore;
-
- mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore; // B - C
- mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore;
- mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G
- mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore;
- mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T
- mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore;
-}
-
-// enables homo-polymer scoring
-void CSmithWatermanGotoh::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) {
- mUseHomoPolymerGapOpenPenalty = true;
- mHomoPolymerGapOpenPenalty = hpGapOpenPenalty;
-}
-
-// enables entropy-based gap open penalty
-void CSmithWatermanGotoh::EnableEntropyGapPenalty(float enGapOpenPenalty) {
- mUseEntropyGapOpenPenalty = true;
- mEntropyGapOpenPenalty = enGapOpenPenalty;
-}
-
-// enables repeat-aware gap extension penalty
-void CSmithWatermanGotoh::EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor) {
- mUseRepeatGapExtensionPenalty = true;
- mRepeatGapExtensionPenalty = rGapExtensionPenalty;
- mMaxRepeatGapExtensionPenalty = rMaxGapRepeatExtensionPenaltyFactor * rGapExtensionPenalty;
-}
-
-// corrects the homopolymer gap order for forward alignments
-void CSmithWatermanGotoh::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) {
-
- // this is only required for alignments with mismatches
- //if(al.NumMismatches == 0) return;
- if ( numMismatches == 0 ) return;
-
- // localize the alignment data
- //char* pReference = al.Reference.Data();
- //char* pQuery = al.Query.Data();
- //const unsigned int numBases = al.Reference.Length();
- char* pReference = mReversedAnchor;
- char* pQuery = mReversedQuery;
-
- // initialize
- bool hasReferenceGap = false, hasQueryGap = false;
- char* pNonGapSeq = NULL;
- char* pGapSeq = NULL;
- char nonGapBase = 'J';
-
- // identify gapped regions
- for(unsigned int i = 0; i < numBases; i++) {
-
- // check if the current position is gapped
- hasReferenceGap = false;
- hasQueryGap = false;
-
- if(pReference[i] == GAP) {
- hasReferenceGap = true;
- pNonGapSeq = pQuery;
- pGapSeq = pReference;
- nonGapBase = pQuery[i];
- }
-
- if(pQuery[i] == GAP) {
- hasQueryGap = true;
- pNonGapSeq = pReference;
- pGapSeq = pQuery;
- nonGapBase = pReference[i];
- }
-
- // continue if we don't have any gaps
- if(!hasReferenceGap && !hasQueryGap) continue;
-
- // sanity check
- if(hasReferenceGap && hasQueryGap) {
- printf("ERROR: Found a gap in both the reference sequence and query sequence.\n");
- exit(1);
- }
-
- // find the non-gapped length (forward)
- unsigned short numGappedBases = 0;
- unsigned short nonGapLength = 0;
- unsigned short testPos = i;
- while(testPos < numBases) {
-
- const char gs = pGapSeq[testPos];
- const char ngs = pNonGapSeq[testPos];
-
- bool isPartofHomopolymer = false;
- if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true;
- if(!isPartofHomopolymer) break;
-
- if(gs == GAP) numGappedBases++;
- else nonGapLength++;
- testPos++;
- }
-
- // fix the gap order
- if(numGappedBases != 0) {
- char* pCurrentSequence = pGapSeq + i;
- memset(pCurrentSequence, nonGapBase, nonGapLength);
- pCurrentSequence += nonGapLength;
- memset(pCurrentSequence, GAP, numGappedBases);
- }
-
- // increment
- i += numGappedBases + nonGapLength - 1;
- }
-}
diff --git a/external/vcflib/smithwaterman/SmithWatermanGotoh.h b/external/vcflib/smithwaterman/SmithWatermanGotoh.h
deleted file mode 100644
index 80ab402..0000000
--- a/external/vcflib/smithwaterman/SmithWatermanGotoh.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <algorithm>
-#include <memory>
-//#include "Alignment.h"
-#include "Mosaik.h"
-#include <stdio.h>
-#include <string.h>
-#include <sstream>
-#include <string>
-#include "Repeats.h"
-#include "LeftAlign.h"
-
-using namespace std;
-
-#define MOSAIK_NUM_NUCLEOTIDES 26
-#define GAP '-'
-
-class CSmithWatermanGotoh {
-public:
- // constructor
- CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty);
- // destructor
- ~CSmithWatermanGotoh(void);
- // aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
- void Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2);
- // enables homo-polymer scoring
- void EnableHomoPolymerGapPenalty(float hpGapOpenPenalty);
- // enables non-repeat gap open penalty
- void EnableEntropyGapPenalty(float enGapOpenPenalty);
- // enables repeat gap extension penalty
- void EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor = 10);
-private:
- // creates a simple scoring matrix to align the nucleotides and the ambiguity code N
- void CreateScoringMatrix(void);
- // corrects the homopolymer gap order for forward alignments
- void CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches);
- // returns the maximum floating point number
- static inline float MaxFloats(const float& a, const float& b, const float& c);
- // our simple scoring matrix
- float mScoringMatrix[MOSAIK_NUM_NUCLEOTIDES][MOSAIK_NUM_NUCLEOTIDES];
- // keep track of maximum initialized sizes
- unsigned int mCurrentMatrixSize;
- unsigned int mCurrentAnchorSize;
- unsigned int mCurrentQuerySize;
- unsigned int mCurrentAQSumSize;
- // define our traceback directions
- // N.B. This used to be defined as an enum, but gcc doesn't like being told
- // which storage class to use
- const static char Directions_STOP;
- const static char Directions_LEFT;
- const static char Directions_DIAGONAL;
- const static char Directions_UP;
- // repeat structure determination
- const static int repeat_size_max;
- // define scoring constants
- const float mMatchScore;
- const float mMismatchScore;
- const float mGapOpenPenalty;
- const float mGapExtendPenalty;
- // store the backtrace pointers
- char* mPointers;
- // store the vertical gap sizes - assuming gaps are not longer than 32768 bases long
- short* mSizesOfVerticalGaps;
- // store the horizontal gap sizes - assuming gaps are not longer than 32768 bases long
- short* mSizesOfHorizontalGaps;
- // score if xi aligns to a gap after yi
- float* mQueryGapScores;
- // best score of alignment x1...xi to y1...yi
- float* mBestScores;
- // our reversed alignment
- char* mReversedAnchor;
- char* mReversedQuery;
- // define static constants
- static const float FLOAT_NEGATIVE_INFINITY;
- // toggles the use of the homo-polymer gap open penalty
- bool mUseHomoPolymerGapOpenPenalty;
- // specifies the homo-polymer gap open penalty
- float mHomoPolymerGapOpenPenalty;
- // toggles the use of the entropy gap open penalty
- bool mUseEntropyGapOpenPenalty;
- // specifies the entropy gap open penalty (multiplier)
- float mEntropyGapOpenPenalty;
- // toggles the use of the repeat gap extension penalty
- bool mUseRepeatGapExtensionPenalty;
- // specifies the repeat gap extension penalty
- float mRepeatGapExtensionPenalty;
- // specifies the max repeat gap extension penalty
- float mMaxRepeatGapExtensionPenalty;
-};
-
-// returns the maximum floating point number
-inline float CSmithWatermanGotoh::MaxFloats(const float& a, const float& b, const float& c) {
- float max = 0.0f;
- if(a > max) max = a;
- if(b > max) max = b;
- if(c > max) max = c;
- return max;
-}
diff --git a/external/vcflib/smithwaterman/convert.h b/external/vcflib/smithwaterman/convert.h
deleted file mode 100644
index 399bcea..0000000
--- a/external/vcflib/smithwaterman/convert.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef __CONVERT_H
-#define __CONVERT_H
-
-#include <sstream>
-
-// converts the string into the specified type, setting r to the converted
-// value and returning true/false on success or failure
-template<typename T>
-bool convert(const std::string& s, T& r) {
- std::istringstream iss(s);
- iss >> r;
- return iss.eof() ? true : false;
-}
-
-template<typename T>
-std::string convert(const T& r) {
- std::ostringstream iss;
- iss << r;
- return iss.str();
-}
-
-#endif
diff --git a/external/vcflib/smithwaterman/smithwaterman.cpp b/external/vcflib/smithwaterman/smithwaterman.cpp
deleted file mode 100644
index 489dc1c..0000000
--- a/external/vcflib/smithwaterman/smithwaterman.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-#include <iostream>
-#include <string.h>
-#include <string>
-#include <sstream>
-#include <getopt.h>
-#include <utility>
-#include <vector>
-#include <stdlib.h>
-#include "SmithWatermanGotoh.h"
-#include "BandedSmithWaterman.h"
-
-using namespace std;
-
-void printSummary(void) {
- cerr << "usage: smithwaterman [options] <reference sequence> <query sequence>" << endl
- << endl
- << "options:" << endl
- << " -m, --match-score the match score (default 10.0)" << endl
- << " -n, --mismatch-score the mismatch score (default -9.0)" << endl
- << " -g, --gap-open-penalty the gap open penalty (default 15.0)" << endl
- << " -z, --entropy-gap-open-penalty enable entropy scaling of the gap open penalty" << endl
- << " -e, --gap-extend-penalty the gap extend penalty (default 6.66)" << endl
- << " -r, --repeat-gap-extend-penalty use repeat information when generating gap extension penalties" << endl
- << " -b, --bandwidth bandwidth to use (default 0, or non-banded algorithm)" << endl
- << " -p, --print-alignment print out the alignment" << endl
- << endl
- << "When called with literal reference and query sequences, smithwaterman" << endl
- << "prints the cigar match positional string and the match position for the" << endl
- << "query sequence against the reference sequence." << endl;
-}
-
-
-int main (int argc, char** argv) {
-
- int c;
-
- string reference;
- string query;
-
- int bandwidth = 0;
-
- float matchScore = 10.0f;
- float mismatchScore = -9.0f;
- float gapOpenPenalty = 15.0f;
- float gapExtendPenalty = 6.66f;
- float entropyGapOpenPenalty = 0.0f;
- bool useRepeatGapExtendPenalty = false;
- float repeatGapExtendPenalty = 1.0f;
-
- bool print_alignment = false;
-
- while (true) {
- static struct option long_options[] =
- {
- {"help", no_argument, 0, 'h'},
- {"match-score", required_argument, 0, 'm'},
- {"mismatch-score", required_argument, 0, 'n'},
- {"gap-open-penalty", required_argument, 0, 'g'},
- {"entropy-gap-open-penalty", required_argument, 0, 'z'},
- {"gap-extend-penalty", required_argument, 0, 'e'},
- {"repeat-gap-extend-penalty", required_argument, 0, 'r'},
- {"print-alignment", required_argument, 0, 'p'},
- {"bandwidth", required_argument, 0, 'b'},
- {0, 0, 0, 0}
- };
- int option_index = 0;
-
- c = getopt_long (argc, argv, "hpzm:n:g:r:e:b:r:",
- long_options, &option_index);
-
- if (c == -1)
- break;
-
- switch (c)
- {
- case 0:
- /* If this option set a flag, do nothing else now. */
- if (long_options[option_index].flag != 0)
- break;
- printf ("option %s", long_options[option_index].name);
- if (optarg)
- printf (" with arg %s", optarg);
- printf ("\n");
- break;
-
- case 'm':
- matchScore = atof(optarg);
- break;
-
- case 'n':
- mismatchScore = atof(optarg);
- break;
-
- case 'g':
- gapOpenPenalty = atof(optarg);
- break;
-
- case 'z':
- entropyGapOpenPenalty = 1;
- break;
-
- case 'r':
- useRepeatGapExtendPenalty = true;
- repeatGapExtendPenalty = atof(optarg);
- break;
-
- case 'e':
- gapExtendPenalty = atof(optarg);
- break;
-
- case 'b':
- bandwidth = atoi(optarg);
- break;
-
- case 'p':
- print_alignment = true;
- break;
-
- case 'h':
- printSummary();
- exit(0);
- break;
-
- case '?':
- /* getopt_long already printed an error message. */
- printSummary();
- exit(1);
- break;
-
- default:
- abort ();
- }
- }
-
- /* Print any remaining command line arguments (not options). */
- if (optind == argc - 2) {
- //cerr << "fasta file: " << argv[optind] << endl;
- reference = string(argv[optind]);
- ++optind;
- query = string(argv[optind]);
- } else {
- cerr << "please specify a reference and query sequence" << endl
- << "execute " << argv[0] << " --help for command-line usage" << endl;
- exit(1);
- }
-
- // initialize
-
- unsigned int referencePos;
- string cigar;
-
- // create a new Smith-Waterman alignment object
- if (bandwidth > 0) {
- pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr;
- hr.first.first = 2;
- hr.first.second = 18;
- hr.second.first = 1;
- hr.second.second = 17;
- CBandedSmithWaterman bsw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty, bandwidth);
- bsw.Align(referencePos, cigar, reference, query, hr);
- } else {
- CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
- if (useRepeatGapExtendPenalty)
- sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
- if (entropyGapOpenPenalty > 0)
- sw.EnableEntropyGapPenalty(entropyGapOpenPenalty);
- sw.Align(referencePos, cigar, reference, query);
- }
-
- printf("%s %3u\n", cigar.c_str(), referencePos);
-
- // optionally print out the alignment
- if (print_alignment) {
- int alignmentLength = 0;
- int len;
- string slen;
- vector<pair<int, char> > cigarData;
- for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
- switch (*c) {
- case 'I':
- len = atoi(slen.c_str());
- slen.clear();
- cigarData.push_back(make_pair(len, *c));
- break;
- case 'D':
- len = atoi(slen.c_str());
- alignmentLength += len;
- slen.clear();
- cigarData.push_back(make_pair(len, *c));
- break;
- case 'M':
- len = atoi(slen.c_str());
- alignmentLength += len;
- slen.clear();
- cigarData.push_back(make_pair(len, *c));
- break;
- case 'S':
- len = atoi(slen.c_str());
- slen.clear();
- cigarData.push_back(make_pair(len, *c));
- break;
- default:
- len = 0;
- slen += *c;
- break;
- }
- }
-
- string gapped_ref = string(reference).substr(referencePos, alignmentLength);
- string gapped_query = string(query);
-
- int refpos = 0;
- int readpos = 0;
- for (vector<pair<int, char> >::iterator c = cigarData.begin(); c != cigarData.end(); ++c) {
- int len = c->first;
- switch (c->second) {
- case 'I':
- gapped_ref.insert(refpos, string(len, '-'));
- readpos += len;
- refpos += len;
- break;
- case 'D':
- gapped_query.insert(readpos, string(len, '-'));
- refpos += len;
- readpos += len;
- break;
- case 'M':
- readpos += len;
- refpos += len;
- break;
- case 'S':
- readpos += len;
- gapped_ref.insert(refpos, string(len, '*'));
- refpos += len;
- break;
- default:
- break;
- }
- }
-
- cout << gapped_ref << endl << gapped_query << endl;
- }
-
- return 0;
-
-}
diff --git a/external/vcflib/split.cpp b/external/vcflib/split.cpp
deleted file mode 100644
index 831dfcd..0000000
--- a/external/vcflib/split.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "split.h"
-
-
-std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
- std::string delims = std::string(1, delim);
- tokenize(s, elems, delims);
- return elems;
-}
-
-std::vector<std::string> split(const std::string &s, char delim) {
- std::vector<std::string> elems;
- return split(s, delim, elems);
-}
-
-std::vector<std::string> &split(const std::string &s, const std::string& delims, std::vector<std::string> &elems) {
- tokenize(s, elems, delims);
- return elems;
-}
-
-std::vector<std::string> split(const std::string &s, const std::string& delims) {
- std::vector<std::string> elems;
- return split(s, delims, elems);
-}
diff --git a/external/vcflib/split.h b/external/vcflib/split.h
deleted file mode 100644
index e10ba78..0000000
--- a/external/vcflib/split.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef __SPLIT_H
-#define __SPLIT_H
-
-// functions to split a string by a specific delimiter
-#include <string>
-#include <vector>
-#include <sstream>
-#include <string.h>
-
-// thanks to Evan Teran, http://stackoverflow.com/questions/236129/how-to-split-a-string/236803#236803
-
-// split a string on a single delimiter character (delim)
-std::vector<std::string>& split(const std::string &s, char delim, std::vector<std::string> &elems);
-std::vector<std::string> split(const std::string &s, char delim);
-
-// split a string on any character found in the string of delimiters (delims)
-std::vector<std::string>& split(const std::string &s, const std::string& delims, std::vector<std::string> &elems);
-std::vector<std::string> split(const std::string &s, const std::string& delims);
-
-// from Marius, http://stackoverflow.com/a/1493195/238609
-template < class ContainerT >
-void tokenize(const std::string& str, ContainerT& tokens,
- const std::string& delimiters = " ", const bool trimEmpty = false)
-{
-
- std::string::size_type pos, lastPos = 0;
- while(true)
- {
- pos = str.find_first_of(delimiters, lastPos);
- if(pos == std::string::npos)
- {
-
- pos = str.length();
-
- if(pos != lastPos || !trimEmpty) {
- tokens.push_back(typename ContainerT::value_type(str.data()+lastPos, (typename ContainerT::value_type::size_type)pos-lastPos));
- }
-
- break;
- }
- else
- {
- if(pos != lastPos || !trimEmpty) {
- tokens.push_back(typename ContainerT::value_type(str.data()+lastPos, (typename ContainerT::value_type::size_type)pos-lastPos));
- }
- }
-
- lastPos = pos + 1;
- }
-};
-
-
-#endif
diff --git a/external/vcflib/tabixpp/ChangeLog b/external/vcflib/tabixpp/ChangeLog
deleted file mode 100644
index fd335b8..0000000
--- a/external/vcflib/tabixpp/ChangeLog
+++ /dev/null
@@ -1,593 +0,0 @@
-------------------------------------------------------------------------
-r942 | lh3lh3 | 2011-03-31 16:39:50 -0400 (Thu, 31 Mar 2011) | 2 lines
-Changed paths:
- M /trunk/tabix/main.c
-
-update version number
-
-------------------------------------------------------------------------
-r940 | lh3lh3 | 2011-03-31 16:38:03 -0400 (Thu, 31 Mar 2011) | 2 lines
-Changed paths:
- M /trunk/tabix/bedidx.c
- M /trunk/tabix/main.c
-
-fixed two bugs due to recent changes
-
-------------------------------------------------------------------------
-r939 | lh3lh3 | 2011-03-31 16:12:21 -0400 (Thu, 31 Mar 2011) | 2 lines
-Changed paths:
- M /trunk/tabix/bgzf.c
- M /trunk/tabix/bgzf.h
- M /trunk/tabix/main.c
-
-update to the latest bgzf.*
-
-------------------------------------------------------------------------
-r938 | lh3lh3 | 2011-03-31 16:02:21 -0400 (Thu, 31 Mar 2011) | 2 lines
-Changed paths:
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
- M /trunk/tabix/tabix.h
-
-BED support
-
-------------------------------------------------------------------------
-r937 | lh3lh3 | 2011-03-31 15:03:49 -0400 (Thu, 31 Mar 2011) | 2 lines
-Changed paths:
- M /trunk/tabix/Makefile
- A /trunk/tabix/bedidx.c
- M /trunk/tabix/example.gtf.gz.tbi
- M /trunk/tabix/index.c
- A /trunk/tabix/kseq.h
- M /trunk/tabix/tabix.h
-
-restructure get_intv() for BED support
-
-------------------------------------------------------------------------
-r919 | petulda | 2011-02-24 10:14:14 -0500 (Thu, 24 Feb 2011) | 1 line
-Changed paths:
- M /trunk/tabix/bgzf.c
- M /trunk/tabix/bgzf.h
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
-
-New -r (reheader) option for efficient header replacement.
-------------------------------------------------------------------------
-r915 | lh3lh3 | 2011-02-22 09:50:57 -0500 (Tue, 22 Feb 2011) | 2 lines
-Changed paths:
- A /trunk/tabix/python
- A /trunk/tabix/python/setup.py (from /trunk/tabix/setup.py:914)
- A /trunk/tabix/python/tabixmodule.c (from /trunk/tabix/tabixmodule.c:914)
- A /trunk/tabix/python/test.py (from /trunk/tabix/test.py:914)
- D /trunk/tabix/setup.py
- D /trunk/tabix/tabixmodule.c
- D /trunk/tabix/test.py
-
-move to a new python/ directory
-
-------------------------------------------------------------------------
-r914 | lh3lh3 | 2011-02-22 09:49:35 -0500 (Tue, 22 Feb 2011) | 2 lines
-Changed paths:
- A /trunk/tabix/setup.py
- A /trunk/tabix/tabixmodule.c
- A /trunk/tabix/test.py
-
-CPython C-API by Hyeshik Chang
-
-------------------------------------------------------------------------
-r904 | petulda | 2011-01-28 08:06:27 -0500 (Fri, 28 Jan 2011) | 1 line
-Changed paths:
- M /trunk/tabix/index.c
-
-Check the number of fields on each line and exit nicely without segfault
-------------------------------------------------------------------------
-r901 | petulda | 2011-01-21 06:45:37 -0500 (Fri, 21 Jan 2011) | 1 line
-Changed paths:
- M /trunk/tabix/main.c
-
-Fix: Complain only when VCF is newer, not newer or same mtime
-------------------------------------------------------------------------
-r900 | petulda | 2011-01-21 04:23:04 -0500 (Fri, 21 Jan 2011) | 1 line
-Changed paths:
- M /trunk/tabix/main.c
-
-Prevent the common user mistake and check the timestamps of the vcf and index file
-------------------------------------------------------------------------
-r876 | lh3lh3 | 2010-12-08 12:38:45 -0500 (Wed, 08 Dec 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/ChangeLog
- M /trunk/tabix/NEWS
- M /trunk/tabix/main.c
-
-Release tabix-0.2.3
-
-------------------------------------------------------------------------
-r875 | lh3lh3 | 2010-12-08 12:28:35 -0500 (Wed, 08 Dec 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/ChangeLog
- M /trunk/tabix/index.c
-
-Fixed a minor bug in generating index
-
-------------------------------------------------------------------------
-r855 | petulda | 2010-11-25 11:50:13 -0500 (Thu, 25 Nov 2010) | 1 line
-Changed paths:
- M /trunk/tabix/main.c
-
-Disable "unknown target name or minus interval" warning.
-------------------------------------------------------------------------
-r775 | petulda | 2010-10-26 15:02:30 -0400 (Tue, 26 Oct 2010) | 1 line
-Changed paths:
- M /trunk/tabix/main.c
-
-Added -h option to print header lines
-------------------------------------------------------------------------
-r742 | jmarshall | 2010-09-27 06:47:23 -0400 (Mon, 27 Sep 2010) | 2 lines
-Changed paths:
- M /trunk/tabix
-
-Add svn:ignore properties for intermediate and generated files.
-
-------------------------------------------------------------------------
-r725 | lh3lh3 | 2010-09-15 13:01:53 -0400 (Wed, 15 Sep 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/bgzip.c
-
-patches by Peter Chines
-
-------------------------------------------------------------------------
-r714 | lh3lh3 | 2010-09-07 10:13:25 -0400 (Tue, 07 Sep 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/TabixReader.java
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
-
-fixed a bug in C/Java when n_off == 0
-
-------------------------------------------------------------------------
-r712 | lh3lh3 | 2010-09-03 09:21:23 -0400 (Fri, 03 Sep 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/TabixReader.java
-
-fixed a bug in parsing region strings
-
-------------------------------------------------------------------------
-r700 | petulda | 2010-08-25 10:42:37 -0400 (Wed, 25 Aug 2010) | 1 line
-Changed paths:
- M /trunk/tabix/main.c
-
-Fix: Exit with an error rather than segfault when index is not present and region is queried
-------------------------------------------------------------------------
-r696 | petulda | 2010-08-24 10:24:12 -0400 (Tue, 24 Aug 2010) | 1 line
-Changed paths:
- M /trunk/tabix/bgzf.c
- M /trunk/tabix/bgzf.h
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
-
-Complain about not-bgzipped files and check for noncontinuous chromosome blocks
-------------------------------------------------------------------------
-r603 | lh3lh3 | 2010-06-28 10:49:39 -0400 (Mon, 28 Jun 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/NEWS
- M /trunk/tabix/TabixReader.java
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
-
-Release tabix-0.2.2
-
-------------------------------------------------------------------------
-r597 | lh3lh3 | 2010-06-13 21:08:29 -0400 (Sun, 13 Jun 2010) | 3 lines
-Changed paths:
- M /trunk/tabix/index.c
-
-Change the namespace of sorting, to avoid function name collision with samtools.
-
-
-------------------------------------------------------------------------
-r582 | lh3lh3 | 2010-06-03 10:40:25 -0400 (Thu, 03 Jun 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/NEWS
- M /trunk/tabix/main.c
- M /trunk/tabix/tabix.py
-
-Release tabix-0.2.1
-
-------------------------------------------------------------------------
-r581 | lh3lh3 | 2010-05-24 14:24:24 -0400 (Mon, 24 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/tabix.py
-
-OOP interface with the help from Aaron Quinlan
-
-------------------------------------------------------------------------
-r580 | lh3lh3 | 2010-05-23 23:36:05 -0400 (Sun, 23 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/tabix.py
-
-minor change
-
-------------------------------------------------------------------------
-r579 | lh3lh3 | 2010-05-23 23:25:24 -0400 (Sun, 23 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/tabix.py
-
-For Snow Leopard compatibility
-
-------------------------------------------------------------------------
-r575 | lh3lh3 | 2010-05-12 19:31:27 -0400 (Wed, 12 May 2010) | 4 lines
-Changed paths:
- M /trunk/tabix/Makefile
- M /trunk/tabix/index.c
- M /trunk/tabix/tabix.h
- A /trunk/tabix/tabix.py
-
- * optionally generate shared library for Mac and Linux
- * added a python script that directly calls the shared library
- * added a new API for easy python access
-
-------------------------------------------------------------------------
-r574 | lh3lh3 | 2010-05-11 12:14:27 -0400 (Tue, 11 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/ChangeLog
- M /trunk/tabix/NEWS
- M /trunk/tabix/perl/Tabix.pm
- M /trunk/tabix/perl/TabixIterator.pm
- M /trunk/tabix/tabix.1
-
-Release tabix-0.2.0
-
-------------------------------------------------------------------------
-r573 | lh3lh3 | 2010-05-11 12:08:30 -0400 (Tue, 11 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/Makefile
-
-Added -fPIC
-
-------------------------------------------------------------------------
-r572 | lh3lh3 | 2010-05-11 11:59:07 -0400 (Tue, 11 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/perl/MANIFEST
-
-update
-
-------------------------------------------------------------------------
-r571 | lh3lh3 | 2010-05-11 11:56:54 -0400 (Tue, 11 May 2010) | 4 lines
-Changed paths:
- A /trunk/tabix/example.gtf.gz
- A /trunk/tabix/example.gtf.gz.tbi
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
- M /trunk/tabix/perl/MANIFEST
- M /trunk/tabix/perl/Tabix.pm
- M /trunk/tabix/perl/Tabix.xs
- A /trunk/tabix/perl/TabixIterator.pm
- A /trunk/tabix/perl/t
- A /trunk/tabix/perl/t/01local.t
- A /trunk/tabix/perl/t/02remote.t
- M /trunk/tabix/tabix.1
- M /trunk/tabix/tabix.h
-
- * improved C/Perl APIs
- * added test for Perl
- * added an tiny example
-
-------------------------------------------------------------------------
-r570 | lh3lh3 | 2010-05-11 01:04:21 -0400 (Tue, 11 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/TabixReader.java
-
-fixed the same issue in java
-
-------------------------------------------------------------------------
-r569 | lh3lh3 | 2010-05-11 01:03:24 -0400 (Tue, 11 May 2010) | 3 lines
-Changed paths:
- M /trunk/tabix/index.c
- M /trunk/tabix/perl/Tabix.pm
- M /trunk/tabix/perl/Tabix.xs
-
- * fixed a potential issue in index.c
- * improve perl APIs
-
-------------------------------------------------------------------------
-r568 | lh3lh3 | 2010-05-10 23:46:21 -0400 (Mon, 10 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/perl/Tabix.xs
-
-return an array from get_names()
-
-------------------------------------------------------------------------
-r567 | lh3lh3 | 2010-05-10 23:38:46 -0400 (Mon, 10 May 2010) | 4 lines
-Changed paths:
- M /trunk/tabix/TabixReader.java
- M /trunk/tabix/index.c
- A /trunk/tabix/perl
- A /trunk/tabix/perl/MANIFEST
- A /trunk/tabix/perl/Makefile.PL
- A /trunk/tabix/perl/Tabix.pm
- A /trunk/tabix/perl/Tabix.xs
- A /trunk/tabix/perl/typemap
- M /trunk/tabix/tabix.h
-
- * added the initial perl binding. The interface needs to be improved.
- * added a new API for perl binding
- * fixed a potential bug in java.
-
-------------------------------------------------------------------------
-r565 | lh3lh3 | 2010-05-09 23:24:35 -0400 (Sun, 09 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/main.c
-
-Release tabix-0.1.6
-
-------------------------------------------------------------------------
-r564 | lh3lh3 | 2010-05-09 23:01:49 -0400 (Sun, 09 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/index.c
-
-fixed a typo
-
-------------------------------------------------------------------------
-r563 | lh3lh3 | 2010-05-09 22:58:26 -0400 (Sun, 09 May 2010) | 2 lines
-Changed paths:
- A /trunk/tabix/ChangeLog
- M /trunk/tabix/NEWS
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
- M /trunk/tabix/tabix.h
-
-If nothing bad happens, this will become 0.1.6
-
-------------------------------------------------------------------------
-r562 | lh3lh3 | 2010-05-09 19:43:56 -0400 (Sun, 09 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/index.c
-
-Fixed a bug
-
-------------------------------------------------------------------------
-r560 | lh3lh3 | 2010-05-05 10:59:09 -0400 (Wed, 05 May 2010) | 3 lines
-Changed paths:
- A /trunk/tabix/NEWS
- M /trunk/tabix/TabixReader.java
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
- M /trunk/tabix/tabix.1
- M /trunk/tabix/tabix.h
-
- * Release tabix-0.1.5 (r560)
- * Improve seeking efficiency. Index file needs to be rebuilt.
-
-------------------------------------------------------------------------
-r559 | lh3lh3 | 2010-05-04 23:11:42 -0400 (Tue, 04 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/main.c
-
-Release tabix-0.1.4 (r559)
-
-------------------------------------------------------------------------
-r558 | lh3lh3 | 2010-05-01 12:48:01 -0400 (Sat, 01 May 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/TabixReader.java
-
-implement SAM/VCF support; NOT tested yet
-
-------------------------------------------------------------------------
-r557 | lh3lh3 | 2010-05-01 00:42:34 -0400 (Sat, 01 May 2010) | 2 lines
-Changed paths:
- A /trunk/tabix/TabixReader.java
-
-The Java implementation of tabix.
-
-------------------------------------------------------------------------
-r556 | lh3lh3 | 2010-04-30 22:34:07 -0400 (Fri, 30 Apr 2010) | 4 lines
-Changed paths:
- M /trunk/tabix/index.c
- M /trunk/tabix/knetfile.c
- M /trunk/tabix/main.c
-
- * tabix-0.1.3-3 (r556)
- * fixed a small memory leak in knetfile
- * fixed a minor bug for remote downloading
-
-------------------------------------------------------------------------
-r555 | lh3lh3 | 2010-04-30 22:15:12 -0400 (Fri, 30 Apr 2010) | 4 lines
-Changed paths:
- M /trunk/tabix/Makefile
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
-
- * tabix-0.1.3-2 (r555)
- * do not overwrite index file by default
- * a little code cleanup
-
-------------------------------------------------------------------------
-r554 | lh3lh3 | 2010-04-30 21:44:31 -0400 (Fri, 30 Apr 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/index.c
-
-fixed a potential bug for UCSC-like coordinate
-
-------------------------------------------------------------------------
-r553 | lh3lh3 | 2010-04-28 17:43:41 -0400 (Wed, 28 Apr 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/tabix.tex
-
-minor clarification to the format spec
-
-------------------------------------------------------------------------
-r552 | lh3lh3 | 2010-04-28 16:33:07 -0400 (Wed, 28 Apr 2010) | 3 lines
-Changed paths:
- M /trunk/tabix/Makefile
- M /trunk/tabix/bgzip.c
- A /trunk/tabix/tabix.tex
-
- * added the format specification
- * fixed a typo in bgzip
-
-------------------------------------------------------------------------
-r550 | petulda | 2010-04-22 11:03:24 -0400 (Thu, 22 Apr 2010) | 1 line
-Changed paths:
- M /trunk/tabix/bgzip.c
-
-The behaviour changed slightly to mimic gzip. Detect if std descriptors are connected to the terminal.
-------------------------------------------------------------------------
-r549 | petulda | 2010-04-22 09:46:10 -0400 (Thu, 22 Apr 2010) | 1 line
-Changed paths:
- M /trunk/tabix/bgzip.c
-
-Fix in src/dst file detection and slight change of behaviour
-------------------------------------------------------------------------
-r548 | petulda | 2010-04-19 04:39:46 -0400 (Mon, 19 Apr 2010) | 1 line
-Changed paths:
- M /trunk/tabix/index.c
-
-Close file descriptor in ti_list_chromosomes
-------------------------------------------------------------------------
-r547 | petulda | 2010-04-16 09:27:11 -0400 (Fri, 16 Apr 2010) | 1 line
-Changed paths:
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
- M /trunk/tabix/tabix.h
-
-Added the -l option for listing chromosomes
-------------------------------------------------------------------------
-r544 | lh3lh3 | 2010-03-29 10:58:48 -0400 (Mon, 29 Mar 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/main.c
-
-removed a line of debugging code
-
-------------------------------------------------------------------------
-r543 | lh3lh3 | 2010-03-19 12:29:16 -0400 (Fri, 19 Mar 2010) | 3 lines
-Changed paths:
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
- M /trunk/tabix/tabix.1
-
- * tabix-0.1.3 (r543)
- * fixed another off-by-one bug
-
-------------------------------------------------------------------------
-r542 | lh3lh3 | 2010-03-16 22:35:52 -0400 (Tue, 16 Mar 2010) | 2 lines
-Changed paths:
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
- M /trunk/tabix/tabix.1
-
-Release tabix-0.1.1
-
-------------------------------------------------------------------------
-r506 | lh3lh3 | 2009-11-02 23:20:12 -0500 (Mon, 02 Nov 2009) | 2 lines
-Changed paths:
- M /trunk/tabix/main.c
-
-Release tabix-0.1.0
-
-------------------------------------------------------------------------
-r505 | lh3lh3 | 2009-11-02 23:15:49 -0500 (Mon, 02 Nov 2009) | 2 lines
-Changed paths:
- A /trunk/tabix/tabix.1
-
-documentation
-
-------------------------------------------------------------------------
-r504 | lh3lh3 | 2009-11-02 11:08:18 -0500 (Mon, 02 Nov 2009) | 5 lines
-Changed paths:
- M /trunk/tabix/Makefile
- M /trunk/tabix/bgzip.c
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
- M /trunk/tabix/tabix.h
-
- * tabix-0.0.0-5 (r504)
- * fixed a critical bug in fetching data (a typo in fact)
- * support SAM (tested on ex1.sam) and VCF (not tested)
- * improve the command-line interface
-
-------------------------------------------------------------------------
-r503 | lh3lh3 | 2009-11-02 10:04:43 -0500 (Mon, 02 Nov 2009) | 3 lines
-Changed paths:
- M /trunk/tabix/Makefile
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
-
- * tabix-0.0.0-4 (r503)
- * index files are bgzf compressed
-
-------------------------------------------------------------------------
-r502 | lh3lh3 | 2009-11-02 09:47:25 -0500 (Mon, 02 Nov 2009) | 4 lines
-Changed paths:
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
- M /trunk/tabix/tabix.h
-
- * tabix-0.0.0-3 (r502)
- * support meta lines (not tested)
- * I am going to make the index file in the BGZF format
-
-------------------------------------------------------------------------
-r501 | lh3lh3 | 2009-11-01 22:03:07 -0500 (Sun, 01 Nov 2009) | 3 lines
-Changed paths:
- M /trunk/tabix/Makefile
- M /trunk/tabix/bgzf.h
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
-
- * tabix-0.0.0-2 (r501)
- * accelerate ti_readline()
-
-------------------------------------------------------------------------
-r500 | lh3lh3 | 2009-11-01 20:49:52 -0500 (Sun, 01 Nov 2009) | 3 lines
-Changed paths:
- M /trunk/tabix/Makefile
- M /trunk/tabix/bgzip.c
- M /trunk/tabix/index.c
- M /trunk/tabix/main.c
-
- * tabix-0.0.0-1 (r500)
- * apparently working
-
-------------------------------------------------------------------------
-r499 | lh3lh3 | 2009-11-01 14:04:52 -0500 (Sun, 01 Nov 2009) | 2 lines
-Changed paths:
- D /trunk/tabix/parser.c
-
-obsolete file
-
-------------------------------------------------------------------------
-r498 | lh3lh3 | 2009-11-01 14:04:08 -0500 (Sun, 01 Nov 2009) | 2 lines
-Changed paths:
- M /trunk/tabix/bgzip.c
-
-bgzip is more like gzip in its command-line interface
-
-------------------------------------------------------------------------
-r497 | lh3lh3 | 2009-11-01 13:43:35 -0500 (Sun, 01 Nov 2009) | 2 lines
-Changed paths:
- A /trunk/tabix/Makefile
- A /trunk/tabix/bam_endian.h
- A /trunk/tabix/bgzf.c
- A /trunk/tabix/bgzf.h
- A /trunk/tabix/bgzip.c
- A /trunk/tabix/index.c
- A /trunk/tabix/khash.h
- A /trunk/tabix/knetfile.c
- A /trunk/tabix/knetfile.h
- A /trunk/tabix/ksort.h
- A /trunk/tabix/kstring.c
- A /trunk/tabix/kstring.h
- A /trunk/tabix/main.c
- A /trunk/tabix/parser.c
- A /trunk/tabix/tabix.h
-
-initial source code. It is BUGGY!
-
-------------------------------------------------------------------------
-r496 | lh3lh3 | 2009-11-01 13:42:39 -0500 (Sun, 01 Nov 2009) | 2 lines
-Changed paths:
- A /trunk/tabix
-
-A generic indexer for TAB-delimited genome position files
-
-------------------------------------------------------------------------
diff --git a/external/vcflib/tabixpp/NEWS b/external/vcflib/tabixpp/NEWS
deleted file mode 100644
index d230541..0000000
--- a/external/vcflib/tabixpp/NEWS
+++ /dev/null
@@ -1,126 +0,0 @@
-Release 0.2.4 (10 April, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Give an error if the index file is older than the data file.
-
- * Avoid a segfault given flawed input.
-
- * Added Python APIs contributed by Hyeshik Chang. The new APIs do not bind to
- the dynamic library and are reported to be faster. Pysam also comes with a
- tabix binding.
-
- * Added option "-r" for efficient header replacement.
-
- * Added BED support.
-
- * Synchronized the BGZF library between tabix and samtools.
-
-(0.2.4: 10 April 2011, r949)
-
-
-
-Beta Release 0.2.3 (8 December, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Fixed a minor bug where the first record in a headerless file may be
- missed.
-
- * Added an option to print header lines.
-
- * Fixed a rare bug which may occasionally happen when retrieving data
- from a region without any records.
-
- * Enhanced error reporting.
-
- * Fixed a bug in bgzip which may delete the original file even if not
- intended.
-
-(0.2.3: 8 December 2010, r876)
-
-
-
-Beta Release 0.2.2 (28 June, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Dropped the VCF3 support. Added VCF4 support.
-
- * Avoided the function name collision with samtools.
-
-(0.2.2: 28 June 2010, r603)
-
-
-
-Beta Release 0.2.1 (3 June, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Allow shared library to be compiled. Added python binding to the
- shared library.
-
-(0.2.1: 3 June 2010, r582)
-
-
-
-Beta Release 0.2.0 (11 May, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Fixed an issue for random access given an interval end larger than
- 2^29.
-
- * Updated the Java binding.
-
- * Added a Perl module using XS.
-
- * Improved the C APIs.
-
-(0.2.0: 11 May 2010, r574)
-
-
-
-Beta Release 0.1.6 (9 May, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Improved backward compatibility. Release 0.1.5 does not work with the
- buggy index file generated by 0.1.2.
-
- * Fixed a bug in building linear index. The bug does not affect the
- results, only affects efficiency in rare cases.
-
- * Reduced the number of seek calls given an index generated by old
- version of tabix.
-
- * Added new APIs for retrieving data via an iterator. The old callback
- APIs are not changed, although internally it uses iterator to
- retrieve data.
-
-I am trying to freeze tabix. I just hope I am committing new bugs.
-
-(0.1.6: 9 May 2010, r563)
-
-
-
-Beta Release 0.1.5 (5 May, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Clarified that tabix is released under MIT/X11.
-
- * Improved the robustness of indexing and retrieval.
-
- * Reduced the number of seek calls when the specified region starts
- from a 16kb block with no data. The index format is the same, but the
- content is changed a little.
-
-(0.1.5: 5 May 2010, r560)
diff --git a/external/vcflib/tabixpp/README b/external/vcflib/tabixpp/README
deleted file mode 100644
index 966660e..0000000
--- a/external/vcflib/tabixpp/README
+++ /dev/null
@@ -1,6 +0,0 @@
-This is a fork of the tabix project [1] which includes a C++ class wrapper for
-reading tabix-indexed files.
-
-Author: Erik Garrison <erik.garrison at bc.edu>
-
-[1] http://samtools.sourceforge.net/tabix.shtml
diff --git a/external/vcflib/tabixpp/bam_endian.h b/external/vcflib/tabixpp/bam_endian.h
deleted file mode 100644
index 0fc74a8..0000000
--- a/external/vcflib/tabixpp/bam_endian.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef BAM_ENDIAN_H
-#define BAM_ENDIAN_H
-
-#include <stdint.h>
-
-static inline int bam_is_big_endian()
-{
- long one= 1;
- return !(*((char *)(&one)));
-}
-static inline uint16_t bam_swap_endian_2(uint16_t v)
-{
- return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
-}
-static inline void *bam_swap_endian_2p(void *x)
-{
- *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
- return x;
-}
-static inline uint32_t bam_swap_endian_4(uint32_t v)
-{
- v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
- return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
-}
-static inline void *bam_swap_endian_4p(void *x)
-{
- *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
- return x;
-}
-static inline uint64_t bam_swap_endian_8(uint64_t v)
-{
- v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
- v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
- return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
-}
-static inline void *bam_swap_endian_8p(void *x)
-{
- *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
- return x;
-}
-
-#endif
diff --git a/external/vcflib/tabixpp/bedidx.c b/external/vcflib/tabixpp/bedidx.c
deleted file mode 100644
index 722877d..0000000
--- a/external/vcflib/tabixpp/bedidx.c
+++ /dev/null
@@ -1,156 +0,0 @@
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-#include <zlib.h>
-
-#include "ksort.h"
-KSORT_INIT_GENERIC(uint64_t)
-
-#include "kseq.h"
-KSTREAM_INIT(gzFile, gzread, 8192)
-
-typedef struct {
- int n, m;
- uint64_t *a;
- int *idx;
-} bed_reglist_t;
-
-#include "khash.h"
-KHASH_MAP_INIT_STR(reg, bed_reglist_t)
-
-#define LIDX_SHIFT 13
-
-typedef kh_reg_t reghash_t;
-
-int *bed_index_core(int n, uint64_t *a, int *n_idx)
-{
- int i, j, m, *idx;
- m = *n_idx = 0; idx = 0;
- for (i = 0; i < n; ++i) {
- int beg, end;
- beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
- if (m < end + 1) {
- int oldm = m;
- m = end + 1;
- kroundup32(m);
- idx = realloc(idx, m * sizeof(int));
- for (j = oldm; j < m; ++j) idx[j] = -1;
- }
- if (beg == end) {
- if (idx[beg] < 0) idx[beg] = i;
- } else {
- for (j = beg; j <= end; ++j)
- if (idx[j] < 0) idx[j] = i;
- }
- *n_idx = end + 1;
- }
- return idx;
-}
-
-void bed_index(void *_h)
-{
- reghash_t *h = (reghash_t*)_h;
- khint_t k;
- for (k = 0; k < kh_end(h); ++k) {
- if (kh_exist(h, k)) {
- bed_reglist_t *p = &kh_val(h, k);
- if (p->idx) free(p->idx);
- ks_introsort(uint64_t, p->n, p->a);
- p->idx = bed_index_core(p->n, p->a, &p->m);
- }
- }
-}
-
-int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
-{
- int i, min_off;
- if (p->n == 0) return 0;
- min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
- if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
- int n = beg>>LIDX_SHIFT;
- if (n > p->n) n = p->n;
- for (i = n - 1; i >= 0; --i)
- if (p->idx[i] >= 0) break;
- min_off = i >= 0? p->idx[i] : 0;
- }
- for (i = min_off; i < p->n; ++i) {
- if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
- if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
- return 1; // find the overlap; return
- }
- return 0;
-}
-
-int bed_overlap(const void *_h, const char *chr, int beg, int end)
-{
- const reghash_t *h = (const reghash_t*)_h;
- khint_t k;
- if (!h) return 0;
- k = kh_get(reg, h, chr);
- if (k == kh_end(h)) return 0;
- return bed_overlap_core(&kh_val(h, k), beg, end);
-}
-
-void *bed_read(const char *fn)
-{
- reghash_t *h = kh_init(reg);
- gzFile fp;
- kstream_t *ks;
- int dret;
- kstring_t *str;
- // read the list
- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
- if (fp == 0) return 0;
- str = calloc(1, sizeof(kstring_t));
- ks = ks_init(fp);
- while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
- int beg = -1, end = -1;
- bed_reglist_t *p;
- khint_t k = kh_get(reg, h, str->s);
- if (k == kh_end(h)) { // absent from the hash table
- int ret;
- char *s = strdup(str->s);
- k = kh_put(reg, h, s, &ret);
- memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
- }
- p = &kh_val(h, k);
- if (dret != '\n') { // if the lines has other characters
- if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
- beg = atoi(str->s); // begin
- if (dret != '\n') {
- if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0]))
- end = atoi(str->s); // end
- }
- }
- }
- if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
- if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
- if (beg >= 0 && end > beg) {
- if (p->n == p->m) {
- p->m = p->m? p->m<<1 : 4;
- p->a = realloc(p->a, p->m * 8);
- }
- p->a[p->n++] = (uint64_t)beg<<32 | end;
- }
- }
- ks_destroy(ks);
- gzclose(fp);
- free(str->s); free(str);
- bed_index(h);
- return h;
-}
-
-void bed_destroy(void *_h)
-{
- reghash_t *h = (reghash_t*)_h;
- khint_t k;
- for (k = 0; k < kh_end(h); ++k) {
- if (kh_exist(h, k)) {
- free(kh_val(h, k).a);
- free(kh_val(h, k).idx);
- free((char*)kh_key(h, k));
- }
- }
- kh_destroy(reg, h);
-}
diff --git a/external/vcflib/tabixpp/bgzf.c b/external/vcflib/tabixpp/bgzf.c
deleted file mode 100644
index 29df53b..0000000
--- a/external/vcflib/tabixpp/bgzf.c
+++ /dev/null
@@ -1,711 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
-*/
-
-/*
- 2009-06-29 by lh3: cache recent uncompressed blocks.
- 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
- 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "bgzf.h"
-
-#include "khash.h"
-typedef struct {
- int size;
- uint8_t *block;
- int64_t end_offset;
-} cache_t;
-KHASH_MAP_INIT_INT64(cache, cache_t)
-
-#if defined(_WIN32) || defined(_MSC_VER)
-#define ftello(fp) ftell(fp)
-#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
-#else
-extern off_t ftello(FILE *stream);
-extern int fseeko(FILE *stream, off_t offset, int whence);
-#endif
-
-typedef int8_t bgzf_byte_t;
-
-static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
-static const int MAX_BLOCK_SIZE = 64 * 1024;
-
-static const int BLOCK_HEADER_LENGTH = 18;
-static const int BLOCK_FOOTER_LENGTH = 8;
-
-static const int GZIP_ID1 = 31;
-static const int GZIP_ID2 = 139;
-static const int CM_DEFLATE = 8;
-static const int FLG_FEXTRA = 4;
-static const int OS_UNKNOWN = 255;
-static const int BGZF_ID1 = 66; // 'B'
-static const int BGZF_ID2 = 67; // 'C'
-static const int BGZF_LEN = 2;
-static const int BGZF_XLEN = 6; // BGZF_LEN+4
-
-static const int GZIP_WINDOW_BITS = -15; // no zlib header
-static const int Z_DEFAULT_MEM_LEVEL = 8;
-
-
-void
-_packInt16(uint8_t* buffer, uint16_t value)
-{
- buffer[0] = value;
- buffer[1] = value >> 8;
-}
-
-int
-_unpackInt16(const uint8_t* buffer)
-{
- return (buffer[0] | (buffer[1] << 8));
-}
-
-void
-_packInt32(uint8_t* buffer, uint32_t value)
-{
- buffer[0] = value;
- buffer[1] = value >> 8;
- buffer[2] = value >> 16;
- buffer[3] = value >> 24;
-}
-
-static inline
-int
-_bgzf_min(int x, int y)
-{
- return (x < y) ? x : y;
-}
-
-static
-void
-report_error(BGZF* fp, const char* message) {
- fp->error = message;
-}
-
-int bgzf_check_bgzf(const char *fn)
-{
- BGZF *fp;
- uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377";
- int n;
-
- if ((fp = _bgzf_open(fn, "r")) == 0)
- {
- fprintf(stderr, "[_bgzf_check_bgzf] failed to open the file: %s\n",fn);
- return -1;
- }
-
-#ifdef _USE_KNETFILE
- n = knet_read(fp->x.fpr, buf, 10);
-#else
- n = fread(buf, 1, 10, fp->file);
-#endif
- _bgzf_close(fp);
-
- if ( n!=10 )
- return -1;
-
- if ( !memcmp(magic, buf, 10) ) return 1;
- return 0;
-}
-
-static BGZF *_bgzf_read_init()
-{
- BGZF *fp;
- fp = calloc(1, sizeof(BGZF));
- fp->uncompressed_block_size = MAX_BLOCK_SIZE;
- fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
- fp->compressed_block_size = MAX_BLOCK_SIZE;
- fp->compressed_block = malloc(MAX_BLOCK_SIZE);
- fp->cache_size = 0;
- fp->cache = kh_init(cache);
- return fp;
-}
-
-static
-BGZF*
-open_read(int fd)
-{
-#ifdef _USE_KNETFILE
- knetFile *file = knet_dopen(fd, "r");
-#else
- FILE* file = fdopen(fd, "r");
-#endif
- BGZF* fp;
- if (file == 0) return 0;
- fp = _bgzf_read_init();
- fp->file_descriptor = fd;
- fp->open_mode = 'r';
-#ifdef _USE_KNETFILE
- fp->x.fpr = file;
-#else
- fp->file = file;
-#endif
- return fp;
-}
-
-static
-BGZF*
-open_write(int fd, int compress_level) // compress_level==-1 for the default level
-{
- FILE* file = fdopen(fd, "w");
- BGZF* fp;
- if (file == 0) return 0;
- fp = malloc(sizeof(BGZF));
- fp->file_descriptor = fd;
- fp->open_mode = 'w';
- fp->owned_file = 0;
- fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
- if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
-#ifdef _USE_KNETFILE
- fp->x.fpw = file;
-#else
- fp->file = file;
-#endif
- fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
- fp->uncompressed_block = NULL;
- fp->compressed_block_size = MAX_BLOCK_SIZE;
- fp->compressed_block = malloc(MAX_BLOCK_SIZE);
- fp->block_address = 0;
- fp->block_offset = 0;
- fp->block_length = 0;
- fp->error = NULL;
- return fp;
-}
-
-BGZF*
-_bgzf_open(const char* __restrict path, const char* __restrict mode)
-{
- BGZF* fp = NULL;
- if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */
-#ifdef _USE_KNETFILE
- knetFile *file = knet_open(path, mode);
- if (file == 0) return 0;
- fp = _bgzf_read_init();
- fp->file_descriptor = -1;
- fp->open_mode = 'r';
- fp->x.fpr = file;
-#else
- int fd, oflag = O_RDONLY;
-#ifdef _WIN32
- oflag |= O_BINARY;
-#endif
- fd = open(path, oflag);
- if (fd == -1) return 0;
- fp = open_read(fd);
-#endif
- } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
- int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
-#ifdef _WIN32
- oflag |= O_BINARY;
-#endif
- fd = open(path, oflag, 0666);
- if (fd == -1) return 0;
- { // set compress_level
- int i;
- for (i = 0; mode[i]; ++i)
- if (mode[i] >= '0' && mode[i] <= '9') break;
- if (mode[i]) compress_level = (int)mode[i] - '0';
- if (strchr(mode, 'u')) compress_level = 0;
- }
- fp = open_write(fd, compress_level);
- }
- if (fp != NULL) fp->owned_file = 1;
- return fp;
-}
-
-BGZF*
-_bgzf_fdopen(int fd, const char * __restrict mode)
-{
- if (fd == -1) return 0;
- if (mode[0] == 'r' || mode[0] == 'R') {
- return open_read(fd);
- } else if (mode[0] == 'w' || mode[0] == 'W') {
- int i, compress_level = -1;
- for (i = 0; mode[i]; ++i)
- if (mode[i] >= '0' && mode[i] <= '9') break;
- if (mode[i]) compress_level = (int)mode[i] - '0';
- if (strchr(mode, 'u')) compress_level = 0;
- return open_write(fd, compress_level);
- } else {
- return NULL;
- }
-}
-
-static
-int
-_deflate_block(BGZF* fp, int block_length)
-{
- // Deflate the block in fp->uncompressed_block into fp->compressed_block.
- // Also adds an extra field that stores the compressed block length.
-
- bgzf_byte_t* buffer = fp->compressed_block;
- int buffer_size = fp->compressed_block_size;
-
- // Init gzip header
- buffer[0] = GZIP_ID1;
- buffer[1] = GZIP_ID2;
- buffer[2] = CM_DEFLATE;
- buffer[3] = FLG_FEXTRA;
- buffer[4] = 0; // mtime
- buffer[5] = 0;
- buffer[6] = 0;
- buffer[7] = 0;
- buffer[8] = 0;
- buffer[9] = OS_UNKNOWN;
- buffer[10] = BGZF_XLEN;
- buffer[11] = 0;
- buffer[12] = BGZF_ID1;
- buffer[13] = BGZF_ID2;
- buffer[14] = BGZF_LEN;
- buffer[15] = 0;
- buffer[16] = 0; // placeholder for block length
- buffer[17] = 0;
-
- // loop to retry for blocks that do not compress enough
- int input_length = block_length;
- int compressed_length = 0;
- while (1) {
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = fp->uncompressed_block;
- zs.avail_in = input_length;
- zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
- zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
-
- int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
- GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
- if (status != Z_OK) {
- report_error(fp, "deflate init failed");
- return -1;
- }
- status = deflate(&zs, Z_FINISH);
- if (status != Z_STREAM_END) {
- deflateEnd(&zs);
- if (status == Z_OK) {
- // Not enough space in buffer.
- // Can happen in the rare case the input doesn't compress enough.
- // Reduce the amount of input until it fits.
- input_length -= 1024;
- if (input_length <= 0) {
- // should never happen
- report_error(fp, "input reduction failed");
- return -1;
- }
- continue;
- }
- report_error(fp, "deflate failed");
- return -1;
- }
- status = deflateEnd(&zs);
- if (status != Z_OK) {
- report_error(fp, "deflate end failed");
- return -1;
- }
- compressed_length = zs.total_out;
- compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
- if (compressed_length > MAX_BLOCK_SIZE) {
- // should never happen
- report_error(fp, "deflate overflow");
- return -1;
- }
- break;
- }
-
- _packInt16((uint8_t*)&buffer[16], compressed_length-1);
- uint32_t crc = crc32(0L, NULL, 0L);
- crc = crc32(crc, fp->uncompressed_block, input_length);
- _packInt32((uint8_t*)&buffer[compressed_length-8], crc);
- _packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
-
- int remaining = block_length - input_length;
- if (remaining > 0) {
- if (remaining > input_length) {
- // should never happen (check so we can use memcpy)
- report_error(fp, "remainder too large");
- return -1;
- }
- memcpy(fp->uncompressed_block,
- fp->uncompressed_block + input_length,
- remaining);
- }
- fp->block_offset = remaining;
- return compressed_length;
-}
-
-static
-int
-inflate_block(BGZF* fp, int block_length)
-{
- // Inflate the block in fp->compressed_block into fp->uncompressed_block
-
- z_stream zs;
- int status;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = fp->compressed_block + 18;
- zs.avail_in = block_length - 16;
- zs.next_out = fp->uncompressed_block;
- zs.avail_out = fp->uncompressed_block_size;
-
- status = inflateInit2(&zs, GZIP_WINDOW_BITS);
- if (status != Z_OK) {
- report_error(fp, "inflate init failed");
- return -1;
- }
- status = inflate(&zs, Z_FINISH);
- if (status != Z_STREAM_END) {
- inflateEnd(&zs);
- report_error(fp, "inflate failed");
- return -1;
- }
- status = inflateEnd(&zs);
- if (status != Z_OK) {
- report_error(fp, "inflate failed");
- return -1;
- }
- return zs.total_out;
-}
-
-static
-int
-check_header(const bgzf_byte_t* header)
-{
- return (header[0] == GZIP_ID1 &&
- header[1] == (bgzf_byte_t) GZIP_ID2 &&
- header[2] == Z_DEFLATED &&
- (header[3] & FLG_FEXTRA) != 0 &&
- _unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
- header[12] == BGZF_ID1 &&
- header[13] == BGZF_ID2 &&
- _unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
-}
-
-static void free_cache(BGZF *fp)
-{
- khint_t k;
- khash_t(cache) *h = (khash_t(cache)*)fp->cache;
- if (fp->open_mode != 'r') return;
- for (k = kh_begin(h); k < kh_end(h); ++k)
- if (kh_exist(h, k)) free(kh_val(h, k).block);
- kh_destroy(cache, h);
-}
-
-static int load_block_from_cache(BGZF *fp, int64_t block_address)
-{
- khint_t k;
- cache_t *p;
- khash_t(cache) *h = (khash_t(cache)*)fp->cache;
- k = kh_get(cache, h, block_address);
- if (k == kh_end(h)) return 0;
- p = &kh_val(h, k);
- if (fp->block_length != 0) fp->block_offset = 0;
- fp->block_address = block_address;
- fp->block_length = p->size;
- memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
-#ifdef _USE_KNETFILE
- knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
-#else
- fseeko(fp->file, p->end_offset, SEEK_SET);
-#endif
- return p->size;
-}
-
-static void cache_block(BGZF *fp, int size)
-{
- int ret;
- khint_t k;
- cache_t *p;
- khash_t(cache) *h = (khash_t(cache)*)fp->cache;
- if (MAX_BLOCK_SIZE >= fp->cache_size) return;
- if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
- /* A better way would be to remove the oldest block in the
- * cache, but here we remove a random one for simplicity. This
- * should not have a big impact on performance. */
- for (k = kh_begin(h); k < kh_end(h); ++k)
- if (kh_exist(h, k)) break;
- if (k < kh_end(h)) {
- free(kh_val(h, k).block);
- kh_del(cache, h, k);
- }
- }
- k = kh_put(cache, h, fp->block_address, &ret);
- if (ret == 0) return; // if this happens, a bug!
- p = &kh_val(h, k);
- p->size = fp->block_length;
- p->end_offset = fp->block_address + size;
- p->block = malloc(MAX_BLOCK_SIZE);
- memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
-}
-
-int
-_bgzf_read_block(BGZF* fp)
-{
- bgzf_byte_t header[BLOCK_HEADER_LENGTH];
- int count, size = 0, block_length, remaining;
-#ifdef _USE_KNETFILE
- int64_t block_address = knet_tell(fp->x.fpr);
- if (load_block_from_cache(fp, block_address)) return 0;
- count = knet_read(fp->x.fpr, header, sizeof(header));
-#else
- int64_t block_address = ftello(fp->file);
- if (load_block_from_cache(fp, block_address)) return 0;
- count = fread(header, 1, sizeof(header), fp->file);
-#endif
- if (count == 0) {
- fp->block_length = 0;
- return 0;
- }
- size = count;
- if (count != sizeof(header)) {
- report_error(fp, "read failed");
- return -1;
- }
- if (!check_header(header)) {
- report_error(fp, "invalid block header");
- return -1;
- }
- block_length = _unpackInt16((uint8_t*)&header[16]) + 1;
- bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
- memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
- remaining = block_length - BLOCK_HEADER_LENGTH;
-#ifdef _USE_KNETFILE
- count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
-#else
- count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
-#endif
- if (count != remaining) {
- report_error(fp, "read failed");
- return -1;
- }
- size += count;
- count = inflate_block(fp, block_length);
- if (count < 0) return -1;
- if (fp->block_length != 0) {
- // Do not reset offset if this read follows a seek.
- fp->block_offset = 0;
- }
- fp->block_address = block_address;
- fp->block_length = count;
- cache_block(fp, size);
- return 0;
-}
-
-int
-_bgzf_read(BGZF* fp, void* data, int length)
-{
- if (length <= 0) {
- return 0;
- }
- if (fp->open_mode != 'r') {
- report_error(fp, "file not open for reading");
- return -1;
- }
-
- int bytes_read = 0;
- bgzf_byte_t* output = data;
- while (bytes_read < length) {
- int copy_length, available = fp->block_length - fp->block_offset;
- bgzf_byte_t *buffer;
- if (available <= 0) {
- if (_bgzf_read_block(fp) != 0) {
- return -1;
- }
- available = fp->block_length - fp->block_offset;
- if (available <= 0) {
- break;
- }
- }
- copy_length = _bgzf_min(length-bytes_read, available);
- buffer = fp->uncompressed_block;
- memcpy(output, buffer + fp->block_offset, copy_length);
- fp->block_offset += copy_length;
- output += copy_length;
- bytes_read += copy_length;
- }
- if (fp->block_offset == fp->block_length) {
-#ifdef _USE_KNETFILE
- fp->block_address = knet_tell(fp->x.fpr);
-#else
- fp->block_address = ftello(fp->file);
-#endif
- fp->block_offset = 0;
- fp->block_length = 0;
- }
- return bytes_read;
-}
-
-int _bgzf_flush(BGZF* fp)
-{
- while (fp->block_offset > 0) {
- int count, block_length;
- block_length = _deflate_block(fp, fp->block_offset);
- if (block_length < 0) return -1;
-#ifdef _USE_KNETFILE
- count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
-#else
- count = fwrite(fp->compressed_block, 1, block_length, fp->file);
-#endif
- if (count != block_length) {
- report_error(fp, "write failed");
- return -1;
- }
- fp->block_address += block_length;
- }
- return 0;
-}
-
-int _bgzf_flush_try(BGZF *fp, int size)
-{
- if (fp->block_offset + size > fp->uncompressed_block_size)
- return _bgzf_flush(fp);
- return -1;
-}
-
-int _bgzf_write(BGZF* fp, const void* data, int length)
-{
- const bgzf_byte_t *input = data;
- int block_length, bytes_written;
- if (fp->open_mode != 'w') {
- report_error(fp, "file not open for writing");
- return -1;
- }
-
- if (fp->uncompressed_block == NULL)
- fp->uncompressed_block = malloc(fp->uncompressed_block_size);
-
- input = data;
- block_length = fp->uncompressed_block_size;
- bytes_written = 0;
- while (bytes_written < length) {
- int copy_length = _bgzf_min(block_length - fp->block_offset, length - bytes_written);
- bgzf_byte_t* buffer = fp->uncompressed_block;
- memcpy(buffer + fp->block_offset, input, copy_length);
- fp->block_offset += copy_length;
- input += copy_length;
- bytes_written += copy_length;
- if (fp->block_offset == block_length) {
- if (_bgzf_flush(fp) != 0) {
- break;
- }
- }
- }
- return bytes_written;
-}
-
-int _bgzf_close(BGZF* fp)
-{
- if (fp->open_mode == 'w') {
- if (_bgzf_flush(fp) != 0) return -1;
- { // add an empty block
- int block_length = _deflate_block(fp, 0);
-#ifdef _USE_KNETFILE
- fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
-#else
- fwrite(fp->compressed_block, 1, block_length, fp->file);
-#endif
- }
-#ifdef _USE_KNETFILE
- if (fflush(fp->x.fpw) != 0) {
-#else
- if (fflush(fp->file) != 0) {
-#endif
- report_error(fp, "flush failed");
- return -1;
- }
- }
- if (fp->owned_file) {
-#ifdef _USE_KNETFILE
- int ret;
- if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
- else ret = knet_close(fp->x.fpr);
- if (ret != 0) return -1;
-#else
- if (fclose(fp->file) != 0) return -1;
-#endif
- }
- free(fp->uncompressed_block);
- free(fp->compressed_block);
- free_cache(fp);
- free(fp);
- return 0;
-}
-
-void _bgzf_set_cache_size(BGZF *fp, int cache_size)
-{
- if (fp) fp->cache_size = cache_size;
-}
-
-int _bgzf_check_EOF(BGZF *fp)
-{
- static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
- uint8_t buf[28];
- off_t offset;
-#ifdef _USE_KNETFILE
- offset = knet_tell(fp->x.fpr);
- if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
- knet_read(fp->x.fpr, buf, 28);
- knet_seek(fp->x.fpr, offset, SEEK_SET);
-#else
- offset = ftello(fp->file);
- if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
- fread(buf, 1, 28, fp->file);
- fseeko(fp->file, offset, SEEK_SET);
-#endif
- return (memcmp(magic, buf, 28) == 0)? 1 : 0;
-}
-
-int64_t _bgzf_seek(BGZF* fp, int64_t pos, int where)
-{
- int block_offset;
- int64_t block_address;
-
- if (fp->open_mode != 'r') {
- report_error(fp, "file not open for read");
- return -1;
- }
- if (where != SEEK_SET) {
- report_error(fp, "unimplemented seek option");
- return -1;
- }
- block_offset = pos & 0xFFFF;
- block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
-#ifdef _USE_KNETFILE
- if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
-#else
- if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
-#endif
- report_error(fp, "seek failed");
- return -1;
- }
- fp->block_length = 0; // indicates current block is not loaded
- fp->block_address = block_address;
- fp->block_offset = block_offset;
- return 0;
-}
diff --git a/external/vcflib/tabixpp/bgzf.h b/external/vcflib/tabixpp/bgzf.h
deleted file mode 100644
index a69ea51..0000000
--- a/external/vcflib/tabixpp/bgzf.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
-*/
-
-#ifndef __BGZF_H
-#define __BGZF_H
-
-#include <stdint.h>
-#include <stdio.h>
-#include <zlib.h>
-#ifdef _USE_KNETFILE
-#include "knetfile.h"
-#endif
-
-//typedef int8_t bool;
-
-typedef struct {
- int file_descriptor;
- char open_mode; // 'r' or 'w'
- int16_t owned_file, compress_level;
-#ifdef _USE_KNETFILE
- union {
- knetFile *fpr;
- FILE *fpw;
- } x;
-#else
- FILE* file;
-#endif
- int uncompressed_block_size;
- int compressed_block_size;
- void* uncompressed_block;
- void* compressed_block;
- int64_t block_address;
- int block_length;
- int block_offset;
- int cache_size;
- const char* error;
- void *cache; // a pointer to a hash table
-} BGZF;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Open an existing file descriptor for reading or writing.
- * Mode must be either "r" or "w".
- * A subsequent _bgzf_close will not close the file descriptor.
- * Returns null on error.
- */
-BGZF* _bgzf_fdopen(int fd, const char* __restrict mode);
-
-/*
- * Open the specified file for reading or writing.
- * Mode must be either "r" or "w".
- * Returns null on error.
- */
-BGZF* _bgzf_open(const char* path, const char* __restrict mode);
-
-/*
- * Close the BGZ file and free all associated resources.
- * Does not close the underlying file descriptor if created with _bgzf_fdopen.
- * Returns zero on success, -1 on error.
- */
-int _bgzf_close(BGZF* fp);
-
-/*
- * Read up to length bytes from the file storing into data.
- * Returns the number of bytes actually read.
- * Returns zero on end of file.
- * Returns -1 on error.
- */
-int _bgzf_read(BGZF* fp, void* data, int length);
-
-/*
- * Write length bytes from data to the file.
- * Returns the number of bytes written.
- * Returns -1 on error.
- */
-int _bgzf_write(BGZF* fp, const void* data, int length);
-
-/*
- * Return a virtual file pointer to the current location in the file.
- * No interpetation of the value should be made, other than a subsequent
- * call to _bgzf_seek can be used to position the file at the same point.
- * Return value is non-negative on success.
- * Returns -1 on error.
- */
-#define _bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
-
-/*
- * Set the file to read from the location specified by pos, which must
- * be a value previously returned by _bgzf_tell for this file (but not
- * necessarily one returned by this file handle).
- * The where argument must be SEEK_SET.
- * Seeking on a file opened for write is not supported.
- * Returns zero on success, -1 on error.
- */
-int64_t _bgzf_seek(BGZF* fp, int64_t pos, int where);
-
-/*
- * Set the cache size. Zero to disable. By default, caching is
- * disabled. The recommended cache size for frequent random access is
- * about 8M bytes.
- */
-void _bgzf_set_cache_size(BGZF *fp, int cache_size);
-
-int _bgzf_check_EOF(BGZF *fp);
-int _bgzf_read_block(BGZF* fp);
-int _bgzf_flush(BGZF* fp);
-int _bgzf_flush_try(BGZF *fp, int size);
-int bgzf_check_bgzf(const char *fn);
-
-#ifdef __cplusplus
-}
-#endif
-
-static inline int _bgzf_getc(BGZF *fp)
-{
- int c;
- if (fp->block_offset >= fp->block_length) {
- if (_bgzf_read_block(fp) != 0) return -2; /* error */
- if (fp->block_length == 0) return -1; /* end-of-file */
- }
- c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
- if (fp->block_offset == fp->block_length) {
-#ifdef _USE_KNETFILE
- fp->block_address = knet_tell(fp->x.fpr);
-#else
- fp->block_address = ftello(fp->file);
-#endif
- fp->block_offset = 0;
- fp->block_length = 0;
- }
- return c;
-}
-
-#endif
diff --git a/external/vcflib/tabixpp/bgzip.c b/external/vcflib/tabixpp/bgzip.c
deleted file mode 100644
index f0a5b4c..0000000
--- a/external/vcflib/tabixpp/bgzip.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
-*/
-
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/select.h>
-#include <sys/stat.h>
-#include "bgzf.h"
-
-static const int WINDOW_SIZE = 64 * 1024;
-
-static int bgzip_main_usage()
-{
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n");
- fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n");
- fprintf(stderr, " -d decompress\n");
- fprintf(stderr, " -f overwrite files without asking\n");
- fprintf(stderr, " -b INT decompress at virtual file pointer INT\n");
- fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n");
- fprintf(stderr, " -h give this help\n");
- fprintf(stderr, "\n");
- return 1;
-}
-
-static int write_open(const char *fn, int is_forced)
-{
- int fd = -1;
- char c;
- if (!is_forced) {
- if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
- fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
- scanf("%c", &c);
- if (c != 'Y' && c != 'y') {
- fprintf(stderr, "[bgzip] not overwritten\n");
- exit(1);
- }
- }
- }
- if (fd < 0) {
- if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
- fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
- exit(1);
- }
- }
- return fd;
-}
-
-static void fail(BGZF* fp)
-{
- fprintf(stderr, "Error: %s\n", fp->error);
- exit(1);
-}
-
-int main(int argc, char **argv)
-{
- int c, compress, pstdout, is_forced;
- BGZF *fp;
- void *buffer;
- long start, end, size;
-
- compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
- while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){
- switch(c){
- case 'h': return bgzip_main_usage();
- case 'd': compress = 0; break;
- case 'c': pstdout = 1; break;
- case 'b': start = atol(optarg); break;
- case 's': size = atol(optarg); break;
- case 'f': is_forced = 1; break;
- }
- }
- if (size >= 0) end = start + size;
- if (end >= 0 && end < start) {
- fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
- return 1;
- }
- if (compress == 1) {
- struct stat sbuf;
- int f_src = fileno(stdin);
- int f_dst = fileno(stdout);
-
- if ( argc>optind )
- {
- if ( stat(argv[optind],&sbuf)<0 )
- {
- fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
- return 1;
- }
-
- if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
- fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
- return 1;
- }
-
- if (pstdout)
- f_dst = fileno(stdout);
- else
- {
- char *name = malloc(strlen(argv[optind]) + 5);
- strcpy(name, argv[optind]);
- strcat(name, ".gz");
- f_dst = write_open(name, is_forced);
- if (f_dst < 0) return 1;
- free(name);
- }
- }
- else if (!pstdout && isatty(fileno((FILE *)stdout)) )
- return bgzip_main_usage();
-
- fp = _bgzf_fdopen(f_dst, "w");
- buffer = malloc(WINDOW_SIZE);
- while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
- if (_bgzf_write(fp, buffer, c) < 0) fail(fp);
- // f_dst will be closed here
- if (_bgzf_close(fp) < 0) fail(fp);
- if (argc > optind && !pstdout) unlink(argv[optind]);
- free(buffer);
- close(f_src);
- return 0;
- } else {
- struct stat sbuf;
- int f_dst;
-
- if ( argc>optind )
- {
- if ( stat(argv[optind],&sbuf)<0 )
- {
- fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
- return 1;
- }
- char *name;
- int len = strlen(argv[optind]);
- if ( strcmp(argv[optind]+len-3,".gz") )
- {
- fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
- return 1;
- }
- fp = _bgzf_open(argv[optind], "r");
- if (fp == NULL) {
- fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
- return 1;
- }
-
- if (pstdout) {
- f_dst = fileno(stdout);
- }
- else {
- name = strdup(argv[optind]);
- name[strlen(name) - 3] = '\0';
- f_dst = write_open(name, is_forced);
- free(name);
- }
- }
- else if (!pstdout && isatty(fileno((FILE *)stdin)) )
- return bgzip_main_usage();
- else
- {
- f_dst = fileno(stdout);
- fp = _bgzf_fdopen(fileno(stdin), "r");
- if (fp == NULL) {
- fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
- return 1;
- }
- }
- buffer = malloc(WINDOW_SIZE);
- if (_bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
- while (1) {
- if (end < 0) c = _bgzf_read(fp, buffer, WINDOW_SIZE);
- else c = _bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
- if (c == 0) break;
- if (c < 0) fail(fp);
- start += c;
- write(f_dst, buffer, c);
- if (end >= 0 && start >= end) break;
- }
- free(buffer);
- if (_bgzf_close(fp) < 0) fail(fp);
- if (!pstdout) unlink(argv[optind]);
- return 0;
- }
-}
diff --git a/external/vcflib/tabixpp/index.c b/external/vcflib/tabixpp/index.c
deleted file mode 100644
index 6c2db77..0000000
--- a/external/vcflib/tabixpp/index.c
+++ /dev/null
@@ -1,998 +0,0 @@
-#include <ctype.h>
-#include <assert.h>
-#include <sys/stat.h>
-#include "khash.h"
-#include "ksort.h"
-#include "kstring.h"
-#include "bam_endian.h"
-#ifdef _USE_KNETFILE
-#include "knetfile.h"
-#endif
-#include "tabix.h"
-
-#define TAD_MIN_CHUNK_GAP 32768
-// 1<<14 is the size of minimum bin.
-#define TAD_LIDX_SHIFT 14
-
-typedef struct {
- uint64_t u, v;
-} pair64_t;
-
-#define pair64_lt(a,b) ((a).u < (b).u)
-KSORT_INIT(offt, pair64_t, pair64_lt)
-
-typedef struct {
- uint32_t m, n;
- pair64_t *list;
-} ti_binlist_t;
-
-typedef struct {
- int32_t n, m;
- uint64_t *offset;
-} ti_lidx_t;
-
-KHASH_MAP_INIT_INT(i, ti_binlist_t)
-KHASH_MAP_INIT_STR(s, int)
-
-struct __ti_index_t {
- ti_conf_t conf;
- int32_t n, max;
- khash_t(s) *tname;
- khash_t(i) **index;
- ti_lidx_t *index2;
-};
-
-struct __ti_iter_t {
- int from_first; // read from the first record; no random access
- int tid, beg, end, n_off, i, finished;
- uint64_t curr_off;
- kstring_t str;
- const ti_index_t *idx;
- pair64_t *off;
-};
-
-typedef struct {
- int tid, beg, end, bin;
-} ti_intv_t;
-
-ti_conf_t ti_conf_gff = { 0, 1, 4, 5, '#', 0 };
-ti_conf_t ti_conf_bed = { TI_FLAG_UCSC, 1, 2, 3, '#', 0 };
-ti_conf_t ti_conf_psltbl = { TI_FLAG_UCSC, 15, 17, 18, '#', 0 };
-ti_conf_t ti_conf_sam = { TI_PRESET_SAM, 3, 4, 0, '@', 0 };
-ti_conf_t ti_conf_vcf = { TI_PRESET_VCF, 1, 2, 0, '#', 0 };
-
-/***************
- * read a line *
- ***************/
-
-/*
-int ti_readline(BGZF *fp, kstring_t *str)
-{
- int c, l = 0;
- str->l = 0;
- while ((c = bgzf_getc(fp)) >= 0 && c != '\n') {
- ++l;
- if (c != '\r') kputc(c, str);
- }
- if (c < 0 && l == 0) return -1; // end of file
- return str->l;
-}
-*/
-
-/* Below is a faster implementation largely equivalent to the one
- * commented out above. */
-int ti_readline(BGZF *fp, kstring_t *str)
-{
- int l, state = 0;
- unsigned char *buf = (unsigned char*)fp->uncompressed_block;
- str->l = 0;
- do {
- if (fp->block_offset >= fp->block_length) {
- if (_bgzf_read_block(fp) != 0) { state = -2; break; }
- if (fp->block_length == 0) { state = -1; break; }
- }
- for (l = fp->block_offset; l < fp->block_length && buf[l] != '\n'; ++l);
- if (l < fp->block_length) state = 1;
- l -= fp->block_offset;
- if (str->l + l + 1 >= str->m) {
- str->m = str->l + l + 2;
- kroundup32(str->m);
- str->s = (char*)realloc(str->s, str->m);
- }
- memcpy(str->s + str->l, buf + fp->block_offset, l);
- str->l += l;
- fp->block_offset += l + 1;
- if (fp->block_offset >= fp->block_length) {
-#ifdef _USE_KNETFILE
- fp->block_address = knet_tell(fp->x.fpr);
-#else
- fp->block_address = ftello(fp->file);
-#endif
- fp->block_offset = 0;
- fp->block_length = 0;
- }
- } while (state == 0);
- if (str->l == 0 && state < 0) return state;
- str->s[str->l] = 0;
- return str->l;
-}
-
-/*************************************
- * get the interval from a data line *
- *************************************/
-
-static inline int ti_reg2bin(uint32_t beg, uint32_t end)
-{
- --end;
- if (beg>>14 == end>>14) return 4681 + (beg>>14);
- if (beg>>17 == end>>17) return 585 + (beg>>17);
- if (beg>>20 == end>>20) return 73 + (beg>>20);
- if (beg>>23 == end>>23) return 9 + (beg>>23);
- if (beg>>26 == end>>26) return 1 + (beg>>26);
- return 0;
-}
-
-static int get_tid(ti_index_t *idx, const char *ss)
-{
- khint_t k;
- int tid;
- k = kh_get(s, idx->tname, ss);
- if (k == kh_end(idx->tname)) { // a new target sequence
- int ret, size;
- // update idx->n, ->max, ->index and ->index2
- if (idx->n == idx->max) {
- idx->max = idx->max? idx->max<<1 : 8;
- idx->index = realloc(idx->index, idx->max * sizeof(void*));
- idx->index2 = realloc(idx->index2, idx->max * sizeof(ti_lidx_t));
- }
- memset(&idx->index2[idx->n], 0, sizeof(ti_lidx_t));
- idx->index[idx->n++] = kh_init(i);
- // update ->tname
- tid = size = kh_size(idx->tname);
- k = kh_put(s, idx->tname, strdup(ss), &ret);
- kh_value(idx->tname, k) = size;
- assert(idx->n == kh_size(idx->tname));
- } else tid = kh_value(idx->tname, k);
- return tid;
-}
-
-int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv)
-{
- int i, b = 0, id = 1, ncols = 0;
- char *s;
- intv->ss = intv->se = 0; intv->beg = intv->end = -1;
- for (i = 0; i <= len; ++i) {
- if (line[i] == '\t' || line[i] == 0) {
- ++ncols;
- if (id == conf->sc) {
- intv->ss = line + b; intv->se = line + i;
- } else if (id == conf->bc) {
- // here ->beg is 0-based.
- intv->beg = intv->end = strtol(line + b, &s, 0);
- if (!(conf->preset&TI_FLAG_UCSC)) --intv->beg;
- else ++intv->end;
- if (intv->beg < 0) intv->beg = 0;
- if (intv->end < 1) intv->end = 1;
- } else {
- if ((conf->preset&0xffff) == TI_PRESET_GENERIC) {
- if (id == conf->ec) intv->end = strtol(line + b, &s, 0);
- } else if ((conf->preset&0xffff) == TI_PRESET_SAM) {
- if (id == 6) { // CIGAR
- int l = 0, op;
- char *t;
- for (s = line + b; s < line + i;) {
- long x = strtol(s, &t, 10);
- op = toupper(*t);
- if (op == 'M' || op == 'D' || op == 'N') l += x;
- s = t + 1;
- }
- if (l == 0) l = 1;
- intv->end = intv->beg + l;
- }
- } else if ((conf->preset&0xffff) == TI_PRESET_VCF) {
- // FIXME: the following is NOT tested and is likely to be buggy
- if (id == 4) {
- if (b < i) intv->end = intv->beg + (i - b);
- } else if (id == 8) { // look for "END="
- int c = line[i];
- line[i] = 0;
- s = strstr(line + b, "END=");
- if (s == line + b) s += 4;
- else if (s) {
- s = strstr(line + b, ";END=");
- if (s) s += 5;
- }
- if (s) intv->end = strtol(s, &s, 0);
- line[i] = c;
- }
- }
- }
- b = i + 1;
- ++id;
- }
- }
-/*
- if (ncols < conf->sc || ncols < conf->bc || ncols < conf->ec) {
- if (ncols == 1) fprintf(stderr,"[get_intv] Is the file tab-delimited? The line has %d field only: %s\n", ncols, line);
- else fprintf(stderr,"[get_intv] The line has %d field(s) only: %s\n", ncols, line);
- exit(1);
- }
-*/
- if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
- return 0;
-}
-
-static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
-{
- ti_interval_t x;
- intv->tid = intv->beg = intv->end = intv->bin = -1;
- if (ti_get_intv(&idx->conf, str->l, str->s, &x) == 0) {
- int c = *x.se;
- *x.se = '\0'; intv->tid = get_tid(idx, x.ss); *x.se = c;
- intv->beg = x.beg; intv->end = x.end;
- intv->bin = ti_reg2bin(intv->beg, intv->end);
- return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
- } else {
- fprintf(stderr, "[%s] the following line cannot be parsed and skipped: %s\n", __func__, str->s);
- return -1;
- }
-}
-
-/************
- * indexing *
- ************/
-
-// requirement: len <= LEN_MASK
-static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
-{
- khint_t k;
- ti_binlist_t *l;
- int ret;
- k = kh_put(i, h, bin, &ret);
- l = &kh_value(h, k);
- if (ret) { // not present
- l->m = 1; l->n = 0;
- l->list = (pair64_t*)calloc(l->m, 16);
- }
- if (l->n == l->m) {
- l->m <<= 1;
- l->list = (pair64_t*)realloc(l->list, l->m * 16);
- }
- l->list[l->n].u = beg; l->list[l->n++].v = end;
-}
-
-static inline uint64_t insert_offset2(ti_lidx_t *index2, int _beg, int _end, uint64_t offset)
-{
- int i, beg, end;
- beg = _beg >> TAD_LIDX_SHIFT;
- end = (_end - 1) >> TAD_LIDX_SHIFT;
- if (index2->m < end + 1) {
- int old_m = index2->m;
- index2->m = end + 1;
- kroundup32(index2->m);
- index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
- memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
- }
- if (beg == end) {
- if (index2->offset[beg] == 0) index2->offset[beg] = offset;
- } else {
- for (i = beg; i <= end; ++i)
- if (index2->offset[i] == 0) index2->offset[i] = offset;
- }
- if (index2->n < end + 1) index2->n = end + 1;
- return (uint64_t)beg<<32 | end;
-}
-
-static void merge_chunks(ti_index_t *idx)
-{
- khash_t(i) *index;
- int i, l, m;
- khint_t k;
- for (i = 0; i < idx->n; ++i) {
- index = idx->index[i];
- for (k = kh_begin(index); k != kh_end(index); ++k) {
- ti_binlist_t *p;
- if (!kh_exist(index, k)) continue;
- p = &kh_value(index, k);
- m = 0;
- for (l = 1; l < p->n; ++l) {
- if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
- else p->list[++m] = p->list[l];
- } // ~for(l)
- p->n = m + 1;
- } // ~for(k)
- } // ~for(i)
-}
-
-static void fill_missing(ti_index_t *idx)
-{
- int i, j;
- for (i = 0; i < idx->n; ++i) {
- ti_lidx_t *idx2 = &idx->index2[i];
- for (j = 1; j < idx2->n; ++j)
- if (idx2->offset[j] == 0)
- idx2->offset[j] = idx2->offset[j-1];
- }
-}
-
-ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf)
-{
- int ret;
- ti_index_t *idx;
- uint32_t last_bin, save_bin;
- int32_t last_coor, last_tid, save_tid;
- uint64_t save_off, last_off, lineno = 0, offset0 = (uint64_t)-1, tmp;
- kstring_t *str;
-
- str = calloc(1, sizeof(kstring_t));
-
- idx = (ti_index_t*)calloc(1, sizeof(ti_index_t));
- idx->conf = *conf;
- idx->n = idx->max = 0;
- idx->tname = kh_init(s);
- idx->index = 0;
- idx->index2 = 0;
-
- save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
- save_off = last_off = _bgzf_tell(fp); last_coor = 0xffffffffu;
- while ((ret = ti_readline(fp, str)) >= 0) {
- ti_intv_t intv;
- ++lineno;
- if (lineno <= idx->conf.line_skip || str->s[0] == idx->conf.meta_char) {
- last_off = _bgzf_tell(fp);
- continue;
- }
- get_intv(idx, str, &intv);
- if ( intv.beg<0 || intv.end<0 )
- {
- fprintf(stderr,"[ti_index_core] the indexes overlap or are out of bounds\n");
- exit(1);
- }
- if (last_tid != intv.tid) { // change of chromosomes
- if (last_tid>intv.tid )
- {
- fprintf(stderr,"[ti_index_core] the chromosome blocks not continuous at line %llu, is the file sorted? [pos %d]\n",(unsigned long long)lineno,intv.beg+1);
- exit(1);
- }
- last_tid = intv.tid;
- last_bin = 0xffffffffu;
- } else if (last_coor > intv.beg) {
- fprintf(stderr, "[ti_index_core] the file out of order at line %llu\n", (unsigned long long)lineno);
- exit(1);
- }
- tmp = insert_offset2(&idx->index2[intv.tid], intv.beg, intv.end, last_off);
- if (last_off == 0) offset0 = tmp;
- if (intv.bin != last_bin) { // then possibly write the binning index
- if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
- insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
- save_off = last_off;
- save_bin = last_bin = intv.bin;
- save_tid = intv.tid;
- if (save_tid < 0) break;
- }
- if (_bgzf_tell(fp) <= last_off) {
- fprintf(stderr, "[ti_index_core] bug in BGZF: %llx < %llx\n",
- (unsigned long long)_bgzf_tell(fp), (unsigned long long)last_off);
- exit(1);
- }
- last_off = _bgzf_tell(fp);
- last_coor = intv.beg;
- }
- if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, _bgzf_tell(fp));
- merge_chunks(idx);
- fill_missing(idx);
- if (offset0 != (uint64_t)-1 && idx->n && idx->index2[0].offset) {
- int i, beg = offset0>>32, end = offset0&0xffffffffu;
- for (i = beg; i <= end; ++i) idx->index2[0].offset[i] = 0;
- }
-
- free(str->s); free(str);
- return idx;
-}
-
-void ti_index_destroy(ti_index_t *idx)
-{
- khint_t k;
- int i;
- if (idx == 0) return;
- // destroy the name hash table
- for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k) {
- if (kh_exist(idx->tname, k))
- free((char*)kh_key(idx->tname, k));
- }
- kh_destroy(s, idx->tname);
- // destroy the binning index
- for (i = 0; i < idx->n; ++i) {
- khash_t(i) *index = idx->index[i];
- ti_lidx_t *index2 = idx->index2 + i;
- for (k = kh_begin(index); k != kh_end(index); ++k) {
- if (kh_exist(index, k))
- free(kh_value(index, k).list);
- }
- kh_destroy(i, index);
- free(index2->offset);
- }
- free(idx->index);
- // destroy the linear index
- free(idx->index2);
- free(idx);
-}
-
-/******************
- * index file I/O *
- ******************/
-
-void ti_index_save(const ti_index_t *idx, BGZF *fp)
-{
- int32_t i, size, ti_is_be;
- khint_t k;
- ti_is_be = bam_is_big_endian();
- _bgzf_write(fp, "TBI\1", 4);
- if (ti_is_be) {
- uint32_t x = idx->n;
- _bgzf_write(fp, bam_swap_endian_4p(&x), 4);
- } else _bgzf_write(fp, &idx->n, 4);
- assert(sizeof(ti_conf_t) == 24);
- if (ti_is_be) { // write ti_conf_t;
- uint32_t x[6];
- memcpy(x, &idx->conf, 24);
- for (i = 0; i < 6; ++i) _bgzf_write(fp, bam_swap_endian_4p(&x[i]), 4);
- } else _bgzf_write(fp, &idx->conf, sizeof(ti_conf_t));
- { // write target names
- char **name;
- int32_t l = 0;
- name = calloc(kh_size(idx->tname), sizeof(void*));
- for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k)
- if (kh_exist(idx->tname, k))
- name[kh_value(idx->tname, k)] = (char*)kh_key(idx->tname, k);
- for (i = 0; i < kh_size(idx->tname); ++i)
- l += strlen(name[i]) + 1;
- if (ti_is_be) _bgzf_write(fp, bam_swap_endian_4p(&l), 4);
- else _bgzf_write(fp, &l, 4);
- for (i = 0; i < kh_size(idx->tname); ++i)
- _bgzf_write(fp, name[i], strlen(name[i]) + 1);
- free(name);
- }
- for (i = 0; i < idx->n; ++i) {
- khash_t(i) *index = idx->index[i];
- ti_lidx_t *index2 = idx->index2 + i;
- // write binning index
- size = kh_size(index);
- if (ti_is_be) { // big endian
- uint32_t x = size;
- _bgzf_write(fp, bam_swap_endian_4p(&x), 4);
- } else _bgzf_write(fp, &size, 4);
- for (k = kh_begin(index); k != kh_end(index); ++k) {
- if (kh_exist(index, k)) {
- ti_binlist_t *p = &kh_value(index, k);
- if (ti_is_be) { // big endian
- uint32_t x;
- x = kh_key(index, k); _bgzf_write(fp, bam_swap_endian_4p(&x), 4);
- x = p->n; _bgzf_write(fp, bam_swap_endian_4p(&x), 4);
- for (x = 0; (int)x < p->n; ++x) {
- bam_swap_endian_8p(&p->list[x].u);
- bam_swap_endian_8p(&p->list[x].v);
- }
- _bgzf_write(fp, p->list, 16 * p->n);
- for (x = 0; (int)x < p->n; ++x) {
- bam_swap_endian_8p(&p->list[x].u);
- bam_swap_endian_8p(&p->list[x].v);
- }
- } else {
- _bgzf_write(fp, &kh_key(index, k), 4);
- _bgzf_write(fp, &p->n, 4);
- _bgzf_write(fp, p->list, 16 * p->n);
- }
- }
- }
- // write linear index (index2)
- if (ti_is_be) {
- int x = index2->n;
- _bgzf_write(fp, bam_swap_endian_4p(&x), 4);
- } else _bgzf_write(fp, &index2->n, 4);
- if (ti_is_be) { // big endian
- int x;
- for (x = 0; (int)x < index2->n; ++x)
- bam_swap_endian_8p(&index2->offset[x]);
- _bgzf_write(fp, index2->offset, 8 * index2->n);
- for (x = 0; (int)x < index2->n; ++x)
- bam_swap_endian_8p(&index2->offset[x]);
- } else _bgzf_write(fp, index2->offset, 8 * index2->n);
- }
-}
-
-static ti_index_t *ti_index_load_core(BGZF *fp)
-{
- int i, ti_is_be;
- char magic[4];
- ti_index_t *idx;
- ti_is_be = bam_is_big_endian();
- if (fp == 0) {
- fprintf(stderr, "[ti_index_load_core] fail to load index.\n");
- return 0;
- }
- _bgzf_read(fp, magic, 4);
- if (strncmp(magic, "TBI\1", 4)) {
- fprintf(stderr, "[ti_index_load] wrong magic number.\n");
- return 0;
- }
- idx = (ti_index_t*)calloc(1, sizeof(ti_index_t));
- _bgzf_read(fp, &idx->n, 4);
- if (ti_is_be) bam_swap_endian_4p(&idx->n);
- idx->tname = kh_init(s);
- idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
- idx->index2 = (ti_lidx_t*)calloc(idx->n, sizeof(ti_lidx_t));
- // read idx->conf
- _bgzf_read(fp, &idx->conf, sizeof(ti_conf_t));
- if (ti_is_be) {
- bam_swap_endian_4p(&idx->conf.preset);
- bam_swap_endian_4p(&idx->conf.sc);
- bam_swap_endian_4p(&idx->conf.bc);
- bam_swap_endian_4p(&idx->conf.ec);
- bam_swap_endian_4p(&idx->conf.meta_char);
- bam_swap_endian_4p(&idx->conf.line_skip);
- }
- { // read target names
- int j, ret;
- kstring_t *str;
- int32_t l;
- uint8_t *buf;
- _bgzf_read(fp, &l, 4);
- if (ti_is_be) bam_swap_endian_4p(&l);
- buf = calloc(l, 1);
- _bgzf_read(fp, buf, l);
- str = calloc(1, sizeof(kstring_t));
- for (i = j = 0; i < l; ++i) {
- if (buf[i] == 0) {
- khint_t k = kh_put(s, idx->tname, strdup(str->s), &ret);
- kh_value(idx->tname, k) = j++;
- str->l = 0;
- } else kputc(buf[i], str);
- }
- free(str->s); free(str); free(buf);
- }
- for (i = 0; i < idx->n; ++i) {
- khash_t(i) *index;
- ti_lidx_t *index2 = idx->index2 + i;
- uint32_t key, size;
- khint_t k;
- int j, ret;
- ti_binlist_t *p;
- index = idx->index[i] = kh_init(i);
- // load binning index
- _bgzf_read(fp, &size, 4);
- if (ti_is_be) bam_swap_endian_4p(&size);
- for (j = 0; j < (int)size; ++j) {
- _bgzf_read(fp, &key, 4);
- if (ti_is_be) bam_swap_endian_4p(&key);
- k = kh_put(i, index, key, &ret);
- p = &kh_value(index, k);
- _bgzf_read(fp, &p->n, 4);
- if (ti_is_be) bam_swap_endian_4p(&p->n);
- p->m = p->n;
- p->list = (pair64_t*)malloc(p->m * 16);
- _bgzf_read(fp, p->list, 16 * p->n);
- if (ti_is_be) {
- int x;
- for (x = 0; x < p->n; ++x) {
- bam_swap_endian_8p(&p->list[x].u);
- bam_swap_endian_8p(&p->list[x].v);
- }
- }
- }
- // load linear index
- _bgzf_read(fp, &index2->n, 4);
- if (ti_is_be) bam_swap_endian_4p(&index2->n);
- index2->m = index2->n;
- index2->offset = (uint64_t*)calloc(index2->m, 8);
- _bgzf_read(fp, index2->offset, index2->n * 8);
- if (ti_is_be)
- for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
- }
- return idx;
-}
-
-ti_index_t *ti_index_load_local(const char *fnidx)
-{
- BGZF *fp;
- fp = _bgzf_open(fnidx, "r");
- if (fp) {
- ti_index_t *idx = ti_index_load_core(fp);
- _bgzf_close(fp);
- return idx;
- } else return 0;
-}
-
-#ifdef _USE_KNETFILE
-static void download_from_remote(const char *url)
-{
- const int buf_size = 1 * 1024 * 1024;
- char *fn;
- FILE *fp;
- uint8_t *buf;
- knetFile *fp_remote;
- int l;
- if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
- l = strlen(url);
- for (fn = (char*)url + l - 1; fn >= url; --fn)
- if (*fn == '/') break;
- ++fn; // fn now points to the file name
- fp_remote = knet_open(url, "r");
- if (fp_remote == 0) {
- fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
- return;
- }
- if ((fp = fopen(fn, "w")) == 0) {
- fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
- knet_close(fp_remote);
- return;
- }
- buf = (uint8_t*)calloc(buf_size, 1);
- while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
- fwrite(buf, 1, l, fp);
- free(buf);
- fclose(fp);
- knet_close(fp_remote);
-}
-#else
-static void download_from_remote(const char *url)
-{
- return;
-}
-#endif
-
-static char *get_local_version(const char *fn)
-{
- struct stat sbuf;
- char *fnidx = (char*)calloc(strlen(fn) + 5, 1);
- strcat(strcpy(fnidx, fn), ".tbi");
- if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) {
- char *p, *url;
- int l = strlen(fnidx);
- for (p = fnidx + l - 1; p >= fnidx; --p)
- if (*p == '/') break;
- url = fnidx; fnidx = strdup(p + 1);
- if (stat(fnidx, &sbuf) == 0) {
- free(url);
- return fnidx;
- }
- fprintf(stderr, "[%s] downloading the index file...\n", __func__);
- download_from_remote(url);
- free(url);
- }
- if (stat(fnidx, &sbuf) == 0) return fnidx;
- free(fnidx); return 0;
-}
-
-const char **ti_seqname(const ti_index_t *idx, int *n)
-{
- const char **names;
- khint_t k;
- *n = idx->n;
- names = calloc(idx->n, sizeof(void*));
- for (k = kh_begin(idx->tname); k < kh_end(idx->tname); ++k)
- if (kh_exist(idx->tname, k))
- names[kh_val(idx->tname, k)] = kh_key(idx->tname, k);
- return names;
-}
-
-ti_index_t *ti_index_load(const char *fn)
-{
- ti_index_t *idx;
- char *fname = get_local_version(fn);
- if (fname == 0) return 0;
- idx = ti_index_load_local(fname);
- if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load the index: %s\n", fname);
- free(fname);
- return idx;
-}
-
-int ti_index_build2(const char *fn, const ti_conf_t *conf, const char *_fnidx)
-{
- char *fnidx;
- BGZF *fp, *fpidx;
- ti_index_t *idx;
- if ((fp = _bgzf_open(fn, "r")) == 0) {
- fprintf(stderr, "[ti_index_build2] fail to open the file: %s\n", fn);
- return -1;
- }
- idx = ti_index_core(fp, conf);
- _bgzf_close(fp);
- if (_fnidx == 0) {
- fnidx = (char*)calloc(strlen(fn) + 5, 1);
- strcpy(fnidx, fn); strcat(fnidx, ".tbi");
- } else fnidx = strdup(_fnidx);
- fpidx = _bgzf_open(fnidx, "w");
- if (fpidx == 0) {
- fprintf(stderr, "[ti_index_build2] fail to create the index file.\n");
- free(fnidx);
- return -1;
- }
- ti_index_save(idx, fpidx);
- ti_index_destroy(idx);
- _bgzf_close(fpidx);
- free(fnidx);
- return 0;
-}
-
-int ti_index_build(const char *fn, const ti_conf_t *conf)
-{
- return ti_index_build2(fn, conf, 0);
-}
-
-/********************************************
- * parse a region in the format chr:beg-end *
- ********************************************/
-
-int ti_get_tid(const ti_index_t *idx, const char *name)
-{
- khiter_t iter;
- const khash_t(s) *h = idx->tname;
- iter = kh_get(s, h, name); /* get the tid */
- if (iter == kh_end(h)) return -1;
- return kh_value(h, iter);
-}
-
-int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end)
-{
- char *s, *p;
- int i, l, k;
- l = strlen(str);
- p = s = (char*)malloc(l+1);
- /* squeeze out "," */
- for (i = k = 0; i != l; ++i)
- if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
- s[k] = 0;
- for (i = 0; i != k; ++i) if (s[i] == ':') break;
- s[i] = 0;
- if ((*tid = ti_get_tid(idx, s)) < 0) {
- free(s);
- return -1;
- }
- if (i == k) { /* dump the whole sequence */
- *begin = 0; *end = 1<<29; free(s);
- return 0;
- }
- for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
- *begin = atoi(p);
- if (i < k) {
- p = s + i + 1;
- *end = atoi(p);
- } else *end = 1<<29;
- if (*begin > 0) --*begin;
- free(s);
- if (*begin > *end) return -1;
- return 0;
-}
-
-/*******************************
- * retrieve a specified region *
- *******************************/
-
-#define MAX_BIN 37450 // =(8^6-1)/7+1
-
-static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
-{
- int i = 0, k;
- if (beg >= end) return 0;
- if (end >= 1u<<29) end = 1u<<29;
- --end;
- list[i++] = 0;
- for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
- for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
- for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
- for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
- for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
- return i;
-}
-
-ti_iter_t ti_iter_first()
-{
- ti_iter_t iter;
- iter = calloc(1, sizeof(struct __ti_iter_t));
- iter->from_first = 1;
- return iter;
-}
-
-ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end)
-{
- uint16_t *bins;
- int i, n_bins, n_off;
- pair64_t *off;
- khint_t k;
- khash_t(i) *index;
- uint64_t min_off;
- ti_iter_t iter = 0;
-
- if (beg < 0) beg = 0;
- if (end < beg) return 0;
- // initialize the iterator
- iter = calloc(1, sizeof(struct __ti_iter_t));
- iter->idx = idx; iter->tid = tid; iter->beg = beg; iter->end = end; iter->i = -1;
- // random access
- bins = (uint16_t*)calloc(MAX_BIN, 2);
- n_bins = reg2bins(beg, end, bins);
- index = idx->index[tid];
- if (idx->index2[tid].n > 0) {
- min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
- : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT];
- if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
- int n = beg>>TAD_LIDX_SHIFT;
- if (n > idx->index2[tid].n) n = idx->index2[tid].n;
- for (i = n - 1; i >= 0; --i)
- if (idx->index2[tid].offset[i] != 0) break;
- if (i >= 0) min_off = idx->index2[tid].offset[i];
- }
- } else min_off = 0; // tabix 0.1.2 may produce such index files
- for (i = n_off = 0; i < n_bins; ++i) {
- if ((k = kh_get(i, index, bins[i])) != kh_end(index))
- n_off += kh_value(index, k).n;
- }
- if (n_off == 0) {
- free(bins); return iter;
- }
- off = (pair64_t*)calloc(n_off, 16);
- for (i = n_off = 0; i < n_bins; ++i) {
- if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
- int j;
- ti_binlist_t *p = &kh_value(index, k);
- for (j = 0; j < p->n; ++j)
- if (p->list[j].v > min_off) off[n_off++] = p->list[j];
- }
- }
- if (n_off == 0) {
- free(bins); free(off); return iter;
- }
- free(bins);
- {
- int l;
- ks_introsort(offt, n_off, off);
- // resolve completely contained adjacent blocks
- for (i = 1, l = 0; i < n_off; ++i)
- if (off[l].v < off[i].v)
- off[++l] = off[i];
- n_off = l + 1;
- // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
- for (i = 1; i < n_off; ++i)
- if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
- { // merge adjacent blocks
- for (i = 1, l = 0; i < n_off; ++i) {
- if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
- else off[++l] = off[i];
- }
- n_off = l + 1;
- }
- }
- iter->n_off = n_off; iter->off = off;
- return iter;
-}
-
-const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len)
-{
- if (iter->finished) return 0;
- if (iter->from_first) {
- int ret;
- if ((ret = ti_readline(fp, &iter->str)) < 0) {
- iter->finished = 1;
- return 0;
- } else {
- if (len) *len = iter->str.l;
- return iter->str.s;
- }
- }
- if (iter->n_off == 0) return 0;
- while (1) {
- int ret;
- if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
- if (iter->i == iter->n_off - 1) break; // no more chunks
- if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
- if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
- _bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
- iter->curr_off = _bgzf_tell(fp);
- }
- ++iter->i;
- }
- if ((ret = ti_readline(fp, &iter->str)) >= 0) {
- ti_intv_t intv;
- iter->curr_off = _bgzf_tell(fp);
- if (iter->str.s[0] == iter->idx->conf.meta_char) continue;
- get_intv((ti_index_t*)iter->idx, &iter->str, &intv);
- if (intv.tid != iter->tid || intv.beg >= iter->end) break; // no need to proceed
- else if (intv.end > iter->beg && iter->end > intv.beg) {
- if (len) *len = iter->str.l;
- return iter->str.s;
- }
- } else break; // end of file
- }
- iter->finished = 1;
- return 0;
-}
-
-void ti_iter_destroy(ti_iter_t iter)
-{
- if (iter) {
- free(iter->str.s); free(iter->off);
- free(iter);
- }
-}
-
-int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func)
-{
- ti_iter_t iter;
- const char *s;
- int len;
- iter = ti_iter_query(idx, tid, beg, end);
- while ((s = ti_iter_read(fp, iter, &len)) != 0)
- func(len, s, data);
- ti_iter_destroy(iter);
- return 0;
-}
-
-const ti_conf_t *ti_get_conf(ti_index_t *idx) { return idx? &idx->conf : 0; }
-
-/*******************
- * High-level APIs *
- *******************/
-
-tabix_t *ti_open(const char *fn, const char *fnidx)
-{
- tabix_t *t;
- BGZF *fp;
- if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
- t = calloc(1, sizeof(tabix_t));
- t->fn = strdup(fn);
- if (fnidx) t->fnidx = strdup(fnidx);
- t->fp = fp;
- return t;
-}
-
-void ti_close(tabix_t *t)
-{
- if (t) {
- _bgzf_close(t->fp);
- if (t->idx) ti_index_destroy(t->idx);
- free(t->fn); free(t->fnidx);
- free(t);
- }
-}
-
-int ti_lazy_index_load(tabix_t *t)
-{
- if (t->idx == 0) { // load index
- if (t->fnidx) t->idx = ti_index_load_local(t->fnidx);
- else t->idx = ti_index_load(t->fn);
- if (t->idx == 0) return -1; // fail to load index
- }
- return 0;
-}
-
-ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end)
-{
- if (tid < 0) return ti_iter_first();
- if (ti_lazy_index_load(t) != 0) return 0;
- return ti_iter_query(t->idx, tid, beg, end);
-}
-
-ti_iter_t ti_querys(tabix_t *t, const char *reg)
-{
- int tid, beg, end;
- if (reg == 0) return ti_iter_first();
- if (ti_lazy_index_load(t) != 0) return 0;
- if (ti_parse_region(t->idx, reg, &tid, &beg, &end) < 0) return 0;
- return ti_iter_query(t->idx, tid, beg, end);
-}
-
-ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end)
-{
- int tid;
- if (name == 0) return ti_iter_first();
- // then need to load the index
- if (ti_lazy_index_load(t) != 0) return 0;
- if ((tid = ti_get_tid(t->idx, name)) < 0) return 0;
- return ti_iter_query(t->idx, tid, beg, end);
-}
-
-const char *ti_read(tabix_t *t, ti_iter_t iter, int *len)
-{
- return ti_iter_read(t->fp, iter, len);
-}
diff --git a/external/vcflib/tabixpp/khash.h b/external/vcflib/tabixpp/khash.h
deleted file mode 100644
index 1d583ef..0000000
--- a/external/vcflib/tabixpp/khash.h
+++ /dev/null
@@ -1,486 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Genome Research Ltd (GRL).
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <lh3 at sanger.ac.uk> */
-
-/*
- An example:
-
-#include "khash.h"
-KHASH_MAP_INIT_INT(32, char)
-int main() {
- int ret, is_missing;
- khiter_t k;
- khash_t(32) *h = kh_init(32);
- k = kh_put(32, h, 5, &ret);
- if (!ret) kh_del(32, h, k);
- kh_value(h, k) = 10;
- k = kh_get(32, h, 10);
- is_missing = (k == kh_end(h));
- k = kh_get(32, h, 5);
- kh_del(32, h, k);
- for (k = kh_begin(h); k != kh_end(h); ++k)
- if (kh_exist(h, k)) kh_value(h, k) = 1;
- kh_destroy(32, h);
- return 0;
-}
-*/
-
-/*
- 2008-09-19 (0.2.3):
-
- * Corrected the example
- * Improved interfaces
-
- 2008-09-11 (0.2.2):
-
- * Improved speed a little in kh_put()
-
- 2008-09-10 (0.2.1):
-
- * Added kh_clear()
- * Fixed a compiling error
-
- 2008-09-02 (0.2.0):
-
- * Changed to token concatenation which increases flexibility.
-
- 2008-08-31 (0.1.2):
-
- * Fixed a bug in kh_get(), which has not been tested previously.
-
- 2008-08-31 (0.1.1):
-
- * Added destructor
-*/
-
-
-#ifndef __AC_KHASH_H
-#define __AC_KHASH_H
-
-/*!
- @header
-
- Generic hash table library.
-
- @copyright Heng Li
- */
-
-#define AC_VERSION_KHASH_H "0.2.2"
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-typedef uint32_t khint_t;
-typedef khint_t khiter_t;
-
-#define __ac_HASH_PRIME_SIZE 32
-static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
-{
- 0ul, 3ul, 11ul, 23ul, 53ul,
- 97ul, 193ul, 389ul, 769ul, 1543ul,
- 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
- 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
- 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
- 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
- 3221225473ul, 4294967291ul
-};
-
-#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
-#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
-#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
-#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
-#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
-#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
-#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
-
-static const double __ac_HASH_UPPER = 0.77;
-
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- typedef struct { \
- khint_t n_buckets, size, n_occupied, upper_bound; \
- uint32_t *flags; \
- khkey_t *keys; \
- khval_t *vals; \
- } kh_##name##_t; \
- static inline kh_##name##_t *kh_init_##name() { \
- return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
- } \
- static inline void kh_destroy_##name(kh_##name##_t *h) \
- { \
- if (h) { \
- free(h->keys); free(h->flags); \
- free(h->vals); \
- free(h); \
- } \
- } \
- static inline void kh_clear_##name(kh_##name##_t *h) \
- { \
- if (h && h->flags) { \
- memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
- h->size = h->n_occupied = 0; \
- } \
- } \
- static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
- { \
- if (h->n_buckets) { \
- khint_t inc, k, i, last; \
- k = __hash_func(key); i = k % h->n_buckets; \
- inc = 1 + k % (h->n_buckets - 1); last = i; \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
- else i += inc; \
- if (i == last) return h->n_buckets; \
- } \
- return __ac_iseither(h->flags, i)? h->n_buckets : i; \
- } else return 0; \
- } \
- static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
- { \
- uint32_t *new_flags = 0; \
- khint_t j = 1; \
- { \
- khint_t t = __ac_HASH_PRIME_SIZE - 1; \
- while (__ac_prime_list[t] > new_n_buckets) --t; \
- new_n_buckets = __ac_prime_list[t+1]; \
- if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
- else { \
- new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
- memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
- if (h->n_buckets < new_n_buckets) { \
- h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) \
- h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
- } \
- } \
- } \
- if (j) { \
- for (j = 0; j != h->n_buckets; ++j) { \
- if (__ac_iseither(h->flags, j) == 0) { \
- khkey_t key = h->keys[j]; \
- khval_t val; \
- if (kh_is_map) val = h->vals[j]; \
- __ac_set_isdel_true(h->flags, j); \
- while (1) { \
- khint_t inc, k, i; \
- k = __hash_func(key); \
- i = k % new_n_buckets; \
- inc = 1 + k % (new_n_buckets - 1); \
- while (!__ac_isempty(new_flags, i)) { \
- if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
- else i += inc; \
- } \
- __ac_set_isempty_false(new_flags, i); \
- if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
- { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
- if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
- __ac_set_isdel_true(h->flags, i); \
- } else { \
- h->keys[i] = key; \
- if (kh_is_map) h->vals[i] = val; \
- break; \
- } \
- } \
- } \
- } \
- if (h->n_buckets > new_n_buckets) { \
- h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) \
- h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
- } \
- free(h->flags); \
- h->flags = new_flags; \
- h->n_buckets = new_n_buckets; \
- h->n_occupied = h->size; \
- h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
- } \
- } \
- static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
- { \
- khint_t x; \
- if (h->n_occupied >= h->upper_bound) { \
- if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
- else kh_resize_##name(h, h->n_buckets + 1); \
- } \
- { \
- khint_t inc, k, i, site, last; \
- x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
- if (__ac_isempty(h->flags, i)) x = i; \
- else { \
- inc = 1 + k % (h->n_buckets - 1); last = i; \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- if (__ac_isdel(h->flags, i)) site = i; \
- if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
- else i += inc; \
- if (i == last) { x = site; break; } \
- } \
- if (x == h->n_buckets) { \
- if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
- else x = i; \
- } \
- } \
- } \
- if (__ac_isempty(h->flags, x)) { \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; ++h->n_occupied; \
- *ret = 1; \
- } else if (__ac_isdel(h->flags, x)) { \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; \
- *ret = 2; \
- } else *ret = 0; \
- return x; \
- } \
- static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
- { \
- if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
- __ac_set_isdel_true(h->flags, x); \
- --h->size; \
- } \
- }
-
-/* --- BEGIN OF HASH FUNCTIONS --- */
-
-/*! @function
- @abstract Integer hash function
- @param key The integer [uint32_t]
- @return The hash value [khint_t]
- */
-#define kh_int_hash_func(key) (uint32_t)(key)
-/*! @function
- @abstract Integer comparison function
- */
-#define kh_int_hash_equal(a, b) ((a) == (b))
-/*! @function
- @abstract 64-bit integer hash function
- @param key The integer [uint64_t]
- @return The hash value [khint_t]
- */
-#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
-/*! @function
- @abstract 64-bit integer comparison function
- */
-#define kh_int64_hash_equal(a, b) ((a) == (b))
-/*! @function
- @abstract const char* hash function
- @param s Pointer to a null terminated string
- @return The hash value
- */
-static inline khint_t __ac_X31_hash_string(const char *s)
-{
- khint_t h = *s;
- if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
- return h;
-}
-/*! @function
- @abstract Another interface to const char* hash function
- @param key Pointer to a null terminated string [const char*]
- @return The hash value [khint_t]
- */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
-/*! @function
- @abstract Const char* comparison function
- */
-#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
-
-/* --- END OF HASH FUNCTIONS --- */
-
-/* Other necessary macros... */
-
-/*!
- @abstract Type of the hash table.
- @param name Name of the hash table [symbol]
- */
-#define khash_t(name) kh_##name##_t
-
-/*! @function
- @abstract Initiate a hash table.
- @param name Name of the hash table [symbol]
- @return Pointer to the hash table [khash_t(name)*]
- */
-#define kh_init(name) kh_init_##name()
-
-/*! @function
- @abstract Destroy a hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- */
-#define kh_destroy(name, h) kh_destroy_##name(h)
-
-/*! @function
- @abstract Reset a hash table without deallocating memory.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- */
-#define kh_clear(name, h) kh_clear_##name(h)
-
-/*! @function
- @abstract Resize a hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param s New size [khint_t]
- */
-#define kh_resize(name, h, s) kh_resize_##name(h, s)
-
-/*! @function
- @abstract Insert a key to the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Key [type of keys]
- @param r Extra return code: 0 if the key is present in the hash table;
- 1 if the bucket is empty (never used); 2 if the element in
- the bucket has been deleted [int*]
- @return Iterator to the inserted element [khint_t]
- */
-#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
-
-/*! @function
- @abstract Retrieve a key from the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Key [type of keys]
- @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
- */
-#define kh_get(name, h, k) kh_get_##name(h, k)
-
-/*! @function
- @abstract Remove a key from the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Iterator to the element to be deleted [khint_t]
- */
-#define kh_del(name, h, k) kh_del_##name(h, k)
-
-
-/*! @function
- @abstract Test whether a bucket contains data.
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khint_t]
- @return 1 if containing data; 0 otherwise [int]
- */
-#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
-
-/*! @function
- @abstract Get key given an iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khint_t]
- @return Key [type of keys]
- */
-#define kh_key(h, x) ((h)->keys[x])
-
-/*! @function
- @abstract Get value given an iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khint_t]
- @return Value [type of values]
- @discussion For hash sets, calling this results in segfault.
- */
-#define kh_val(h, x) ((h)->vals[x])
-
-/*! @function
- @abstract Alias of kh_val()
- */
-#define kh_value(h, x) ((h)->vals[x])
-
-/*! @function
- @abstract Get the start iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @return The start iterator [khint_t]
- */
-#define kh_begin(h) (khint_t)(0)
-
-/*! @function
- @abstract Get the end iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @return The end iterator [khint_t]
- */
-#define kh_end(h) ((h)->n_buckets)
-
-/*! @function
- @abstract Get the number of elements in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @return Number of elements in the hash table [khint_t]
- */
-#define kh_size(h) ((h)->size)
-
-/*! @function
- @abstract Get the number of buckets in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @return Number of buckets in the hash table [khint_t]
- */
-#define kh_n_buckets(h) ((h)->n_buckets)
-
-/* More conenient interfaces */
-
-/*! @function
- @abstract Instantiate a hash set containing integer keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT(name) \
- KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_INT(name, khval_t) \
- KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 64-bit integer keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT64(name) \
- KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 64-bit integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_INT64(name, khval_t) \
- KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
-
-typedef const char *kh_cstr_t;
-/*! @function
- @abstract Instantiate a hash map containing const char* keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_STR(name) \
- KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing const char* keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_STR(name, khval_t) \
- KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
-
-#endif /* __AC_KHASH_H */
diff --git a/external/vcflib/tabixpp/knetfile.c b/external/vcflib/tabixpp/knetfile.c
deleted file mode 100644
index 7c96a3e..0000000
--- a/external/vcflib/tabixpp/knetfile.c
+++ /dev/null
@@ -1,632 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Genome Research Ltd (GRL).
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <lh3 at sanger.ac.uk> */
-
-/* Probably I will not do socket programming in the next few years and
- therefore I decide to heavily annotate this file, for Linux and
- Windows as well. -lh3 */
-
-#include <time.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <sys/types.h>
-
-#ifdef _WIN32
-#include <winsock.h>
-#else
-#include <netdb.h>
-#include <arpa/inet.h>
-#include <sys/socket.h>
-#endif
-
-#include "knetfile.h"
-
-/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
- * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
- * integer -1. In knetfile.c, I use "int" for socket type
- * throughout. This should be improved to avoid confusion.
- *
- * In Linux/Mac, recv() and read() do almost the same thing. You can see
- * in the header file that netread() is simply an alias of read(). In
- * Windows, however, they are different and using recv() is mandatory.
- */
-
-/* This function tests if the file handler is ready for reading (or
- * writing if is_read==0). */
-static int socket_wait(int fd, int is_read)
-{
- fd_set fds, *fdr = 0, *fdw = 0;
- struct timeval tv;
- int ret;
- tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
- FD_ZERO(&fds);
- FD_SET(fd, &fds);
- if (is_read) fdr = &fds;
- else fdw = &fds;
- ret = select(fd+1, fdr, fdw, 0, &tv);
-#ifndef _WIN32
- if (ret == -1) perror("select");
-#else
- if (ret == 0)
- fprintf(stderr, "select time-out\n");
- else if (ret == SOCKET_ERROR)
- fprintf(stderr, "select: %d\n", WSAGetLastError());
-#endif
- return ret;
-}
-
-#ifndef _WIN32
-/* This function does not work with Windows due to the lack of
- * getaddrinfo() in winsock. It is addapted from an example in "Beej's
- * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
-static int socket_connect(const char *host, const char *port)
-{
-#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
-
- int on = 1, fd;
- struct linger lng = { 0, 0 };
- struct addrinfo hints, *res;
- memset(&hints, 0, sizeof(struct addrinfo));
- hints.ai_family = AF_UNSPEC;
- hints.ai_socktype = SOCK_STREAM;
- /* In Unix/Mac, getaddrinfo() is the most convenient way to get
- * server information. */
- if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
- if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
- /* The following two setsockopt() are used by ftplib
- * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
- * necessary. */
- if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
- if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
- if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
- freeaddrinfo(res);
- return fd;
-}
-#else
-/* MinGW's printf has problem with "%lld" */
-char *int64tostr(char *buf, int64_t x)
-{
- int cnt;
- int i = 0;
- do {
- buf[i++] = '0' + x % 10;
- x /= 10;
- } while (x);
- buf[i] = 0;
- for (cnt = i, i = 0; i < cnt/2; ++i) {
- int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
- }
- return buf;
-}
-
-int64_t strtoint64(const char *buf)
-{
- int64_t x;
- for (x = 0; *buf != '\0'; ++buf)
- x = x * 10 + ((int64_t) *buf - 48);
- return x;
-}
-/* In windows, the first thing is to establish the TCP connection. */
-int knet_win32_init()
-{
- WSADATA wsaData;
- return WSAStartup(MAKEWORD(2, 2), &wsaData);
-}
-void knet_win32_destroy()
-{
- WSACleanup();
-}
-/* A slightly modfied version of the following function also works on
- * Mac (and presummably Linux). However, this function is not stable on
- * my Mac. It sometimes works fine but sometimes does not. Therefore for
- * non-Windows OS, I do not use this one. */
-static SOCKET socket_connect(const char *host, const char *port)
-{
-#define __err_connect(func) \
- do { \
- fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \
- return -1; \
- } while (0)
-
- int on = 1;
- SOCKET fd;
- struct linger lng = { 0, 0 };
- struct sockaddr_in server;
- struct hostent *hp = 0;
- // open socket
- if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
- if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
- if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
- // get host info
- if (isalpha(host[0])) hp = gethostbyname(host);
- else {
- struct in_addr addr;
- addr.s_addr = inet_addr(host);
- hp = gethostbyaddr((char*)&addr, 4, AF_INET);
- }
- if (hp == 0) __err_connect("gethost");
- // connect
- server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
- server.sin_family= AF_INET;
- server.sin_port = htons(atoi(port));
- if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
- // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
- return fd;
-}
-#endif
-
-static off_t my_netread(int fd, void *buf, off_t len)
-{
- off_t rest = len, curr, l = 0;
- /* recv() and read() may not read the required length of data with
- * one call. They have to be called repeatedly. */
- while (rest) {
- if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
- curr = netread(fd, buf + l, rest);
- /* According to the glibc manual, section 13.2, a zero returned
- * value indicates end-of-file (EOF), which should mean that
- * read() will not return zero if EOF has not been met but data
- * are not immediately available. */
- if (curr == 0) break;
- l += curr; rest -= curr;
- }
- return l;
-}
-
-/*************************
- * FTP specific routines *
- *************************/
-
-static int kftp_get_response(knetFile *ftp)
-{
-#ifndef _WIN32
- unsigned char c;
-#else
- char c;
-#endif
- int n = 0;
- char *p;
- if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
- while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
- //fputc(c, stderr);
- if (n >= ftp->max_response) {
- ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
- ftp->response = realloc(ftp->response, ftp->max_response);
- }
- ftp->response[n++] = c;
- if (c == '\n') {
- if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
- && ftp->response[3] != '-') break;
- n = 0;
- continue;
- }
- }
- if (n < 2) return -1;
- ftp->response[n-2] = 0;
- return strtol(ftp->response, &p, 0);
-}
-
-static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
-{
- if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
- netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
- return is_get? kftp_get_response(ftp) : 0;
-}
-
-static int kftp_pasv_prep(knetFile *ftp)
-{
- char *p;
- int v[6];
- kftp_send_cmd(ftp, "PASV\r\n", 1);
- for (p = ftp->response; *p && *p != '('; ++p);
- if (*p != '(') return -1;
- ++p;
- sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
- memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
- ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
- return 0;
-}
-
-
-static int kftp_pasv_connect(knetFile *ftp)
-{
- char host[80], port[10];
- if (ftp->pasv_port == 0) {
- fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
- return -1;
- }
- sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
- sprintf(port, "%d", ftp->pasv_port);
- ftp->fd = socket_connect(host, port);
- if (ftp->fd == -1) return -1;
- return 0;
-}
-
-int kftp_connect(knetFile *ftp)
-{
- ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
- if (ftp->ctrl_fd == -1) return -1;
- kftp_get_response(ftp);
- kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
- kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
- kftp_send_cmd(ftp, "TYPE I\r\n", 1);
- return 0;
-}
-
-int kftp_reconnect(knetFile *ftp)
-{
- if (ftp->ctrl_fd != -1) {
- netclose(ftp->ctrl_fd);
- ftp->ctrl_fd = -1;
- }
- netclose(ftp->fd);
- ftp->fd = -1;
- return kftp_connect(ftp);
-}
-
-// initialize ->type, ->host, ->retr and ->size
-knetFile *kftp_parse_url(const char *fn, const char *mode)
-{
- knetFile *fp;
- char *p;
- int l;
- if (strstr(fn, "ftp://") != fn) return 0;
- for (p = (char*)fn + 6; *p && *p != '/'; ++p);
- if (*p != '/') return 0;
- l = p - fn - 6;
- fp = calloc(1, sizeof(knetFile));
- fp->type = KNF_TYPE_FTP;
- fp->fd = -1;
- /* the Linux/Mac version of socket_connect() also recognizes a port
- * like "ftp", but the Windows version does not. */
- fp->port = strdup("21");
- fp->host = calloc(l + 1, 1);
- if (strchr(mode, 'c')) fp->no_reconnect = 1;
- strncpy(fp->host, fn + 6, l);
- fp->retr = calloc(strlen(p) + 8, 1);
- sprintf(fp->retr, "RETR %s\r\n", p);
- fp->size_cmd = calloc(strlen(p) + 8, 1);
- sprintf(fp->size_cmd, "SIZE %s\r\n", p);
- fp->seek_offset = 0;
- return fp;
-}
-// place ->fd at offset off
-int kftp_connect_file(knetFile *fp)
-{
- int ret;
- long long file_size;
- if (fp->fd != -1) {
- netclose(fp->fd);
- if (fp->no_reconnect) kftp_get_response(fp);
- }
- kftp_pasv_prep(fp);
- kftp_send_cmd(fp, fp->size_cmd, 1);
-#ifndef _WIN32
- if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
- {
- fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
- return -1;
- }
-#else
- const char *p = fp->response;
- while (*p != ' ') ++p;
- while (*p < '0' || *p > '9') ++p;
- file_size = strtoint64(p);
-#endif
- fp->file_size = file_size;
- if (fp->offset>=0) {
- char tmp[32];
-#ifndef _WIN32
- sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
-#else
- strcpy(tmp, "REST ");
- int64tostr(tmp + 5, fp->offset);
- strcat(tmp, "\r\n");
-#endif
- kftp_send_cmd(fp, tmp, 1);
- }
- kftp_send_cmd(fp, fp->retr, 0);
- kftp_pasv_connect(fp);
- ret = kftp_get_response(fp);
- if (ret != 150) {
- fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
- netclose(fp->fd);
- fp->fd = -1;
- return -1;
- }
- fp->is_ready = 1;
- return 0;
-}
-
-
-/**************************
- * HTTP specific routines *
- **************************/
-
-knetFile *khttp_parse_url(const char *fn, const char *mode)
-{
- knetFile *fp;
- char *p, *proxy, *q;
- int l;
- if (strstr(fn, "http://") != fn) return 0;
- // set ->http_host
- for (p = (char*)fn + 7; *p && *p != '/'; ++p);
- l = p - fn - 7;
- fp = calloc(1, sizeof(knetFile));
- fp->http_host = calloc(l + 1, 1);
- strncpy(fp->http_host, fn + 7, l);
- fp->http_host[l] = 0;
- for (q = fp->http_host; *q && *q != ':'; ++q);
- if (*q == ':') *q++ = 0;
- // get http_proxy
- proxy = getenv("http_proxy");
- // set ->host, ->port and ->path
- if (proxy == 0) {
- fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
- fp->port = strdup(*q? q : "80");
- fp->path = strdup(*p? p : "/");
- } else {
- fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
- for (q = fp->host; *q && *q != ':'; ++q);
- if (*q == ':') *q++ = 0;
- fp->port = strdup(*q? q : "80");
- fp->path = strdup(fn);
- }
- fp->type = KNF_TYPE_HTTP;
- fp->ctrl_fd = fp->fd = -1;
- fp->seek_offset = 0;
- return fp;
-}
-
-int khttp_connect_file(knetFile *fp)
-{
- int ret, l = 0;
- char *buf, *p;
- if (fp->fd != -1) netclose(fp->fd);
- fp->fd = socket_connect(fp->host, fp->port);
- buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
- l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
- l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
- l += sprintf(buf + l, "\r\n");
- netwrite(fp->fd, buf, l);
- l = 0;
- while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
- if (buf[l] == '\n' && l >= 3)
- if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
- ++l;
- }
- buf[l] = 0;
- if (l < 14) { // prematured header
- netclose(fp->fd);
- fp->fd = -1;
- return -1;
- }
- ret = strtol(buf + 8, &p, 0); // HTTP return code
- if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
- off_t rest = fp->offset;
- while (rest) {
- off_t l = rest < 0x10000? rest : 0x10000;
- rest -= my_netread(fp->fd, buf, l);
- }
- } else if (ret != 206 && ret != 200) {
- free(buf);
- fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
- netclose(fp->fd);
- fp->fd = -1;
- return -1;
- }
- free(buf);
- fp->is_ready = 1;
- return 0;
-}
-
-/********************
- * Generic routines *
- ********************/
-
-knetFile *knet_open(const char *fn, const char *mode)
-{
- knetFile *fp = 0;
- if (mode[0] != 'r') {
- fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
- return 0;
- }
- if (strstr(fn, "ftp://") == fn) {
- fp = kftp_parse_url(fn, mode);
- if (fp == 0) return 0;
- if (kftp_connect(fp) == -1) {
- knet_close(fp);
- return 0;
- }
- kftp_connect_file(fp);
- } else if (strstr(fn, "http://") == fn) {
- fp = khttp_parse_url(fn, mode);
- if (fp == 0) return 0;
- khttp_connect_file(fp);
- } else { // local file
-#ifdef _WIN32
- /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
- * be undefined on some systems, although it is defined on my
- * Mac and the Linux I have tested on. */
- int fd = open(fn, O_RDONLY | O_BINARY);
-#else
- int fd = open(fn, O_RDONLY);
-#endif
- if (fd == -1) {
- perror("open");
- return 0;
- }
- fp = (knetFile*)calloc(1, sizeof(knetFile));
- fp->type = KNF_TYPE_LOCAL;
- fp->fd = fd;
- fp->ctrl_fd = -1;
- }
- if (fp && fp->fd == -1) {
- knet_close(fp);
- return 0;
- }
- return fp;
-}
-
-knetFile *knet_dopen(int fd, const char *mode)
-{
- knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
- fp->type = KNF_TYPE_LOCAL;
- fp->fd = fd;
- return fp;
-}
-
-off_t knet_read(knetFile *fp, void *buf, off_t len)
-{
- off_t l = 0;
- if (fp->fd == -1) return 0;
- if (fp->type == KNF_TYPE_FTP) {
- if (fp->is_ready == 0) {
- if (!fp->no_reconnect) kftp_reconnect(fp);
- kftp_connect_file(fp);
- }
- } else if (fp->type == KNF_TYPE_HTTP) {
- if (fp->is_ready == 0)
- khttp_connect_file(fp);
- }
- if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
- off_t rest = len, curr;
- while (rest) {
- curr = read(fp->fd, buf + l, rest);
- if (curr == 0) break;
- l += curr; rest -= curr;
- }
- } else l = my_netread(fp->fd, buf, len);
- fp->offset += l;
- return l;
-}
-
-off_t knet_seek(knetFile *fp, int64_t off, int whence)
-{
- if (whence == SEEK_SET && off == fp->offset) return 0;
- if (fp->type == KNF_TYPE_LOCAL) {
- /* Be aware that lseek() returns the offset after seeking,
- * while fseek() returns zero on success. */
- off_t offset = lseek(fp->fd, off, whence);
- if (offset == -1) {
- // Be silent, it is OK for knet_seek to fail when the file is streamed
- // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
- return -1;
- }
- fp->offset = offset;
- return 0;
- }
- else if (fp->type == KNF_TYPE_FTP)
- {
- if (whence==SEEK_CUR)
- fp->offset += off;
- else if (whence==SEEK_SET)
- fp->offset = off;
- else if ( whence==SEEK_END)
- fp->offset = fp->file_size+off;
- fp->is_ready = 0;
- return 0;
- }
- else if (fp->type == KNF_TYPE_HTTP)
- {
- if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
- fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
- errno = ESPIPE;
- return -1;
- }
- if (whence==SEEK_CUR)
- fp->offset += off;
- else if (whence==SEEK_SET)
- fp->offset = off;
- fp->is_ready = 0;
- return fp->offset;
- }
- errno = EINVAL;
- fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
- return -1;
-}
-
-int knet_close(knetFile *fp)
-{
- if (fp == 0) return 0;
- if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
- if (fp->fd != -1) {
- /* On Linux/Mac, netclose() is an alias of close(), but on
- * Windows, it is an alias of closesocket(). */
- if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
- else netclose(fp->fd);
- }
- free(fp->host); free(fp->port);
- free(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific
- free(fp->path); free(fp->http_host); // HTTP specific
- free(fp);
- return 0;
-}
-
-#ifdef KNETFILE_MAIN
-int main(void)
-{
- char *buf;
- knetFile *fp;
- int type = 4, l;
-#ifdef _WIN32
- knet_win32_init();
-#endif
- buf = calloc(0x100000, 1);
- if (type == 0) {
- fp = knet_open("knetfile.c", "r");
- knet_seek(fp, 1000, SEEK_SET);
- } else if (type == 1) { // NCBI FTP, large file
- fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
- knet_seek(fp, 2500000000ll, SEEK_SET);
- l = knet_read(fp, buf, 255);
- } else if (type == 2) {
- fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
- knet_seek(fp, 1000, SEEK_SET);
- } else if (type == 3) {
- fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
- knet_seek(fp, 1000, SEEK_SET);
- } else if (type == 4) {
- fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
- knet_read(fp, buf, 10000);
- knet_seek(fp, 20000, SEEK_SET);
- knet_seek(fp, 10000, SEEK_SET);
- l = knet_read(fp, buf+10000, 10000000) + 10000;
- }
- if (type != 4 && type != 1) {
- knet_read(fp, buf, 255);
- buf[255] = 0;
- printf("%s\n", buf);
- } else write(fileno(stdout), buf, l);
- knet_close(fp);
- free(buf);
- return 0;
-}
-#endif
diff --git a/external/vcflib/tabixpp/knetfile.h b/external/vcflib/tabixpp/knetfile.h
deleted file mode 100644
index 0a0e66f..0000000
--- a/external/vcflib/tabixpp/knetfile.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef KNETFILE_H
-#define KNETFILE_H
-
-#include <stdint.h>
-#include <fcntl.h>
-
-#ifndef _WIN32
-#define netread(fd, ptr, len) read(fd, ptr, len)
-#define netwrite(fd, ptr, len) write(fd, ptr, len)
-#define netclose(fd) close(fd)
-#else
-#include <winsock2.h>
-#define netread(fd, ptr, len) recv(fd, ptr, len, 0)
-#define netwrite(fd, ptr, len) send(fd, ptr, len, 0)
-#define netclose(fd) closesocket(fd)
-#endif
-
-// FIXME: currently I/O is unbuffered
-
-#define KNF_TYPE_LOCAL 1
-#define KNF_TYPE_FTP 2
-#define KNF_TYPE_HTTP 3
-
-typedef struct knetFile_s {
- int type, fd;
- int64_t offset;
- char *host, *port;
-
- // the following are for FTP only
- int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
- char *response, *retr, *size_cmd;
- int64_t seek_offset; // for lazy seek
- int64_t file_size;
-
- // the following are for HTTP only
- char *path, *http_host;
-} knetFile;
-
-#define knet_tell(fp) ((fp)->offset)
-#define knet_fileno(fp) ((fp)->fd)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _WIN32
- int knet_win32_init();
- void knet_win32_destroy();
-#endif
-
- knetFile *knet_open(const char *fn, const char *mode);
-
- /*
- This only works with local files.
- */
- knetFile *knet_dopen(int fd, const char *mode);
-
- /*
- If ->is_ready==0, this routine updates ->fd; otherwise, it simply
- reads from ->fd.
- */
- off_t knet_read(knetFile *fp, void *buf, off_t len);
-
- /*
- This routine only sets ->offset and ->is_ready=0. It does not
- communicate with the FTP server.
- */
- off_t knet_seek(knetFile *fp, int64_t off, int whence);
- int knet_close(knetFile *fp);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/external/vcflib/tabixpp/kseq.h b/external/vcflib/tabixpp/kseq.h
deleted file mode 100644
index 82face0..0000000
--- a/external/vcflib/tabixpp/kseq.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Genome Research Ltd (GRL).
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <lh3 at sanger.ac.uk> */
-
-/*
- 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
- */
-
-/* Last Modified: 12APR2009 */
-
-#ifndef AC_KSEQ_H
-#define AC_KSEQ_H
-
-#include <ctype.h>
-#include <string.h>
-#include <stdlib.h>
-
-#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
-#define KS_SEP_TAB 1 // isspace() && !' '
-#define KS_SEP_MAX 1
-
-#define __KS_TYPE(type_t) \
- typedef struct __kstream_t { \
- unsigned char *buf; \
- int begin, end, is_eof; \
- type_t f; \
- } kstream_t;
-
-#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
-#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
-
-#define __KS_BASIC(type_t, __bufsize) \
- static inline kstream_t *ks_init(type_t f) \
- { \
- kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
- ks->f = f; \
- ks->buf = malloc(__bufsize); \
- return ks; \
- } \
- static inline void ks_destroy(kstream_t *ks) \
- { \
- if (ks) { \
- free(ks->buf); \
- free(ks); \
- } \
- }
-
-#define __KS_GETC(__read, __bufsize) \
- static inline int ks_getc(kstream_t *ks) \
- { \
- if (ks->is_eof && ks->begin >= ks->end) return -1; \
- if (ks->begin >= ks->end) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) return -1; \
- } \
- return (int)ks->buf[ks->begin++]; \
- }
-
-#ifndef KSTRING_T
-#define KSTRING_T kstring_t
-typedef struct __kstring_t {
- size_t l, m;
- char *s;
-} kstring_t;
-#endif
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#define __KS_GETUNTIL(__read, __bufsize) \
- static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
- { \
- if (dret) *dret = 0; \
- str->l = 0; \
- if (ks->begin >= ks->end && ks->is_eof) return -1; \
- for (;;) { \
- int i; \
- if (ks->begin >= ks->end) { \
- if (!ks->is_eof) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) break; \
- } else break; \
- } \
- if (delimiter > KS_SEP_MAX) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (ks->buf[i] == delimiter) break; \
- } else if (delimiter == KS_SEP_SPACE) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i])) break; \
- } else if (delimiter == KS_SEP_TAB) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
- } else i = 0; /* never come to here! */ \
- if (str->m - str->l < i - ks->begin + 1) { \
- str->m = str->l + (i - ks->begin) + 1; \
- kroundup32(str->m); \
- str->s = (char*)realloc(str->s, str->m); \
- } \
- memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
- str->l = str->l + (i - ks->begin); \
- ks->begin = i + 1; \
- if (i < ks->end) { \
- if (dret) *dret = ks->buf[i]; \
- break; \
- } \
- } \
- if (str->l == 0) { \
- str->m = 1; \
- str->s = (char*)calloc(1, 1); \
- } \
- str->s[str->l] = '\0'; \
- return str->l; \
- }
-
-#define KSTREAM_INIT(type_t, __read, __bufsize) \
- __KS_TYPE(type_t) \
- __KS_BASIC(type_t, __bufsize) \
- __KS_GETC(__read, __bufsize) \
- __KS_GETUNTIL(__read, __bufsize)
-
-#define __KSEQ_BASIC(type_t) \
- static inline kseq_t *kseq_init(type_t fd) \
- { \
- kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
- s->f = ks_init(fd); \
- return s; \
- } \
- static inline void kseq_rewind(kseq_t *ks) \
- { \
- ks->last_char = 0; \
- ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
- } \
- static inline void kseq_destroy(kseq_t *ks) \
- { \
- if (!ks) return; \
- free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
- ks_destroy(ks->f); \
- free(ks); \
- }
-
-/* Return value:
- >=0 length of the sequence (normal)
- -1 end-of-file
- -2 truncated quality string
- */
-#define __KSEQ_READ \
- static int kseq_read(kseq_t *seq) \
- { \
- int c; \
- kstream_t *ks = seq->f; \
- if (seq->last_char == 0) { /* then jump to the next header line */ \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
- if (c == -1) return -1; /* end of file */ \
- seq->last_char = c; \
- } /* the first header char has been read */ \
- seq->comment.l = seq->seq.l = seq->qual.l = 0; \
- if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
- if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
- if (isgraph(c)) { /* printable non-space character */ \
- if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
- seq->seq.m = seq->seq.l + 2; \
- kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
- seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
- } \
- seq->seq.s[seq->seq.l++] = (char)c; \
- } \
- } \
- if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
- seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
- if (c != '+') return seq->seq.l; /* FASTA */ \
- if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
- seq->qual.m = seq->seq.m; \
- seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
- } \
- while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
- if (c == -1) return -2; /* we should not stop here */ \
- while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
- if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
- seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
- seq->last_char = 0; /* we have not come to the next header line */ \
- if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
- return seq->seq.l; \
- }
-
-#define __KSEQ_TYPE(type_t) \
- typedef struct { \
- kstring_t name, comment, seq, qual; \
- int last_char; \
- kstream_t *f; \
- } kseq_t;
-
-#define KSEQ_INIT(type_t, __read) \
- KSTREAM_INIT(type_t, __read, 4096) \
- __KSEQ_TYPE(type_t) \
- __KSEQ_BASIC(type_t) \
- __KSEQ_READ
-
-#endif
diff --git a/external/vcflib/tabixpp/ksort.h b/external/vcflib/tabixpp/ksort.h
deleted file mode 100644
index eb46f28..0000000
--- a/external/vcflib/tabixpp/ksort.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Genome Research Ltd (GRL).
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <lh3 at sanger.ac.uk> */
-
-/*
- 2008-11-16 (0.1.4):
-
- * Fixed a bug in introsort() that happens in rare cases.
-
- 2008-11-05 (0.1.3):
-
- * Fixed a bug in introsort() for complex comparisons.
-
- * Fixed a bug in mergesort(). The previous version is not stable.
-
- 2008-09-15 (0.1.2):
-
- * Accelerated introsort. On my Mac (not on another Linux machine),
- my implementation is as fast as std::sort on random input.
-
- * Added combsort and in introsort, switch to combsort if the
- recursion is too deep.
-
- 2008-09-13 (0.1.1):
-
- * Added k-small algorithm
-
- 2008-09-05 (0.1.0):
-
- * Initial version
-
-*/
-
-#ifndef AC_KSORT_H
-#define AC_KSORT_H
-
-#include <stdlib.h>
-#include <string.h>
-
-typedef struct {
- void *left, *right;
- int depth;
-} ks_isort_stack_t;
-
-#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
-
-#define KSORT_INIT(name, type_t, __sort_lt) \
- void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
- { \
- type_t *a2[2], *a, *b; \
- int curr, shift; \
- \
- a2[0] = array; \
- a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
- for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
- a = a2[curr]; b = a2[1-curr]; \
- if (shift == 0) { \
- type_t *p = b, *i, *eb = a + n; \
- for (i = a; i < eb; i += 2) { \
- if (i == eb - 1) *p++ = *i; \
- else { \
- if (__sort_lt(*(i+1), *i)) { \
- *p++ = *(i+1); *p++ = *i; \
- } else { \
- *p++ = *i; *p++ = *(i+1); \
- } \
- } \
- } \
- } else { \
- size_t i, step = 1ul<<shift; \
- for (i = 0; i < n; i += step<<1) { \
- type_t *p, *j, *k, *ea, *eb; \
- if (n < i + step) { \
- ea = a + n; eb = a; \
- } else { \
- ea = a + i + step; \
- eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
- } \
- j = a + i; k = a + i + step; p = b + i; \
- while (j < ea && k < eb) { \
- if (__sort_lt(*k, *j)) *p++ = *k++; \
- else *p++ = *j++; \
- } \
- while (j < ea) *p++ = *j++; \
- while (k < eb) *p++ = *k++; \
- } \
- } \
- curr = 1 - curr; \
- } \
- if (curr == 1) { \
- type_t *p = a2[0], *i = a2[1], *eb = array + n; \
- for (; p < eb; ++i) *p++ = *i; \
- } \
- if (temp == 0) free(a2[1]); \
- } \
- void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
- { \
- size_t k = i; \
- type_t tmp = l[i]; \
- while ((k = (k << 1) + 1) < n) { \
- if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
- if (__sort_lt(l[k], tmp)) break; \
- l[i] = l[k]; i = k; \
- } \
- l[i] = tmp; \
- } \
- void ks_heapmake_##name(size_t lsize, type_t l[]) \
- { \
- size_t i; \
- for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
- ks_heapadjust_##name(i, lsize, l); \
- } \
- void ks_heapsort_##name(size_t lsize, type_t l[]) \
- { \
- size_t i; \
- for (i = lsize - 1; i > 0; --i) { \
- type_t tmp; \
- tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
- } \
- } \
- void __ks_insertsort_##name(type_t *s, type_t *t) \
- { \
- type_t *i, *j, swap_tmp; \
- for (i = s + 1; i < t; ++i) \
- for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
- swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
- } \
- } \
- void ks_combsort_##name(size_t n, type_t a[]) \
- { \
- const double shrink_factor = 1.2473309501039786540366528676643; \
- int do_swap; \
- size_t gap = n; \
- type_t tmp, *i, *j; \
- do { \
- if (gap > 2) { \
- gap = (size_t)(gap / shrink_factor); \
- if (gap == 9 || gap == 10) gap = 11; \
- } \
- do_swap = 0; \
- for (i = a; i < a + n - gap; ++i) { \
- j = i + gap; \
- if (__sort_lt(*j, *i)) { \
- tmp = *i; *i = *j; *j = tmp; \
- do_swap = 1; \
- } \
- } \
- } while (do_swap || gap > 2); \
- if (gap != 1) __ks_insertsort_##name(a, a + n); \
- } \
- void ks_introsort_##name(size_t n, type_t a[]) \
- { \
- int d; \
- ks_isort_stack_t *top, *stack; \
- type_t rp, swap_tmp; \
- type_t *s, *t, *i, *j, *k; \
- \
- if (n < 1) return; \
- else if (n == 2) { \
- if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
- return; \
- } \
- for (d = 2; 1ul<<d < n; ++d); \
- stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
- top = stack; s = a; t = a + (n-1); d <<= 1; \
- while (1) { \
- if (s < t) { \
- if (--d == 0) { \
- ks_combsort_##name(t - s + 1, s); \
- t = s; \
- continue; \
- } \
- i = s; j = t; k = i + ((j-i)>>1) + 1; \
- if (__sort_lt(*k, *i)) { \
- if (__sort_lt(*k, *j)) k = j; \
- } else k = __sort_lt(*j, *i)? i : j; \
- rp = *k; \
- if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
- for (;;) { \
- do ++i; while (__sort_lt(*i, rp)); \
- do --j; while (i <= j && __sort_lt(rp, *j)); \
- if (j <= i) break; \
- swap_tmp = *i; *i = *j; *j = swap_tmp; \
- } \
- swap_tmp = *i; *i = *t; *t = swap_tmp; \
- if (i-s > t-i) { \
- if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
- s = t-i > 16? i+1 : t; \
- } else { \
- if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
- t = i-s > 16? i-1 : s; \
- } \
- } else { \
- if (top == stack) { \
- free(stack); \
- __ks_insertsort_##name(a, a+n); \
- return; \
- } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
- } \
- } \
- } \
- /* This function is adapted from: http://ndevilla.free.fr/median/ */ \
- /* 0 <= kk < n */ \
- type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
- { \
- type_t *low, *high, *k, *ll, *hh, *mid; \
- low = arr; high = arr + n - 1; k = arr + kk; \
- for (;;) { \
- if (high <= low) return *k; \
- if (high == low + 1) { \
- if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
- return *k; \
- } \
- mid = low + (high - low) / 2; \
- if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
- if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
- if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
- KSORT_SWAP(type_t, *mid, *(low+1)); \
- ll = low + 1; hh = high; \
- for (;;) { \
- do ++ll; while (__sort_lt(*ll, *low)); \
- do --hh; while (__sort_lt(*low, *hh)); \
- if (hh < ll) break; \
- KSORT_SWAP(type_t, *ll, *hh); \
- } \
- KSORT_SWAP(type_t, *low, *hh); \
- if (hh <= k) low = ll; \
- if (hh >= k) high = hh - 1; \
- } \
- }
-
-#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
-#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
-#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
-#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
-#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
-#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
-#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
-
-#define ks_lt_generic(a, b) ((a) < (b))
-#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
-
-typedef const char *ksstr_t;
-
-#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
-#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
-
-#endif
diff --git a/external/vcflib/tabixpp/kstring.c b/external/vcflib/tabixpp/kstring.c
deleted file mode 100644
index e0203fa..0000000
--- a/external/vcflib/tabixpp/kstring.c
+++ /dev/null
@@ -1,165 +0,0 @@
-#include <stdarg.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <string.h>
-#include <stdint.h>
-#include "kstring.h"
-
-int ksprintf(kstring_t *s, const char *fmt, ...)
-{
- va_list ap;
- int l;
- va_start(ap, fmt);
- l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
- va_end(ap);
- if (l + 1 > s->m - s->l) {
- s->m = s->l + l + 2;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- va_start(ap, fmt);
- l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
- }
- va_end(ap);
- s->l += l;
- return l;
-}
-
-// s MUST BE a null terminated string; l = strlen(s)
-int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
-{
- int i, n, max, last_char, last_start, *offsets, l;
- n = 0; max = *_max; offsets = *_offsets;
- l = strlen(s);
-
-#define __ksplit_aux do { \
- if (_offsets) { \
- s[i] = 0; \
- if (n == max) { \
- max = max? max<<1 : 2; \
- offsets = (int*)realloc(offsets, sizeof(int) * max); \
- } \
- offsets[n++] = last_start; \
- } else ++n; \
- } while (0)
-
- for (i = 0, last_char = last_start = 0; i <= l; ++i) {
- if (delimiter == 0) {
- if (isspace(s[i]) || s[i] == 0) {
- if (isgraph(last_char)) __ksplit_aux; // the end of a field
- } else {
- if (isspace(last_char) || last_char == 0) last_start = i;
- }
- } else {
- if (s[i] == delimiter || s[i] == 0) {
- if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
- } else {
- if (last_char == delimiter || last_char == 0) last_start = i;
- }
- }
- last_char = s[i];
- }
- *_max = max; *_offsets = offsets;
- return n;
-}
-
-/**********************
- * Boyer-Moore search *
- **********************/
-
-// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-int *ksBM_prep(const uint8_t *pat, int m)
-{
- int i, *suff, *prep, *bmGs, *bmBc;
- prep = calloc(m + 256, 1);
- bmGs = prep; bmBc = prep + m;
- { // preBmBc()
- for (i = 0; i < 256; ++i) bmBc[i] = m;
- for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
- }
- suff = calloc(m, sizeof(int));
- { // suffixes()
- int f = 0, g;
- suff[m - 1] = m;
- g = m - 1;
- for (i = m - 2; i >= 0; --i) {
- if (i > g && suff[i + m - 1 - f] < i - g)
- suff[i] = suff[i + m - 1 - f];
- else {
- if (i < g) g = i;
- f = i;
- while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
- suff[i] = f - g;
- }
- }
- }
- { // preBmGs()
- int j = 0;
- for (i = 0; i < m; ++i) bmGs[i] = m;
- for (i = m - 1; i >= 0; --i)
- if (suff[i] == i + 1)
- for (; j < m - 1 - i; ++j)
- if (bmGs[j] == m)
- bmGs[j] = m - 1 - i;
- for (i = 0; i <= m - 2; ++i)
- bmGs[m - 1 - suff[i]] = m - 1 - i;
- }
- free(suff);
- return prep;
-}
-
-int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches)
-{
- int i, j, *prep, *bmGs, *bmBc;
- int *matches = 0, mm = 0, nm = 0;
- prep = _prep? _prep : ksBM_prep(pat, m);
- bmGs = prep; bmBc = prep + m;
- j = 0;
- while (j <= n - m) {
- for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
- if (i < 0) {
- if (nm == mm) {
- mm = mm? mm<<1 : 1;
- matches = realloc(matches, mm * sizeof(int));
- }
- matches[nm++] = j;
- j += bmGs[0];
- } else {
- int max = bmBc[str[i+j]] - m + 1 + i;
- if (max < bmGs[i]) max = bmGs[i];
- j += max;
- }
- }
- *n_matches = nm;
- if (_prep == 0) free(prep);
- return matches;
-}
-
-#ifdef KSTRING_MAIN
-#include <stdio.h>
-int main()
-{
- kstring_t *s;
- int *fields, n, i;
- s = (kstring_t*)calloc(1, sizeof(kstring_t));
- // test ksprintf()
- ksprintf(s, " abcdefg: %d ", 100);
- printf("'%s'\n", s->s);
- // test ksplit()
- fields = ksplit(s, 0, &n);
- for (i = 0; i < n; ++i)
- printf("field[%d] = '%s'\n", i, s->s + fields[i]);
- free(s);
-
- {
- static char *str = "abcdefgcdg";
- static char *pat = "cd";
- int n, *matches;
- matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n);
- printf("%d: \n", n);
- for (i = 0; i < n; ++i)
- printf("- %d\n", matches[i]);
- free(matches);
- }
- return 0;
-}
-#endif
diff --git a/external/vcflib/tabixpp/kstring.h b/external/vcflib/tabixpp/kstring.h
deleted file mode 100644
index f4e5a99..0000000
--- a/external/vcflib/tabixpp/kstring.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef KSTRING_H
-#define KSTRING_H
-
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#ifndef KSTRING_T
-#define KSTRING_T kstring_t
-typedef struct __kstring_t {
- size_t l, m;
- char *s;
-} kstring_t;
-#endif
-
-int ksprintf(kstring_t *s, const char *fmt, ...);
-int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
-
-// calculate the auxiliary array, allocated by calloc()
-int *ksBM_prep(const uint8_t *pat, int m);
-
-/* Search pat in str and returned the list of matches. The size of the
- * list is returned as n_matches. _prep is the array returned by
- * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */
-int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches);
-
-static inline int kputsn(const char *p, int l, kstring_t *s)
-{
- if (s->l + l + 1 >= s->m) {
- s->m = s->l + l + 2;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- }
- strncpy(s->s + s->l, p, l);
- s->l += l;
- s->s[s->l] = 0;
- return l;
-}
-
-static inline int kputs(const char *p, kstring_t *s)
-{
- return kputsn(p, strlen(p), s);
-}
-
-static inline int kputc(int c, kstring_t *s)
-{
- if (s->l + 1 >= s->m) {
- s->m = s->l + 2;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- }
- s->s[s->l++] = c;
- s->s[s->l] = 0;
- return c;
-}
-
-static inline int *ksplit(kstring_t *s, int delimiter, int *n)
-{
- int max = 0, *offsets = 0;
- *n = ksplit_core(s->s, delimiter, &max, &offsets);
- return offsets;
-}
-
-#endif
diff --git a/external/vcflib/tabixpp/main.c b/external/vcflib/tabixpp/main.c
deleted file mode 100644
index 792c8a1..0000000
--- a/external/vcflib/tabixpp/main.c
+++ /dev/null
@@ -1,290 +0,0 @@
-#include <string.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <errno.h>
-#include "bgzf.h"
-#include "tabix.h"
-
-#define PACKAGE_VERSION "0.2.5 (r964)"
-
-#define error(...) { fprintf(stderr,__VA_ARGS__); return -1; }
-
-int reheader_file(const char *header, const char *file, int meta)
-{
- BGZF *fp = _bgzf_open(file,"r");
- if (_bgzf_read_block(fp) != 0 || !fp->block_length)
- return -1;
-
- char *buffer = fp->uncompressed_block;
- int skip_until = 0;
-
- if ( buffer[0]==meta )
- {
- skip_until = 1;
-
- // Skip the header
- while (1)
- {
- if ( buffer[skip_until]=='\n' )
- {
- skip_until++;
- if ( skip_until>=fp->block_length )
- {
- if (_bgzf_read_block(fp) != 0 || !fp->block_length)
- error("no body?\n");
- skip_until = 0;
- }
- // The header has finished
- if ( buffer[skip_until]!=meta ) break;
- }
- skip_until++;
- if ( skip_until>=fp->block_length )
- {
- if (_bgzf_read_block(fp) != 0 || !fp->block_length)
- error("no body?\n");
- skip_until = 0;
- }
- }
- }
-
- FILE *fh = fopen(header,"r");
- if ( !fh )
- error("%s: %s", header,strerror(errno));
- int page_size = getpagesize();
- char *buf = valloc(page_size);
- BGZF *bgzf_out = _bgzf_fdopen(fileno(stdout), "w");
- ssize_t nread;
- while ( (nread=fread(buf,1,page_size-1,fh))>0 )
- {
- if ( nread<page_size-1 && buf[nread-1]!='\n' )
- buf[nread++] = '\n';
- if (_bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error);
- }
- fclose(fh);
-
- if ( fp->block_length - skip_until > 0 )
- {
- if (_bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0)
- error("Error: %s\n",fp->error);
- }
- if (_bgzf_flush(bgzf_out) < 0)
- error("Error: %s\n",bgzf_out->error);
-
- while (1)
- {
-#ifdef _USE_KNETFILE
- nread = knet_read(fp->x.fpr, buf, page_size);
-#else
- nread = fread(buf, 1, page_size, fp->file);
-#endif
- if ( nread<=0 )
- break;
-
-#ifdef _USE_KNETFILE
- int count = fwrite(buf, 1, nread, bgzf_out->x.fpw);
-#else
- int count = fwrite(buf, 1, nread, bgzf_out->file);
-#endif
- if (count != nread)
- error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
- }
-
- if (_bgzf_close(bgzf_out) < 0)
- error("Error: %s\n",bgzf_out->error);
-
- return 0;
-}
-
-
-int main(int argc, char *argv[])
-{
- int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, bed_reg = 0;
- ti_conf_t conf = ti_conf_gff;
- const char *reheader = NULL;
- while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhfBr:")) >= 0) {
- switch (c) {
- case 'B': bed_reg = 1; break;
- case '0': conf.preset |= TI_FLAG_UCSC; break;
- case 'S': skip = atoi(optarg); break;
- case 'c': meta = optarg[0]; break;
- case 'p':
- if (strcmp(optarg, "gff") == 0) conf = ti_conf_gff;
- else if (strcmp(optarg, "bed") == 0) conf = ti_conf_bed;
- else if (strcmp(optarg, "sam") == 0) conf = ti_conf_sam;
- else if (strcmp(optarg, "vcf") == 0 || strcmp(optarg, "vcf4") == 0) conf = ti_conf_vcf;
- else if (strcmp(optarg, "psltbl") == 0) conf = ti_conf_psltbl;
- else {
- fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg);
- return 1;
- }
- break;
- case 's': conf.sc = atoi(optarg); break;
- case 'b': conf.bc = atoi(optarg); break;
- case 'e': conf.ec = atoi(optarg); break;
- case 'l': list_chrms = 1; break;
- case 'h': print_header = 1; break;
- case 'f': force = 1; break;
- case 'r': reheader = optarg; break;
- }
- }
- if (skip >= 0) conf.line_skip = skip;
- if (meta >= 0) conf.meta_char = meta;
- if (optind == argc) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Program: tabix (TAB-delimited file InderXer)\n");
- fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
- fprintf(stderr, "Usage: tabix <in.tab.bgz> [region1 [region2 [...]]]\n\n");
- fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n");
- fprintf(stderr, " -s INT sequence name column [1]\n");
- fprintf(stderr, " -b INT start column [4]\n");
- fprintf(stderr, " -e INT end column; can be identical to '-b' [5]\n");
- fprintf(stderr, " -S INT skip first INT lines [0]\n");
- fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n");
- fprintf(stderr, " -r FILE replace the header with the content of FILE [null]\n");
- fprintf(stderr, " -B region1 is a BED file (entire file will be read)\n");
- fprintf(stderr, " -0 zero-based coordinate\n");
- fprintf(stderr, " -h print the header lines\n");
- fprintf(stderr, " -l list chromosome names\n");
- fprintf(stderr, " -f force to overwrite the index\n");
- fprintf(stderr, "\n");
- return 1;
- }
- if (list_chrms) {
- ti_index_t *idx;
- int i, n;
- const char **names;
- idx = ti_index_load(argv[optind]);
- if (idx == 0) {
- fprintf(stderr, "[main] fail to load the index file.\n");
- return 1;
- }
- names = ti_seqname(idx, &n);
- for (i = 0; i < n; ++i) printf("%s\n", names[i]);
- free(names);
- ti_index_destroy(idx);
- return 0;
- }
- if (reheader)
- return reheader_file(reheader,argv[optind],conf.meta_char);
-
- struct stat stat_tbi,stat_vcf;
- char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
- strcat(strcpy(fnidx, argv[optind]), ".tbi");
-
- if (optind + 1 == argc) {
- if (force == 0) {
- if (stat(fnidx, &stat_tbi) == 0)
- {
- // Before complaining, check if the VCF file isn't newer. This is a common source of errors,
- // people tend not to notice that tabix failed
- stat(argv[optind], &stat_vcf);
- if ( stat_vcf.st_mtime <= stat_tbi.st_mtime )
- {
- fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
- free(fnidx);
- return 1;
- }
- }
- }
- if ( bgzf_check_bgzf(argv[optind])!=1 )
- {
- fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]);
- free(fnidx);
- return 1;
- }
- return ti_index_build(argv[optind], &conf);
- }
- { // retrieve
- tabix_t *t;
- // Common source of errors: new VCF is used with an old index
- stat(fnidx, &stat_tbi);
- stat(argv[optind], &stat_vcf);
- if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime )
- {
- fprintf(stderr, "[tabix] the index file is older than the vcf file. Please use '-f' to overwrite or reindex.\n");
- free(fnidx);
- return 1;
- }
- free(fnidx);
-
- if ((t = ti_open(argv[optind], 0)) == 0) {
- fprintf(stderr, "[main] fail to open the data file.\n");
- return 1;
- }
- if (strcmp(argv[optind+1], ".") == 0) { // retrieve all
- ti_iter_t iter;
- const char *s;
- int len;
- iter = ti_query(t, 0, 0, 0);
- while ((s = ti_read(t, iter, &len)) != 0) {
- fputs(s, stdout); fputc('\n', stdout);
- }
- ti_iter_destroy(iter);
- } else { // retrieve from specified regions
- int i, len;
- ti_iter_t iter;
- const char *s;
- const ti_conf_t *idxconf;
-
- if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
- fprintf(stderr,"[tabix] failed to load the index file.\n");
- return 1;
- }
- idxconf = ti_get_conf(t->idx);
-
- if ( print_header )
- {
- // If requested, print the header lines here
- iter = ti_query(t, 0, 0, 0);
- while ((s = ti_read(t, iter, &len)) != 0) {
- if ((int)(*s) != idxconf->meta_char) break;
- fputs(s, stdout); fputc('\n', stdout);
- }
- ti_iter_destroy(iter);
- }
- if (bed_reg) {
- extern int bed_overlap(const void *_h, const char *chr, int beg, int end);
- extern void *bed_read(const char *fn);
- extern void bed_destroy(void *_h);
-
- const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available
- void *bed = bed_read(argv[optind+1]); // load the BED file
- ti_interval_t intv;
-
- if (bed == 0) {
- fprintf(stderr, "[main] fail to read the BED file.\n");
- return 1;
- }
- iter = ti_query(t, 0, 0, 0);
- while ((s = ti_read(t, iter, &len)) != 0) {
- int c;
- ti_get_intv(conf_, len, (char*)s, &intv);
- c = *intv.se; *intv.se = '\0';
- if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) {
- *intv.se = c;
- puts(s);
- }
- *intv.se = c;
- }
- ti_iter_destroy(iter);
- bed_destroy(bed);
- } else {
- for (i = optind + 1; i < argc; ++i) {
- int tid, beg, end;
- if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) {
- iter = ti_queryi(t, tid, beg, end);
- while ((s = ti_read(t, iter, &len)) != 0) {
- fputs(s, stdout); fputc('\n', stdout);
- }
- ti_iter_destroy(iter);
- }
- // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");
- }
- }
- }
- ti_close(t);
- }
- return 0;
-}
diff --git a/external/vcflib/tabixpp/main.cpp b/external/vcflib/tabixpp/main.cpp
deleted file mode 100644
index 592bf30..0000000
--- a/external/vcflib/tabixpp/main.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include "tabix.hpp"
-#include <vector>
-
-using namespace std;
-
-int main(int argc, char** argv) {
-
- if (argc < 2) {
- cerr << argv[0] << " [file] [ [region] ... ]" << endl
- << "Writes out regions from bgzf-compressed, tabix-indexed file." << endl
- << "Supply 'header' to print out the header, and no regions to" << endl
- << "print the contents of the entire file." << endl;
- return 1;
- }
-
- string filename = string(argv[1]);
- vector<string> regions;
- for (int i = 2; i < argc; ++i) {
- regions.push_back(string(argv[i]));
- }
-
- Tabix file(filename);
-
- if (!regions.empty()) {
- for (vector<string>::iterator r = regions.begin(); r != regions.end(); ++r) {
- string& region = *r;
- if (region == "header") {
- string header;
- file.getHeader(header);
- cout << header;
- } else {
- string line;
- file.setRegion(region);
- while (file.getNextLine(line)) {
- cout << line << endl;
- }
- }
- }
- } else {
- string line;
- while (file.getNextLine(line)) {
- cout << line << endl;
- }
- }
-
- return 0;
-}
diff --git a/external/vcflib/tabixpp/tabix.1 b/external/vcflib/tabixpp/tabix.1
deleted file mode 100644
index 1bd9533..0000000
--- a/external/vcflib/tabixpp/tabix.1
+++ /dev/null
@@ -1,132 +0,0 @@
-.TH tabix 1 "11 May 2010" "tabix-0.2.0" "Bioinformatics tools"
-.SH NAME
-.PP
-bgzip - Block compression/decompression utility
-.PP
-tabix - Generic indexer for TAB-delimited genome position files
-.SH SYNOPSIS
-.PP
-.B bgzip
-.RB [ \-cdhB ]
-.RB [ \-b
-.IR virtualOffset ]
-.RB [ \-s
-.IR size ]
-.RI [ file ]
-.PP
-.B tabix
-.RB [ \-0lf ]
-.RB [ \-p
-.R gff|bed|sam|vcf]
-.RB [ \-s
-.IR seqCol ]
-.RB [ \-b
-.IR begCol ]
-.RB [ \-e
-.IR endCol ]
-.RB [ \-S
-.IR lineSkip ]
-.RB [ \-c
-.IR metaChar ]
-.I in.tab.bgz
-.RI [ "region1 " [ "region2 " [ ... "]]]"
-
-.SH DESCRIPTION
-.PP
-Tabix indexes a TAB-delimited genome position file
-.I in.tab.bgz
-and creates an index file
-.I in.tab.bgz.tbi
-when
-.I region
-is absent from the command-line. The input data file must be position
-sorted and compressed by
-.B bgzip
-which has a
-.BR gzip (1)
-like interface. After indexing, tabix is able to quickly retrieve data
-lines overlapping
-.I regions
-specified in the format "chr:beginPos-endPos". Fast data retrieval also
-works over network if URI is given as a file name and in this case the
-index file will be downloaded if it is not present locally.
-
-.SH OPTIONS OF TABIX
-.TP 10
-.BI "-p " STR
-Input format for indexing. Valid values are: gff, bed, sam, vcf and
-psltab. This option should not be applied together with any of
-.BR \-s ", " \-b ", " \-e ", " \-c " and " \-0 ;
-it is not used for data retrieval because this setting is stored in
-the index file. [gff]
-.TP
-.BI "-s " INT
-Column of sequence name. Option
-.BR \-s ", " \-b ", " \-e ", " \-S ", " \-c " and " \-0
-are all stored in the index file and thus not used in data retrieval. [1]
-.TP
-.BI "-b " INT
-Column of start chromosomal position. [4]
-.TP
-.BI "-e " INT
-Column of end chromosomal position. The end column can be the same as the
-start column. [5]
-.TP
-.BI "-S " INT
-Skip first INT lines in the data file. [0]
-.TP
-.BI "-c " CHAR
-Skip lines started with character CHAR. [#]
-.TP
-.B -0
-Specify that the position in the data file is 0-based (e.g. UCSC files)
-rather than 1-based.
-.TP
-.B -h
-Print the header/meta lines.
-.TP
-.B -B
-The second argument is a BED file. When this option is in use, the input
-file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless,
-with this option, the format of the input must be specificed correctly on the command line.
-.TP
-.B -f
-Force to overwrite the index file if it is present.
-.TP
-.B -l
-List the sequence names stored in the index file.
-.RE
-
-.SH EXAMPLE
-(grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz;
-
-tabix -p gff sorted.gff.gz;
-
-tabix sorted.gff.gz chr1:10,000,000-20,000,000;
-
-.SH NOTES
-It is straightforward to achieve overlap queries using the standard
-B-tree index (with or without binning) implemented in all SQL databases,
-or the R-tree index in PostgreSQL and Oracle. But there are still many
-reasons to use tabix. Firstly, tabix directly works with a lot of widely
-used TAB-delimited formats such as GFF/GTF and BED. We do not need to
-design database schema or specialized binary formats. Data do not need
-to be duplicated in different formats, either. Secondly, tabix works on
-compressed data files while most SQL databases do not. The GenCode
-annotation GTF can be compressed down to 4%. Thirdly, tabix is
-fast. The same indexing algorithm is known to work efficiently for an
-alignment with a few billion short reads. SQL databases probably cannot
-easily handle data at this scale. Last but not the least, tabix supports
-remote data retrieval. One can put the data file and the index at an FTP
-or HTTP server, and other users or even web services will be able to get
-a slice without downloading the entire file.
-
-.SH AUTHOR
-.PP
-Tabix was written by Heng Li. The BGZF library was originally
-implemented by Bob Handsaker and modified by Heng Li for remote file
-access and in-memory caching.
-
-.SH SEE ALSO
-.PP
-.BR samtools (1)
diff --git a/external/vcflib/tabixpp/tabix.cpp b/external/vcflib/tabixpp/tabix.cpp
deleted file mode 100644
index 9feb190..0000000
--- a/external/vcflib/tabixpp/tabix.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-#include "tabix.hpp"
-
-Tabix::Tabix(void) { }
-
-Tabix::Tabix(string& file) {
- filename = file;
- const char* cfilename = file.c_str();
- struct stat stat_tbi,stat_vcf;
- char *fnidx = (char*) calloc(strlen(cfilename) + 5, 1);
- strcat(strcpy(fnidx, cfilename), ".tbi");
- cout <<"TABIX:const - cfilename = " << cfilename << endl;
- if ( bgzf_check_bgzf(cfilename)!=1 )
- {
- cerr << "[tabix++] was bgzip used to compress this file? " << file << endl;
- free(fnidx);
- exit(1);
- }
- // Common source of errors: new VCF is used with an old index
- stat(fnidx, &stat_tbi);
- stat(cfilename, &stat_vcf);
- if ( stat_vcf.st_mtime > stat_tbi.st_mtime )
- {
- cerr << "[tabix++] the index file is older than the vcf file. Please use '-f' to overwrite or reindex." << endl;
- free(fnidx);
- exit(1);
- }
- free(fnidx);
-
- if ((t = ti_open(cfilename, 0)) == 0) {
- cerr << "[tabix++] fail to open the data file." << endl;
- exit(1);
- }
-
- if (ti_lazy_index_load(t) < 0) {
- cerr << "[tabix++] failed to load the index file." << endl;
- exit(1);
- }
-
- idxconf = ti_get_conf(t->idx);
-
- // set up the iterator, defaults to the beginning
- iter = ti_query(t, 0, 0, 0);
-
-}
-
-Tabix::~Tabix(void) {
- ti_iter_destroy(iter);
- ti_close(t);
-}
-
-
-void Tabix::getHeader(string& header) {
- header.clear();
- ti_iter_destroy(iter);
- iter = ti_query(t, 0, 0, 0);
- const char* s;
- int len;
- while ((s = ti_read(t, iter, &len)) != 0) {
- if ((int)(*s) != idxconf->meta_char) {
- firstline = string(s); // stash this line
- break;
- } else {
- header += string(s);
- header += "\n";
- }
- }
-}
-
-bool Tabix::setRegion(string& region) {
- if (ti_parse_region(t->idx, region.c_str(), &tid, &beg, &end) == 0) {
- firstline.clear();
- ti_iter_destroy(iter);
- iter = ti_queryi(t, tid, beg, end);
- return true;
- } else return false;
-}
-
-bool Tabix::getNextLine(string& line) {
- const char* s;
- int len;
- if (!firstline.empty()) {
- line = firstline; // recovers line read if header is parsed
- firstline.clear();
- return true;
- }
- if ((s = ti_read(t, iter, &len)) != 0) {
- line = string(s);
- return true;
- } else return false;
-}
diff --git a/external/vcflib/tabixpp/tabix.h b/external/vcflib/tabixpp/tabix.h
deleted file mode 100644
index 7b4497a..0000000
--- a/external/vcflib/tabixpp/tabix.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2009 Genome Research Ltd (GRL), 2010 Broad Institute
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <lh3 at live.co.uk> */
-
-#ifndef __TABIDX_H
-#define __TABIDX_H
-
-#include <stdint.h>
-#include "kstring.h"
-#include "bgzf.h"
-
-#define TI_PRESET_GENERIC 0
-#define TI_PRESET_SAM 1
-#define TI_PRESET_VCF 2
-
-#define TI_FLAG_UCSC 0x10000
-
-typedef int (*ti_fetch_f)(int l, const char *s, void *data);
-
-struct __ti_index_t;
-typedef struct __ti_index_t ti_index_t;
-
-struct __ti_iter_t;
-typedef struct __ti_iter_t *ti_iter_t;
-
-typedef struct {
- BGZF *fp;
- ti_index_t *idx;
- char *fn, *fnidx;
-} tabix_t;
-
-typedef struct {
- int32_t preset;
- int32_t sc, bc, ec; // seq col., beg col. and end col.
- int32_t meta_char, line_skip;
-} ti_conf_t;
-
-typedef struct {
- int beg, end;
- char *ss, *se;
-} ti_interval_t;
-
-extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; // preset
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- /*******************
- * High-level APIs *
- *******************/
-
- tabix_t *ti_open(const char *fn, const char *fnidx);
- int ti_lazy_index_load(tabix_t *t);
- void ti_close(tabix_t *t);
- ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end);
- ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end);
- ti_iter_t ti_querys(tabix_t *t, const char *reg);
- const char *ti_read(tabix_t *t, ti_iter_t iter, int *len);
-
- /* Destroy the iterator */
- void ti_iter_destroy(ti_iter_t iter);
-
- /* Get the list of sequence names. Each "char*" pointer points to a
- * internal member of the index, so DO NOT modify the returned
- * pointer; otherwise the index will be corrupted. The returned
- * pointer should be freed by a single free() call by the routine
- * calling this function. The number of sequences is returned at *n. */
- const char **ti_seqname(const ti_index_t *idx, int *n);
-
- /******************
- * Low-level APIs *
- ******************/
-
- /* Build the index for file <fn>. File <fn>.tbi will be generated
- * and overwrite the file of the same name. Return -1 on failure. */
- int ti_index_build(const char *fn, const ti_conf_t *conf);
-
- /* Load the index from file <fn>.tbi. If <fn> is a URL and the index
- * file is not in the working directory, <fn>.tbi will be
- * downloaded. Return NULL on failure. */
- ti_index_t *ti_index_load(const char *fn);
-
- ti_index_t *ti_index_load_local(const char *fnidx);
-
- /* Destroy the index */
- void ti_index_destroy(ti_index_t *idx);
-
- /* Parse a region like: chr2, chr2:100, chr2:100-200. Return -1 on failure. */
- int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end);
-
- int ti_get_tid(const ti_index_t *idx, const char *name);
-
- /* Get the iterator pointing to the first record at the current file
- * position. If the file is just openned, the iterator points to the
- * first record in the file. */
- ti_iter_t ti_iter_first(void);
-
- /* Get the iterator pointing to the first record in region tid:beg-end */
- ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end);
-
- /* Get the data line pointed by the iterator and iterate to the next record. */
- const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len);
-
- const ti_conf_t *ti_get_conf(ti_index_t *idx);
- int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv);
-
- /*******************
- * Deprecated APIs *
- *******************/
-
- /* The callback version for random access */
- int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func);
-
- /* Read one line. */
- int ti_readline(BGZF *fp, kstring_t *str);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/external/vcflib/tabixpp/tabix.hpp b/external/vcflib/tabixpp/tabix.hpp
deleted file mode 100644
index 7b67a6f..0000000
--- a/external/vcflib/tabixpp/tabix.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <string>
-#include <stdlib.h>
-#include <sys/stat.h>
-#include "bgzf.h"
-#include "tabix.h"
-#include <iostream>
-
-
-using namespace std;
-
-class Tabix {
-
- tabix_t *t;
- ti_iter_t iter;
- const ti_conf_t *idxconf;
- int tid, beg, end;
- string firstline;
-
-public:
-
- string filename;
-
- Tabix(void);
- Tabix(string& file);
- ~Tabix(void);
-
- void getHeader(string& header);
- bool setRegion(string& region);
- bool getNextLine(string& line);
-
-};
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/tvc.git
More information about the debian-med-commit
mailing list