[med-svn] [Git][med-team/libvcflib][master] 5 commits: routine-update: New upstream version
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Sat Feb 11 17:31:30 GMT 2023
Étienne Mollier pushed to branch master at Debian Med / libvcflib
Commits:
70019347 by Étienne Mollier at 2023-02-11T18:09:07+01:00
routine-update: New upstream version
- - - - -
7957615c by Étienne Mollier at 2023-02-11T18:10:01+01:00
New upstream version 1.0.9+dfsg
- - - - -
97bc2c0e by Étienne Mollier at 2023-02-11T18:10:48+01:00
Update upstream source from tag 'upstream/1.0.9+dfsg'
Update to upstream version '1.0.9+dfsg'
with Debian dir 458871504476fe6f1b50ded7e7fe8b4b1d9f7ada
- - - - -
9a6a47d3 by Étienne Mollier at 2023-02-11T18:15:43+01:00
routine-update: Ready to upload to unstable
- - - - -
d72d88cf by Étienne Mollier at 2023-02-11T18:30:52+01:00
un-release
- - - - -
12 changed files:
- RELEASE_NOTES.md
- debian/changelog
- src/Variant.cpp
- src/Variant.h
- src/cigar.cpp
- src/cigar.hpp
- src/legacy.cpp
- src/pythonffi.cpp
- src/vcf-wfa.cpp
- src/vcf-wfa.h
- src/vcfwave.cpp
- test/tests/realign.py
Changes:
=====================================
RELEASE_NOTES.md
=====================================
@@ -12,6 +12,12 @@ and
- tabix -p vcf my_file.vcf.gz
- pangenie, vg deconstruct, vcfbub
+## ChangeLog v1.0.8 (20230210)
+
+Vcflib maintenance release - mostly for including in Debian
+
++ Another fix so downstream packages, such as freebayes, no longer need WFAlib.
+
## ChangeLog v1.0.7 (20230207)
Vcflib maintenance release - mostly for including in Debian
=====================================
debian/changelog
=====================================
@@ -1,11 +1,11 @@
-libvcflib (1.0.8+dfsg-1) UNRELEASED; urgency=medium
+libvcflib (1.0.9+dfsg-1) UNRELEASED; urgency=medium
* Team upload.
- * New upstream version 1.0.8+dfsg
+ * New upstream version 1.0.9+dfsg
* d/patches: refresh everything.
* wfa2_linking: add patch; fix linker errors.
- -- Étienne Mollier <emollier at debian.org> Sat, 11 Feb 2023 16:07:36 +0100
+ -- Étienne Mollier <emollier at debian.org> Sat, 11 Feb 2023 18:11:13 +0100
libvcflib (1.0.7+dfsg-2) unstable; urgency=medium
=====================================
src/Variant.cpp
=====================================
@@ -8,6 +8,7 @@
*/
#include "Variant.h"
+#include "cigar.hpp"
#include <utility>
namespace vcflib {
@@ -2690,4 +2691,209 @@ vector<Variant*> Variant::matchingHaplotypes() {
}
}
+
+// Reintroduce old version for freebayes
+
+map<string, vector<VariantAllele> > Variant::parsedAlternates(bool includePreviousBaseForIndels,
+ bool useMNPs,
+ bool useEntropy,
+ float matchScore,
+ float mismatchScore,
+ float gapOpenPenalty,
+ float gapExtendPenalty,
+ float repeatGapExtendPenalty,
+ string flankingRefLeft,
+ string flankingRefRight) {
+
+ map<string, vector<VariantAllele> > variantAlleles;
+
+ if (isSymbolicSV()){
+ // Don't ever align SVs. It just wrecks things.
+ return this->flatAlternates();
+ }
+ // add the reference allele
+ variantAlleles[ref].push_back(VariantAllele(ref, ref, position));
+
+ // single SNP case, no ambiguity possible, no need to spend a lot of
+ // compute aligning ref and alt fields
+ if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
+ variantAlleles[alt.front()].push_back(VariantAllele(ref, alt.front(), position));
+ return variantAlleles;
+ }
+
+ // padding is used to ensure a stable alignment of the alternates to the reference
+ // without having to go back and look at the full reference sequence
+ int paddingLen = max(10, (int) (ref.size())); // dynamically determine optimum padding length
+ for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+ string& alternate = *a;
+ paddingLen = max(paddingLen, (int) (alternate.size()));
+ }
+ char padChar = 'Z';
+ char anchorChar = 'Q';
+ string padding(paddingLen, padChar);
+
+ // this 'anchored' string is done for stability
+ // the assumption is that there should be a positional match in the first base
+ // this is true for VCF 4.1, and standard best practices
+ // using the anchor char ensures this without other kinds of realignment
+ string reference_M;
+ if (flankingRefLeft.empty() && flankingRefRight.empty()) {
+ reference_M = padding + ref + padding;
+ reference_M[paddingLen] = anchorChar;
+ } else {
+ reference_M = flankingRefLeft + ref + flankingRefRight;
+ paddingLen = flankingRefLeft.size();
+ }
+
+ // passed to sw.Align
+ unsigned int referencePos;
+
+ string cigar;
+
+ for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+
+ string& alternate = *a;
+ vector<VariantAllele>& variants = variantAlleles[alternate];
+ string alternateQuery_M;
+ if (flankingRefLeft.empty() && flankingRefRight.empty()) {
+ alternateQuery_M = padding + alternate + padding;
+ alternateQuery_M[paddingLen] = anchorChar;
+ } else {
+ alternateQuery_M = flankingRefLeft + alternate + flankingRefRight;
+ }
+ //const unsigned int alternateLen = alternate.size();
+
+ if (true) {
+ CSmithWatermanGotoh sw(matchScore,
+ mismatchScore,
+ gapOpenPenalty,
+ gapExtendPenalty);
+ if (useEntropy) sw.EnableEntropyGapPenalty(1);
+ if (repeatGapExtendPenalty != 0){
+ sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
+ }
+ sw.Align(referencePos, cigar, reference_M, alternateQuery_M);
+ } else { // disabled for now
+ StripedSmithWaterman::Aligner aligner;
+ StripedSmithWaterman::Filter sswFilter;
+ StripedSmithWaterman::Alignment alignment;
+ aligner.Align(alternateQuery_M.c_str(),
+ reference_M.c_str(),
+ reference_M.size(), sswFilter, &alignment);
+ cigar = alignment.cigar_string;
+ }
+
+ // left-realign the alignment...
+
+ vector<pair<int, string> > cigarData = old_splitCigar(cigar);
+
+ if (cigarData.front().second != "M"
+ || cigarData.back().second != "M"
+ || cigarData.front().first < paddingLen
+ || cigarData.back().first < paddingLen) {
+ cerr << "parsedAlternates: alignment does not start with match over padded sequence" << endl;
+ cerr << cigar << endl;
+ cerr << reference_M << endl;
+ cerr << alternateQuery_M << endl;
+ exit(1);
+ } else {
+ cigarData.front().first -= paddingLen;
+ cigarData.back().first -= paddingLen;;
+ }
+ //cigarData = cleanCigar(cigarData);
+ cigar = old_joinCigar(cigarData);
+
+ int altpos = 0;
+ int refpos = 0;
+
+ for (vector<pair<int, string> >::iterator e = cigarData.begin();
+ e != cigarData.end(); ++e) {
+
+ int len = e->first;
+ string type = e->second;
+
+ switch (type.at(0)) {
+ case 'I':
+ if (includePreviousBaseForIndels) {
+ if (!variants.empty() &&
+ variants.back().ref != variants.back().alt) {
+ VariantAllele a =
+ VariantAllele("",
+ alternate.substr(altpos, len),
+ refpos + position);
+ variants.back() = variants.back() + a;
+ } else {
+ VariantAllele a =
+ VariantAllele(ref.substr(refpos - 1, 1),
+ alternate.substr(altpos - 1, len + 1),
+ refpos + position - 1);
+ variants.push_back(a);
+ }
+ } else {
+ variants.push_back(VariantAllele("",
+ alternate.substr(altpos, len),
+ refpos + position));
+ }
+ altpos += len;
+ break;
+ case 'D':
+ if (includePreviousBaseForIndels) {
+ if (!variants.empty() &&
+ variants.back().ref != variants.back().alt) {
+ VariantAllele a
+ = VariantAllele(ref.substr(refpos, len)
+ , "", refpos + position);
+ variants.back() = variants.back() + a;
+ } else {
+ VariantAllele a
+ = VariantAllele(ref.substr(refpos - 1, len + 1),
+ alternate.substr(altpos - 1, 1),
+ refpos + position - 1);
+ variants.push_back(a);
+ }
+ } else {
+ variants.push_back(VariantAllele(ref.substr(refpos, len),
+ "", refpos + position));
+ }
+ refpos += len;
+ break;
+
+ // zk has added (!variants.empty()) solves the seg fault in
+ // vcfstats, but need to test
+ case 'M':
+ {
+ for (int i = 0; i < len; ++i) {
+ VariantAllele a
+ = VariantAllele(ref.substr(refpos + i, 1),
+ alternate.substr(altpos + i, 1),
+ (refpos + i + position));
+ if (useMNPs && (!variants.empty()) &&
+ variants.back().ref.size() == variants.back().alt.size()
+ && variants.back().ref != variants.back().alt) {
+ variants.back() = variants.back() + a;
+ } else {
+ variants.push_back(a);
+ }
+ }
+ }
+ refpos += len;
+ altpos += len;
+ break;
+ case 'S':
+ {
+ refpos += len;
+ altpos += len;
+ break;
+ }
+ default:
+ {
+ break;
+ }
+ }
+ }
+ }
+ return variantAlleles;
+}
+
+
} // end namespace vcf
=====================================
src/Variant.h
=====================================
@@ -234,6 +234,16 @@ public:
bool keepGeno=true,
bool debug=false);
+ map<string, vector<VariantAllele> > parsedAlternates(bool includePreviousBaseForIndels = false,
+ bool useMNPs = false,
+ bool useEntropy = false,
+ float matchScore = 10.0f,
+ float mismatchScore = -9.0f,
+ float gapOpenPenalty = 15.0f,
+ float gapExtendPenalty = 6.66f,
+ float repeatGapExtendPenalty = 0.0f,
+ string flankingRefLeft = "",
+ string flankingRefRight = "");
// the same output format as parsedAlternates, without parsing
map<string, vector<VariantAllele> > flatAlternates(void);
=====================================
src/cigar.cpp
=====================================
@@ -177,4 +177,52 @@ bool isEmptyCigarElement(const pair<int, char>& elem) {
return elem.first == 0;
}
+vector<pair<int, string> > old_splitCigar(const string& cigarStr) {
+ vector<pair<int, string> > cigar;
+ string number;
+ string type;
+ // strings go [Number][Type] ...
+ for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
+ char c = *s;
+ if (isdigit(c)) {
+ if (type.empty()) {
+ number += c;
+ } else {
+ // signal for next token, push back the last pair, clean up
+ cigar.push_back(make_pair(atoi(number.c_str()), type));
+ number.clear();
+ type.clear();
+ number += c;
+ }
+ } else {
+ type += c;
+ }
+ }
+ if (!number.empty() && !type.empty()) {
+ cigar.push_back(make_pair(atoi(number.c_str()), type));
+ }
+ return cigar;
+}
+
+string old_joinCigar(const vector<pair<int, string> >& cigar) {
+ string cigarStr;
+ for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ if (c->first) {
+ cigarStr += convert(c->first) + c->second;
+ }
+ }
+ return cigarStr;
+}
+
+string old_joinCigar(const vector<pair<int, char> >& cigar) {
+ string cigarStr;
+ for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ if (c->first) {
+ cigarStr += convert(c->first) + string(1, c->second);
+ }
+ }
+ return cigarStr;
+}
+
+
}
=====================================
src/cigar.hpp
=====================================
@@ -16,10 +16,12 @@ typedef vector<pair<int, char> > Cigar;
string varCigar(vector<VariantAllele>& vav, bool xForMismatch = false);
string mergeCigar(const string& c1, const string& c2);
vector<pair<int, char> > splitUnpackedCigar(const string& cigarStr);
+vector<pair<int, string> > old_splitCigar(const string& cigarStr);
vector<pair<int, char> > splitCigar(const string& cigarStr);
list<pair<int, char> > splitCigarList(const string& cigarStr);
int cigarRefLen(const vector<pair<int, char> >& cigar);
vector<pair<int, char> > cleanCigar(const vector<pair<int, char> >& cigar);
+string old_joinCigar(const vector<pair<int, string> >& cigar);
string joinCigar(const vector<pair<int, char> >& cigar);
string joinCigarList(const list<pair<int, char> >& cigar);
bool isEmptyCigarElement(const pair<int, char>& elem);
=====================================
src/legacy.cpp
=====================================
@@ -2,8 +2,8 @@
vcflib C++ library for parsing and manipulating VCF files. This file contains
legacy material that will be phased out.
- Copyright © 2010-2022 Erik Garrison
- Copyright © 2020-2022 Pjotr Prins
+ Copyright © 2010-2023 Erik Garrison
+ Copyright © 2020-2023 Pjotr Prins
This software is published under the MIT License. See the LICENSE file.
*/
@@ -21,7 +21,8 @@ namespace vcflib {
// 'refs'. In this function Smith-Waterman is used with padding on
// both sides of a ref and each alt. The SW method quadratic in nature
// and painful with long sequences. Recently WFA is introduced with
-// runs in linear time.
+// runs in linear time: use WfaVariant::parsedAlternates.
+
//
// Returns map of [REF,ALTs] with attached VariantAllele records
@@ -40,6 +41,7 @@ map<string, vector<VariantAllele> > Variant::legacy_parsedAlternates(
bool useWaveFront,
bool debug) {
+
bool OLDPADDING = true; // we want to use the first bp for alignment
map<string, vector<VariantAllele> > variantAlleles; // return type
@@ -662,4 +664,6 @@ void Variant::legacy_reduceAlleles(
}
}
+
+
} // namespace vcflib
=====================================
src/pythonffi.cpp
=====================================
@@ -64,7 +64,7 @@ PYBIND11_MODULE(pyvcflib, m)
py::class_<WfaVariant, Variant>(m, "WfaVariant", "WFA VCF record")
.def(py::init<VariantCallFile &>() )
- .def("parsedAlternates", &WfaVariant::parsedAlternates);
+ .def("wfa_parsedAlternates", &WfaVariant::wfa_parsedAlternates);
py::class_<VariantCallFile>(m, "VariantCallFile", "VCF file")
.def(py::init())
=====================================
src/vcf-wfa.cpp
=====================================
@@ -22,7 +22,7 @@ namespace vcflib {
// Returns map of [REF,ALTs] with attached VariantAllele records
-map<string, pair<vector<VariantAllele>,bool> > WfaVariant::parsedAlternates(
+map<string, pair<vector<VariantAllele>,bool> > WfaVariant::wfa_parsedAlternates(
bool includePreviousBaseForIndels,
bool useMNPs,
bool useEntropy,
=====================================
src/vcf-wfa.h
=====================================
@@ -20,7 +20,7 @@ public:
WfaVariant(VariantCallFile& v) : Variant(v)
{ };
- map<string, pair<vector<VariantAllele>,bool> > parsedAlternates(
+ map<string, pair<vector<VariantAllele>,bool> > wfa_parsedAlternates(
bool includePreviousBaseForIndels = false,
bool useMNPs = false,
bool useEntropy = false,
=====================================
src/vcfwave.cpp
=====================================
@@ -254,7 +254,7 @@ int main(int argc, char** argv) {
}
map<string, pair<vector<VariantAllele>, bool> > varAlleles =
- var.parsedAlternates(includePreviousBaseForIndels, useMNPs,
+ var.wfa_parsedAlternates(includePreviousBaseForIndels, useMNPs,
false, // bool useEntropy = false,
"", // string flankingRefLeft = "",
"", // string flankingRefRight = "",
=====================================
test/tests/realign.py
=====================================
@@ -64,7 +64,7 @@ class RealignTest(unittest.TestCase):
wfa_params.affine2p_penalties.gap_extension2 = 1
wfa_params.alignment_scope = alignment_scope_t.compute_alignment;
# A dict is returned of alleles with variants and is_reversed
- wf = var.parsedAlternates(True,True,False,"","",wfa_params,True,64,1,True)
+ wf = var.wfa_parsedAlternates(True,True,False,"","",wfa_params,True,64,1,True)
print(f'ref={var.ref}')
print(var.info)
for key1, value1 in wf.items():
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/ac907126881ca1656e00fe776fd5e4fe939cd662...d72d88cf45106c6ea8f39b44a32473df078a5ada
--
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/ac907126881ca1656e00fe776fd5e4fe939cd662...d72d88cf45106c6ea8f39b44a32473df078a5ada
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230211/ee4dc0b6/attachment-0001.htm>
More information about the debian-med-commit
mailing list