[med-svn] [Git][med-team/libvcflib][master] 5 commits: routine-update: New upstream version

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Sat Feb 11 17:31:30 GMT 2023



Étienne Mollier pushed to branch master at Debian Med / libvcflib


Commits:
70019347 by Étienne Mollier at 2023-02-11T18:09:07+01:00
routine-update: New upstream version

- - - - -
7957615c by Étienne Mollier at 2023-02-11T18:10:01+01:00
New upstream version 1.0.9+dfsg
- - - - -
97bc2c0e by Étienne Mollier at 2023-02-11T18:10:48+01:00
Update upstream source from tag 'upstream/1.0.9+dfsg'

Update to upstream version '1.0.9+dfsg'
with Debian dir 458871504476fe6f1b50ded7e7fe8b4b1d9f7ada
- - - - -
9a6a47d3 by Étienne Mollier at 2023-02-11T18:15:43+01:00
routine-update: Ready to upload to unstable

- - - - -
d72d88cf by Étienne Mollier at 2023-02-11T18:30:52+01:00
un-release

- - - - -


12 changed files:

- RELEASE_NOTES.md
- debian/changelog
- src/Variant.cpp
- src/Variant.h
- src/cigar.cpp
- src/cigar.hpp
- src/legacy.cpp
- src/pythonffi.cpp
- src/vcf-wfa.cpp
- src/vcf-wfa.h
- src/vcfwave.cpp
- test/tests/realign.py


Changes:

=====================================
RELEASE_NOTES.md
=====================================
@@ -12,6 +12,12 @@ and
 - tabix -p vcf my_file.vcf.gz
 - pangenie, vg deconstruct, vcfbub
 
+## ChangeLog v1.0.8 (20230210)
+
+Vcflib maintenance release - mostly for including in Debian
+
++ Another fix so downstream packages, such as freebayes, no longer need WFAlib.
+
 ## ChangeLog v1.0.7 (20230207)
 
 Vcflib maintenance release - mostly for including in Debian


=====================================
debian/changelog
=====================================
@@ -1,11 +1,11 @@
-libvcflib (1.0.8+dfsg-1) UNRELEASED; urgency=medium
+libvcflib (1.0.9+dfsg-1) UNRELEASED; urgency=medium
 
   * Team upload.
-  * New upstream version 1.0.8+dfsg
+  * New upstream version 1.0.9+dfsg
   * d/patches: refresh everything.
   * wfa2_linking: add patch; fix linker errors.
 
- -- Étienne Mollier <emollier at debian.org>  Sat, 11 Feb 2023 16:07:36 +0100
+ -- Étienne Mollier <emollier at debian.org>  Sat, 11 Feb 2023 18:11:13 +0100
 
 libvcflib (1.0.7+dfsg-2) unstable; urgency=medium
 


=====================================
src/Variant.cpp
=====================================
@@ -8,6 +8,7 @@
 */
 
 #include "Variant.h"
+#include "cigar.hpp"
 #include <utility>
 
 namespace vcflib {
@@ -2690,4 +2691,209 @@ vector<Variant*> Variant::matchingHaplotypes() {
         }
     }
 
+
+// Reintroduce old version for freebayes
+
+map<string, vector<VariantAllele> > Variant::parsedAlternates(bool includePreviousBaseForIndels,
+                                                              bool useMNPs,
+                                                              bool useEntropy,
+                                                              float matchScore,
+                                                              float mismatchScore,
+                                                              float gapOpenPenalty,
+                                                              float gapExtendPenalty,
+                                                              float repeatGapExtendPenalty,
+                                                              string flankingRefLeft,
+                                                              string flankingRefRight) {
+
+    map<string, vector<VariantAllele> > variantAlleles;
+
+    if (isSymbolicSV()){
+        // Don't ever align SVs. It just wrecks things.
+        return this->flatAlternates();
+    }
+    // add the reference allele
+    variantAlleles[ref].push_back(VariantAllele(ref, ref, position));
+
+    // single SNP case, no ambiguity possible, no need to spend a lot of
+    // compute aligning ref and alt fields
+    if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
+        variantAlleles[alt.front()].push_back(VariantAllele(ref, alt.front(), position));
+        return variantAlleles;
+    }
+
+    // padding is used to ensure a stable alignment of the alternates to the reference
+    // without having to go back and look at the full reference sequence
+    int paddingLen = max(10, (int) (ref.size()));  // dynamically determine optimum padding length
+    for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+        string& alternate = *a;
+        paddingLen = max(paddingLen, (int) (alternate.size()));
+    }
+    char padChar = 'Z';
+    char anchorChar = 'Q';
+    string padding(paddingLen, padChar);
+
+    // this 'anchored' string is done for stability
+    // the assumption is that there should be a positional match in the first base
+    // this is true for VCF 4.1, and standard best practices
+    // using the anchor char ensures this without other kinds of realignment
+    string reference_M;
+    if (flankingRefLeft.empty() && flankingRefRight.empty()) {
+        reference_M = padding + ref + padding;
+        reference_M[paddingLen] = anchorChar;
+    } else {
+        reference_M = flankingRefLeft + ref + flankingRefRight;
+        paddingLen = flankingRefLeft.size();
+    }
+
+    // passed to sw.Align
+    unsigned int referencePos;
+
+    string cigar;
+
+    for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+
+      string& alternate = *a;
+      vector<VariantAllele>& variants = variantAlleles[alternate];
+      string alternateQuery_M;
+      if (flankingRefLeft.empty() && flankingRefRight.empty()) {
+	alternateQuery_M = padding + alternate + padding;
+	alternateQuery_M[paddingLen] = anchorChar;
+      } else {
+	alternateQuery_M = flankingRefLeft + alternate + flankingRefRight;
+      }
+      //const unsigned int alternateLen = alternate.size();
+
+      if (true) {
+	CSmithWatermanGotoh sw(matchScore,
+			       mismatchScore,
+			       gapOpenPenalty,
+			       gapExtendPenalty);
+	if (useEntropy) sw.EnableEntropyGapPenalty(1);
+	if (repeatGapExtendPenalty != 0){
+	  sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
+	}
+	sw.Align(referencePos, cigar, reference_M, alternateQuery_M);
+      } else {  // disabled for now
+	StripedSmithWaterman::Aligner aligner;
+	StripedSmithWaterman::Filter sswFilter;
+	StripedSmithWaterman::Alignment alignment;
+	aligner.Align(alternateQuery_M.c_str(),
+		      reference_M.c_str(),
+		      reference_M.size(), sswFilter, &alignment);
+	cigar = alignment.cigar_string;
+      }
+
+      // left-realign the alignment...
+
+      vector<pair<int, string> > cigarData = old_splitCigar(cigar);
+
+      if (cigarData.front().second != "M"
+	  || cigarData.back().second != "M"
+	  || cigarData.front().first < paddingLen
+	  || cigarData.back().first < paddingLen) {
+	cerr << "parsedAlternates: alignment does not start with match over padded sequence" << endl;
+	cerr << cigar << endl;
+	cerr << reference_M << endl;
+	cerr << alternateQuery_M << endl;
+	exit(1);
+      } else {
+	cigarData.front().first -= paddingLen;
+	cigarData.back().first -= paddingLen;;
+      }
+      //cigarData = cleanCigar(cigarData);
+      cigar = old_joinCigar(cigarData);
+
+      int altpos = 0;
+      int refpos = 0;
+
+      for (vector<pair<int, string> >::iterator e = cigarData.begin();
+	   e != cigarData.end(); ++e) {
+
+	int len = e->first;
+	string type = e->second;
+
+	switch (type.at(0)) {
+	case 'I':
+	  if (includePreviousBaseForIndels) {
+	    if (!variants.empty() &&
+		variants.back().ref != variants.back().alt) {
+	      VariantAllele a =
+		VariantAllele("",
+			      alternate.substr(altpos, len),
+			      refpos + position);
+	      variants.back() = variants.back() + a;
+	    } else {
+	      VariantAllele a =
+		VariantAllele(ref.substr(refpos - 1, 1),
+			      alternate.substr(altpos - 1, len + 1),
+			      refpos + position - 1);
+	      variants.push_back(a);
+	    }
+	  } else {
+	    variants.push_back(VariantAllele("",
+					     alternate.substr(altpos, len),
+					     refpos + position));
+	  }
+	  altpos += len;
+	  break;
+	case 'D':
+	  if (includePreviousBaseForIndels) {
+	    if (!variants.empty() &&
+		variants.back().ref != variants.back().alt) {
+	      VariantAllele a
+		= VariantAllele(ref.substr(refpos, len)
+				, "", refpos + position);
+	      variants.back() = variants.back() + a;
+	      } else {
+	      VariantAllele a
+		= VariantAllele(ref.substr(refpos - 1, len + 1),
+				alternate.substr(altpos - 1, 1),
+				refpos + position - 1);
+	      variants.push_back(a);
+	    }
+	  } else {
+	    variants.push_back(VariantAllele(ref.substr(refpos, len),
+					     "", refpos + position));
+	  }
+	  refpos += len;
+	  break;
+
+	  // zk has added (!variants.empty()) solves the seg fault in
+          // vcfstats, but need to test
+	case 'M':
+	  {
+	    for (int i = 0; i < len; ++i) {
+	      VariantAllele a
+		= VariantAllele(ref.substr(refpos + i, 1),
+				alternate.substr(altpos + i, 1),
+				(refpos + i + position));
+	      if (useMNPs && (!variants.empty()) &&
+		  variants.back().ref.size() == variants.back().alt.size()
+		  && variants.back().ref != variants.back().alt) {
+		  variants.back() = variants.back() + a;
+	      } else {
+		variants.push_back(a);
+	      }
+	    }
+	  }
+	  refpos += len;
+	  altpos += len;
+	  break;
+	case 'S':
+	  {
+	    refpos += len;
+	    altpos += len;
+	    break;
+	  }
+	default:
+	  {
+	    break;
+	  }
+	}
+      }
+    }
+    return variantAlleles;
+}
+
+
 } // end namespace vcf


=====================================
src/Variant.h
=====================================
@@ -234,6 +234,16 @@ public:
         bool keepGeno=true,
         bool debug=false);
 
+    map<string, vector<VariantAllele> > parsedAlternates(bool includePreviousBaseForIndels = false,
+                                                         bool useMNPs = false,
+                                                         bool useEntropy = false,
+                                                         float matchScore = 10.0f,
+                                                         float mismatchScore = -9.0f,
+                                                         float gapOpenPenalty = 15.0f,
+                                                         float gapExtendPenalty = 6.66f,
+                                                         float repeatGapExtendPenalty = 0.0f,
+                                                         string flankingRefLeft = "",
+                                                         string flankingRefRight = "");
 
     // the same output format as parsedAlternates, without parsing
     map<string, vector<VariantAllele> > flatAlternates(void);


=====================================
src/cigar.cpp
=====================================
@@ -177,4 +177,52 @@ bool isEmptyCigarElement(const pair<int, char>& elem) {
     return elem.first == 0;
 }
 
+vector<pair<int, string> > old_splitCigar(const string& cigarStr) {
+    vector<pair<int, string> > cigar;
+    string number;
+    string type;
+    // strings go [Number][Type] ...
+    for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
+        char c = *s;
+        if (isdigit(c)) {
+            if (type.empty()) {
+                number += c;
+            } else {
+                // signal for next token, push back the last pair, clean up
+                cigar.push_back(make_pair(atoi(number.c_str()), type));
+                number.clear();
+                type.clear();
+                number += c;
+            }
+        } else {
+            type += c;
+        }
+    }
+    if (!number.empty() && !type.empty()) {
+        cigar.push_back(make_pair(atoi(number.c_str()), type));
+    }
+    return cigar;
+}
+
+string old_joinCigar(const vector<pair<int, string> >& cigar) {
+    string cigarStr;
+    for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+        if (c->first) {
+            cigarStr += convert(c->first) + c->second;
+        }
+    }
+    return cigarStr;
+}
+
+string old_joinCigar(const vector<pair<int, char> >& cigar) {
+    string cigarStr;
+    for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+        if (c->first) {
+            cigarStr += convert(c->first) + string(1, c->second);
+        }
+    }
+    return cigarStr;
+}
+
+
 }


=====================================
src/cigar.hpp
=====================================
@@ -16,10 +16,12 @@ typedef vector<pair<int, char> > Cigar;
 string varCigar(vector<VariantAllele>& vav, bool xForMismatch = false);
 string mergeCigar(const string& c1, const string& c2);
 vector<pair<int, char> > splitUnpackedCigar(const string& cigarStr);
+vector<pair<int, string> > old_splitCigar(const string& cigarStr);
 vector<pair<int, char> > splitCigar(const string& cigarStr);
 list<pair<int, char> > splitCigarList(const string& cigarStr);
 int cigarRefLen(const vector<pair<int, char> >& cigar);
 vector<pair<int, char> > cleanCigar(const vector<pair<int, char> >& cigar);
+string old_joinCigar(const vector<pair<int, string> >& cigar);
 string joinCigar(const vector<pair<int, char> >& cigar);
 string joinCigarList(const list<pair<int, char> >& cigar);
 bool isEmptyCigarElement(const pair<int, char>& elem);


=====================================
src/legacy.cpp
=====================================
@@ -2,8 +2,8 @@
     vcflib C++ library for parsing and manipulating VCF files. This file contains
     legacy material that will be phased out.
 
-    Copyright © 2010-2022 Erik Garrison
-    Copyright © 2020-2022 Pjotr Prins
+    Copyright © 2010-2023 Erik Garrison
+    Copyright © 2020-2023 Pjotr Prins
 
     This software is published under the MIT License. See the LICENSE file.
 */
@@ -21,7 +21,8 @@ namespace vcflib {
 // 'refs'. In this function Smith-Waterman is used with padding on
 // both sides of a ref and each alt. The SW method quadratic in nature
 // and painful with long sequences. Recently WFA is introduced with
-// runs in linear time.
+// runs in linear time: use WfaVariant::parsedAlternates.
+
 //
 // Returns map of [REF,ALTs] with attached VariantAllele records
 
@@ -40,6 +41,7 @@ map<string, vector<VariantAllele> > Variant::legacy_parsedAlternates(
     bool useWaveFront,
     bool debug) {
 
+
     bool OLDPADDING = true; // we want to use the first bp for alignment
 
     map<string, vector<VariantAllele> > variantAlleles; // return type
@@ -662,4 +664,6 @@ void Variant::legacy_reduceAlleles(
     }
 }
 
+
+
 } // namespace vcflib


=====================================
src/pythonffi.cpp
=====================================
@@ -64,7 +64,7 @@ PYBIND11_MODULE(pyvcflib, m)
 
   py::class_<WfaVariant, Variant>(m, "WfaVariant", "WFA VCF record")
       .def(py::init<VariantCallFile &>() )
-      .def("parsedAlternates", &WfaVariant::parsedAlternates);
+      .def("wfa_parsedAlternates", &WfaVariant::wfa_parsedAlternates);
 
   py::class_<VariantCallFile>(m, "VariantCallFile", "VCF file")
       .def(py::init())


=====================================
src/vcf-wfa.cpp
=====================================
@@ -22,7 +22,7 @@ namespace vcflib {
 // Returns map of [REF,ALTs] with attached VariantAllele records
 
 
-map<string, pair<vector<VariantAllele>,bool> > WfaVariant::parsedAlternates(
+map<string, pair<vector<VariantAllele>,bool> > WfaVariant::wfa_parsedAlternates(
     bool includePreviousBaseForIndels,
     bool useMNPs,
     bool useEntropy,


=====================================
src/vcf-wfa.h
=====================================
@@ -20,7 +20,7 @@ public:
     WfaVariant(VariantCallFile& v) : Variant(v)
       { };
 
-    map<string, pair<vector<VariantAllele>,bool> > parsedAlternates(
+    map<string, pair<vector<VariantAllele>,bool> > wfa_parsedAlternates(
            bool includePreviousBaseForIndels = false,
            bool useMNPs = false,
            bool useEntropy = false,


=====================================
src/vcfwave.cpp
=====================================
@@ -254,7 +254,7 @@ int main(int argc, char** argv) {
         }
 
         map<string, pair<vector<VariantAllele>, bool> > varAlleles =
-           var.parsedAlternates(includePreviousBaseForIndels, useMNPs,
+           var.wfa_parsedAlternates(includePreviousBaseForIndels, useMNPs,
                                 false, // bool useEntropy = false,
                                 "",    // string flankingRefLeft = "",
                                 "",    // string flankingRefRight = "",


=====================================
test/tests/realign.py
=====================================
@@ -64,7 +64,7 @@ class RealignTest(unittest.TestCase):
         wfa_params.affine2p_penalties.gap_extension2 = 1
         wfa_params.alignment_scope = alignment_scope_t.compute_alignment;
         # A dict is returned of alleles with variants and is_reversed
-        wf = var.parsedAlternates(True,True,False,"","",wfa_params,True,64,1,True)
+        wf = var.wfa_parsedAlternates(True,True,False,"","",wfa_params,True,64,1,True)
         print(f'ref={var.ref}')
         print(var.info)
         for key1, value1 in wf.items():



View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/ac907126881ca1656e00fe776fd5e4fe939cd662...d72d88cf45106c6ea8f39b44a32473df078a5ada

-- 
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/ac907126881ca1656e00fe776fd5e4fe939cd662...d72d88cf45106c6ea8f39b44a32473df078a5ada
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230211/ee4dc0b6/attachment-0001.htm>


More information about the debian-med-commit mailing list