[med-svn] [Git][med-team/libvcflib][upstream] New upstream version 1.0.8+dfsg

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Sat Feb 11 15:10:40 GMT 2023



Étienne Mollier pushed to branch upstream at Debian Med / libvcflib


Commits:
7dedad94 by Étienne Mollier at 2023-02-11T15:43:44+01:00
New upstream version 1.0.8+dfsg
- - - - -


9 changed files:

- CMakeLists.txt
- src/Variant.cpp
- src/Variant.h
- src/legacy.cpp
- src/pythonffi.cpp
- + src/vcf-wfa.cpp
- + src/vcf-wfa.h
- src/vcfwave.cpp
- test/tests/realign.py


Changes:

=====================================
CMakeLists.txt
=====================================
@@ -43,6 +43,8 @@ option(WFA_GITMODULE "Force local git submodule for WFA2LIB" ON) # disable in di
 include(CheckIPOSupported) # adds lto
 check_ipo_supported(RESULT ipo_supported OUTPUT output)
 
+set(EXTRA_FLAGS " ") # otherwise it injects OFF
+
 # ---- Dependencies
 
 if(OPENMP)
@@ -143,6 +145,7 @@ file(GLOB INCLUDES
 set(vcflib_SOURCE
     src/vcf-c-api.cpp
     src/legacy.cpp
+    src/vcf-wfa.cpp
     src/Variant.cpp
     src/rnglib.cpp
     src/var.cpp
@@ -434,9 +437,15 @@ if (NOT BUILD_ONLY_LIB)
   foreach(BIN ${BINS})
     add_executable(${BIN} src/${BIN}.cpp)
     add_dependencies(${BIN} vcflib)
-    target_link_libraries(${BIN} PUBLIC ${vcflib_LIBS} vcflib ${WFALIB})
+    target_link_libraries(${BIN} PUBLIC ${vcflib_LIBS} vcflib)
   endforeach(BIN ${BINS})
-  # target_link_libraries(vcfwave PUBLIC ${WFALIB})
+  # These binaries include WFALIB
+  target_link_libraries(vcfallelicprimitives PUBLIC ${WFALIB})
+  target_link_libraries(vcfcleancomplex PUBLIC ${WFALIB})
+  target_link_libraries(vcfparsealts PUBLIC ${WFALIB})
+  target_link_libraries(vcfroc PUBLIC ${WFALIB})
+  target_link_libraries(vcfstats PUBLIC ${WFALIB})
+  target_link_libraries(vcfwave PUBLIC ${WFALIB})
   install(TARGETS ${BINS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 
   # ---- Copy scripts
@@ -456,6 +465,7 @@ install(TARGETS pyvcflib LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 enable_testing()
 
+
 function(add_pytest TEST_FILE)
   add_test(
       NAME ${TEST_FILE}


=====================================
src/Variant.cpp
=====================================
@@ -2082,311 +2082,6 @@ int ploidy(const map<int, int>& genotype) {
 
 
 
-// parsedAlternates returns a hash of 'ref' and a vector of alts. A
-// single record may be split into multiple records with new
-// 'refs'. In this function Smith-Waterman is used with padding on
-// both sides of a ref and each alt. The SW method quadratic in nature
-// and painful with long sequences. Recently WFA is introduced with
-// runs in linear time.
-//
-// Returns map of [REF,ALTs] with attached VariantAllele records
-
-
-map<string, pair<vector<VariantAllele>,bool> > Variant::parsedAlternates(
-    bool includePreviousBaseForIndels,
-    bool useMNPs,
-    bool useEntropy,
-    string flankingRefLeft,
-    string flankingRefRight,
-    wavefront_aligner_attr_t* wfaParams,
-    int invKmerLen,
-    int invMinLen,
-    int threads,
-    bool debug) {
-
-    // return type is a hash of allele containing a list of variants and a
-    // boolean value signifying an inversion of the variant
-    map<string, pair<vector<VariantAllele>,bool> > variantAlleles;
-
-    if (isSymbolicSV()){
-        // Don't ever align symbolic SVs. It just wrecks things.
-        for (auto& f : this->flatAlternates()) {
-            variantAlleles[f.first] = make_pair(f.second, false);
-        }
-        return variantAlleles;
-    }
-    // add the reference allele
-    variantAlleles[ref].first.push_back(VariantAllele(ref, ref, position));
-
-    // single SNP case, no ambiguity possible, no need to spend a lot of
-    // compute aligning ref and alt fields
-    if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
-        variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
-        return variantAlleles;
-    }
-
-    const string& reference_M = ref;
-
-    std::vector<rkmh::hash_t> ref_sketch_fwd;
-    std::vector<rkmh::hash_t> ref_sketch_rev;
-    if (invKmerLen && ref.size() >= invKmerLen
-        && ref.size() >= invMinLen) {
-        ref_sketch_fwd = rkmh::hash_sequence(
-            ref.c_str(), ref.size(), invKmerLen, ref.size()-invKmerLen+1);
-        string fer = reverse_complement(ref);
-        ref_sketch_rev = rkmh::hash_sequence(
-            fer.c_str(), fer.size(), invKmerLen, fer.size()-invKmerLen+1);
-
-    }
-
-    /*
-    for (auto a: alt) { // iterate ALT strings
-        // unsigned int referencePos;
-        string& alternate = a;
-        variantAlleles[alternate]; // create the slot
-    }
-    */
-
-// #pragma omp parallel for
-    for (auto a: alt) { // iterate ALT strings
-        //for (uint64_t idx = 0; idx < alt.size(); ++idx) {
-        //auto& a = alt[idx];
-        // unsigned int referencePos;
-        string alternate = a;
-        pair<vector<VariantAllele>, bool>& _v = variantAlleles[alternate];
-        bool is_inv = false;
-        // get the alt/ref mapping in inversion space
-        if (invKmerLen && alternate.size() >= invKmerLen
-            && alternate.size() >= invMinLen
-            && ref.size() >= invKmerLen
-            && ref.size() >= invMinLen
-            && max((int)ref.size(), (int)alt.size()) > invMinLen) {
-            // check if it's more likely for us to align as an inversion
-            auto alt_sketch = rkmh::hash_sequence(
-                a.c_str(), a.size(), invKmerLen, a.size()-invKmerLen+1);
-            if (rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
-                > rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen)) {
-                is_inv = true;
-                // flip the alt
-                string alternate_rev = reverse_complement(alternate);
-                // cout << "alt: " << alternate_rev << endl;
-                alternate = alternate_rev;
-            }
-            variantAlleles[alternate]; // create slot
-            variantAlleles[alternate].second = is_inv;
-            /*
-            cerr << "comparing "
-                 << " vs ref fwd " << rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
-                 << " vs rev " << rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen) << endl;
-            */
-        }
-
-        const string& alternateQuery_M = alternate;
-
-        string cigar;
-        vector<pair<int, char> > cigarData;
-
-        /*
-         * WFA2-lib
-         */
-        /*
-        // the C++ WFA2-lib interface is not yet stable due to heuristic mode initialization issues
-        WFAlignerGapAffine2Pieces aligner(19,39,3,81,1,WFAligner::Alignment,WFAligner::MemoryHigh);
-        aligner.alignEnd2End(reference_M.c_str(), reference_M.size(), alternateQuery_M.c_str(), alternateQuery_M.size());
-        cigar = aligner.getAlignmentCigar();
-        */
-        auto wfp = *wfaParams;
-        size_t max_len = max(reference_M.size(), alternateQuery_M.size());
-        if (max_len > 1000) {
-            wfp.memory_mode = wavefront_memory_ultralow;
-        } else {
-            wfp.memory_mode = wavefront_memory_high;
-        }
-        auto wf_aligner = wavefront_aligner_new(&wfp);
-
-        wavefront_aligner_set_max_num_threads(wf_aligner,threads);
-        wavefront_aligner_set_heuristic_none(wf_aligner);
-        wavefront_aligner_set_alignment_end_to_end(wf_aligner);
-        wavefront_align(wf_aligner,
-                        reference_M.c_str(), reference_M.size(),
-                        alternateQuery_M.c_str(), alternateQuery_M.size());
-        //cerr << "WFA_input\t" << ">
-        /*
-          cigar_print_pretty(stderr,
-          reference_M.c_str(), reference_M.size(),
-          alternateQuery_M.c_str(), alternateQuery_M.size(),
-          &wf_aligner->cigar,wf_aligner->mm_allocator);
-        */
-        // Fetch CIGAR
-        char* buffer = wf_aligner->cigar->operations + wf_aligner->cigar->begin_offset;
-        int buf_len = wf_aligner->cigar->end_offset - wf_aligner->cigar->begin_offset;
-        // Create string and return
-        cigar = std::string(buffer,buf_len);
-        wavefront_aligner_delete(wf_aligner);
-        if (cigar == "") {
-            if (debug) {
-                cerr << "ERROR: stopped WF because there is no CIGAR!" << endl;
-            }
-            cerr << ">fail.pattern" << endl
-                 << reference_M << endl
-                 << ">fail.query" << endl
-                 << alternateQuery_M << endl;
-            variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
-            exit(1);
-        }
-        cigarData = splitUnpackedCigar(cigar);
-
-        if (debug)
-            cerr << "biWF= " << position << ":" << cigar << "/" << joinCigar(cigarData) << ":" << reference_M << "," << alternateQuery_M << endl;
-
-        if (cigarData.size() == 0) {
-            cerr << "Algorithm error: CIGAR <" << cigar << "> is empty for "
-                 << "ref " << reference_M << ","
-                 << "allele " << alternateQuery_M << endl;
-
-            exit(1);
-        }
-
-        // now left align!
-        //
-        // TODO: currently broken! it seems to mess up our indel alleles (they become length 0 on ref or alt)
-        //stablyLeftAlign(alternateQuery_M, reference_M, cigarData, 5, true);
-
-        if (debug) {
-            cigar = joinCigar(cigarData);
-            cerr << position << ":" << cigar << ":" << reference_M << "," << alternateQuery_M << endl;
-        }
-
-        // Walk the CIGAR for one alternate and build up variantAlleles
-        vector<VariantAllele> &variants = variantAlleles[alternate].first;
-        int altpos = 0;
-        int refpos = 0;
-
-        for (auto e: cigarData) {
-            int  mlen  = e.first;  // CIGAR matchlen
-            char mtype = e.second; // CIGAR matchtype
-
-            switch (mtype) {
-            case 'I': // CIGAR INSERT
-            {
-                auto allele = VariantAllele("",
-                                            alternate.substr(altpos, mlen),
-                                            refpos + position);
-                // inject insertion from allele
-                if (variants.size() && variants.back().is_pure_indel()) {
-                    variants.back() = variants.back() + allele;
-                } else {
-                    variants.push_back(allele);
-                }
-                altpos += mlen;
-            }
-            break;
-            case 'D': // CIGAR DELETE
-            {
-                auto allele = VariantAllele(ref.substr(refpos, mlen),
-                                            "", refpos + position);
-                // inject deletion from ref
-                if (variants.size() && variants.back().is_pure_indel()) {
-                    variants.back() = variants.back() + allele;
-                } else {
-                    variants.push_back(allele);
-                }
-                refpos += mlen;
-            }
-            break;
-            case 'M': // CIGAR match and variant
-            case 'X':
-                variants.push_back(VariantAllele(ref.substr(refpos, mlen),
-                                                 alternate.substr(altpos, mlen),
-                                                 refpos + position));
-                refpos += mlen;
-                altpos += mlen;
-                break;
-            case 'S': // S operations specify segments at the start
-                      // and/or end of the query that do not appear in
-                      // a local alignment
-                refpos += mlen;
-                altpos += mlen;
-                break;
-
-            default: // Ignore N, H
-                cerr << "Hit unexpected CIGAR " << mtype << " in " << endl;
-                cerr << "pos: " << position << endl;
-                cerr << "cigar: " << cigar << endl;
-                cerr << "ref:   " << reference_M << endl;
-                cerr << "allele:" << alternateQuery_M << endl;
-                if (debug) exit(1);
-                break;
-            } // switch mtype
-        }
-        if (includePreviousBaseForIndels) {
-            for (uint64_t i = 0; i < variants.size(); ++i) {
-                auto& v = variants[i];
-                if (v.is_pure_indel()) {
-                    if (i == 0) { // special case, we're at the beginning
-                        // do we have a next allele?
-                        if (i == variants.size()-1) {
-                            // if not-- panic
-                            cerr << "allele base fail: no subsequent alleles to indel" << endl;
-                            cerr << v << endl;
-                            cerr << "variant at " << position << endl;
-                            exit(1);
-                        }
-                        // else
-                        // while the next allele is an indel
-                        auto j = i+1;
-                        while (j < variants.size()
-                               // stop when we have sequence in both ref or alt
-                               && v.is_pure_indel()) {
-                            auto q = variants[j++];
-                            // merge with us
-                            shift_mid_left(v, q);
-                        }
-                        // panic if we reach the end of the variants
-                        if (j == variants.size() && v.is_pure_indel()) {
-                            cerr << "allele base fail: can't get an additional base next" << endl;
-                            cerr << v << endl;
-                            cerr << "variant at " << position << endl;
-                            exit(1);
-                        }
-                    } else {
-                        // while the next allele is an indel
-                        int j = i-1; // i is guaranteed >=1
-                        while (j >= 0
-                               // stop when we have sequence in both ref or alt
-                               && v.is_pure_indel()) {
-                            auto q = variants[j--];
-                            // move the middle base between the alleles
-                            // to be at the start of v
-                            // or merge if both are pure indels
-                            shift_mid_right(q, v);
-                        }
-                        // panic if we reach the end of the variants
-                        if (j <= 0 && v.is_pure_indel()) {
-                            cerr << "allele base fail: can't get an additional base prev" << endl;
-                            cerr << v << endl;
-                            cerr << "variant at " << position << endl;
-                            exit(1);
-                        }
-                        // while the previous allele is an indel
-                        // merge with us
-                        // stop when we have sequence in both ref or alt
-                        // if allele we reach is a positional match
-                        // panic if we reach the start of the variants
-
-                    }
-                    // try to get the previous reference or alternate base
-                    // continue until the
-                    // we first check if we have an indel just before us
-                    // if so, we need to merge this into the same allele and go further back
-                }
-            }
-        }
-    }
-
-    return variantAlleles;
-}
-
 
 map<string, vector<VariantAllele> > Variant::flatAlternates(void) {
     map<string, vector<VariantAllele> > variantAlleles;


=====================================
src/Variant.h
=====================================
@@ -1,8 +1,8 @@
 /*
     vcflib C++ library for parsing and manipulating VCF files
 
-    Copyright © 2010-2022 Erik Garrison
-    Copyright © 2020-2022 Pjotr Prins
+    Copyright © 2010-2023 Erik Garrison
+    Copyright © 2020-2023 Pjotr Prins
 
     This software is published under the MIT License. See the LICENSE file.
 */
@@ -33,7 +33,6 @@
 #include "rkmh.hpp"
 #include "LeftAlign.hpp"
 #include <Fasta.h>
-#include "wavefront/wfa.hpp"
 
 extern "C" {
   #include "filevercmp.h"
@@ -210,18 +209,6 @@ public:
     set<string> altSet(void);  // set of alleles, rather than vector of them
     map<string, int> altAlleleIndexes;  // reverse lookup for alleles
 
-    map<string, pair<vector<VariantAllele>,bool> > parsedAlternates(
-           bool includePreviousBaseForIndels = false,
-           bool useMNPs = false,
-           bool useEntropy = false,
-           string flankingRefLeft = "",
-           string flankingRefRight = "",
-           wavefront_aligner_attr_t* wfaParams = NULL,
-           int invKmerLen = 17,
-           int invMinLen = 1000,
-           int threads = 1,
-           bool debug = false);
-
     // Legacy version of parsedAlterneates:
     map<string, vector<VariantAllele> > legacy_parsedAlternates(
            bool includePreviousBaseForIndels = false,


=====================================
src/legacy.cpp
=====================================
@@ -10,6 +10,7 @@
 
 #include <utility>
 #include "Variant.h"
+#include "vcf-wfa.h"
 #include "allele.hpp"
 #include "cigar.hpp"
 


=====================================
src/pythonffi.cpp
=====================================
@@ -1,8 +1,9 @@
 // Python ffi calls C++ functions
 //
-// Copyright © 2022 Pjotr Prins
+// Copyright © 2022-2023 Pjotr Prins
 
 #include "Variant.h"
+#include "vcf-wfa.h"
 
 // Pybind11
 #include <pybind11/pybind11.h>
@@ -59,8 +60,12 @@ PYBIND11_MODULE(pyvcflib, m)
       .def_readonly("sampleNames", &Variant::sampleNames)
       .def_readonly("samples", &Variant::samples)
       .def("legacy_parsedAlternates", &Variant::legacy_parsedAlternates)
-      .def("parsedAlternates", &Variant::parsedAlternates)
       ;
+
+  py::class_<WfaVariant, Variant>(m, "WfaVariant", "WFA VCF record")
+      .def(py::init<VariantCallFile &>() )
+      .def("parsedAlternates", &WfaVariant::parsedAlternates);
+
   py::class_<VariantCallFile>(m, "VariantCallFile", "VCF file")
       .def(py::init())
       .def("openFile",&VariantCallFile::openFile,"Open the VCF")


=====================================
src/vcf-wfa.cpp
=====================================
@@ -0,0 +1,320 @@
+/*
+    vcflib C++ library for parsing and manipulating VCF files
+
+    Copyright © 2010-2023 Erik Garrison
+    Copyright © 2020-2023 Pjotr Prins
+
+    This software is published under the MIT License. See the LICENSE file.
+*/
+
+#include "Variant.h"
+#include "vcf-wfa.h"
+
+namespace vcflib {
+
+// parsedAlternates returns a hash of 'ref' and a vector of alts. A
+// single record may be split into multiple records with new
+// 'refs'. In this function Smith-Waterman is used with padding on
+// both sides of a ref and each alt. The SW method quadratic in nature
+// and painful with long sequences. Recently WFA is introduced with
+// runs in linear time.
+//
+// Returns map of [REF,ALTs] with attached VariantAllele records
+
+
+map<string, pair<vector<VariantAllele>,bool> > WfaVariant::parsedAlternates(
+    bool includePreviousBaseForIndels,
+    bool useMNPs,
+    bool useEntropy,
+    string flankingRefLeft,
+    string flankingRefRight,
+    wavefront_aligner_attr_t* wfaParams,
+    int invKmerLen,
+    int invMinLen,
+    int threads,
+    bool debug) {
+
+    // return type is a hash of allele containing a list of variants and a
+    // boolean value signifying an inversion of the variant
+    map<string, pair<vector<VariantAllele>,bool> > variantAlleles;
+
+    if (isSymbolicSV()){
+        // Don't ever align symbolic SVs. It just wrecks things.
+        for (auto& f : this->flatAlternates()) {
+            variantAlleles[f.first] = make_pair(f.second, false);
+        }
+        return variantAlleles;
+    }
+    // add the reference allele
+    variantAlleles[ref].first.push_back(VariantAllele(ref, ref, position));
+
+    // single SNP case, no ambiguity possible, no need to spend a lot of
+    // compute aligning ref and alt fields
+    if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
+        variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
+        return variantAlleles;
+    }
+
+    const string& reference_M = ref;
+
+    std::vector<rkmh::hash_t> ref_sketch_fwd;
+    std::vector<rkmh::hash_t> ref_sketch_rev;
+    if (invKmerLen && ref.size() >= invKmerLen
+        && ref.size() >= invMinLen) {
+        ref_sketch_fwd = rkmh::hash_sequence(
+            ref.c_str(), ref.size(), invKmerLen, ref.size()-invKmerLen+1);
+        string fer = reverse_complement(ref);
+        ref_sketch_rev = rkmh::hash_sequence(
+            fer.c_str(), fer.size(), invKmerLen, fer.size()-invKmerLen+1);
+
+    }
+
+    /*
+    for (auto a: alt) { // iterate ALT strings
+        // unsigned int referencePos;
+        string& alternate = a;
+        variantAlleles[alternate]; // create the slot
+    }
+    */
+
+// #pragma omp parallel for
+    for (auto a: alt) { // iterate ALT strings
+        //for (uint64_t idx = 0; idx < alt.size(); ++idx) {
+        //auto& a = alt[idx];
+        // unsigned int referencePos;
+        string alternate = a;
+        pair<vector<VariantAllele>, bool>& _v = variantAlleles[alternate];
+        bool is_inv = false;
+        // get the alt/ref mapping in inversion space
+        if (invKmerLen && alternate.size() >= invKmerLen
+            && alternate.size() >= invMinLen
+            && ref.size() >= invKmerLen
+            && ref.size() >= invMinLen
+            && max((int)ref.size(), (int)alt.size()) > invMinLen) {
+            // check if it's more likely for us to align as an inversion
+            auto alt_sketch = rkmh::hash_sequence(
+                a.c_str(), a.size(), invKmerLen, a.size()-invKmerLen+1);
+            if (rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
+                > rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen)) {
+                is_inv = true;
+                // flip the alt
+                string alternate_rev = reverse_complement(alternate);
+                // cout << "alt: " << alternate_rev << endl;
+                alternate = alternate_rev;
+            }
+            variantAlleles[alternate]; // create slot
+            variantAlleles[alternate].second = is_inv;
+            /*
+            cerr << "comparing "
+                 << " vs ref fwd " << rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
+                 << " vs rev " << rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen) << endl;
+            */
+        }
+
+        const string& alternateQuery_M = alternate;
+
+        string cigar;
+        vector<pair<int, char> > cigarData;
+
+        /*
+         * WFA2-lib
+         */
+        /*
+        // the C++ WFA2-lib interface is not yet stable due to heuristic mode initialization issues
+        WFAlignerGapAffine2Pieces aligner(19,39,3,81,1,WFAligner::Alignment,WFAligner::MemoryHigh);
+        aligner.alignEnd2End(reference_M.c_str(), reference_M.size(), alternateQuery_M.c_str(), alternateQuery_M.size());
+        cigar = aligner.getAlignmentCigar();
+        */
+        auto wfp = *wfaParams;
+        size_t max_len = max(reference_M.size(), alternateQuery_M.size());
+        if (max_len > 1000) {
+            wfp.memory_mode = wavefront_memory_ultralow;
+        } else {
+            wfp.memory_mode = wavefront_memory_high;
+        }
+        auto wf_aligner = wavefront_aligner_new(&wfp);
+
+        wavefront_aligner_set_max_num_threads(wf_aligner,threads);
+        wavefront_aligner_set_heuristic_none(wf_aligner);
+        wavefront_aligner_set_alignment_end_to_end(wf_aligner);
+        wavefront_align(wf_aligner,
+                        reference_M.c_str(), reference_M.size(),
+                        alternateQuery_M.c_str(), alternateQuery_M.size());
+        //cerr << "WFA_input\t" << ">
+        /*
+          cigar_print_pretty(stderr,
+          reference_M.c_str(), reference_M.size(),
+          alternateQuery_M.c_str(), alternateQuery_M.size(),
+          &wf_aligner->cigar,wf_aligner->mm_allocator);
+        */
+        // Fetch CIGAR
+        char* buffer = wf_aligner->cigar->operations + wf_aligner->cigar->begin_offset;
+        int buf_len = wf_aligner->cigar->end_offset - wf_aligner->cigar->begin_offset;
+        // Create string and return
+        cigar = std::string(buffer,buf_len);
+        wavefront_aligner_delete(wf_aligner);
+        if (cigar == "") {
+            if (debug) {
+                cerr << "ERROR: stopped WF because there is no CIGAR!" << endl;
+            }
+            cerr << ">fail.pattern" << endl
+                 << reference_M << endl
+                 << ">fail.query" << endl
+                 << alternateQuery_M << endl;
+            variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
+            exit(1);
+        }
+        cigarData = splitUnpackedCigar(cigar);
+
+        if (debug)
+            cerr << "biWF= " << position << ":" << cigar << "/" << joinCigar(cigarData) << ":" << reference_M << "," << alternateQuery_M << endl;
+
+        if (cigarData.size() == 0) {
+            cerr << "Algorithm error: CIGAR <" << cigar << "> is empty for "
+                 << "ref " << reference_M << ","
+                 << "allele " << alternateQuery_M << endl;
+
+            exit(1);
+        }
+
+        // now left align!
+        //
+        // TODO: currently broken! it seems to mess up our indel alleles (they become length 0 on ref or alt)
+        //stablyLeftAlign(alternateQuery_M, reference_M, cigarData, 5, true);
+
+        if (debug) {
+            cigar = joinCigar(cigarData);
+            cerr << position << ":" << cigar << ":" << reference_M << "," << alternateQuery_M << endl;
+        }
+
+        // Walk the CIGAR for one alternate and build up variantAlleles
+        vector<VariantAllele> &variants = variantAlleles[alternate].first;
+        int altpos = 0;
+        int refpos = 0;
+
+        for (auto e: cigarData) {
+            int  mlen  = e.first;  // CIGAR matchlen
+            char mtype = e.second; // CIGAR matchtype
+
+            switch (mtype) {
+            case 'I': // CIGAR INSERT
+            {
+                auto allele = VariantAllele("",
+                                            alternate.substr(altpos, mlen),
+                                            refpos + position);
+                // inject insertion from allele
+                if (variants.size() && variants.back().is_pure_indel()) {
+                    variants.back() = variants.back() + allele;
+                } else {
+                    variants.push_back(allele);
+                }
+                altpos += mlen;
+            }
+            break;
+            case 'D': // CIGAR DELETE
+            {
+                auto allele = VariantAllele(ref.substr(refpos, mlen),
+                                            "", refpos + position);
+                // inject deletion from ref
+                if (variants.size() && variants.back().is_pure_indel()) {
+                    variants.back() = variants.back() + allele;
+                } else {
+                    variants.push_back(allele);
+                }
+                refpos += mlen;
+            }
+            break;
+            case 'M': // CIGAR match and variant
+            case 'X':
+                variants.push_back(VariantAllele(ref.substr(refpos, mlen),
+                                                 alternate.substr(altpos, mlen),
+                                                 refpos + position));
+                refpos += mlen;
+                altpos += mlen;
+                break;
+            case 'S': // S operations specify segments at the start
+                      // and/or end of the query that do not appear in
+                      // a local alignment
+                refpos += mlen;
+                altpos += mlen;
+                break;
+
+            default: // Ignore N, H
+                cerr << "Hit unexpected CIGAR " << mtype << " in " << endl;
+                cerr << "pos: " << position << endl;
+                cerr << "cigar: " << cigar << endl;
+                cerr << "ref:   " << reference_M << endl;
+                cerr << "allele:" << alternateQuery_M << endl;
+                if (debug) exit(1);
+                break;
+            } // switch mtype
+        }
+        if (includePreviousBaseForIndels) {
+            for (uint64_t i = 0; i < variants.size(); ++i) {
+                auto& v = variants[i];
+                if (v.is_pure_indel()) {
+                    if (i == 0) { // special case, we're at the beginning
+                        // do we have a next allele?
+                        if (i == variants.size()-1) {
+                            // if not-- panic
+                            cerr << "allele base fail: no subsequent alleles to indel" << endl;
+                            cerr << v << endl;
+                            cerr << "variant at " << position << endl;
+                            exit(1);
+                        }
+                        // else
+                        // while the next allele is an indel
+                        auto j = i+1;
+                        while (j < variants.size()
+                               // stop when we have sequence in both ref or alt
+                               && v.is_pure_indel()) {
+                            auto q = variants[j++];
+                            // merge with us
+                            shift_mid_left(v, q);
+                        }
+                        // panic if we reach the end of the variants
+                        if (j == variants.size() && v.is_pure_indel()) {
+                            cerr << "allele base fail: can't get an additional base next" << endl;
+                            cerr << v << endl;
+                            cerr << "variant at " << position << endl;
+                            exit(1);
+                        }
+                    } else {
+                        // while the next allele is an indel
+                        int j = i-1; // i is guaranteed >=1
+                        while (j >= 0
+                               // stop when we have sequence in both ref or alt
+                               && v.is_pure_indel()) {
+                            auto q = variants[j--];
+                            // move the middle base between the alleles
+                            // to be at the start of v
+                            // or merge if both are pure indels
+                            shift_mid_right(q, v);
+                        }
+                        // panic if we reach the end of the variants
+                        if (j <= 0 && v.is_pure_indel()) {
+                            cerr << "allele base fail: can't get an additional base prev" << endl;
+                            cerr << v << endl;
+                            cerr << "variant at " << position << endl;
+                            exit(1);
+                        }
+                        // while the previous allele is an indel
+                        // merge with us
+                        // stop when we have sequence in both ref or alt
+                        // if allele we reach is a positional match
+                        // panic if we reach the start of the variants
+
+                    }
+                    // try to get the previous reference or alternate base
+                    // continue until the
+                    // we first check if we have an indel just before us
+                    // if so, we need to merge this into the same allele and go further back
+                }
+            }
+        }
+    }
+
+    return variantAlleles;
+}
+
+} // namespace


=====================================
src/vcf-wfa.h
=====================================
@@ -0,0 +1,36 @@
+/*
+    vcflib C++ library for parsing and manipulating VCF files
+
+    Copyright © 2010-2023 Erik Garrison
+    Copyright © 2020-2023 Pjotr Prins
+
+    This software is published under the MIT License. See the LICENSE file.
+*/
+
+#pragma once
+
+#include "wavefront/wfa.hpp"
+
+namespace vcflib {
+
+class WfaVariant : public Variant
+{
+
+public:
+    WfaVariant(VariantCallFile& v) : Variant(v)
+      { };
+
+    map<string, pair<vector<VariantAllele>,bool> > parsedAlternates(
+           bool includePreviousBaseForIndels = false,
+           bool useMNPs = false,
+           bool useEntropy = false,
+           string flankingRefLeft = "",
+           string flankingRefRight = "",
+           wavefront_aligner_attr_t* wfaParams = NULL,
+           int invKmerLen = 17,
+           int invMinLen = 1000,
+           int threads = 1,
+           bool debug = false);
+};
+
+}


=====================================
src/vcfwave.cpp
=====================================
@@ -17,6 +17,7 @@
 #include <omp.h>
 
 #include "Variant.h"
+#include "vcf-wfa.h"
 #include "convert.h"
 #include "join.h"
 #include "split.h"
@@ -218,7 +219,7 @@ int main(int argc, char** argv) {
     variantFile.addHeaderLine("##INFO=<ID=INV,Number=A,Type=String,Description=\"Count of haplotypes which are aligned in the inverted orientation using vcflib vcfwave.\">");
     cout << variantFile.header << endl;
 
-    Variant var(variantFile);
+    WfaVariant var(variantFile);
     double amount = 0.0, prev_amount = 0.0;
     uint64_t start = get_timestamp();
 


=====================================
test/tests/realign.py
=====================================
@@ -48,7 +48,7 @@ class RealignTest(unittest.TestCase):
     def test_wfbug2(self):
         vcf = VariantCallFile()
         vcf.openFile("../samples/10134514.vcf")
-        var = Variant(vcf)
+        var = WfaVariant(vcf)
         vcf.getNextVariant(var)
         self.assertEqual(var.name,"grch38#chr4_10083863-10181258.vcf:grch38#chr4")
         self.assertEqual(var.ref,'GGAGAATCCCAATTGATGG')



View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/commit/7dedad94a2684662e68466bb3af460f27f26e279

-- 
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/commit/7dedad94a2684662e68466bb3af460f27f26e279
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230211/df9a0b71/attachment-0001.htm>


More information about the debian-med-commit mailing list