[med-svn] [Git][med-team/libvcflib][master] 6 commits: routine-update: New upstream version

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Sat Feb 11 15:10:29 GMT 2023



Étienne Mollier pushed to branch master at Debian Med / libvcflib


Commits:
f967d75a by Étienne Mollier at 2023-02-11T15:43:42+01:00
routine-update: New upstream version

- - - - -
7dedad94 by Étienne Mollier at 2023-02-11T15:43:44+01:00
New upstream version 1.0.8+dfsg
- - - - -
64cb33fe by Étienne Mollier at 2023-02-11T15:44:41+01:00
Update upstream source from tag 'upstream/1.0.8+dfsg'

Update to upstream version '1.0.8+dfsg'
with Debian dir 5a92572a6ab838586acd820dadfabd883d5555a7
- - - - -
8b337461 by Étienne Mollier at 2023-02-11T16:05:26+01:00
d/patches: refresh everything.

- - - - -
ef3016d1 by Étienne Mollier at 2023-02-11T16:07:05+01:00
wfa2_linking: add patch; fix linker errors.

- - - - -
6c92e525 by Étienne Mollier at 2023-02-11T16:09:18+01:00
update changelog.

- - - - -


18 changed files:

- CMakeLists.txt
- debian/changelog
- debian/patches/no_fsom
- debian/patches/pkg-config.patch
- debian/patches/series
- debian/patches/shared_lib.patch
- debian/patches/tabix_linking
- debian/patches/use_debian_packaged_fastahack.patch
- debian/patches/use_debian_packaged_smithwaterman.patch
- + debian/patches/wfa2_linking
- src/Variant.cpp
- src/Variant.h
- src/legacy.cpp
- src/pythonffi.cpp
- + src/vcf-wfa.cpp
- + src/vcf-wfa.h
- src/vcfwave.cpp
- test/tests/realign.py


Changes:

=====================================
CMakeLists.txt
=====================================
@@ -43,6 +43,8 @@ option(WFA_GITMODULE "Force local git submodule for WFA2LIB" ON) # disable in di
 include(CheckIPOSupported) # adds lto
 check_ipo_supported(RESULT ipo_supported OUTPUT output)
 
+set(EXTRA_FLAGS " ") # otherwise it injects OFF
+
 # ---- Dependencies
 
 if(OPENMP)
@@ -143,6 +145,7 @@ file(GLOB INCLUDES
 set(vcflib_SOURCE
     src/vcf-c-api.cpp
     src/legacy.cpp
+    src/vcf-wfa.cpp
     src/Variant.cpp
     src/rnglib.cpp
     src/var.cpp
@@ -434,9 +437,15 @@ if (NOT BUILD_ONLY_LIB)
   foreach(BIN ${BINS})
     add_executable(${BIN} src/${BIN}.cpp)
     add_dependencies(${BIN} vcflib)
-    target_link_libraries(${BIN} PUBLIC ${vcflib_LIBS} vcflib ${WFALIB})
+    target_link_libraries(${BIN} PUBLIC ${vcflib_LIBS} vcflib)
   endforeach(BIN ${BINS})
-  # target_link_libraries(vcfwave PUBLIC ${WFALIB})
+  # These binaries include WFALIB
+  target_link_libraries(vcfallelicprimitives PUBLIC ${WFALIB})
+  target_link_libraries(vcfcleancomplex PUBLIC ${WFALIB})
+  target_link_libraries(vcfparsealts PUBLIC ${WFALIB})
+  target_link_libraries(vcfroc PUBLIC ${WFALIB})
+  target_link_libraries(vcfstats PUBLIC ${WFALIB})
+  target_link_libraries(vcfwave PUBLIC ${WFALIB})
   install(TARGETS ${BINS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 
   # ---- Copy scripts
@@ -456,6 +465,7 @@ install(TARGETS pyvcflib LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 enable_testing()
 
+
 function(add_pytest TEST_FILE)
   add_test(
       NAME ${TEST_FILE}


=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+libvcflib (1.0.8+dfsg-1) UNRELEASED; urgency=medium
+
+  * Team upload.
+  * New upstream version 1.0.8+dfsg
+  * d/patches: refresh everything.
+  * wfa2_linking: add patch; fix linker errors.
+
+ -- Étienne Mollier <emollier at debian.org>  Sat, 11 Feb 2023 16:07:36 +0100
+
 libvcflib (1.0.7+dfsg-2) unstable; urgency=medium
 
   * Restrict architectures to 64bit only


=====================================
debian/patches/no_fsom
=====================================
@@ -1,9 +1,9 @@
 From: Michael R. Crusoe <crusoe at debian.org>
 Subject: fsom is not used
 Forwarded: not-needed
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -161,7 +161,6 @@
+--- libvcflib.orig/CMakeLists.txt
++++ libvcflib/CMakeLists.txt
+@@ -164,7 +164,6 @@
      contrib/smithwaterman/IndelAllele.cpp
      contrib/smithwaterman/disorder.cpp
      contrib/smithwaterman/LeftAlign.cpp


=====================================
debian/patches/pkg-config.patch
=====================================
@@ -3,7 +3,7 @@ Last-Update: Fri, 10 Feb 2017 09:09:35 +0100
 Description: Add pkg-config file to simplify building of freebayes
 Forwarded: not-needed
 --- /dev/null
-+++ b/libvcflib.pc
++++ libvcflib/libvcflib.pc
 @@ -0,0 +1,12 @@
 +prefix=/usr
 +exec_prefix=${prefix}


=====================================
debian/patches/series
=====================================
@@ -6,3 +6,4 @@ shared_lib.patch
 pkg-config.patch
 tabix_linking
 #tests ## FIXME: deactivated for the moment
+wfa2_linking


=====================================
debian/patches/shared_lib.patch
=====================================
@@ -2,9 +2,9 @@ Author: Andreas Tille <tille at debian.org>
 Last-Update: Thu, 15 Sep 2016 22:26:26 +0200
 Description: Create shared lib instead of static
 Forwarded: not-needed
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -157,19 +157,19 @@
+--- libvcflib.orig/CMakeLists.txt
++++ libvcflib/CMakeLists.txt
+@@ -160,19 +160,19 @@
      contrib/c-progress-bar/progress.c
  )
  
@@ -30,7 +30,7 @@ Forwarded: not-needed
  set(BINS
      vcfecho
      dumpContigsFromHeader
-@@ -302,6 +302,10 @@
+@@ -305,6 +305,10 @@
  file (STRINGS "VERSION" BUILD_NUMBER)
  add_definitions(-DVCFLIB_VERSION="${BUILD_NUMBER}")
  add_definitions(-DVERSION="${BUILD_NUMBER}")


=====================================
debian/patches/tabix_linking
=====================================
@@ -1,9 +1,9 @@
 Author: Michael R. Crusoe <crusoe at debian.org>
 Description: link against tabixpp
 Forwarded: not-needed
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -168,6 +168,7 @@
+--- libvcflib.orig/CMakeLists.txt
++++ libvcflib/CMakeLists.txt
+@@ -171,6 +171,7 @@
  target_link_libraries(vcflib
      PkgConfig::SMITHWATERMAN
      PkgConfig::FASTAHACK


=====================================
debian/patches/use_debian_packaged_fastahack.patch
=====================================
@@ -2,9 +2,9 @@ Author: Andreas Tille <tille at debian.org>
 Last-Update: 2021-01-28
 Description: Use Debian packaged libfastahack (and libdisorder)
 Forwarded: not-needed
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -59,6 +59,7 @@
+--- libvcflib.orig/CMakeLists.txt
++++ libvcflib/CMakeLists.txt
+@@ -61,6 +61,7 @@
  pkg_check_modules(HTSLIB htslib)   # Optionally builds from contrib/
  pkg_check_modules(TABIXPP tabixpp) # Optionally builds from contrib/
  pkg_check_modules(SMITHWATERMAN REQUIRED IMPORTED_TARGET libsmithwaterman)
@@ -12,7 +12,7 @@ Forwarded: not-needed
  
  # ---- Build switches
  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-@@ -108,7 +109,6 @@
+@@ -110,7 +111,6 @@
  # ---- Include files
  
  include_directories(include)
@@ -20,7 +20,7 @@ Forwarded: not-needed
  include_directories(contrib/intervaltree)
  include_directories(contrib/multichoose)
  include_directories(contrib/filevercmp)
-@@ -135,7 +135,6 @@
+@@ -137,7 +137,6 @@
    src/*.h*
    contrib/multichoose/*.h*
    contrib/intervaltree/*.h*
@@ -28,7 +28,7 @@ Forwarded: not-needed
    contrib/filevercmp/*.h*
    )
  
-@@ -154,13 +153,13 @@
+@@ -157,13 +156,13 @@
      src/LeftAlign.cpp
      src/cigar.cpp
      src/allele.cpp
@@ -43,8 +43,8 @@ Forwarded: not-needed
      )
  
  if (TABIXPP_LOCAL) # add the tabixpp source file
---- a/src/vcf2dag.cpp
-+++ b/src/vcf2dag.cpp
+--- libvcflib.orig/src/vcf2dag.cpp
++++ libvcflib/src/vcf2dag.cpp
 @@ -11,7 +11,7 @@
  #include "BedReader.h"
  #include "IntervalTree.h"
@@ -54,8 +54,8 @@ Forwarded: not-needed
  #include <algorithm>
  #include <list>
  #include <set>
---- a/src/vcf2fasta.cpp
-+++ b/src/vcf2fasta.cpp
+--- libvcflib.orig/src/vcf2fasta.cpp
++++ libvcflib/src/vcf2fasta.cpp
 @@ -13,7 +13,7 @@
  #include "split.h"
  #include <set>
@@ -65,8 +65,8 @@ Forwarded: not-needed
  #include <iostream>
  #include <fstream>
  
---- a/src/vcfcheck.cpp
-+++ b/src/vcfcheck.cpp
+--- libvcflib.orig/src/vcfcheck.cpp
++++ libvcflib/src/vcfcheck.cpp
 @@ -10,7 +10,7 @@
  
  #include "Variant.h"
@@ -76,8 +76,8 @@ Forwarded: not-needed
  #include "gpatInfo.hpp"
  #include <getopt.h>
  #include <string.h>
---- a/src/vcfentropy.cpp
-+++ b/src/vcfentropy.cpp
+--- libvcflib.orig/src/vcfentropy.cpp
++++ libvcflib/src/vcfentropy.cpp
 @@ -9,7 +9,7 @@
  
  #include "Variant.h"
@@ -87,8 +87,8 @@ Forwarded: not-needed
  #include <getopt.h>
  #include "disorder.h"
  
---- a/src/vcfevenregions.cpp
-+++ b/src/vcfevenregions.cpp
+--- libvcflib.orig/src/vcfevenregions.cpp
++++ libvcflib/src/vcfevenregions.cpp
 @@ -9,7 +9,7 @@
  
  #include "Variant.h"
@@ -98,8 +98,8 @@ Forwarded: not-needed
  #include <getopt.h>
  #include <cmath>
  
---- a/src/vcfgeno2haplo.cpp
-+++ b/src/vcfgeno2haplo.cpp
+--- libvcflib.orig/src/vcfgeno2haplo.cpp
++++ libvcflib/src/vcfgeno2haplo.cpp
 @@ -9,7 +9,7 @@
  
  #include "Variant.h"
@@ -109,8 +109,8 @@ Forwarded: not-needed
  #include "gpatInfo.hpp"
  #include <algorithm>
  #include <list>
---- a/src/vcfinfosummarize.cpp
-+++ b/src/vcfinfosummarize.cpp
+--- libvcflib.orig/src/vcfinfosummarize.cpp
++++ libvcflib/src/vcfinfosummarize.cpp
 @@ -9,7 +9,7 @@
  
  #include "Variant.h"
@@ -120,8 +120,8 @@ Forwarded: not-needed
  #include "gpatInfo.hpp"
  #include <getopt.h>
  #include <algorithm>
---- a/src/vcfintersect.cpp
-+++ b/src/vcfintersect.cpp
+--- libvcflib.orig/src/vcfintersect.cpp
++++ libvcflib/src/vcfintersect.cpp
 @@ -11,7 +11,7 @@
  #include "BedReader.h"
  #include "IntervalTree.h"
@@ -131,8 +131,8 @@ Forwarded: not-needed
  #include <algorithm>
  #include <list>
  #include <set>
---- a/src/vcfleftalign.cpp
-+++ b/src/vcfleftalign.cpp
+--- libvcflib.orig/src/vcfleftalign.cpp
++++ libvcflib/src/vcfleftalign.cpp
 @@ -11,7 +11,7 @@
  #include "convert.h"
  #include "join.h"
@@ -142,8 +142,8 @@ Forwarded: not-needed
  #include <set>
  #include <vector>
  #include <getopt.h>
---- a/src/vcfnormalizesvs.cpp
-+++ b/src/vcfnormalizesvs.cpp
+--- libvcflib.orig/src/vcfnormalizesvs.cpp
++++ libvcflib/src/vcfnormalizesvs.cpp
 @@ -9,7 +9,7 @@
  
  #include "Variant.h"
@@ -153,8 +153,8 @@ Forwarded: not-needed
  
  using namespace std;
  using namespace vcflib;
---- a/src/vcfprimers.cpp
-+++ b/src/vcfprimers.cpp
+--- libvcflib.orig/src/vcfprimers.cpp
++++ libvcflib/src/vcfprimers.cpp
 @@ -9,7 +9,7 @@
  
  #include "Variant.h"
@@ -164,8 +164,8 @@ Forwarded: not-needed
  #include <getopt.h>
  
  using namespace std;
---- a/src/vcfremap.cpp
-+++ b/src/vcfremap.cpp
+--- libvcflib.orig/src/vcfremap.cpp
++++ libvcflib/src/vcfremap.cpp
 @@ -11,7 +11,7 @@
  #include "BedReader.h"
  #include "IntervalTree.h"
@@ -175,8 +175,8 @@ Forwarded: not-needed
  #include <algorithm>
  #include <list>
  #include <set>
---- a/src/vcfroc.cpp
-+++ b/src/vcfroc.cpp
+--- libvcflib.orig/src/vcfroc.cpp
++++ libvcflib/src/vcfroc.cpp
 @@ -11,7 +11,7 @@
  #include "BedReader.h"
  #include "IntervalTree.h"
@@ -186,8 +186,8 @@ Forwarded: not-needed
  #include <algorithm>
  #include <list>
  #include <set>
---- a/src/vcfsample2info.cpp
-+++ b/src/vcfsample2info.cpp
+--- libvcflib.orig/src/vcfsample2info.cpp
++++ libvcflib/src/vcfsample2info.cpp
 @@ -9,7 +9,7 @@
  
  #include "Variant.h"


=====================================
debian/patches/use_debian_packaged_smithwaterman.patch
=====================================
@@ -2,8 +2,8 @@ Author: Andreas Tille <tille at debian.org>
 Last-Update: 2021-01-28
 Description: Use Debian packaged libsmithwaterman
 Forwarded: not-needed
---- a/src/Variant.h
-+++ b/src/Variant.h
+--- libvcflib.orig/src/Variant.h
++++ libvcflib/src/Variant.h
 @@ -26,7 +26,7 @@
  #include "split.h"
  #include "join.h"
@@ -13,9 +13,9 @@ Forwarded: not-needed
  #include "ssw_cpp.hpp"
  #include "convert.h"
  #include "multichoose.h"
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -58,6 +58,7 @@
+--- libvcflib.orig/CMakeLists.txt
++++ libvcflib/CMakeLists.txt
+@@ -60,6 +60,7 @@
  
  pkg_check_modules(HTSLIB htslib)   # Optionally builds from contrib/
  pkg_check_modules(TABIXPP tabixpp) # Optionally builds from contrib/
@@ -23,7 +23,7 @@ Forwarded: not-needed
  
  # ---- Build switches
  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-@@ -109,7 +110,6 @@
+@@ -111,7 +112,6 @@
  include_directories(include)
  include_directories(contrib/fastahack)
  include_directories(contrib/intervaltree)
@@ -31,7 +31,7 @@ Forwarded: not-needed
  include_directories(contrib/multichoose)
  include_directories(contrib/filevercmp)
  include_directories(contrib/c-progress-bar)
-@@ -135,7 +135,6 @@
+@@ -137,7 +137,6 @@
    src/*.h*
    contrib/multichoose/*.h*
    contrib/intervaltree/*.h*
@@ -39,7 +39,7 @@ Forwarded: not-needed
    contrib/fastahack/*.h*
    contrib/filevercmp/*.h*
    )
-@@ -156,15 +155,14 @@
+@@ -159,15 +158,14 @@
      src/cigar.cpp
      src/allele.cpp
      contrib/fastahack/Fasta.cpp


=====================================
debian/patches/wfa2_linking
=====================================
@@ -0,0 +1,17 @@
+Description: set -lwfa2 for the linker
+ This works around a failure to catch the wfa2 library by the CMake files.
+Author: Étienne Mollier <emollier at debian.org>
+Forwarded: no
+Last-Update: 2023-02-11
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- libvcflib.orig/CMakeLists.txt
++++ libvcflib/CMakeLists.txt
+@@ -172,6 +172,7 @@
+     PkgConfig::SMITHWATERMAN
+     PkgConfig::FASTAHACK
+     tabixpp
++    wfa2
+     )
+ 
+ set(BINS


=====================================
src/Variant.cpp
=====================================
@@ -2082,311 +2082,6 @@ int ploidy(const map<int, int>& genotype) {
 
 
 
-// parsedAlternates returns a hash of 'ref' and a vector of alts. A
-// single record may be split into multiple records with new
-// 'refs'. In this function Smith-Waterman is used with padding on
-// both sides of a ref and each alt. The SW method quadratic in nature
-// and painful with long sequences. Recently WFA is introduced with
-// runs in linear time.
-//
-// Returns map of [REF,ALTs] with attached VariantAllele records
-
-
-map<string, pair<vector<VariantAllele>,bool> > Variant::parsedAlternates(
-    bool includePreviousBaseForIndels,
-    bool useMNPs,
-    bool useEntropy,
-    string flankingRefLeft,
-    string flankingRefRight,
-    wavefront_aligner_attr_t* wfaParams,
-    int invKmerLen,
-    int invMinLen,
-    int threads,
-    bool debug) {
-
-    // return type is a hash of allele containing a list of variants and a
-    // boolean value signifying an inversion of the variant
-    map<string, pair<vector<VariantAllele>,bool> > variantAlleles;
-
-    if (isSymbolicSV()){
-        // Don't ever align symbolic SVs. It just wrecks things.
-        for (auto& f : this->flatAlternates()) {
-            variantAlleles[f.first] = make_pair(f.second, false);
-        }
-        return variantAlleles;
-    }
-    // add the reference allele
-    variantAlleles[ref].first.push_back(VariantAllele(ref, ref, position));
-
-    // single SNP case, no ambiguity possible, no need to spend a lot of
-    // compute aligning ref and alt fields
-    if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
-        variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
-        return variantAlleles;
-    }
-
-    const string& reference_M = ref;
-
-    std::vector<rkmh::hash_t> ref_sketch_fwd;
-    std::vector<rkmh::hash_t> ref_sketch_rev;
-    if (invKmerLen && ref.size() >= invKmerLen
-        && ref.size() >= invMinLen) {
-        ref_sketch_fwd = rkmh::hash_sequence(
-            ref.c_str(), ref.size(), invKmerLen, ref.size()-invKmerLen+1);
-        string fer = reverse_complement(ref);
-        ref_sketch_rev = rkmh::hash_sequence(
-            fer.c_str(), fer.size(), invKmerLen, fer.size()-invKmerLen+1);
-
-    }
-
-    /*
-    for (auto a: alt) { // iterate ALT strings
-        // unsigned int referencePos;
-        string& alternate = a;
-        variantAlleles[alternate]; // create the slot
-    }
-    */
-
-// #pragma omp parallel for
-    for (auto a: alt) { // iterate ALT strings
-        //for (uint64_t idx = 0; idx < alt.size(); ++idx) {
-        //auto& a = alt[idx];
-        // unsigned int referencePos;
-        string alternate = a;
-        pair<vector<VariantAllele>, bool>& _v = variantAlleles[alternate];
-        bool is_inv = false;
-        // get the alt/ref mapping in inversion space
-        if (invKmerLen && alternate.size() >= invKmerLen
-            && alternate.size() >= invMinLen
-            && ref.size() >= invKmerLen
-            && ref.size() >= invMinLen
-            && max((int)ref.size(), (int)alt.size()) > invMinLen) {
-            // check if it's more likely for us to align as an inversion
-            auto alt_sketch = rkmh::hash_sequence(
-                a.c_str(), a.size(), invKmerLen, a.size()-invKmerLen+1);
-            if (rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
-                > rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen)) {
-                is_inv = true;
-                // flip the alt
-                string alternate_rev = reverse_complement(alternate);
-                // cout << "alt: " << alternate_rev << endl;
-                alternate = alternate_rev;
-            }
-            variantAlleles[alternate]; // create slot
-            variantAlleles[alternate].second = is_inv;
-            /*
-            cerr << "comparing "
-                 << " vs ref fwd " << rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
-                 << " vs rev " << rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen) << endl;
-            */
-        }
-
-        const string& alternateQuery_M = alternate;
-
-        string cigar;
-        vector<pair<int, char> > cigarData;
-
-        /*
-         * WFA2-lib
-         */
-        /*
-        // the C++ WFA2-lib interface is not yet stable due to heuristic mode initialization issues
-        WFAlignerGapAffine2Pieces aligner(19,39,3,81,1,WFAligner::Alignment,WFAligner::MemoryHigh);
-        aligner.alignEnd2End(reference_M.c_str(), reference_M.size(), alternateQuery_M.c_str(), alternateQuery_M.size());
-        cigar = aligner.getAlignmentCigar();
-        */
-        auto wfp = *wfaParams;
-        size_t max_len = max(reference_M.size(), alternateQuery_M.size());
-        if (max_len > 1000) {
-            wfp.memory_mode = wavefront_memory_ultralow;
-        } else {
-            wfp.memory_mode = wavefront_memory_high;
-        }
-        auto wf_aligner = wavefront_aligner_new(&wfp);
-
-        wavefront_aligner_set_max_num_threads(wf_aligner,threads);
-        wavefront_aligner_set_heuristic_none(wf_aligner);
-        wavefront_aligner_set_alignment_end_to_end(wf_aligner);
-        wavefront_align(wf_aligner,
-                        reference_M.c_str(), reference_M.size(),
-                        alternateQuery_M.c_str(), alternateQuery_M.size());
-        //cerr << "WFA_input\t" << ">
-        /*
-          cigar_print_pretty(stderr,
-          reference_M.c_str(), reference_M.size(),
-          alternateQuery_M.c_str(), alternateQuery_M.size(),
-          &wf_aligner->cigar,wf_aligner->mm_allocator);
-        */
-        // Fetch CIGAR
-        char* buffer = wf_aligner->cigar->operations + wf_aligner->cigar->begin_offset;
-        int buf_len = wf_aligner->cigar->end_offset - wf_aligner->cigar->begin_offset;
-        // Create string and return
-        cigar = std::string(buffer,buf_len);
-        wavefront_aligner_delete(wf_aligner);
-        if (cigar == "") {
-            if (debug) {
-                cerr << "ERROR: stopped WF because there is no CIGAR!" << endl;
-            }
-            cerr << ">fail.pattern" << endl
-                 << reference_M << endl
-                 << ">fail.query" << endl
-                 << alternateQuery_M << endl;
-            variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
-            exit(1);
-        }
-        cigarData = splitUnpackedCigar(cigar);
-
-        if (debug)
-            cerr << "biWF= " << position << ":" << cigar << "/" << joinCigar(cigarData) << ":" << reference_M << "," << alternateQuery_M << endl;
-
-        if (cigarData.size() == 0) {
-            cerr << "Algorithm error: CIGAR <" << cigar << "> is empty for "
-                 << "ref " << reference_M << ","
-                 << "allele " << alternateQuery_M << endl;
-
-            exit(1);
-        }
-
-        // now left align!
-        //
-        // TODO: currently broken! it seems to mess up our indel alleles (they become length 0 on ref or alt)
-        //stablyLeftAlign(alternateQuery_M, reference_M, cigarData, 5, true);
-
-        if (debug) {
-            cigar = joinCigar(cigarData);
-            cerr << position << ":" << cigar << ":" << reference_M << "," << alternateQuery_M << endl;
-        }
-
-        // Walk the CIGAR for one alternate and build up variantAlleles
-        vector<VariantAllele> &variants = variantAlleles[alternate].first;
-        int altpos = 0;
-        int refpos = 0;
-
-        for (auto e: cigarData) {
-            int  mlen  = e.first;  // CIGAR matchlen
-            char mtype = e.second; // CIGAR matchtype
-
-            switch (mtype) {
-            case 'I': // CIGAR INSERT
-            {
-                auto allele = VariantAllele("",
-                                            alternate.substr(altpos, mlen),
-                                            refpos + position);
-                // inject insertion from allele
-                if (variants.size() && variants.back().is_pure_indel()) {
-                    variants.back() = variants.back() + allele;
-                } else {
-                    variants.push_back(allele);
-                }
-                altpos += mlen;
-            }
-            break;
-            case 'D': // CIGAR DELETE
-            {
-                auto allele = VariantAllele(ref.substr(refpos, mlen),
-                                            "", refpos + position);
-                // inject deletion from ref
-                if (variants.size() && variants.back().is_pure_indel()) {
-                    variants.back() = variants.back() + allele;
-                } else {
-                    variants.push_back(allele);
-                }
-                refpos += mlen;
-            }
-            break;
-            case 'M': // CIGAR match and variant
-            case 'X':
-                variants.push_back(VariantAllele(ref.substr(refpos, mlen),
-                                                 alternate.substr(altpos, mlen),
-                                                 refpos + position));
-                refpos += mlen;
-                altpos += mlen;
-                break;
-            case 'S': // S operations specify segments at the start
-                      // and/or end of the query that do not appear in
-                      // a local alignment
-                refpos += mlen;
-                altpos += mlen;
-                break;
-
-            default: // Ignore N, H
-                cerr << "Hit unexpected CIGAR " << mtype << " in " << endl;
-                cerr << "pos: " << position << endl;
-                cerr << "cigar: " << cigar << endl;
-                cerr << "ref:   " << reference_M << endl;
-                cerr << "allele:" << alternateQuery_M << endl;
-                if (debug) exit(1);
-                break;
-            } // switch mtype
-        }
-        if (includePreviousBaseForIndels) {
-            for (uint64_t i = 0; i < variants.size(); ++i) {
-                auto& v = variants[i];
-                if (v.is_pure_indel()) {
-                    if (i == 0) { // special case, we're at the beginning
-                        // do we have a next allele?
-                        if (i == variants.size()-1) {
-                            // if not-- panic
-                            cerr << "allele base fail: no subsequent alleles to indel" << endl;
-                            cerr << v << endl;
-                            cerr << "variant at " << position << endl;
-                            exit(1);
-                        }
-                        // else
-                        // while the next allele is an indel
-                        auto j = i+1;
-                        while (j < variants.size()
-                               // stop when we have sequence in both ref or alt
-                               && v.is_pure_indel()) {
-                            auto q = variants[j++];
-                            // merge with us
-                            shift_mid_left(v, q);
-                        }
-                        // panic if we reach the end of the variants
-                        if (j == variants.size() && v.is_pure_indel()) {
-                            cerr << "allele base fail: can't get an additional base next" << endl;
-                            cerr << v << endl;
-                            cerr << "variant at " << position << endl;
-                            exit(1);
-                        }
-                    } else {
-                        // while the next allele is an indel
-                        int j = i-1; // i is guaranteed >=1
-                        while (j >= 0
-                               // stop when we have sequence in both ref or alt
-                               && v.is_pure_indel()) {
-                            auto q = variants[j--];
-                            // move the middle base between the alleles
-                            // to be at the start of v
-                            // or merge if both are pure indels
-                            shift_mid_right(q, v);
-                        }
-                        // panic if we reach the end of the variants
-                        if (j <= 0 && v.is_pure_indel()) {
-                            cerr << "allele base fail: can't get an additional base prev" << endl;
-                            cerr << v << endl;
-                            cerr << "variant at " << position << endl;
-                            exit(1);
-                        }
-                        // while the previous allele is an indel
-                        // merge with us
-                        // stop when we have sequence in both ref or alt
-                        // if allele we reach is a positional match
-                        // panic if we reach the start of the variants
-
-                    }
-                    // try to get the previous reference or alternate base
-                    // continue until the
-                    // we first check if we have an indel just before us
-                    // if so, we need to merge this into the same allele and go further back
-                }
-            }
-        }
-    }
-
-    return variantAlleles;
-}
-
 
 map<string, vector<VariantAllele> > Variant::flatAlternates(void) {
     map<string, vector<VariantAllele> > variantAlleles;


=====================================
src/Variant.h
=====================================
@@ -1,8 +1,8 @@
 /*
     vcflib C++ library for parsing and manipulating VCF files
 
-    Copyright © 2010-2022 Erik Garrison
-    Copyright © 2020-2022 Pjotr Prins
+    Copyright © 2010-2023 Erik Garrison
+    Copyright © 2020-2023 Pjotr Prins
 
     This software is published under the MIT License. See the LICENSE file.
 */
@@ -33,7 +33,6 @@
 #include "rkmh.hpp"
 #include "LeftAlign.hpp"
 #include <Fasta.h>
-#include "wavefront/wfa.hpp"
 
 extern "C" {
   #include "filevercmp.h"
@@ -210,18 +209,6 @@ public:
     set<string> altSet(void);  // set of alleles, rather than vector of them
     map<string, int> altAlleleIndexes;  // reverse lookup for alleles
 
-    map<string, pair<vector<VariantAllele>,bool> > parsedAlternates(
-           bool includePreviousBaseForIndels = false,
-           bool useMNPs = false,
-           bool useEntropy = false,
-           string flankingRefLeft = "",
-           string flankingRefRight = "",
-           wavefront_aligner_attr_t* wfaParams = NULL,
-           int invKmerLen = 17,
-           int invMinLen = 1000,
-           int threads = 1,
-           bool debug = false);
-
     // Legacy version of parsedAlterneates:
     map<string, vector<VariantAllele> > legacy_parsedAlternates(
            bool includePreviousBaseForIndels = false,


=====================================
src/legacy.cpp
=====================================
@@ -10,6 +10,7 @@
 
 #include <utility>
 #include "Variant.h"
+#include "vcf-wfa.h"
 #include "allele.hpp"
 #include "cigar.hpp"
 


=====================================
src/pythonffi.cpp
=====================================
@@ -1,8 +1,9 @@
 // Python ffi calls C++ functions
 //
-// Copyright © 2022 Pjotr Prins
+// Copyright © 2022-2023 Pjotr Prins
 
 #include "Variant.h"
+#include "vcf-wfa.h"
 
 // Pybind11
 #include <pybind11/pybind11.h>
@@ -59,8 +60,12 @@ PYBIND11_MODULE(pyvcflib, m)
       .def_readonly("sampleNames", &Variant::sampleNames)
       .def_readonly("samples", &Variant::samples)
       .def("legacy_parsedAlternates", &Variant::legacy_parsedAlternates)
-      .def("parsedAlternates", &Variant::parsedAlternates)
       ;
+
+  py::class_<WfaVariant, Variant>(m, "WfaVariant", "WFA VCF record")
+      .def(py::init<VariantCallFile &>() )
+      .def("parsedAlternates", &WfaVariant::parsedAlternates);
+
   py::class_<VariantCallFile>(m, "VariantCallFile", "VCF file")
       .def(py::init())
       .def("openFile",&VariantCallFile::openFile,"Open the VCF")


=====================================
src/vcf-wfa.cpp
=====================================
@@ -0,0 +1,320 @@
+/*
+    vcflib C++ library for parsing and manipulating VCF files
+
+    Copyright © 2010-2023 Erik Garrison
+    Copyright © 2020-2023 Pjotr Prins
+
+    This software is published under the MIT License. See the LICENSE file.
+*/
+
+#include "Variant.h"
+#include "vcf-wfa.h"
+
+namespace vcflib {
+
+// parsedAlternates returns a hash of 'ref' and a vector of alts. A
+// single record may be split into multiple records with new
+// 'refs'. In this function Smith-Waterman is used with padding on
+// both sides of a ref and each alt. The SW method quadratic in nature
+// and painful with long sequences. Recently WFA is introduced with
+// runs in linear time.
+//
+// Returns map of [REF,ALTs] with attached VariantAllele records
+
+
+map<string, pair<vector<VariantAllele>,bool> > WfaVariant::parsedAlternates(
+    bool includePreviousBaseForIndels,
+    bool useMNPs,
+    bool useEntropy,
+    string flankingRefLeft,
+    string flankingRefRight,
+    wavefront_aligner_attr_t* wfaParams,
+    int invKmerLen,
+    int invMinLen,
+    int threads,
+    bool debug) {
+
+    // return type is a hash of allele containing a list of variants and a
+    // boolean value signifying an inversion of the variant
+    map<string, pair<vector<VariantAllele>,bool> > variantAlleles;
+
+    if (isSymbolicSV()){
+        // Don't ever align symbolic SVs. It just wrecks things.
+        for (auto& f : this->flatAlternates()) {
+            variantAlleles[f.first] = make_pair(f.second, false);
+        }
+        return variantAlleles;
+    }
+    // add the reference allele
+    variantAlleles[ref].first.push_back(VariantAllele(ref, ref, position));
+
+    // single SNP case, no ambiguity possible, no need to spend a lot of
+    // compute aligning ref and alt fields
+    if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
+        variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
+        return variantAlleles;
+    }
+
+    const string& reference_M = ref;
+
+    std::vector<rkmh::hash_t> ref_sketch_fwd;
+    std::vector<rkmh::hash_t> ref_sketch_rev;
+    if (invKmerLen && ref.size() >= invKmerLen
+        && ref.size() >= invMinLen) {
+        ref_sketch_fwd = rkmh::hash_sequence(
+            ref.c_str(), ref.size(), invKmerLen, ref.size()-invKmerLen+1);
+        string fer = reverse_complement(ref);
+        ref_sketch_rev = rkmh::hash_sequence(
+            fer.c_str(), fer.size(), invKmerLen, fer.size()-invKmerLen+1);
+
+    }
+
+    /*
+    for (auto a: alt) { // iterate ALT strings
+        // unsigned int referencePos;
+        string& alternate = a;
+        variantAlleles[alternate]; // create the slot
+    }
+    */
+
+// #pragma omp parallel for
+    for (auto a: alt) { // iterate ALT strings
+        //for (uint64_t idx = 0; idx < alt.size(); ++idx) {
+        //auto& a = alt[idx];
+        // unsigned int referencePos;
+        string alternate = a;
+        pair<vector<VariantAllele>, bool>& _v = variantAlleles[alternate];
+        bool is_inv = false;
+        // get the alt/ref mapping in inversion space
+        if (invKmerLen && alternate.size() >= invKmerLen
+            && alternate.size() >= invMinLen
+            && ref.size() >= invKmerLen
+            && ref.size() >= invMinLen
+            && max((int)ref.size(), (int)alt.size()) > invMinLen) {
+            // check if it's more likely for us to align as an inversion
+            auto alt_sketch = rkmh::hash_sequence(
+                a.c_str(), a.size(), invKmerLen, a.size()-invKmerLen+1);
+            if (rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
+                > rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen)) {
+                is_inv = true;
+                // flip the alt
+                string alternate_rev = reverse_complement(alternate);
+                // cout << "alt: " << alternate_rev << endl;
+                alternate = alternate_rev;
+            }
+            variantAlleles[alternate]; // create slot
+            variantAlleles[alternate].second = is_inv;
+            /*
+            cerr << "comparing "
+                 << " vs ref fwd " << rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
+                 << " vs rev " << rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen) << endl;
+            */
+        }
+
+        const string& alternateQuery_M = alternate;
+
+        string cigar;
+        vector<pair<int, char> > cigarData;
+
+        /*
+         * WFA2-lib
+         */
+        /*
+        // the C++ WFA2-lib interface is not yet stable due to heuristic mode initialization issues
+        WFAlignerGapAffine2Pieces aligner(19,39,3,81,1,WFAligner::Alignment,WFAligner::MemoryHigh);
+        aligner.alignEnd2End(reference_M.c_str(), reference_M.size(), alternateQuery_M.c_str(), alternateQuery_M.size());
+        cigar = aligner.getAlignmentCigar();
+        */
+        auto wfp = *wfaParams;
+        size_t max_len = max(reference_M.size(), alternateQuery_M.size());
+        if (max_len > 1000) {
+            wfp.memory_mode = wavefront_memory_ultralow;
+        } else {
+            wfp.memory_mode = wavefront_memory_high;
+        }
+        auto wf_aligner = wavefront_aligner_new(&wfp);
+
+        wavefront_aligner_set_max_num_threads(wf_aligner,threads);
+        wavefront_aligner_set_heuristic_none(wf_aligner);
+        wavefront_aligner_set_alignment_end_to_end(wf_aligner);
+        wavefront_align(wf_aligner,
+                        reference_M.c_str(), reference_M.size(),
+                        alternateQuery_M.c_str(), alternateQuery_M.size());
+        //cerr << "WFA_input\t" << ">
+        /*
+          cigar_print_pretty(stderr,
+          reference_M.c_str(), reference_M.size(),
+          alternateQuery_M.c_str(), alternateQuery_M.size(),
+          &wf_aligner->cigar,wf_aligner->mm_allocator);
+        */
+        // Fetch CIGAR
+        char* buffer = wf_aligner->cigar->operations + wf_aligner->cigar->begin_offset;
+        int buf_len = wf_aligner->cigar->end_offset - wf_aligner->cigar->begin_offset;
+        // Create string and return
+        cigar = std::string(buffer,buf_len);
+        wavefront_aligner_delete(wf_aligner);
+        if (cigar == "") {
+            if (debug) {
+                cerr << "ERROR: stopped WF because there is no CIGAR!" << endl;
+            }
+            cerr << ">fail.pattern" << endl
+                 << reference_M << endl
+                 << ">fail.query" << endl
+                 << alternateQuery_M << endl;
+            variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
+            exit(1);
+        }
+        cigarData = splitUnpackedCigar(cigar);
+
+        if (debug)
+            cerr << "biWF= " << position << ":" << cigar << "/" << joinCigar(cigarData) << ":" << reference_M << "," << alternateQuery_M << endl;
+
+        if (cigarData.size() == 0) {
+            cerr << "Algorithm error: CIGAR <" << cigar << "> is empty for "
+                 << "ref " << reference_M << ","
+                 << "allele " << alternateQuery_M << endl;
+
+            exit(1);
+        }
+
+        // now left align!
+        //
+        // TODO: currently broken! it seems to mess up our indel alleles (they become length 0 on ref or alt)
+        //stablyLeftAlign(alternateQuery_M, reference_M, cigarData, 5, true);
+
+        if (debug) {
+            cigar = joinCigar(cigarData);
+            cerr << position << ":" << cigar << ":" << reference_M << "," << alternateQuery_M << endl;
+        }
+
+        // Walk the CIGAR for one alternate and build up variantAlleles
+        vector<VariantAllele> &variants = variantAlleles[alternate].first;
+        int altpos = 0;
+        int refpos = 0;
+
+        for (auto e: cigarData) {
+            int  mlen  = e.first;  // CIGAR matchlen
+            char mtype = e.second; // CIGAR matchtype
+
+            switch (mtype) {
+            case 'I': // CIGAR INSERT
+            {
+                auto allele = VariantAllele("",
+                                            alternate.substr(altpos, mlen),
+                                            refpos + position);
+                // inject insertion from allele
+                if (variants.size() && variants.back().is_pure_indel()) {
+                    variants.back() = variants.back() + allele;
+                } else {
+                    variants.push_back(allele);
+                }
+                altpos += mlen;
+            }
+            break;
+            case 'D': // CIGAR DELETE
+            {
+                auto allele = VariantAllele(ref.substr(refpos, mlen),
+                                            "", refpos + position);
+                // inject deletion from ref
+                if (variants.size() && variants.back().is_pure_indel()) {
+                    variants.back() = variants.back() + allele;
+                } else {
+                    variants.push_back(allele);
+                }
+                refpos += mlen;
+            }
+            break;
+            case 'M': // CIGAR match and variant
+            case 'X':
+                variants.push_back(VariantAllele(ref.substr(refpos, mlen),
+                                                 alternate.substr(altpos, mlen),
+                                                 refpos + position));
+                refpos += mlen;
+                altpos += mlen;
+                break;
+            case 'S': // S operations specify segments at the start
+                      // and/or end of the query that do not appear in
+                      // a local alignment
+                refpos += mlen;
+                altpos += mlen;
+                break;
+
+            default: // Ignore N, H
+                cerr << "Hit unexpected CIGAR " << mtype << " in " << endl;
+                cerr << "pos: " << position << endl;
+                cerr << "cigar: " << cigar << endl;
+                cerr << "ref:   " << reference_M << endl;
+                cerr << "allele:" << alternateQuery_M << endl;
+                if (debug) exit(1);
+                break;
+            } // switch mtype
+        }
+        if (includePreviousBaseForIndels) {
+            for (uint64_t i = 0; i < variants.size(); ++i) {
+                auto& v = variants[i];
+                if (v.is_pure_indel()) {
+                    if (i == 0) { // special case, we're at the beginning
+                        // do we have a next allele?
+                        if (i == variants.size()-1) {
+                            // if not-- panic
+                            cerr << "allele base fail: no subsequent alleles to indel" << endl;
+                            cerr << v << endl;
+                            cerr << "variant at " << position << endl;
+                            exit(1);
+                        }
+                        // else
+                        // while the next allele is an indel
+                        auto j = i+1;
+                        while (j < variants.size()
+                               // stop when we have sequence in both ref or alt
+                               && v.is_pure_indel()) {
+                            auto q = variants[j++];
+                            // merge with us
+                            shift_mid_left(v, q);
+                        }
+                        // panic if we reach the end of the variants
+                        if (j == variants.size() && v.is_pure_indel()) {
+                            cerr << "allele base fail: can't get an additional base next" << endl;
+                            cerr << v << endl;
+                            cerr << "variant at " << position << endl;
+                            exit(1);
+                        }
+                    } else {
+                        // while the next allele is an indel
+                        int j = i-1; // i is guaranteed >=1
+                        while (j >= 0
+                               // stop when we have sequence in both ref or alt
+                               && v.is_pure_indel()) {
+                            auto q = variants[j--];
+                            // move the middle base between the alleles
+                            // to be at the start of v
+                            // or merge if both are pure indels
+                            shift_mid_right(q, v);
+                        }
+                        // panic if we reach the end of the variants
+                        if (j <= 0 && v.is_pure_indel()) {
+                            cerr << "allele base fail: can't get an additional base prev" << endl;
+                            cerr << v << endl;
+                            cerr << "variant at " << position << endl;
+                            exit(1);
+                        }
+                        // while the previous allele is an indel
+                        // merge with us
+                        // stop when we have sequence in both ref or alt
+                        // if allele we reach is a positional match
+                        // panic if we reach the start of the variants
+
+                    }
+                    // try to get the previous reference or alternate base
+                    // continue until the
+                    // we first check if we have an indel just before us
+                    // if so, we need to merge this into the same allele and go further back
+                }
+            }
+        }
+    }
+
+    return variantAlleles;
+}
+
+} // namespace


=====================================
src/vcf-wfa.h
=====================================
@@ -0,0 +1,36 @@
+/*
+    vcflib C++ library for parsing and manipulating VCF files
+
+    Copyright © 2010-2023 Erik Garrison
+    Copyright © 2020-2023 Pjotr Prins
+
+    This software is published under the MIT License. See the LICENSE file.
+*/
+
+#pragma once
+
+#include "wavefront/wfa.hpp"
+
+namespace vcflib {
+
+class WfaVariant : public Variant
+{
+
+public:
+    WfaVariant(VariantCallFile& v) : Variant(v)
+      { };
+
+    map<string, pair<vector<VariantAllele>,bool> > parsedAlternates(
+           bool includePreviousBaseForIndels = false,
+           bool useMNPs = false,
+           bool useEntropy = false,
+           string flankingRefLeft = "",
+           string flankingRefRight = "",
+           wavefront_aligner_attr_t* wfaParams = NULL,
+           int invKmerLen = 17,
+           int invMinLen = 1000,
+           int threads = 1,
+           bool debug = false);
+};
+
+}


=====================================
src/vcfwave.cpp
=====================================
@@ -17,6 +17,7 @@
 #include <omp.h>
 
 #include "Variant.h"
+#include "vcf-wfa.h"
 #include "convert.h"
 #include "join.h"
 #include "split.h"
@@ -218,7 +219,7 @@ int main(int argc, char** argv) {
     variantFile.addHeaderLine("##INFO=<ID=INV,Number=A,Type=String,Description=\"Count of haplotypes which are aligned in the inverted orientation using vcflib vcfwave.\">");
     cout << variantFile.header << endl;
 
-    Variant var(variantFile);
+    WfaVariant var(variantFile);
     double amount = 0.0, prev_amount = 0.0;
     uint64_t start = get_timestamp();
 


=====================================
test/tests/realign.py
=====================================
@@ -48,7 +48,7 @@ class RealignTest(unittest.TestCase):
     def test_wfbug2(self):
         vcf = VariantCallFile()
         vcf.openFile("../samples/10134514.vcf")
-        var = Variant(vcf)
+        var = WfaVariant(vcf)
         vcf.getNextVariant(var)
         self.assertEqual(var.name,"grch38#chr4_10083863-10181258.vcf:grch38#chr4")
         self.assertEqual(var.ref,'GGAGAATCCCAATTGATGG')



View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/058df1f3b6f945dd6e9d9417163175de2ef67dfb...6c92e5254330625b867a59f0e24b360697bcaf46

-- 
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/058df1f3b6f945dd6e9d9417163175de2ef67dfb...6c92e5254330625b867a59f0e24b360697bcaf46
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230211/a845fcc6/attachment-0001.htm>


More information about the debian-med-commit mailing list