[med-svn] [Git][med-team/libvcflib][upstream] New upstream version 1.0.8+dfsg
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Sat Feb 11 15:10:40 GMT 2023
Étienne Mollier pushed to branch upstream at Debian Med / libvcflib
Commits:
7dedad94 by Étienne Mollier at 2023-02-11T15:43:44+01:00
New upstream version 1.0.8+dfsg
- - - - -
9 changed files:
- CMakeLists.txt
- src/Variant.cpp
- src/Variant.h
- src/legacy.cpp
- src/pythonffi.cpp
- + src/vcf-wfa.cpp
- + src/vcf-wfa.h
- src/vcfwave.cpp
- test/tests/realign.py
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -43,6 +43,8 @@ option(WFA_GITMODULE "Force local git submodule for WFA2LIB" ON) # disable in di
include(CheckIPOSupported) # adds lto
check_ipo_supported(RESULT ipo_supported OUTPUT output)
+set(EXTRA_FLAGS " ") # otherwise it injects OFF
+
# ---- Dependencies
if(OPENMP)
@@ -143,6 +145,7 @@ file(GLOB INCLUDES
set(vcflib_SOURCE
src/vcf-c-api.cpp
src/legacy.cpp
+ src/vcf-wfa.cpp
src/Variant.cpp
src/rnglib.cpp
src/var.cpp
@@ -434,9 +437,15 @@ if (NOT BUILD_ONLY_LIB)
foreach(BIN ${BINS})
add_executable(${BIN} src/${BIN}.cpp)
add_dependencies(${BIN} vcflib)
- target_link_libraries(${BIN} PUBLIC ${vcflib_LIBS} vcflib ${WFALIB})
+ target_link_libraries(${BIN} PUBLIC ${vcflib_LIBS} vcflib)
endforeach(BIN ${BINS})
- # target_link_libraries(vcfwave PUBLIC ${WFALIB})
+ # These binaries include WFALIB
+ target_link_libraries(vcfallelicprimitives PUBLIC ${WFALIB})
+ target_link_libraries(vcfcleancomplex PUBLIC ${WFALIB})
+ target_link_libraries(vcfparsealts PUBLIC ${WFALIB})
+ target_link_libraries(vcfroc PUBLIC ${WFALIB})
+ target_link_libraries(vcfstats PUBLIC ${WFALIB})
+ target_link_libraries(vcfwave PUBLIC ${WFALIB})
install(TARGETS ${BINS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
# ---- Copy scripts
@@ -456,6 +465,7 @@ install(TARGETS pyvcflib LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
enable_testing()
+
function(add_pytest TEST_FILE)
add_test(
NAME ${TEST_FILE}
=====================================
src/Variant.cpp
=====================================
@@ -2082,311 +2082,6 @@ int ploidy(const map<int, int>& genotype) {
-// parsedAlternates returns a hash of 'ref' and a vector of alts. A
-// single record may be split into multiple records with new
-// 'refs'. In this function Smith-Waterman is used with padding on
-// both sides of a ref and each alt. The SW method quadratic in nature
-// and painful with long sequences. Recently WFA is introduced with
-// runs in linear time.
-//
-// Returns map of [REF,ALTs] with attached VariantAllele records
-
-
-map<string, pair<vector<VariantAllele>,bool> > Variant::parsedAlternates(
- bool includePreviousBaseForIndels,
- bool useMNPs,
- bool useEntropy,
- string flankingRefLeft,
- string flankingRefRight,
- wavefront_aligner_attr_t* wfaParams,
- int invKmerLen,
- int invMinLen,
- int threads,
- bool debug) {
-
- // return type is a hash of allele containing a list of variants and a
- // boolean value signifying an inversion of the variant
- map<string, pair<vector<VariantAllele>,bool> > variantAlleles;
-
- if (isSymbolicSV()){
- // Don't ever align symbolic SVs. It just wrecks things.
- for (auto& f : this->flatAlternates()) {
- variantAlleles[f.first] = make_pair(f.second, false);
- }
- return variantAlleles;
- }
- // add the reference allele
- variantAlleles[ref].first.push_back(VariantAllele(ref, ref, position));
-
- // single SNP case, no ambiguity possible, no need to spend a lot of
- // compute aligning ref and alt fields
- if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
- variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
- return variantAlleles;
- }
-
- const string& reference_M = ref;
-
- std::vector<rkmh::hash_t> ref_sketch_fwd;
- std::vector<rkmh::hash_t> ref_sketch_rev;
- if (invKmerLen && ref.size() >= invKmerLen
- && ref.size() >= invMinLen) {
- ref_sketch_fwd = rkmh::hash_sequence(
- ref.c_str(), ref.size(), invKmerLen, ref.size()-invKmerLen+1);
- string fer = reverse_complement(ref);
- ref_sketch_rev = rkmh::hash_sequence(
- fer.c_str(), fer.size(), invKmerLen, fer.size()-invKmerLen+1);
-
- }
-
- /*
- for (auto a: alt) { // iterate ALT strings
- // unsigned int referencePos;
- string& alternate = a;
- variantAlleles[alternate]; // create the slot
- }
- */
-
-// #pragma omp parallel for
- for (auto a: alt) { // iterate ALT strings
- //for (uint64_t idx = 0; idx < alt.size(); ++idx) {
- //auto& a = alt[idx];
- // unsigned int referencePos;
- string alternate = a;
- pair<vector<VariantAllele>, bool>& _v = variantAlleles[alternate];
- bool is_inv = false;
- // get the alt/ref mapping in inversion space
- if (invKmerLen && alternate.size() >= invKmerLen
- && alternate.size() >= invMinLen
- && ref.size() >= invKmerLen
- && ref.size() >= invMinLen
- && max((int)ref.size(), (int)alt.size()) > invMinLen) {
- // check if it's more likely for us to align as an inversion
- auto alt_sketch = rkmh::hash_sequence(
- a.c_str(), a.size(), invKmerLen, a.size()-invKmerLen+1);
- if (rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
- > rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen)) {
- is_inv = true;
- // flip the alt
- string alternate_rev = reverse_complement(alternate);
- // cout << "alt: " << alternate_rev << endl;
- alternate = alternate_rev;
- }
- variantAlleles[alternate]; // create slot
- variantAlleles[alternate].second = is_inv;
- /*
- cerr << "comparing "
- << " vs ref fwd " << rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
- << " vs rev " << rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen) << endl;
- */
- }
-
- const string& alternateQuery_M = alternate;
-
- string cigar;
- vector<pair<int, char> > cigarData;
-
- /*
- * WFA2-lib
- */
- /*
- // the C++ WFA2-lib interface is not yet stable due to heuristic mode initialization issues
- WFAlignerGapAffine2Pieces aligner(19,39,3,81,1,WFAligner::Alignment,WFAligner::MemoryHigh);
- aligner.alignEnd2End(reference_M.c_str(), reference_M.size(), alternateQuery_M.c_str(), alternateQuery_M.size());
- cigar = aligner.getAlignmentCigar();
- */
- auto wfp = *wfaParams;
- size_t max_len = max(reference_M.size(), alternateQuery_M.size());
- if (max_len > 1000) {
- wfp.memory_mode = wavefront_memory_ultralow;
- } else {
- wfp.memory_mode = wavefront_memory_high;
- }
- auto wf_aligner = wavefront_aligner_new(&wfp);
-
- wavefront_aligner_set_max_num_threads(wf_aligner,threads);
- wavefront_aligner_set_heuristic_none(wf_aligner);
- wavefront_aligner_set_alignment_end_to_end(wf_aligner);
- wavefront_align(wf_aligner,
- reference_M.c_str(), reference_M.size(),
- alternateQuery_M.c_str(), alternateQuery_M.size());
- //cerr << "WFA_input\t" << ">
- /*
- cigar_print_pretty(stderr,
- reference_M.c_str(), reference_M.size(),
- alternateQuery_M.c_str(), alternateQuery_M.size(),
- &wf_aligner->cigar,wf_aligner->mm_allocator);
- */
- // Fetch CIGAR
- char* buffer = wf_aligner->cigar->operations + wf_aligner->cigar->begin_offset;
- int buf_len = wf_aligner->cigar->end_offset - wf_aligner->cigar->begin_offset;
- // Create string and return
- cigar = std::string(buffer,buf_len);
- wavefront_aligner_delete(wf_aligner);
- if (cigar == "") {
- if (debug) {
- cerr << "ERROR: stopped WF because there is no CIGAR!" << endl;
- }
- cerr << ">fail.pattern" << endl
- << reference_M << endl
- << ">fail.query" << endl
- << alternateQuery_M << endl;
- variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
- exit(1);
- }
- cigarData = splitUnpackedCigar(cigar);
-
- if (debug)
- cerr << "biWF= " << position << ":" << cigar << "/" << joinCigar(cigarData) << ":" << reference_M << "," << alternateQuery_M << endl;
-
- if (cigarData.size() == 0) {
- cerr << "Algorithm error: CIGAR <" << cigar << "> is empty for "
- << "ref " << reference_M << ","
- << "allele " << alternateQuery_M << endl;
-
- exit(1);
- }
-
- // now left align!
- //
- // TODO: currently broken! it seems to mess up our indel alleles (they become length 0 on ref or alt)
- //stablyLeftAlign(alternateQuery_M, reference_M, cigarData, 5, true);
-
- if (debug) {
- cigar = joinCigar(cigarData);
- cerr << position << ":" << cigar << ":" << reference_M << "," << alternateQuery_M << endl;
- }
-
- // Walk the CIGAR for one alternate and build up variantAlleles
- vector<VariantAllele> &variants = variantAlleles[alternate].first;
- int altpos = 0;
- int refpos = 0;
-
- for (auto e: cigarData) {
- int mlen = e.first; // CIGAR matchlen
- char mtype = e.second; // CIGAR matchtype
-
- switch (mtype) {
- case 'I': // CIGAR INSERT
- {
- auto allele = VariantAllele("",
- alternate.substr(altpos, mlen),
- refpos + position);
- // inject insertion from allele
- if (variants.size() && variants.back().is_pure_indel()) {
- variants.back() = variants.back() + allele;
- } else {
- variants.push_back(allele);
- }
- altpos += mlen;
- }
- break;
- case 'D': // CIGAR DELETE
- {
- auto allele = VariantAllele(ref.substr(refpos, mlen),
- "", refpos + position);
- // inject deletion from ref
- if (variants.size() && variants.back().is_pure_indel()) {
- variants.back() = variants.back() + allele;
- } else {
- variants.push_back(allele);
- }
- refpos += mlen;
- }
- break;
- case 'M': // CIGAR match and variant
- case 'X':
- variants.push_back(VariantAllele(ref.substr(refpos, mlen),
- alternate.substr(altpos, mlen),
- refpos + position));
- refpos += mlen;
- altpos += mlen;
- break;
- case 'S': // S operations specify segments at the start
- // and/or end of the query that do not appear in
- // a local alignment
- refpos += mlen;
- altpos += mlen;
- break;
-
- default: // Ignore N, H
- cerr << "Hit unexpected CIGAR " << mtype << " in " << endl;
- cerr << "pos: " << position << endl;
- cerr << "cigar: " << cigar << endl;
- cerr << "ref: " << reference_M << endl;
- cerr << "allele:" << alternateQuery_M << endl;
- if (debug) exit(1);
- break;
- } // switch mtype
- }
- if (includePreviousBaseForIndels) {
- for (uint64_t i = 0; i < variants.size(); ++i) {
- auto& v = variants[i];
- if (v.is_pure_indel()) {
- if (i == 0) { // special case, we're at the beginning
- // do we have a next allele?
- if (i == variants.size()-1) {
- // if not-- panic
- cerr << "allele base fail: no subsequent alleles to indel" << endl;
- cerr << v << endl;
- cerr << "variant at " << position << endl;
- exit(1);
- }
- // else
- // while the next allele is an indel
- auto j = i+1;
- while (j < variants.size()
- // stop when we have sequence in both ref or alt
- && v.is_pure_indel()) {
- auto q = variants[j++];
- // merge with us
- shift_mid_left(v, q);
- }
- // panic if we reach the end of the variants
- if (j == variants.size() && v.is_pure_indel()) {
- cerr << "allele base fail: can't get an additional base next" << endl;
- cerr << v << endl;
- cerr << "variant at " << position << endl;
- exit(1);
- }
- } else {
- // while the next allele is an indel
- int j = i-1; // i is guaranteed >=1
- while (j >= 0
- // stop when we have sequence in both ref or alt
- && v.is_pure_indel()) {
- auto q = variants[j--];
- // move the middle base between the alleles
- // to be at the start of v
- // or merge if both are pure indels
- shift_mid_right(q, v);
- }
- // panic if we reach the end of the variants
- if (j <= 0 && v.is_pure_indel()) {
- cerr << "allele base fail: can't get an additional base prev" << endl;
- cerr << v << endl;
- cerr << "variant at " << position << endl;
- exit(1);
- }
- // while the previous allele is an indel
- // merge with us
- // stop when we have sequence in both ref or alt
- // if allele we reach is a positional match
- // panic if we reach the start of the variants
-
- }
- // try to get the previous reference or alternate base
- // continue until the
- // we first check if we have an indel just before us
- // if so, we need to merge this into the same allele and go further back
- }
- }
- }
- }
-
- return variantAlleles;
-}
-
map<string, vector<VariantAllele> > Variant::flatAlternates(void) {
map<string, vector<VariantAllele> > variantAlleles;
=====================================
src/Variant.h
=====================================
@@ -1,8 +1,8 @@
/*
vcflib C++ library for parsing and manipulating VCF files
- Copyright © 2010-2022 Erik Garrison
- Copyright © 2020-2022 Pjotr Prins
+ Copyright © 2010-2023 Erik Garrison
+ Copyright © 2020-2023 Pjotr Prins
This software is published under the MIT License. See the LICENSE file.
*/
@@ -33,7 +33,6 @@
#include "rkmh.hpp"
#include "LeftAlign.hpp"
#include <Fasta.h>
-#include "wavefront/wfa.hpp"
extern "C" {
#include "filevercmp.h"
@@ -210,18 +209,6 @@ public:
set<string> altSet(void); // set of alleles, rather than vector of them
map<string, int> altAlleleIndexes; // reverse lookup for alleles
- map<string, pair<vector<VariantAllele>,bool> > parsedAlternates(
- bool includePreviousBaseForIndels = false,
- bool useMNPs = false,
- bool useEntropy = false,
- string flankingRefLeft = "",
- string flankingRefRight = "",
- wavefront_aligner_attr_t* wfaParams = NULL,
- int invKmerLen = 17,
- int invMinLen = 1000,
- int threads = 1,
- bool debug = false);
-
// Legacy version of parsedAlterneates:
map<string, vector<VariantAllele> > legacy_parsedAlternates(
bool includePreviousBaseForIndels = false,
=====================================
src/legacy.cpp
=====================================
@@ -10,6 +10,7 @@
#include <utility>
#include "Variant.h"
+#include "vcf-wfa.h"
#include "allele.hpp"
#include "cigar.hpp"
=====================================
src/pythonffi.cpp
=====================================
@@ -1,8 +1,9 @@
// Python ffi calls C++ functions
//
-// Copyright © 2022 Pjotr Prins
+// Copyright © 2022-2023 Pjotr Prins
#include "Variant.h"
+#include "vcf-wfa.h"
// Pybind11
#include <pybind11/pybind11.h>
@@ -59,8 +60,12 @@ PYBIND11_MODULE(pyvcflib, m)
.def_readonly("sampleNames", &Variant::sampleNames)
.def_readonly("samples", &Variant::samples)
.def("legacy_parsedAlternates", &Variant::legacy_parsedAlternates)
- .def("parsedAlternates", &Variant::parsedAlternates)
;
+
+ py::class_<WfaVariant, Variant>(m, "WfaVariant", "WFA VCF record")
+ .def(py::init<VariantCallFile &>() )
+ .def("parsedAlternates", &WfaVariant::parsedAlternates);
+
py::class_<VariantCallFile>(m, "VariantCallFile", "VCF file")
.def(py::init())
.def("openFile",&VariantCallFile::openFile,"Open the VCF")
=====================================
src/vcf-wfa.cpp
=====================================
@@ -0,0 +1,320 @@
+/*
+ vcflib C++ library for parsing and manipulating VCF files
+
+ Copyright © 2010-2023 Erik Garrison
+ Copyright © 2020-2023 Pjotr Prins
+
+ This software is published under the MIT License. See the LICENSE file.
+*/
+
+#include "Variant.h"
+#include "vcf-wfa.h"
+
+namespace vcflib {
+
+// parsedAlternates returns a hash of 'ref' and a vector of alts. A
+// single record may be split into multiple records with new
+// 'refs'. In this function Smith-Waterman is used with padding on
+// both sides of a ref and each alt. The SW method quadratic in nature
+// and painful with long sequences. Recently WFA is introduced with
+// runs in linear time.
+//
+// Returns map of [REF,ALTs] with attached VariantAllele records
+
+
+map<string, pair<vector<VariantAllele>,bool> > WfaVariant::parsedAlternates(
+ bool includePreviousBaseForIndels,
+ bool useMNPs,
+ bool useEntropy,
+ string flankingRefLeft,
+ string flankingRefRight,
+ wavefront_aligner_attr_t* wfaParams,
+ int invKmerLen,
+ int invMinLen,
+ int threads,
+ bool debug) {
+
+ // return type is a hash of allele containing a list of variants and a
+ // boolean value signifying an inversion of the variant
+ map<string, pair<vector<VariantAllele>,bool> > variantAlleles;
+
+ if (isSymbolicSV()){
+ // Don't ever align symbolic SVs. It just wrecks things.
+ for (auto& f : this->flatAlternates()) {
+ variantAlleles[f.first] = make_pair(f.second, false);
+ }
+ return variantAlleles;
+ }
+ // add the reference allele
+ variantAlleles[ref].first.push_back(VariantAllele(ref, ref, position));
+
+ // single SNP case, no ambiguity possible, no need to spend a lot of
+ // compute aligning ref and alt fields
+ if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
+ variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
+ return variantAlleles;
+ }
+
+ const string& reference_M = ref;
+
+ std::vector<rkmh::hash_t> ref_sketch_fwd;
+ std::vector<rkmh::hash_t> ref_sketch_rev;
+ if (invKmerLen && ref.size() >= invKmerLen
+ && ref.size() >= invMinLen) {
+ ref_sketch_fwd = rkmh::hash_sequence(
+ ref.c_str(), ref.size(), invKmerLen, ref.size()-invKmerLen+1);
+ string fer = reverse_complement(ref);
+ ref_sketch_rev = rkmh::hash_sequence(
+ fer.c_str(), fer.size(), invKmerLen, fer.size()-invKmerLen+1);
+
+ }
+
+ /*
+ for (auto a: alt) { // iterate ALT strings
+ // unsigned int referencePos;
+ string& alternate = a;
+ variantAlleles[alternate]; // create the slot
+ }
+ */
+
+// #pragma omp parallel for
+ for (auto a: alt) { // iterate ALT strings
+ //for (uint64_t idx = 0; idx < alt.size(); ++idx) {
+ //auto& a = alt[idx];
+ // unsigned int referencePos;
+ string alternate = a;
+ pair<vector<VariantAllele>, bool>& _v = variantAlleles[alternate];
+ bool is_inv = false;
+ // get the alt/ref mapping in inversion space
+ if (invKmerLen && alternate.size() >= invKmerLen
+ && alternate.size() >= invMinLen
+ && ref.size() >= invKmerLen
+ && ref.size() >= invMinLen
+ && max((int)ref.size(), (int)alt.size()) > invMinLen) {
+ // check if it's more likely for us to align as an inversion
+ auto alt_sketch = rkmh::hash_sequence(
+ a.c_str(), a.size(), invKmerLen, a.size()-invKmerLen+1);
+ if (rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
+ > rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen)) {
+ is_inv = true;
+ // flip the alt
+ string alternate_rev = reverse_complement(alternate);
+ // cout << "alt: " << alternate_rev << endl;
+ alternate = alternate_rev;
+ }
+ variantAlleles[alternate]; // create slot
+ variantAlleles[alternate].second = is_inv;
+ /*
+ cerr << "comparing "
+ << " vs ref fwd " << rkmh::compare(alt_sketch, ref_sketch_fwd, invKmerLen)
+ << " vs rev " << rkmh::compare(alt_sketch, ref_sketch_rev, invKmerLen) << endl;
+ */
+ }
+
+ const string& alternateQuery_M = alternate;
+
+ string cigar;
+ vector<pair<int, char> > cigarData;
+
+ /*
+ * WFA2-lib
+ */
+ /*
+ // the C++ WFA2-lib interface is not yet stable due to heuristic mode initialization issues
+ WFAlignerGapAffine2Pieces aligner(19,39,3,81,1,WFAligner::Alignment,WFAligner::MemoryHigh);
+ aligner.alignEnd2End(reference_M.c_str(), reference_M.size(), alternateQuery_M.c_str(), alternateQuery_M.size());
+ cigar = aligner.getAlignmentCigar();
+ */
+ auto wfp = *wfaParams;
+ size_t max_len = max(reference_M.size(), alternateQuery_M.size());
+ if (max_len > 1000) {
+ wfp.memory_mode = wavefront_memory_ultralow;
+ } else {
+ wfp.memory_mode = wavefront_memory_high;
+ }
+ auto wf_aligner = wavefront_aligner_new(&wfp);
+
+ wavefront_aligner_set_max_num_threads(wf_aligner,threads);
+ wavefront_aligner_set_heuristic_none(wf_aligner);
+ wavefront_aligner_set_alignment_end_to_end(wf_aligner);
+ wavefront_align(wf_aligner,
+ reference_M.c_str(), reference_M.size(),
+ alternateQuery_M.c_str(), alternateQuery_M.size());
+ //cerr << "WFA_input\t" << ">
+ /*
+ cigar_print_pretty(stderr,
+ reference_M.c_str(), reference_M.size(),
+ alternateQuery_M.c_str(), alternateQuery_M.size(),
+ &wf_aligner->cigar,wf_aligner->mm_allocator);
+ */
+ // Fetch CIGAR
+ char* buffer = wf_aligner->cigar->operations + wf_aligner->cigar->begin_offset;
+ int buf_len = wf_aligner->cigar->end_offset - wf_aligner->cigar->begin_offset;
+ // Create string and return
+ cigar = std::string(buffer,buf_len);
+ wavefront_aligner_delete(wf_aligner);
+ if (cigar == "") {
+ if (debug) {
+ cerr << "ERROR: stopped WF because there is no CIGAR!" << endl;
+ }
+ cerr << ">fail.pattern" << endl
+ << reference_M << endl
+ << ">fail.query" << endl
+ << alternateQuery_M << endl;
+ variantAlleles[alt.front()].first.push_back(VariantAllele(ref, alt.front(), position));
+ exit(1);
+ }
+ cigarData = splitUnpackedCigar(cigar);
+
+ if (debug)
+ cerr << "biWF= " << position << ":" << cigar << "/" << joinCigar(cigarData) << ":" << reference_M << "," << alternateQuery_M << endl;
+
+ if (cigarData.size() == 0) {
+ cerr << "Algorithm error: CIGAR <" << cigar << "> is empty for "
+ << "ref " << reference_M << ","
+ << "allele " << alternateQuery_M << endl;
+
+ exit(1);
+ }
+
+ // now left align!
+ //
+ // TODO: currently broken! it seems to mess up our indel alleles (they become length 0 on ref or alt)
+ //stablyLeftAlign(alternateQuery_M, reference_M, cigarData, 5, true);
+
+ if (debug) {
+ cigar = joinCigar(cigarData);
+ cerr << position << ":" << cigar << ":" << reference_M << "," << alternateQuery_M << endl;
+ }
+
+ // Walk the CIGAR for one alternate and build up variantAlleles
+ vector<VariantAllele> &variants = variantAlleles[alternate].first;
+ int altpos = 0;
+ int refpos = 0;
+
+ for (auto e: cigarData) {
+ int mlen = e.first; // CIGAR matchlen
+ char mtype = e.second; // CIGAR matchtype
+
+ switch (mtype) {
+ case 'I': // CIGAR INSERT
+ {
+ auto allele = VariantAllele("",
+ alternate.substr(altpos, mlen),
+ refpos + position);
+ // inject insertion from allele
+ if (variants.size() && variants.back().is_pure_indel()) {
+ variants.back() = variants.back() + allele;
+ } else {
+ variants.push_back(allele);
+ }
+ altpos += mlen;
+ }
+ break;
+ case 'D': // CIGAR DELETE
+ {
+ auto allele = VariantAllele(ref.substr(refpos, mlen),
+ "", refpos + position);
+ // inject deletion from ref
+ if (variants.size() && variants.back().is_pure_indel()) {
+ variants.back() = variants.back() + allele;
+ } else {
+ variants.push_back(allele);
+ }
+ refpos += mlen;
+ }
+ break;
+ case 'M': // CIGAR match and variant
+ case 'X':
+ variants.push_back(VariantAllele(ref.substr(refpos, mlen),
+ alternate.substr(altpos, mlen),
+ refpos + position));
+ refpos += mlen;
+ altpos += mlen;
+ break;
+ case 'S': // S operations specify segments at the start
+ // and/or end of the query that do not appear in
+ // a local alignment
+ refpos += mlen;
+ altpos += mlen;
+ break;
+
+ default: // Ignore N, H
+ cerr << "Hit unexpected CIGAR " << mtype << " in " << endl;
+ cerr << "pos: " << position << endl;
+ cerr << "cigar: " << cigar << endl;
+ cerr << "ref: " << reference_M << endl;
+ cerr << "allele:" << alternateQuery_M << endl;
+ if (debug) exit(1);
+ break;
+ } // switch mtype
+ }
+ if (includePreviousBaseForIndels) {
+ for (uint64_t i = 0; i < variants.size(); ++i) {
+ auto& v = variants[i];
+ if (v.is_pure_indel()) {
+ if (i == 0) { // special case, we're at the beginning
+ // do we have a next allele?
+ if (i == variants.size()-1) {
+ // if not-- panic
+ cerr << "allele base fail: no subsequent alleles to indel" << endl;
+ cerr << v << endl;
+ cerr << "variant at " << position << endl;
+ exit(1);
+ }
+ // else
+ // while the next allele is an indel
+ auto j = i+1;
+ while (j < variants.size()
+ // stop when we have sequence in both ref or alt
+ && v.is_pure_indel()) {
+ auto q = variants[j++];
+ // merge with us
+ shift_mid_left(v, q);
+ }
+ // panic if we reach the end of the variants
+ if (j == variants.size() && v.is_pure_indel()) {
+ cerr << "allele base fail: can't get an additional base next" << endl;
+ cerr << v << endl;
+ cerr << "variant at " << position << endl;
+ exit(1);
+ }
+ } else {
+ // while the next allele is an indel
+ int j = i-1; // i is guaranteed >=1
+ while (j >= 0
+ // stop when we have sequence in both ref or alt
+ && v.is_pure_indel()) {
+ auto q = variants[j--];
+ // move the middle base between the alleles
+ // to be at the start of v
+ // or merge if both are pure indels
+ shift_mid_right(q, v);
+ }
+ // panic if we reach the end of the variants
+ if (j <= 0 && v.is_pure_indel()) {
+ cerr << "allele base fail: can't get an additional base prev" << endl;
+ cerr << v << endl;
+ cerr << "variant at " << position << endl;
+ exit(1);
+ }
+ // while the previous allele is an indel
+ // merge with us
+ // stop when we have sequence in both ref or alt
+ // if allele we reach is a positional match
+ // panic if we reach the start of the variants
+
+ }
+ // try to get the previous reference or alternate base
+ // continue until the
+ // we first check if we have an indel just before us
+ // if so, we need to merge this into the same allele and go further back
+ }
+ }
+ }
+ }
+
+ return variantAlleles;
+}
+
+} // namespace
=====================================
src/vcf-wfa.h
=====================================
@@ -0,0 +1,36 @@
+/*
+ vcflib C++ library for parsing and manipulating VCF files
+
+ Copyright © 2010-2023 Erik Garrison
+ Copyright © 2020-2023 Pjotr Prins
+
+ This software is published under the MIT License. See the LICENSE file.
+*/
+
+#pragma once
+
+#include "wavefront/wfa.hpp"
+
+namespace vcflib {
+
+class WfaVariant : public Variant
+{
+
+public:
+ WfaVariant(VariantCallFile& v) : Variant(v)
+ { };
+
+ map<string, pair<vector<VariantAllele>,bool> > parsedAlternates(
+ bool includePreviousBaseForIndels = false,
+ bool useMNPs = false,
+ bool useEntropy = false,
+ string flankingRefLeft = "",
+ string flankingRefRight = "",
+ wavefront_aligner_attr_t* wfaParams = NULL,
+ int invKmerLen = 17,
+ int invMinLen = 1000,
+ int threads = 1,
+ bool debug = false);
+};
+
+}
=====================================
src/vcfwave.cpp
=====================================
@@ -17,6 +17,7 @@
#include <omp.h>
#include "Variant.h"
+#include "vcf-wfa.h"
#include "convert.h"
#include "join.h"
#include "split.h"
@@ -218,7 +219,7 @@ int main(int argc, char** argv) {
variantFile.addHeaderLine("##INFO=<ID=INV,Number=A,Type=String,Description=\"Count of haplotypes which are aligned in the inverted orientation using vcflib vcfwave.\">");
cout << variantFile.header << endl;
- Variant var(variantFile);
+ WfaVariant var(variantFile);
double amount = 0.0, prev_amount = 0.0;
uint64_t start = get_timestamp();
=====================================
test/tests/realign.py
=====================================
@@ -48,7 +48,7 @@ class RealignTest(unittest.TestCase):
def test_wfbug2(self):
vcf = VariantCallFile()
vcf.openFile("../samples/10134514.vcf")
- var = Variant(vcf)
+ var = WfaVariant(vcf)
vcf.getNextVariant(var)
self.assertEqual(var.name,"grch38#chr4_10083863-10181258.vcf:grch38#chr4")
self.assertEqual(var.ref,'GGAGAATCCCAATTGATGG')
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/commit/7dedad94a2684662e68466bb3af460f27f26e279
--
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/commit/7dedad94a2684662e68466bb3af460f27f26e279
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230211/df9a0b71/attachment-0001.htm>
More information about the debian-med-commit
mailing list