[med-svn] [Git][med-team/libvcflib][master] 5 commits: routine-update: New upstream version

Andreas Tille (@tille) gitlab at salsa.debian.org
Tue Feb 7 18:03:11 GMT 2023



Andreas Tille pushed to branch master at Debian Med / libvcflib


Commits:
3a00ac1e by Andreas Tille at 2023-02-07T18:36:48+01:00
routine-update: New upstream version

- - - - -
4311b248 by Andreas Tille at 2023-02-07T18:36:49+01:00
New upstream version 1.0.7+dfsg
- - - - -
c335f578 by Andreas Tille at 2023-02-07T18:37:16+01:00
Update upstream source from tag 'upstream/1.0.7+dfsg'

Update to upstream version '1.0.7+dfsg'
with Debian dir a4acb67812d0f1f1762a013bfaa15ed92ace93b7
- - - - -
a05c1f08 by Andreas Tille at 2023-02-07T18:37:21+01:00
Remove duplicate line from changelog.

Changes-By: lintian-brush

- - - - -
b27cf4d0 by Andreas Tille at 2023-02-07T18:41:32+01:00
routine-update: Ready to upload to unstable

- - - - -


15 changed files:

- CMakeLists.txt
- README.md
- RELEASE_NOTES.md
- VERSION
- debian/changelog
- doc/vcfcreatemulti.md
- guix.scm
- man/vcfcreatemulti.1
- src/Variant.cpp
- src/vcffilter.cpp
- test/pytest/vcfcreatemulti.md
- + test/pytest/vcffilter.md
- − test/tests/mainTest.cpp
- − test/tests/variantFileTests.h
- − test/tests/variantTests.h


Changes:

=====================================
CMakeLists.txt
=====================================
@@ -100,7 +100,7 @@ endif(GPROF)
 if (ZIG)
   find_program(ZIG_EXE NAMES "zig")
   if (NOT ZIG_EXE)
-    MESSAGE(FATAL_ERROR "zig binary not found in PATH. zig is used for vcfcreatemulti. Either use cmake -DZIG=OFF option or add zig to the PATH")
+    MESSAGE(FATAL_ERROR "zig binary not found in PATH. zig is used for vcfcreatemulti's latest features. Either use cmake -DZIG=OFF option or add zig to the PATH")
   endif (NOT ZIG_EXE)
 endif(ZIG)
 
@@ -357,11 +357,14 @@ endif(HTSLIB_LOCAL)
 
 if(WFA_GITMODULE)
   set(WFA_INCLUDE_DIRS ${WFA_LOCAL})
-  add_subdirectory(${WFA_LOCAL})
+  add_subdirectory(${WFA_LOCAL} EXCLUDE_FROM_ALL)
   set(WFALIB wfa2) # pick up the wfa2 lib target from the included CMakeLists.txt
 else(WFA_GITMODULE)
   set(WFA_INCLUDE_DIRS ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/wfa2lib)
   find_library(WFALIB wfa2 wfa) # distro search for shared lib
+  if(NOT WFALIB)
+    message(STATUS "ERROR: Can not find libfwa! Make sure it is installed or use the git submodule instead")
+  endif()
 endif(WFA_GITMODULE)
 
 include_directories(${WFA_INCLUDE_DIRS})
@@ -434,11 +437,11 @@ if (NOT BUILD_ONLY_LIB)
     target_link_libraries(${BIN} PUBLIC ${vcflib_LIBS} vcflib ${WFALIB})
   endforeach(BIN ${BINS})
   # target_link_libraries(vcfwave PUBLIC ${WFALIB})
-  install(TARGETS ${BINS} RUNTIME DESTINATION bin)
+  install(TARGETS ${BINS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 
   # ---- Copy scripts
   foreach(SCRIPT ${SCRIPTS})
-    install(PROGRAMS ./scripts/${SCRIPT} DESTINATION bin RENAME ${SCRIPT})
+    install(PROGRAMS ./scripts/${SCRIPT} DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME ${SCRIPT})
   endforeach(SCRIPT ${SCRIPTS})
 
 endif()
@@ -447,7 +450,7 @@ endif()
 pybind11_add_module(pyvcflib "${CMAKE_SOURCE_DIR}/src/pythonffi.cpp")
 add_dependencies(pyvcflib ${vcflib_DEPS})
 target_link_libraries(pyvcflib PUBLIC vcflib ${vcflib_LIBS} ${WFALIB})
-install(TARGETS pyvcflib LIBRARY DESTINATION lib)
+install(TARGETS pyvcflib LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 # ---- Test
 
@@ -509,13 +512,13 @@ add_pydoctest(vcflib-api)
 add_pydoctest(vcf2tsv)
 add_pydoctest(vcfallelicprimitives)
 add_pydoctest(vcfwave)
+add_pydoctest(vcffilter)
 if (ZIG)
   add_pydoctest(vcfcreatemulti)
 endif (ZIG)
 
 add_pydoctest(vcfnulldotslashdot)
 add_doctest(doc/vcfintersect)
-add_doctest(doc/vcffilter)
 
 # ---- Build docs
 #
@@ -563,7 +566,7 @@ endif (PANDOC)
 
 # ---- Install
 
-install(TARGETS vcflib ARCHIVE DESTINATION lib)
+install(TARGETS vcflib ARCHIVE DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 install(FILES ${INCLUDES} DESTINATION include)
 


=====================================
README.md
=====================================
@@ -332,7 +332,7 @@ VCFLIB uses the cmake build system, after a recursive checkout of the sources ma
 git clone --recursive https://github.com/vcflib/vcflib.git
 cd vcflib
 mkdir -p build && cd build
-cmake  -DCMAKE_BUILD_TYPE=Debug -DZIG=OFF -DOPENMP=OFF ..
+cmake -DCMAKE_BUILD_TYPE=Debug -DZIG=OFF -DOPENMP=OFF ..
 cmake --build .
 cmake --install .
 ```
@@ -340,7 +340,7 @@ cmake --install .
 and to run the tests
 
 ```sh
-ctest --verbose
+ctest . --verbose
 ```
 
 Executables are built into the `./build` directory in the repository.
@@ -393,6 +393,16 @@ Check out htslib in tabixpp (recursively) and
 The standard build creates `build/vcflib.a`. Take a hint from the
 [cmake](./CMakeLists.txt) file that builds all the vcflib tools.
 
+## distro builds
+
+Distros, such as Debian, should build with something like
+
+```
+cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DZIG=OFF -DWFA_GITMODULE=OFF ..
+```
+
+See the CMakeLists.txt header for more.
+
 ## source code
 
 See [vcfecho.cpp](./src/vcfecho.cpp) for basic usage.


=====================================
RELEASE_NOTES.md
=====================================
@@ -5,14 +5,22 @@ and
 
 ## TODO
 
-- [ ] vcfcreatemulti: fix problem with slow and wrong complex regions
-- [ ] complete vcfcreatemulti merge multiple rows
+- [ ] vcfcreatemulti: fix problem with slow and wrong complex regions (implement backtrack)
       + [ ] check for indels which are really the same
       + [ ] combine vcfwave duplicated functionality
 - bgzip
 - tabix -p vcf my_file.vcf.gz
 - pangenie, vg deconstruct, vcfbub
 
+## ChangeLog v1.0.7 (20230207)
+
+Vcflib maintenance release - mostly for including in Debian
+
++ Fixed regression discovered by garguantua_kerr and atille of Debian (thanks!)
++ Added note on bio-vcf in vcffilter doc
++ notes on vcfcreatemulti and backtracking
++ CMake: honour include(GNUInstallDirs) paths (I forgot)
+
 ## ChangeLog v1.0.6 (20230129)
 
 Vcflib maintenance release - mostly for including in Debian


=====================================
VERSION
=====================================
@@ -1 +1 @@
-1.0.6
+1.0.7


=====================================
debian/changelog
=====================================
@@ -1,4 +1,4 @@
-libvcflib (1.0.6+dfsg-1) UNRELEASED; urgency=medium
+libvcflib (1.0.7+dfsg-1) unstable; urgency=medium
 
   [ Andreas Tille ]
   * Team Upload.
@@ -21,7 +21,7 @@ libvcflib (1.0.6+dfsg-1) UNRELEASED; urgency=medium
   * Pass more flags to get past the configure step
   * d/rules: Get tests running, do not install py shared object lib
 
- -- Nilesh Patra <nilesh at debian.org>  Mon, 06 Feb 2023 23:50:29 +0530
+ -- Andreas Tille <tille at debian.org>  Tue, 07 Feb 2023 18:37:40 +0100
 
 libvcflib (1.0.3+dfsg-2) unstable; urgency=medium
 


=====================================
doc/vcfcreatemulti.md
=====================================
@@ -109,6 +109,7 @@ ALT-SNP2 ACTGACTA       1/0
 ```
 
 In words: the result is incorrect.
+
 At this point, for analysis, there is little else to do but go to the original data (pangenome or VCF) and compare the results.
 What `vcfcreatemulti` helps to do is point out that there is a complex region here with ample variation and the resulting layout is a problem (too many ALTs as in 'too many cooks'!).
 
@@ -118,12 +119,47 @@ To help vcflib show's a `WARNING: Too many ALT alleles to fit in sample(s)' and
 grep MULTI= ./test/tmp/vcfcreatemulti_2.vcf -c
 ```
 
-Finds 3 marked records.
+Finds 3 marked records. One of them is derived from the combination:
+
+```
+grch38#chr8 36377478  >601>606  GTTTCTTGAAAAACCAAATGT GTTTCTTGAAAAACCAAAGGT,G 60  . AC=20,1;AF=0.224719,0.011236
+;AN=89;AT=>601>602>603>605>606,>601>602>604>605>606,>601>606;NS=45;LV=0 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0
+0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|0 0|2 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1
+0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+grch38#chr8 36377496  >602>605  T G 60  . AC=20;AF=0.227273;AN=88;AT=>602>603>605,>602>604>605;NS=45;LV=1;PS=>60
+1>606 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|
+0 0|. 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1 0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+```
+
+resulting in
+
+```
+grch38#chr8 36377478  >601>606  GTTTCTTGAAAAACCAAATGT GTTTCTTGAAAAACCAAAGGT,G,GTTTCTTGAAAAACCAAAGGT 60  . AC=20,
+1,20;AF=0.224719,0.011236,0.227273;AN=89,89,88;AT=>601>602>603>605>606,>601>602>604>605>606,>601>602>603>605>606
+,>601>606,>602>603>605,>602>604>605;NS=45;LV=0;MULTI=ALTPROBLEM;combined=36377478-36377496  GT  0|0 0|0 0|0 0|0
+0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 3|3 0|0 0|0 0|0 0|0 3|0 3|0 0|2 0|0 0|3 0|3 3|3 3|3
+0|0 3|3 0|0 0|3 0|3 0|0 3|0 3|3 0|3 0|3 0|0 0|3 0
+```
+
+This is a combination of:
+
+```
+grch38#chr8 36377478  >601>606  GTTTCTTGAAAAACCAAATGT GTTTCTTGAAAAACCAAAGGT,G 60  . AC=20,1;AF=0.224719,0.011236
+;AN=89;AT=>601>602>603>605>606,>601>602>604>605>606,>601>606;NS=45;LV=0 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0
+0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|0 0|2 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1
+0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+grch38#chr8 36377496  >602>605  T G 60  . AC=20;AF=0.227273;AN=88;AT=>602>603>605,>602>604>605;NS=45;LV=1;PS=>60
+1>606 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|
+0 0|. 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1 0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+```
+
+Where the ALTs end up being a duplication and there is some overlap in the genotype calling.
 
 One future solution might be to have vcfcreatemulti ignore SNPs, or only take the first one, but that somewhat would do away with pointing out complex arrangements. Another solution might be to edit the ALTs and merge ALT-SNP1 into ALT-SNP2 so we get `ACTGCCTA`.
-I have not made up my mind yet.
 Contributions and ideas are welcome!
 
+Having a think about this: the safest approach is to backtrack on a conflict and leave it alone. So, when a variant comes up that conflicts with the combined record (so far) we should drop merging that variant and leave it alone. This will typically happen with a long ALT that overlaps many SNPs. We could come up with all types of solutions, but the point of this algorithm is to 'fix' the obvious cases. At this point we continue and show the MULTI=ALTPROBLEM info field. It is not satisfactory and it is slow too. We can have a stab at the backtrack in the future.
+
 ## Source code
 
 [vcfcreatemulti.cpp](../../src/vcfcreatemulti.cpp)


=====================================
guix.scm
=====================================
@@ -8,7 +8,7 @@
 ;;
 ;; For the tests you need /usr/bin/env. In a container create it with
 ;;
-;;   mkdir -p /usr/bin ; ln -s $GUIX_ENVIRONMENT/bin/env /usr/bin/env
+;;   mkdir -p /usr/bin /bin ; ln -v -s $GUIX_ENVIRONMENT/bin/env /usr/bin/env ; ln -v -s $GUIX_ENVIRONMENT/bin/bash /bin/bash
 ;;
 ;; or in one go
 ;;


=====================================
man/vcfcreatemulti.1
=====================================
@@ -118,6 +118,7 @@ ALT-SNP2 ACTGACTA       1/0
 .fi
 .PP
 In words: the result is incorrect.
+.PP
 At this point, for analysis, there is little else to do but go to the
 original data (pangenome or VCF) and compare the results.
 What \f[C]vcfcreatemulti\f[R] helps to do is point out that there is a
@@ -136,14 +137,66 @@ grep MULTI= ./test/tmp/vcfcreatemulti_2.vcf -c
 .fi
 .PP
 Finds 3 marked records.
+One of them is derived from the combination:
+.IP
+.nf
+\f[C]
+grch38#chr8 36377478  >601>606  GTTTCTTGAAAAACCAAATGT GTTTCTTGAAAAACCAAAGGT,G 60  . AC=20,1;AF=0.224719,0.011236
+;AN=89;AT=>601>602>603>605>606,>601>602>604>605>606,>601>606;NS=45;LV=0 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0
+0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|0 0|2 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1
+0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+grch38#chr8 36377496  >602>605  T G 60  . AC=20;AF=0.227273;AN=88;AT=>602>603>605,>602>604>605;NS=45;LV=1;PS=>60
+1>606 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|
+0 0|. 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1 0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+\f[R]
+.fi
+.PP
+resulting in
+.IP
+.nf
+\f[C]
+grch38#chr8 36377478  >601>606  GTTTCTTGAAAAACCAAATGT GTTTCTTGAAAAACCAAAGGT,G,GTTTCTTGAAAAACCAAAGGT 60  . AC=20,
+1,20;AF=0.224719,0.011236,0.227273;AN=89,89,88;AT=>601>602>603>605>606,>601>602>604>605>606,>601>602>603>605>606
+,>601>606,>602>603>605,>602>604>605;NS=45;LV=0;MULTI=ALTPROBLEM;combined=36377478-36377496  GT  0|0 0|0 0|0 0|0
+0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 3|3 0|0 0|0 0|0 0|0 3|0 3|0 0|2 0|0 0|3 0|3 3|3 3|3
+0|0 3|3 0|0 0|3 0|3 0|0 3|0 3|3 0|3 0|3 0|0 0|3 0
+\f[R]
+.fi
+.PP
+This is a combination of:
+.IP
+.nf
+\f[C]
+grch38#chr8 36377478  >601>606  GTTTCTTGAAAAACCAAATGT GTTTCTTGAAAAACCAAAGGT,G 60  . AC=20,1;AF=0.224719,0.011236
+;AN=89;AT=>601>602>603>605>606,>601>602>604>605>606,>601>606;NS=45;LV=0 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0
+0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|0 0|2 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1
+0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+grch38#chr8 36377496  >602>605  T G 60  . AC=20;AF=0.227273;AN=88;AT=>602>603>605,>602>604>605;NS=45;LV=1;PS=>60
+1>606 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|
+0 0|. 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1 0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+\f[R]
+.fi
+.PP
+Where the ALTs end up being a duplication and there is some overlap in
+the genotype calling.
 .PP
 One future solution might be to have vcfcreatemulti ignore SNPs, or only
 take the first one, but that somewhat would do away with pointing out
 complex arrangements.
 Another solution might be to edit the ALTs and merge ALT-SNP1 into
 ALT-SNP2 so we get \f[C]ACTGCCTA\f[R].
-I have not made up my mind yet.
 Contributions and ideas are welcome!
+.PP
+Having a think about this: the safest approach is to backtrack on a
+conflict and leave it alone.
+So, when a variant comes up that conflicts with the combined record (so
+far) we should drop merging that variant and leave it alone.
+This will typically happen with a long ALT that overlaps many SNPs.
+We could come up with all types of solutions, but the point of this
+algorithm is to `fix' the obvious cases.
+At this point we continue and show the MULTI=ALTPROBLEM info field.
+It is not satisfactory and it is slow too.
+We can have a stab at the backtrack in the future.
 .SH ./vcfcreatemulti ../samples/grch38#chr8_36353854-36453166.vcf > ../test/data/regression/vcfcreatemulti_2.vcf
 .RS
 .RS


=====================================
src/Variant.cpp
=====================================
@@ -1,8 +1,8 @@
 /*
     vcflib C++ library for parsing and manipulating VCF files
 
-    Copyright © 2010-2022 Erik Garrison
-    Copyright © 2020-2022 Pjotr Prins
+    Copyright © 2010-2023 Erik Garrison
+    Copyright © 2020-2023 Pjotr Prins
 
     This software is published under the MIT License. See the LICENSE file.
 */
@@ -1829,20 +1829,22 @@ bool VariantCallFile::parseHeader(string& hs) {
                 // field
                 if (entryType == "INFO" || entryType == "FORMAT") {
                     vector<string> fields = split(entryData, "=,");
-                    map<string,string> mapper;
-                    string key = "";
-                    for (auto field: fields) {
-                        // split into key-value pairs and add to mapper
-                        if (key == "")
-                            key = field;
-                        else {
-                            mapper[key] = field;
-                            key = "";
-                        }
+                    if (fields[0] != "ID") {
+                        cerr << "header parse error at:" << endl
+                             << "fields[0] != \"ID\"" << endl
+                             << headerLine << endl;
+                        exit(1);
+                    }
+                    string id = fields[1];
+                    if (fields[2] != "Number") {
+                        cerr << "header parse error at:" << endl
+                             << "fields[2] != \"Number\"" << endl
+                             << headerLine << endl;
+                        exit(1);
                     }
-                    string id = mapper["ID"];
                     int number;
-                    string numberstr = mapper["NUMBER"].c_str();
+                    // string numberstr = mapper["Number"].c_str();
+                    string numberstr = fields[3].c_str();
                     // XXX TODO VCF has variable numbers of fields...
                     if (numberstr == "A") {
                         number = ALLELE_NUMBER;
@@ -1853,7 +1855,15 @@ bool VariantCallFile::parseHeader(string& hs) {
                     } else {
                         convert(numberstr, number);
                     }
-                    VariantFieldType type = typeStrToVariantFieldType(mapper["TYPE"]);
+                    if (fields[4] != "Type") {
+                        cerr << "header parse error at:" << endl
+                             << "fields[4] != \"Type\"" << endl
+                             << headerLine << endl;
+                        exit(1);
+                    }
+                    VariantFieldType type = typeStrToVariantFieldType(fields[5]);
+
+                    // VariantFieldType type = typeStrToVariantFieldType(mapper["TYPE"]);
                     if (entryType == "INFO") {
                         infoCounts[id] = number;
                         infoTypes[id] = type;


=====================================
src/vcffilter.cpp
=====================================
@@ -15,7 +15,7 @@ using namespace std;
 using namespace vcflib;
 
 void printSummary(char** argv) {
-  cerr << "vcflib " << VCFLIB_VERSION << " filter the specified vcf file using the set of filters" << endl << endl
+  cerr << "vcflib filter the specified vcf file using the set of filters" << endl << endl
        << "usage: " << argv[0] << " [options] <vcf file>" << endl
          << endl
          << "options:" << endl


=====================================
test/pytest/vcfcreatemulti.md
=====================================
@@ -109,6 +109,7 @@ ALT-SNP2 ACTGACTA       1/0
 ```
 
 In words: the result is incorrect.
+
 At this point, for analysis, there is little else to do but go to the original data (pangenome or VCF) and compare the results.
 What `vcfcreatemulti` helps to do is point out that there is a complex region here with ample variation and the resulting layout is a problem (too many ALTs as in 'too many cooks'!).
 
@@ -118,12 +119,47 @@ To help vcflib show's a `WARNING: Too many ALT alleles to fit in sample(s)' and
 grep MULTI= ./test/tmp/vcfcreatemulti_2.vcf -c
 ```
 
-Finds 3 marked records.
+Finds 3 marked records. One of them is derived from the combination:
+
+```
+grch38#chr8 36377478  >601>606  GTTTCTTGAAAAACCAAATGT GTTTCTTGAAAAACCAAAGGT,G 60  . AC=20,1;AF=0.224719,0.011236
+;AN=89;AT=>601>602>603>605>606,>601>602>604>605>606,>601>606;NS=45;LV=0 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0
+0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|0 0|2 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1
+0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+grch38#chr8 36377496  >602>605  T G 60  . AC=20;AF=0.227273;AN=88;AT=>602>603>605,>602>604>605;NS=45;LV=1;PS=>60
+1>606 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|
+0 0|. 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1 0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+```
+
+resulting in
+
+```
+grch38#chr8 36377478  >601>606  GTTTCTTGAAAAACCAAATGT GTTTCTTGAAAAACCAAAGGT,G,GTTTCTTGAAAAACCAAAGGT 60  . AC=20,
+1,20;AF=0.224719,0.011236,0.227273;AN=89,89,88;AT=>601>602>603>605>606,>601>602>604>605>606,>601>602>603>605>606
+,>601>606,>602>603>605,>602>604>605;NS=45;LV=0;MULTI=ALTPROBLEM;combined=36377478-36377496  GT  0|0 0|0 0|0 0|0
+0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 3|3 0|0 0|0 0|0 0|0 3|0 3|0 0|2 0|0 0|3 0|3 3|3 3|3
+0|0 3|3 0|0 0|3 0|3 0|0 3|0 3|3 0|3 0|3 0|0 0|3 0
+```
+
+This is a combination of:
+
+```
+grch38#chr8 36377478  >601>606  GTTTCTTGAAAAACCAAATGT GTTTCTTGAAAAACCAAAGGT,G 60  . AC=20,1;AF=0.224719,0.011236
+;AN=89;AT=>601>602>603>605>606,>601>602>604>605>606,>601>606;NS=45;LV=0 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0
+0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|0 0|2 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1
+0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+grch38#chr8 36377496  >602>605  T G 60  . AC=20;AF=0.227273;AN=88;AT=>602>603>605,>602>604>605;NS=45;LV=1;PS=>60
+1>606 GT  0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 1|1 0|0 0|0 0|0 0|0 1|0 1|
+0 0|. 0|0 0|1 0|1 1|1 1|1 0|0 1|1 0|0 0|1 0|1 0|0 1|0 1|1 0|1 0|1 0|0 0|1 0
+```
+
+Where the ALTs end up being a duplication and there is some overlap in the genotype calling.
 
 One future solution might be to have vcfcreatemulti ignore SNPs, or only take the first one, but that somewhat would do away with pointing out complex arrangements. Another solution might be to edit the ALTs and merge ALT-SNP1 into ALT-SNP2 so we get `ACTGCCTA`.
-I have not made up my mind yet.
 Contributions and ideas are welcome!
 
+Having a think about this: the safest approach is to backtrack on a conflict and leave it alone. So, when a variant comes up that conflicts with the combined record (so far) we should drop merging that variant and leave it alone. This will typically happen with a long ALT that overlaps many SNPs. We could come up with all types of solutions, but the point of this algorithm is to 'fix' the obvious cases. At this point we continue and show the MULTI=ALTPROBLEM info field. It is not satisfactory and it is slow too. We can have a stab at the backtrack in the future.
+
 ## Source code
 
 [vcfcreatemulti.cpp](../../src/vcfcreatemulti.cpp)


=====================================
test/pytest/vcffilter.md
=====================================
@@ -0,0 +1,119 @@
+% VCFFILTER(1) vcffilter (vcflib) | vcffilter (VCF filter)
+% Erik Garrison and vcflib contributors
+
+# NAME
+
+**vcffilter**
+
+# SYNOPSIS
+
+**vcffilter** [options] <vcf file>
+
+# DESCRIPTION
+
+VCF filter the specified vcf file using the set of filters.
+
+
+# OPTIONS
+<!--
+
+    >>> from rtest import run_stdout, head, cat, sh
+
+-->
+
+Current command line options:
+
+```
+
+>>> head("vcffilter -h",39)
+vcflib filter the specified vcf file using the set of filters
+>
+usage: vcffilter [options] <vcf file>
+>
+options:
+    -f, --info-filter     specifies a filter to apply to the info fields of records,
+                          removes alleles which do not pass the filter
+    -g, --genotype-filter specifies a filter to apply to the genotype fields of records
+    -k, --keep-info       used in conjunction with '-g', keeps variant info, but removes genotype
+    -s, --filter-sites    filter entire records, not just alleles
+    -t, --tag-pass        tag vcf records as positively filtered with this tag, print all records
+    -F, --tag-fail        tag vcf records as negatively filtered with this tag, print all records
+    -A, --append-filter   append the existing filter tag, don't just replace it
+    -a, --allele-tag      apply -t on a per-allele basis.  adds or sets the corresponding INFO field tag
+    -v, --invert          inverts the filter, e.g. grep -v
+    -o, --or              use logical OR instead of AND to combine filters
+    -r, --region          specify a region on which to target the filtering, requires a BGZF
+                          compressed file which has been indexed with tabix.  any number of
+                          regions may be specified.
+>
+Filter the specified vcf file using the set of filters.
+Filters are specified in the form "<ID> <operator> <value>:
+ -f "DP > 10"  # for info fields
+ -g "GT = 1|1" # for genotype fields
+ -f "CpG"  # for 'flag' fields
+>
+Operators can be any of: =, !, <, >, |, &
+>
+Any number of filters may be specified.  They are combined via logical AND
+unless --or is specified on the command line.  Obtain logical negation through
+the use of parentheses, e.g. "! ( DP = 10 )"
+>
+For convenience, you can specify "QUAL" to refer to the quality of the site, even
+though it does not appear in the INFO fields.
+>
+type: filter
+>
+
+```
+
+
+
+# EXIT VALUES
+
+**0**
+: Success
+
+**not 0**
+: Failure
+
+# EXAMPLES
+
+Filter VCF records that have an allele count > 10 results in 1471 matches for
+
+```python
+>>> sh("../build/vcffilter -f 'AC > 10' ../samples/grch38#chr4_10083863-10181258.vcf|wc -l")
+546
+
+```
+
+# SEE ALSO
+
+[vcflib](./vcflib.md)(1)
+
+Note that [bio-vcf](https://github.com/vcflib/bio-vcf) may easily give 5x better performance. E.g.
+
+```
+bio-vcf --filter 'r.info.ac>10'
+```
+
+instead of
+
+```
+vcfffilter 'AC > 10'
+```
+
+will filter all AC fields larger than 10.
+
+# OTHER
+
+## Source code
+
+[vcffilter.cpp](https://github.com/vcflib/vcflib/blob/master/src/vcffilter.cpp)
+
+# LICENSE
+
+Copyright 2011-2023 (C) Erik Garrison and vcflib contributors. MIT licensed.
+
+<!--
+  Created with ./scripts/bin2md.rb scripts/bin2md-template.erb
+-->


=====================================
test/tests/mainTest.cpp deleted
=====================================
@@ -1,11 +0,0 @@
-#include <stdio.h>
-
-#include "gtest/gtest.h"
-#include "variantTests.h"
-#include "variantFileTests.h"
-
-GTEST_API_ int main(int argc, char** argv)
-{
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}


=====================================
test/tests/variantFileTests.h deleted
=====================================
@@ -1,80 +0,0 @@
-#include "gtest/gtest.h"
-#include "Variant.h"
-#include <iostream>
-#include <string>
-#include <vector>
-
-TEST(VariantCallFile, open){
-
-  vcflib::VariantCallFile variantFile;
-
-  std::string filename = "../samples/sample.vcf";
-
-  variantFile.open(filename);
-  
-  ASSERT_TRUE(variantFile.is_open());
-
-
-};
-
-
-TEST(VariantCallFile, recordCountUncompressed){
-  vcflib::VariantCallFile variantFile;
-  
-  std::string filename = "../samples/sample.vcf";
-  variantFile.open(filename);
-  vcflib::Variant var(variantFile);
-  
-  long int count = 0;
-
-  while (variantFile.getNextVariant(var)) {
-    count+= 1;
-  }
-  ASSERT_EQ(count, 9);
-}
-
-TEST(VariantCallFile, recordCountCompressed){
-  vcflib::VariantCallFile variantFile;
-
-  std::string filename = "../samples/sample.compressed.vcf.gz";
-  variantFile.open(filename);
-  vcflib::Variant var(variantFile);
-
-  long int count = 0;
-  while (variantFile.getNextVariant(var)) {
-    count+= 1;
-  }
-  ASSERT_EQ(count, 9);
-}
-
-TEST(VariantCallFile, sampleNamesCompressed){
-  vcflib::VariantCallFile variantFile;
-
-  std::string filename = "../samples/sample.compressed.vcf.gz";
-  variantFile.open(filename);
-
-  int sampleSize = variantFile.sampleNames.size();
-
-  ASSERT_EQ(sampleSize, 3);
-
-  std::vector<std::string> names;
-  names.push_back("NA00001");
-  names.push_back("NA00002");
-  names.push_back("NA00003");
-  
-  int counter = 0;
-
-  for(std::vector<std::string>::iterator it 
-	= variantFile.sampleNames.begin(); 
-      it != variantFile.sampleNames.end(); it++){
-    
-    ASSERT_EQ(*it, names[counter]);
-    
-    counter+=1;
-
-  }
-
-
-  
-
-}


=====================================
test/tests/variantTests.h deleted
=====================================
@@ -1,37 +0,0 @@
-#include "gtest/gtest.h"
-#include "Variant.h"
-#include <iostream>
-#include <string>
-#include <vector>
-
-
-
-TEST(Variant, correctGenotype){
-
-  const char *truth[] = {"0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "0|1", "0|1", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "1|1", "1|1", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|1", "0|0", "0|0", "0|0", "0|0", "2|2", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "1|1", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "1|1", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0", "0|0"};
-
-  vcflib::VariantCallFile variantFile;
-
-  std::string filename = "../samples/1kg-phaseIII-v5a.20130502.genotypes.chr22-16-16.5mb.vcf.gz";
-
-  variantFile.open(filename);  
-
-  vcflib::Variant var(variantFile);
-  
-  int index = 0;
-  
-  while (variantFile.getNextVariant(var)) {
-
-    std::string tCall = string(truth[index]);
-    std::string eCall = var.samples["NA21144"]["GT"].front();
-
-  
-    ASSERT_EQ(tCall, eCall);
-   
-    index+=1;
-    
-    if(index > 999){
-      break;
-    }
-  }
-}



View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/d3984a705aff55e7c362f9735b30a6ba72c99b3d...b27cf4d015ed38cfe71abb2243c218b6536ca401

-- 
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/d3984a705aff55e7c362f9735b30a6ba72c99b3d...b27cf4d015ed38cfe71abb2243c218b6536ca401
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230207/b7abd7c5/attachment-0001.htm>


More information about the debian-med-commit mailing list