[med-svn] [Git][med-team/libvcflib][upstream] 2 commits: New upstream version 1.0..6.zip+dfsg

Andreas Tille (@tille) gitlab at salsa.debian.org
Sun Feb 5 17:21:54 GMT 2023



Andreas Tille pushed to branch upstream at Debian Med / libvcflib


Commits:
c4cb620d by Andreas Tille at 2023-02-05T18:07:42+01:00
New upstream version 1.0..6.zip+dfsg
- - - - -
a206b088 by Andreas Tille at 2023-02-05T18:18:28+01:00
New upstream version 1.0.6+dfsg
- - - - -


10 changed files:

- .gitmodules
- CMakeLists.txt
- RELEASE_NOTES.md
- VERSION
- guix.scm
- src/vcfcreatemulti.cpp
- src/vcfwave.cpp
- src/zig/build.zig
- src/zig/vcf.zig
- test/doc/performance.md


Changes:

=====================================
.gitmodules
=====================================
@@ -28,7 +28,7 @@
 [submodule "contrib/tabixpp"]
 	path = contrib/tabixpp
 	url = https://github.com/vcflib/tabixpp.git
-  branch = pjotrp
+  branch = master
 [submodule "contrib/WFA2-lib"]
 	path = contrib/WFA2-lib
 	url = https://github.com/smarco/WFA2-lib


=====================================
CMakeLists.txt
=====================================
@@ -1,3 +1,11 @@
+# For Debian currently with
+#
+#   cd build
+#   cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DZIG=OFF -DWFA_GITMODULE=OFF ..
+#   make
+#   make test
+#   make install
+#
 cmake_minimum_required(VERSION 3.16)
 project(vcflib)
 
@@ -5,10 +13,12 @@ set(CMAKE_CXX_STANDARD 17)
 
 include(ExternalProject)
 include(FeatureSummary)
+include(GNUInstallDirs)
 
 find_package(PkgConfig REQUIRED)
 find_package(pybind11 CONFIG)
 
+include(GNUInstallDirs)
 include(FindBZip2)
 include(FindLibLZMA)
 include(FindZLIB)
@@ -27,7 +37,8 @@ option(OPENMP "Enable OpenMP" ON) # disabling does not work because of vcfwave
 option(PROFILING "Enable profiling" OFF)
 option(GPROF "Enable gprof profiling" OFF)
 option(ASAN "Use address sanitiser" OFF)
-option(ZIG "Disable using the zig compiler" ON)
+option(ZIG "Set to OFF to disable the zig code" ON)
+option(WFA_GITMODULE "Force local git submodule for WFA2LIB" ON) # disable in distros, you may need to add path to WFA_INCLUDE_DIRS
 
 include(CheckIPOSupported) # adds lto
 check_ipo_supported(RESULT ipo_supported OUTPUT output)
@@ -42,8 +53,6 @@ endif(OPENMP)
 
 find_package(ZLIB)
 set_package_properties(ZLIB PROPERTIES TYPE REQUIRED)
-#find_package(LIBLZMA)
-#set_package_properties(LIBLZMA PROPERTIES TYPE REQUIRED)
 find_package(Threads)
 set_package_properties(Threads PROPERTIES TYPE REQUIRED)
 
@@ -91,7 +100,7 @@ endif(GPROF)
 if (ZIG)
   find_program(ZIG_EXE NAMES "zig")
   if (NOT ZIG_EXE)
-    MESSAGE(FATAL_ERROR "zig binary not found in PATH: either use cmake -DNO_ZIG option or add zig to the PATH")
+    MESSAGE(FATAL_ERROR "zig binary not found in PATH. zig is used for vcfcreatemulti. Either use cmake -DZIG=OFF option or add zig to the PATH")
   endif (NOT ZIG_EXE)
 endif(ZIG)
 
@@ -118,7 +127,9 @@ if (NOT TABIX_FOUND)
   )
 endif()
 
-set(WFA_LOCAL contrib/WFA2-lib)
+if(WFA_GITMODULE)
+  set(WFA_LOCAL contrib/WFA2-lib)
+endif()
 
 file(GLOB INCLUDES
   src/*.h*
@@ -248,23 +259,12 @@ set(BINS
     vcfinfo2qual
     vcfglbound
     vcfinfosummarize
+    vcfcreatemulti
 )
 
-if (ZIG)
-  list(APPEND BINS vcfcreatemulti)
-endif (ZIG)
-
 set(SCRIPTS
     bed2region
     bgziptabix
-    # plotBfst.R
-    # plotHaplotypes.R
-    # plotHapLrt.R
-    # plotPfst.R
-    # plot_roc.r
-    # plotSmoothed.R
-    # plotWCfst.R
-    # plotXPEHH.R
     vcf2bed.py
     vcf2sqlite.py
     vcfbiallelic
@@ -307,7 +307,6 @@ file (STRINGS "VERSION" BUILD_NUMBER)
 add_definitions(-DVCFLIB_VERSION="${BUILD_NUMBER}")
 add_definitions(-DVERSION="${BUILD_NUMBER}")
 
-
 # ---- Build htslib
 #
 # Note by default we use the distributed htslib! These are
@@ -356,42 +355,17 @@ if (HTSLIB_LOCAL)
 
 endif(HTSLIB_LOCAL)
 
-# FIXME: hard-coded compile switches need to be fixed upstream
-if (ASAN)
-  set(wfa_MAKE_ARGS BUILD_WFA_PARALLEL=1 BUILD_TOOLS=0 BUILD_EXAMPLES=0 CC=gcc CC_FLAGS=-fPIC setup asan lib_wfa)
-else()
-  set(wfa_MAKE_ARGS BUILD_WFA_PARALLEL=1 BUILD_TOOLS=0 BUILD_EXAMPLES=0 CC=gcc CC_FLAGS=-fPIC setup lib_wfa)
-endif()
-
-# if ((${CMAKE_BUILD_TYPE} MATCHES Release) OR (${CMAKE_BUILD_TYPE} MATCHES RelWithDebInfo))
-#   set(wfa_MAKE_ARGS ${wfa_MAKE_ARGS})
-# endif()
-
-ExternalProject_Add(wfa-EXT
-    SOURCE_DIR "${CMAKE_SOURCE_DIR}/${WFA_LOCAL}"
-    UPDATE_COMMAND ""
-    CONFIGURE_COMMAND ""
-    INSTALL_COMMAND ""
-    BUILD_IN_SOURCE ON
-    BUILD_ALWAYS ON
-    BUILD_COMMAND $(MAKE) ${wfa_MAKE_ARGS}
-)
-ExternalProject_Get_property(wfa-EXT SOURCE_DIR)
-set(WFA_INCLUDE_DIRS ${SOURCE_DIR})
-set(WFA_LINK_LIBRARIES ${SOURCE_DIR}/lib/libwfa.a )
-# set(WFA_LINK_LIBRARIES ${SOURCE_DIR}/lib/libwfa.a ${SOURCE_DIR}/lib/libwfacpp.a )
-link_directories($SOURCE_DIR}/lib)
-MESSAGE(STATUS ${WFA_LINK_LIBRARIES})
-
-# add_library(WAFCPP_STATIC_LIB STATIC IMPORTED)
-# set_target_properties(WAFCPP_STATIC_LIB PROPERTIES IMPORTED_LOCATION ${WFA_LINK_LIBRARIES})
-# add_dependencies(WAFCPP_STATIC_LIB wfa-EXT)
-
-add_custom_target(wfa
-    DEPENDS wfa-EXT
-    VERBATIM)
+if(WFA_GITMODULE)
+  set(WFA_INCLUDE_DIRS ${WFA_LOCAL})
+  add_subdirectory(${WFA_LOCAL})
+  set(WFALIB wfa2) # pick up the wfa2 lib target from the included CMakeLists.txt
+else(WFA_GITMODULE)
+  set(WFA_INCLUDE_DIRS ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/wfa2lib)
+  find_library(WFALIB wfa2 wfa) # distro search for shared lib
+endif(WFA_GITMODULE)
 
 include_directories(${WFA_INCLUDE_DIRS})
+MESSAGE(STATUS "WFA using include ${WFA_INCLUDE_DIRS}")
 
 # ZIG VCF imported library is part of VCFLIB source tree. We designate it an
 # external project so we can run 'zig build'
@@ -404,31 +378,25 @@ ExternalProject_Add(ZIG-EXT
     INSTALL_COMMAND ""
     BUILD_IN_SOURCE ON
     BUILD_ALWAYS ON
-    BUILD_COMMAND ${CMAKE_SOURCE_DIR}/src/zig/compile.sh -Drelease-fast=true -freference-trace
+    BUILD_COMMAND ${CMAKE_SOURCE_DIR}/src/zig/compile.sh # -Drelease-fast=true -freference-trace
 )
 ExternalProject_Get_property(ZIG-EXT SOURCE_DIR)
 set(ZIG_INCLUDE_DIRS ${SOURCE_DIR})
 set(ZIG_LINK_LIBRARIES ${SOURCE_DIR}/zig-out/lib/libzig.a)
-# link_directories($SOURCE_DIR}/lib)
-# MESSAGE(STATUS ${ZIG_LINK_LIBRARIES})
 
 add_library(ZIGCPP_STATIC_LIB STATIC IMPORTED)
 set_target_properties(ZIGCPP_STATIC_LIB PROPERTIES IMPORTED_LOCATION ${ZIG_LINK_LIBRARIES})
 add_dependencies(ZIGCPP_STATIC_LIB ZIG-EXT vcf.zig)
 
-# add_custom_target(ZIGLIB
-#     DEPENDS ZIG-EXT
-#     VERBATIM)
-
 include_directories(${ZIG_INCLUDE_DIRS})
 
 else (ZIG)
   set (ZIG_LINK_LIBRARIES )
+  add_definitions(-DNO_ZIG=1)
 endif (ZIG)
 
 set(vcflib_DEPS
   CURL::libcurl
-  wfa
   )
 
 if (ZIG)
@@ -455,7 +423,6 @@ if (ZIG)
   list(APPEND vcflib_LIBS ${ZIG_LINK_LIBRARIES})
 endif()
 
-# target_link_libraries(vcflib PUBLIC ${vcflib_LIBS})
 add_dependencies(vcflib ${vcflib_DEPS})
 
 # ---- Build all
@@ -464,8 +431,9 @@ if (NOT BUILD_ONLY_LIB)
   foreach(BIN ${BINS})
     add_executable(${BIN} src/${BIN}.cpp)
     add_dependencies(${BIN} vcflib)
-    target_link_libraries(${BIN} PUBLIC ${vcflib_LIBS} vcflib)
+    target_link_libraries(${BIN} PUBLIC ${vcflib_LIBS} vcflib ${WFALIB})
   endforeach(BIN ${BINS})
+  # target_link_libraries(vcfwave PUBLIC ${WFALIB})
   install(TARGETS ${BINS} RUNTIME DESTINATION bin)
 
   # ---- Copy scripts
@@ -478,7 +446,7 @@ endif()
 # ---- Python bindings - mostly for testing at this stage
 pybind11_add_module(pyvcflib "${CMAKE_SOURCE_DIR}/src/pythonffi.cpp")
 add_dependencies(pyvcflib ${vcflib_DEPS})
-target_link_libraries(pyvcflib PUBLIC vcflib "${vcflib_LIBS}")
+target_link_libraries(pyvcflib PUBLIC vcflib ${vcflib_LIBS} ${WFALIB})
 install(TARGETS pyvcflib LIBRARY DESTINATION lib)
 
 # ---- Test
@@ -548,7 +516,6 @@ endif (ZIG)
 add_pydoctest(vcfnulldotslashdot)
 add_doctest(doc/vcfintersect)
 add_doctest(doc/vcffilter)
-# add_pydoctest_fullname(../README.md)
 
 # ---- Build docs
 #


=====================================
RELEASE_NOTES.md
=====================================
@@ -5,32 +5,35 @@ and
 
 ## TODO
 
-- [X] vcfcreatemulti merge multiple rows
-      + [X] rewrite vcfcreatemulti using zig
-      + [X] merge genotypes correctly, with tests
-      + [X] adjust info and genotypes for variants that have multiple alts already (now errors)
-      + [X] handle phase
-      + [X] document using with `vcfwave` and `bcftools norm -m-`
-      + [X] document building with zig
-      + [X] added progress bar to vcfwave and vcfcreatemulti with update to tabixpp
-      + [X] default vcfwave and vcfcreatemulti to nextgen mode
-      + [X] why is vcfwave on a single thread?
-      + [X] check file is sorted for vcfcreatemulti and improve suggestions
-      + [X] update WFA-LIB to main
-      + [X] add tests (zig memory handler)
-      + [X] check for memory leaks
-      + [ ] make CI pass with recent zig in path
-- [ ] RELEASE 1.0.5
+- [ ] vcfcreatemulti: fix problem with slow and wrong complex regions
 - [ ] complete vcfcreatemulti merge multiple rows
       + [ ] check for indels which are really the same
       + [ ] combine vcfwave duplicated functionality
-- [ ] vcfuniq combine genotypes from non-unique records: use vcfcreatemulti and bcftools norm as above
-- [ ] vcfwave check polyploids?
-- [ ] RELEASE 1.0.6
+- bgzip
+- tabix -p vcf my_file.vcf.gz
+- pangenie, vg deconstruct, vcfbub
 
-## ChangeLog v1.0.5 (2023xxxx)
+## ChangeLog v1.0.6 (20230129)
 
-Release with some major changes.
+Vcflib maintenance release - mostly for including in Debian
+
++ Fixed zig complaining about leaking memory
++ Added CMake Debian support with -DWFA_GITMODULE=OFF
++ Introduced CMake include(GNUInstallDirs)
++ Successfully built wfa2 using embedded CMakeLists.txt
++ Cleaned up CMakeLists.txt removing comments etc.
++ Reintroduced vcfcreatemulti in legacy mode when ZIG=OFF (for Debian)
+
+## ChangeLog v1.0.5 (20230116)
+
+Vcflib's first *Humpty Dumpty* release: [vcfcreatemulti](./doc/vcfcreatemulti.md) is the natural companion to [vcfwave](./doc/vcfwave.md).
+
+Often variant callers are not perfect.
+**vcfwave** with its companion tool **vcfcreatemulti** can take an existing VCF file that contains multiple complex overlapping and even nested alleles and, unlike Humpty Dumpty, take them apart and put them together again.
+Thereby, hopefully, creating sane VCF output that is useful for analysis and getting rid of false positives.
+
+We created these tools by including the state-of-the-art [biWFA](https://github.com/smarco/WFA2-lib) wavefront aligner.
+The tools are particularly useful for the output from structural variation callers and pangenome genotypers, such as used by the Human Pangenome Reference Consortium (HPRC) because of overlapping ALT segments.
 
 Important changes:
 
@@ -58,6 +61,15 @@ Introduction of O(n) wavefront algorithm WF to replace O(n^2) Smith-Waterman SW.
 + Added python testing framework
 + Added tabixpp back in as a submodule, fixes https://github.com/vcflib/vcflib/issues/305
 + Optimizations and bug fixes. (thanks @mphschmitt)
++ vcfcreatemulti merge multiple rows
++ rewrite of vcfcreatemulti using zig
++ vcfcreatemulti merge genotypes correctly, with tests
++ vcfcreatemulti adjust info and genotypes for variants that have multiple alts already (now errors)
++ vcfcreatemulti handle phase
++ vcfcreatemulti document building with zig
++ vcfcreatemulti added progress bar to vcfwave and vcfcreatemulti with update to tabixpp
++ vcfcreatemulti default vcfwave and vcfcreatemulti to nextgen mode
++ vcfcreatemulti check file is sorted for vcfcreatemulti and improve suggestions
 
 ## ChangeLog v1.0.4
 


=====================================
VERSION
=====================================
@@ -1 +1 @@
-1.0.4
+1.0.6


=====================================
guix.scm
=====================================
@@ -91,7 +91,7 @@
        ("smithwaterman" ,smithwaterman)
        ("tabixpp" ,tabixpp)
        ("xz" ,xz)
-       ; ("zig" ,zig) ;; note we use zig-0.9.1
+       ("zig" ,zig) ;; note we use zig-0.9.1
        ("zlib" ,zlib)))
     (native-inputs
      `(("pkg-config" ,pkg-config)))


=====================================
src/vcfcreatemulti.cpp
=====================================
@@ -19,12 +19,15 @@ extern "C" {
 using namespace std;
 using namespace vcflib;
 
+#ifdef NO_ZIG
+bool nextGen  = false;
+#else
 bool nextGen  = true;
+#endif
+
 bool quiet = false;
 off_t file_size = -1;
 
-// extern "C" void *zig_create_multi_allelic(Variant *retvar, Variant *varlist[], long size);
-
 void printSummary(char** argv) {
     cerr << R"(
 Usage: vcfcreatemulti [options] [file]
@@ -112,6 +115,7 @@ Variant createMultiallelic_legacy(vector<Variant>& vars) {
     return mvar;
 }
 
+#ifndef NO_ZIG
 Variant createMultiallelic_zig(vector<Variant>& vars) {
 
     if (vars.size() == 1) {
@@ -127,12 +131,15 @@ Variant createMultiallelic_zig(vector<Variant>& vars) {
 
     return *mvar;
 }
+#endif
 
 // This function is called for every line/variant in the VCF file
 Variant createMultiallelic(vector<Variant>& vars) {
+    #ifndef NO_ZIG
     if (nextGen)
         return createMultiallelic_zig(vars);
     else
+    #endif
         return createMultiallelic_legacy(vars);
 }
 
@@ -211,7 +218,7 @@ int main(int argc, char** argv) {
     size_t prev_pos = 0;
 
     if (!quiet)
-        cerr << "vcfcreatemulti processing..." << endl;
+        cerr << "vcfcreatemulti " << VCFLIB_VERSION << " processing..." << endl;
 
     while (variantFile.getNextVariant(var)) {
 
@@ -276,10 +283,12 @@ int main(int argc, char** argv) {
         cout << result << endl;
     }
 
+    #ifndef NO_ZIG
     if (nextGen) {
         zig_display_warnings();
         zig_cleanup();
     }
+    #endif
 
     if (!quiet) cerr << endl;
 


=====================================
src/vcfwave.cpp
=====================================
@@ -223,7 +223,7 @@ int main(int argc, char** argv) {
     uint64_t start = get_timestamp();
 
     if (!quiet)
-        cerr << "vcfwave processing..." << endl;
+        cerr << "vcfwave " << VCFLIB_VERSION << " processing..." << endl;
     while (variantFile.getNextVariant(var)) {
 
         amount = (double)variantFile.file_pos()/(double)file_size;


=====================================
src/zig/build.zig
=====================================
@@ -19,7 +19,7 @@ pub fn build(b: *std.build.Builder) void {
 
     const main_tests = b.addTest("vcf.zig");
     main_tests.setBuildMode(mode);
-    main_tests.addLibraryPath("../../build");
+    // main_tests.addLibraryPath("../../build");
     // main_tests.addObjectFile("../../build/libvcflib.a");
 
     const test_step = b.step("test", "Run library tests");


=====================================
src/zig/vcf.zig
=====================================
@@ -56,7 +56,6 @@ var gpa = std.heap.GeneralPurposeAllocator(.{}){};
 const allocator = gpa.allocator();
 
 var warnings = std.StringHashMap(bool).init(allocator); // Note: not thread safe
-// var warnings = ArrayList([] const u8).init(allocator); // NOTE: not thread safe
 
 pub fn warning(str: [] const u8) !void {
     try warnings.put(str,true);
@@ -308,29 +307,27 @@ export fn zig_create_multi_allelic(variant: ?*anyopaque, varlist: [*c]?* anyopaq
 
     // Get genotypes and update mvar
     var genotypes = samples.reduce_renumber_genotypes(Variant,vs) catch unreachable;
-    // mvar.set_samples(genotypes.c_samples());
+    defer genotypes.s_samples.deinit();
+
     mvar.set_samples(genotypes.s_samples);
     var ninfo = ArrayList([] const u8).init(allocator);
-    ninfo.deinit();
+    defer ninfo.deinit();
+    
     if (genotypes.g_err != samples.VcfSampleError.None) {
         ninfo.append("ALTPROBLEM") catch unreachable;
         mvar.set_info("MULTI",ninfo);
     }
         
-    genotypes.s_samples.deinit();
-    
     return mvar.v;
 }
 
 /// The C++ code should call this to cleanup
 
 export fn zig_cleanup() void {    
-    // p("zig cleaning up!",.{});
-    // std.debug.assert(!gpa.deinit());
     warnings.deinit();
 
-    const leaked = gpa.deinit();
-    if (leaked) p("MEM FAIL",.{});
+    // ---- Not cleaning up the GPA unless we are debugging
+    // std.debug.assert(!gpa.deinit());
 }
 
 


=====================================
test/doc/performance.md
=====================================
@@ -54,3 +54,7 @@ tux02:~/tmp/vcflib/build$ /usr/bin/time -v ./vcfwave ../samples/chr18.grch38.vcf
 hmm. Not so exciting. Looks like we need to parallelize at a different level. Memory used was about 6Gb, and that is not bad.
 
 The good news is that the output is identical.
+
+## vcfcreatemulti
+
+This program is actually slower than vcfwave.



View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/79f5652d833028a9d9db89711c99ab5ef5aeedc5...a206b08809db1f11293430dc38a800de33d2509f

-- 
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/-/compare/79f5652d833028a9d9db89711c99ab5ef5aeedc5...a206b08809db1f11293430dc38a800de33d2509f
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230205/c99dc370/attachment-0001.htm>


More information about the debian-med-commit mailing list