[med-svn] [gatb-core] 02/04: New upstream version 1.4.0+dfsg
Andreas Tille
tille at debian.org
Fri Dec 1 14:31:38 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository gatb-core.
commit 51eb9b0b060086f7f91dac009275f0ad7fffd0fd
Author: Andreas Tille <tille at debian.org>
Date: Fri Dec 1 15:24:01 2017 +0100
New upstream version 1.4.0+dfsg
---
.travis.yml | 31 +
README.md | 9 +-
gatb-core/.gitignore | 3 +
gatb-core/CMakeLists.txt | 15 +-
gatb-core/RELEASES.md | 84 +
gatb-core/cmake/Delivery.cmake | 2 +-
gatb-core/cmake/GetDate.cmake | 31 -
gatb-core/doc/doxygen/src/mainpage.hpp | 10 +-
gatb-core/doc/doxygen/src/snippetspage.hpp | 45 +-
gatb-core/docker/Dockerfile.clang | 5 +-
gatb-core/examples/.DS_Store | Bin 10244 -> 0 bytes
gatb-core/examples/Makefile | 24 +
gatb-core/examples/README.md | 94 +
gatb-core/examples/bank/bank15.cpp | 16 +-
gatb-core/examples/bank/bank21.cpp | 97 +
gatb-core/examples/bank/bank22.cpp | 50 +
gatb-core/examples/bank/{bank15.cpp => bank23.cpp} | 21 +-
gatb-core/examples/bank/bank24.cpp | 77 +
gatb-core/examples/bank/bank25.cpp | 93 +
gatb-core/examples/bank/bank26.cpp | 54 +
gatb-core/examples/bank/bank27.cpp | 70 +
gatb-core/examples/bank/bank28.cpp | 64 +
gatb-core/examples/kmer/.DS_Store | Bin 6148 -> 0 bytes
gatb-core/examples/storage/.DS_Store | Bin 6148 -> 0 bytes
gatb-core/examples/tools/multithreading8.cpp | 89 +
gatb-core/scripts/NewProject/CMakeLists.txt | 6 +-
gatb-core/scripts/delivery_compile.sh | 2 +-
gatb-core/scripts/make_official_release.sh | 2 +-
gatb-core/src/.DS_Store | Bin 6148 -> 0 bytes
gatb-core/src/gatb/.DS_Store | Bin 8196 -> 0 bytes
gatb-core/src/gatb/bank/.DS_Store | Bin 6148 -> 0 bytes
gatb-core/src/gatb/bank/api/.DS_Store | Bin 6148 -> 0 bytes
gatb-core/src/gatb/bank/api/Sequence.hpp | 116 +
gatb-core/src/gatb/bank/impl/AbstractBank.hpp | 2 +-
gatb-core/src/gatb/bank/impl/Bank.cpp | 18 +
gatb-core/src/gatb/bank/impl/BankBinary.cpp | 2 +-
gatb-core/src/gatb/bank/impl/BankFasta.cpp | 6 +-
gatb-core/src/gatb/bcalm2/bcalm_algo.cpp | 648 ++++--
gatb-core/src/gatb/bcalm2/bcalm_algo.hpp | 88 +-
gatb-core/src/gatb/bcalm2/bglue_algo.cpp | 922 ++++----
gatb-core/src/gatb/bcalm2/bglue_algo.hpp | 158 +-
gatb-core/src/gatb/bcalm2/lockstdqueue.h | 52 -
gatb-core/src/gatb/bcalm2/logging.cpp | 40 +
gatb-core/src/gatb/bcalm2/logging.hpp | 6 +
gatb-core/src/gatb/bcalm2/ograph.cpp | 340 ++-
gatb-core/src/gatb/bcalm2/ograph.h | 22 +-
gatb-core/src/gatb/debruijn/impl/ExtremityInfo.hpp | 4 +
gatb-core/src/gatb/debruijn/impl/Graph.cpp | 23 +-
gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp | 621 ++++-
gatb-core/src/gatb/debruijn/impl/GraphUnitigs.hpp | 39 +-
.../src/gatb/debruijn/impl/IterativeExtensions.cpp | 24 +-
gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp | 381 +++
gatb-core/src/gatb/debruijn/impl/LinkTigs.hpp | 40 +
.../src/gatb/debruijn/impl/Simplifications.cpp | 220 +-
.../src/gatb/debruijn/impl/Simplifications.hpp | 15 +-
.../debruijn/impl/UnitigsConstructionAlgorithm.cpp | 194 +-
.../debruijn/impl/UnitigsConstructionAlgorithm.hpp | 29 +-
gatb-core/src/gatb/debruijn/impl/dag_vector.hpp | 325 +++
gatb-core/src/gatb/debruijn/impl/rank_vector.hpp | 137 ++
gatb-core/src/gatb/gatb_core.hpp | 5 +-
gatb-core/src/gatb/kmer/.DS_Store | Bin 6148 -> 0 bytes
gatb-core/src/gatb/kmer/impl/.DS_Store | Bin 10244 -> 0 bytes
.../src/gatb/kmer/impl/ConfigurationAlgorithm.cpp | 4 +-
.../src/gatb/kmer/impl/CountProcessorHistogram.hpp | 3 +
gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.cpp | 20 +-
gatb-core/src/gatb/kmer/impl/Model.hpp | 256 +-
gatb-core/src/gatb/kmer/impl/PartiInfo.hpp | 28 +-
gatb-core/src/gatb/kmer/impl/PartitionsCommand.cpp | 1477 ++++++++++--
gatb-core/src/gatb/kmer/impl/PartitionsCommand.hpp | 236 +-
.../src/gatb/kmer/impl/RepartitionAlgorithm.cpp | 10 +-
.../src/gatb/kmer/impl/Sequence2SuperKmer.hpp | 9 +
.../src/gatb/kmer/impl/SortingCountAlgorithm.cpp | 583 ++++-
.../src/gatb/kmer/impl/SortingCountAlgorithm.hpp | 6 +
gatb-core/src/gatb/system/api/Exception.hpp | 19 +-
gatb-core/src/gatb/system/impl/FileSystemLinux.cpp | 2 +-
gatb-core/src/gatb/system/impl/FileSystemMacos.cpp | 2 +-
.../src/gatb/system/impl/SystemInfoCommon.cpp | 1 -
gatb-core/src/gatb/system/impl/ThreadLinux.cpp | 15 +-
gatb-core/src/gatb/system/impl/ThreadMacos.cpp | 16 +-
.../gatb/template/TemplateSpecialization10.cpp.in | 8 +-
gatb-core/src/gatb/tools/.DS_Store | Bin 6148 -> 0 bytes
.../src/gatb/tools/collections/impl/Hash16.hpp | 5 +
.../gatb/tools/collections/impl/IteratorFile.hpp | 10 +-
.../src/gatb/tools/collections/impl/MapMPHF.hpp | 311 ++-
gatb-core/src/gatb/tools/compression/DnaCoder.cpp | 1784 ++++++++++++++
gatb-core/src/gatb/tools/compression/DnaCoder.hpp | 307 +++
.../src/gatb/tools/compression/HeaderCoder.cpp | 789 +++++++
.../src/gatb/tools/compression/HeaderCoder.hpp | 186 ++
gatb-core/src/gatb/tools/compression/Leon.cpp | 2434 ++++++++++++++++++++
gatb-core/src/gatb/tools/compression/Leon.hpp | 490 ++++
.../src/gatb/tools/compression/RangeCoder.cpp | 2 +-
.../src/gatb/tools/compression/RangeCoder.hpp | 4 +-
gatb-core/src/gatb/tools/designpattern/.DS_Store | Bin 6148 -> 0 bytes
.../src/gatb/tools/designpattern/api/.DS_Store | Bin 6148 -> 0 bytes
.../src/gatb/tools/designpattern/api/Iterator.hpp | 12 +-
.../src/gatb/tools/designpattern/impl/.DS_Store | Bin 6148 -> 0 bytes
.../tools/designpattern/impl/IteratorHelpers.hpp | 60 +-
gatb-core/src/gatb/tools/misc/.DS_Store | Bin 8196 -> 0 bytes
gatb-core/src/gatb/tools/misc/api/.DS_Store | Bin 6148 -> 0 bytes
gatb-core/src/gatb/tools/misc/api/IHistogram.hpp | 5 +
gatb-core/src/gatb/tools/misc/impl/.DS_Store | Bin 8196 -> 0 bytes
gatb-core/src/gatb/tools/misc/impl/Histogram.cpp | 15 +
gatb-core/src/gatb/tools/misc/impl/Histogram.hpp | 16 +-
gatb-core/src/gatb/tools/misc/impl/Pool.hpp | 34 +-
gatb-core/src/gatb/tools/misc/impl/Stringify.hpp | 4 +
gatb-core/src/gatb/tools/misc/impl/Tool.cpp | 2 +-
gatb-core/src/gatb/tools/storage/impl/Storage.cpp | 285 +++
gatb-core/src/gatb/tools/storage/impl/Storage.hpp | 98 +-
gatb-core/src/gatb/tools/storage/impl/Storage.tpp | 4 +-
.../src/gatb/tools/storage/impl/StorageHDF5.hpp | 20 +-
.../db/NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq.gz | Bin 0 -> 3891060 bytes
...IST7035_TAAGGCGA_L001_R1_001_5OK.fastq.leon-ref | Bin 0 -> 3276064 bytes
gatb-core/test/db/README.md | 22 +
gatb-core/test/db/giab.hg002.2D_6K.fastq.gz | Bin 0 -> 23908774 bytes
gatb-core/test/db/leon1.fastq | 28 +
gatb-core/test/db/leon1.fastq.leon-ref | Bin 0 -> 134856 bytes
gatb-core/test/db/leon2.fastq | 28 +
gatb-core/test/db/leon2.fastq.leon-ref | Bin 0 -> 134856 bytes
gatb-core/test/jenkins/leon/README.md | 62 +
gatb-core/test/jenkins/leon/download.sh | 72 +
gatb-core/test/jenkins/leon/test_integrity.sh | 31 +
.../tool-leon-functional-tests-compile.sh} | 53 +-
.../leon/tool-leon-functional-tests-test.sh | 266 +++
gatb-core/test/jenkins/publish-doc-api.sh | 20 +-
.../jenkins/test-bin-debian7-64bits-gcc-4.7.sh | 12 +-
...cc-4.2.1 => test-bin-macos-10.9.5-gcc-4.2.1.sh} | 12 +-
.../test-docker-gatb-core-compile-clang36.sh | 42 +
.../test-snippets-debian7-64bits-gcc-4.7.sh | 4 +-
.../test/jenkins/test-suite-fedora20-gcc-4.8.sh | 6 +-
.../test/slaves/ubuntu16-shell-provisioner.sh | 99 +
gatb-core/test/unit/src/bank/TestBank.cpp | 2 +-
gatb-core/test/unit/src/bank/TestLeon.cpp | 435 ++++
gatb-core/test/unit/src/debruijn/TestDebruijn.cpp | 2 +-
.../src/debruijn/TestSimplificationsUnitigs.cpp | 65 +-
gatb-core/test/unit/src/kmer/TestDSK.cpp | 4 +-
gatb-core/test/unit/src/kmer/TestMPHF.cpp | 7 +-
gatb-core/test/unit/src/main.cpp | 28 +-
.../test/unit/src/tools/collections/TestMap.cpp | 10 +-
gatb-core/thirdparty/json/json.hpp | 659 ++++++
gatb-core/tools/CMakeLists.txt | 4 +-
gatb-core/tools/dbgh5.cpp | 2 +-
gatb-core/tools/leon.cpp | 61 +
142 files changed, 15051 insertions(+), 2028 deletions(-)
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..672d2f3
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,31 @@
+language: cpp
+os:
+- linux
+compiler:
+- clang
+- gcc
+addons:
+ apt:
+ sources:
+ - ubuntu-toolchain-r-test
+ - llvm-toolchain-precise-3.7
+ - george-edison55-precise-backports # for cmake 3
+ packages:
+ - libcppunit-dev
+ - g++-4.8
+ - clang-3.7
+ - cmake
+ - cmake-data
+install:
+- if [ "`echo $CXX`" == "g++" ] && [ "$TRAVIS_OS_NAME" == "linux" ]; then export CXX=g++-4.8; fi
+- if [ "`echo $CXX`" == "clang++" ] && [ "$TRAVIS_OS_NAME" == "linux" ]; then export CXX=clang++-3.7; fi
+matrix:
+script:
+- cd gatb-core
+- mkdir build
+- cd build
+- cmake .. && make
+- export CPPUNIT_VERBOSE=TRUE && ./bin/gatb-core-cppunit
+env:
+ global:
+ - MAKEFLAGS="-j 4"
diff --git a/README.md b/README.md
index f5ff695..afe9880 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@
|---------|-------------|-------------|-------------|---------------|--------------|--------------|
|*Debian 8*| | [![Build Status](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-docker-gatb-core-compile-gcc48/badge/icon)](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-docker-gatb-core-compile-gcc48/) | [![Build Status](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-docker-gatb-core-compile-gcc49/badge/icon)](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-docker-gatb-core-compile-gcc49/) | [![Build Status](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/ [...]
|*Debian 7*| [![Build Status](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-suite-debian7-64bits-gcc-4.7/badge/icon)](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-suite-debian7-64bits-gcc-4.7/) | - | - | - | - | [![Build Status](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-valgrind-debian7-64bits-gcc-4.7/badge/icon)](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-valgrind-debian7-64bits-gcc-4.7/) |
+|*Fedora 20*| - | [![Build Status](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-suite-fedora20-gcc-4.8/badge/icon)](https://ci.inria.fr/gatb-core/view/GATB-CORE/job/test-suite-fedora20-gcc-4.8/) | - | - | - | - |
| **Mac OSX** | **clang-600** | **gcc 4.2.1** |
| :--: |---------------|---------------|
@@ -128,11 +129,17 @@ More about GATB-CORE code compiling instruction is available [here](http://gatb-
Read [this documentation](https://gatb.inria.fr/use-eclipse-to-develop-gatb-core-softwares/).
+# Work on GATB-Core code using Xcode
-# GATB-Core programming tutorial
+Read [this documentation](https://gatb.inria.fr/use-xcode-to-develop-gatb-core-softwares/).
+
+
+# Learning GATB-Core: tutorial
You can follow [this link](https://gatb.inria.fr/gatb-programming-tutorial/) to start the GATB-Core Online Tutorial trail.
+The project also contains many [code examples](https://github.com/GATB/gatb-core/tree/master/gatb-core/examples) that can be easily compiled and executed to review how to use GATB-Core APIs.
+
# Documentation
The complete GATB-Core documentation is available [here](http://gatb-core.gforge.inria.fr/doc/api/). It contains: API, code snippets, compile instructions, *etc*.
diff --git a/gatb-core/.gitignore b/gatb-core/.gitignore
index 3963c80..33967a1 100644
--- a/gatb-core/.gitignore
+++ b/gatb-core/.gitignore
@@ -2,3 +2,6 @@ CMakeLists.txt.user
/build*
/.settings/
/.cproject
+*.leon
+.DS_Store
+
diff --git a/gatb-core/CMakeLists.txt b/gatb-core/CMakeLists.txt
index f6d7f67..0dc1f4d 100644
--- a/gatb-core/CMakeLists.txt
+++ b/gatb-core/CMakeLists.txt
@@ -8,7 +8,7 @@ cmake_minimum_required (VERSION 3.1.0)
################################################################################
# The default version number is the latest official build
SET (gatb-core_VERSION_MAJOR 1)
-SET (gatb-core_VERSION_MINOR 3)
+SET (gatb-core_VERSION_MINOR 4)
SET (gatb-core_VERSION_PATCH 0)
# But, it is possible to define another release number during a local build
@@ -48,13 +48,12 @@ set (CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
################################################################################
include (DefineInteger)
include (CppUnit)
-include (GetDate)
# We check whether we have native 128 bits integers
DefineInteger (k)
# We get the current date
-GetCurrentDate (gatb-core-date)
+string (TIMESTAMP gatb-core-date "%Y-%m-%d/%H:%M:%S")
################################################################################
# COMPILER DEFINITIONS
@@ -271,10 +270,12 @@ ENDIF()
# EXAMPLES GENERATION
################################################################################
IF (EXISTS "${PROJECT_SOURCE_DIR}/examples")
- IF (NOT DEFINED GATB_CORE_EXCLUDE_EXAMPLES)
- ADD_SUBDIRECTORY(examples EXCLUDE_FROM_ALL)
+ IF (GATB_CORE_INCLUDE_EXAMPLES)
+ ADD_SUBDIRECTORY(examples)
ENDIF()
ENDIF()
+# add example snippets into binary archive (use by CPack directive)
+INSTALL(DIRECTORY "${PROJECT_SOURCE_DIR}/examples/" DESTINATION "examples")
################################################################################
# INSTALL
@@ -287,10 +288,6 @@ IF (NOT DEFINED GATB_CORE_INSTALL_EXCLUDE)
INSTALL (DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/boost DESTINATION ./include)
ENDIF()
-IF (NOT DEFINED GATB_CORE_EXCLUDE_EXAMPLES)
- INSTALL (DIRECTORY ${PROJECT_SOURCE_DIR}/examples DESTINATION . OPTIONAL FILES_MATCHING PATTERN "*.cpp")
-ENDIF()
-
################################################################################
# DELIVERY
################################################################################
diff --git a/gatb-core/RELEASES.md b/gatb-core/RELEASES.md
index ad986d1..bd1f9a0 100644
--- a/gatb-core/RELEASES.md
+++ b/gatb-core/RELEASES.md
@@ -1,4 +1,88 @@
--------------------------------------------------------------------------------
+# RELEASE 1.4.0
+
+* Integration of Leon compressor into GATB-Core :
+ * It means that the Leon file format can now be handled natively by all softwares relying upon GATB-Core. In other words, you can apply data processing on reads without decompression of the Leon file.
+ * more details at https://github.com/GATB/gatb-core/wiki/Using-GATB-Core-integrated-Leon-compressor
+ * unit tests + large-scale test suite of Leon compressor; cf. https://ci.inria.fr/gatb-core/view/Leon/job/tool-leon-functional-tests/lastBuild/console
+
+* Time and memory optimisations :
+
+ * Faster k-mer counting (inspired by KMC3 but not yet as fast :)
+
+ * More efficient graph representation using compressed vectors (in `GraphUnitigs.cpp`)
+
+ * Faster unitigs compaction (engineering improvements in BCALM code)
+
+ * New compact encoding scheme to load the abundance values in memory (encoded on 8 bits, value range = 0 to 50k with 5% max error)
+
+* Parameterizable graph simplifications steps (see `Graph.hpp` and Minia): optional tip-clipping, bulge and erroneous connection removal
+
+* Preliminary support for loading unitigs (in `GraphUnitigs.cpp`) from a GFA1 graph format generated by BCALM (using `scripts/convertToGFA.py` in BCALM repository)
+
+* Adding new ways to compile, making compilation easier :
+
+ * Added a simple makefile to compile a GATB tool without CMake (see `examples/Makefile`)
+
+ * Added support for Docker. Using `docker/Dockerfile` one can build a docker image containing GATB-core.
+
+ * 2 new ways to compile example codes snippets :
+ * `cmake -DGATB_CORE_INCLUDE_EXAMPLES=True ..`
+ or
+ * `cd example ; make [folder]/[examplename.cpp]` for instance, `make kmer/kmer2` will compile `kmer2.cpp`
+
+* Various bugfixes
+
+
+--------------------------------------------------------------------------------
+# RELEASE 1.3.0
+
+## Summary
+
+* A new graph object is introduced: GraphUnitigs, optimized to traverse unitigs but not to query individual kmers.
+* A few graph API functions changed.
+* Updated MPHF and HDF5.
+* This releases now requires your compiler to be C++11-compatible.
+
+## Details
+
+* Tech notice
+
+ * Compiling GATB-Core library now requires c++/11 capable compilers.
+
+ * CMake 3.1.0 is the minimum release of CMake required to compile GATB-Core.
+
+ * HDF5 library (use for data storage) upgraded to latest release [1.8.18](https://support.hdfgroup.org/HDF5/release/obtain518.html)
+
+ * Parameters "-mphf none", "-mphf emphf" and "-mphf boophf" and variable WITH_MPHF are deprecated. Please remove them from your applications (e.g. in Graph::create()). BooPHF is now the default MPHF object and it is always compiled. Emphf has been removed from the library.
+
+ * Debug compilation is now done using standard Cmake rule "-DCMAKE_BUILD_TYPE=Debug", instead of "-Ddebug=1".
+
+* API changes
+
+ * Developers, please pay attention to these breaking changes:
+
+ * `Graph::Vector` is now ``GraphVector`
+ * `Graph::Iterator` is now `GraphIterator`
+ * `Graph::create()` does not accept anymore '-mphf ...' (see Tech Notice, above)
+
+* New features
+
+ * New GraphUnitigs class that offers a de Bruijn graph representation based on unitigs (created by BCALM2) loaded in memory. It has the same API as the Graph class although some functions aren't implemented, as accessing a node that is not an extremity of a unitig isn't supported in this representation. The representation is designed to traverse unitigs quickly, skipping over all non-branching nodes. This representation doesn't use the Bloom filter nor the MPHF. To use this represent [...]
+
+ * New functions to traverse the graph have been added . See `simplePath*` in Graph.hpp. These functions are mostly designed to take advantage of GraphUnitigs and they have the same API in Graph too. They also will replace the Traversal class. Partial compatibility with the original Graph class has been implemented so far.
+
+ * [BooPHF](https://github.com/rizkg/BBHash) is now the default MPHF object used by GATB-Core
+
+ * In addition to HDF5, we introduce a new experimental support for raw file format. It was made for two reasons: avoid potential memory leaks due to hdf5 (unclear at this point), and avoid hdf5 file corruption (whenever a job is interrupted after kmer counting, sometimes the h5 file containing the kmer counts cannot be re-opened). The format is experimental, so use at your own risks. The file format is basically the same content as the previous HDF5 format but with each dataset being [...]
+
+--------------------------------------------------------------------------------
+# RELEASE 1.2.2
+
+#### This is a bug-fix release :
+* fixed a compilation issue with old version of clang compilers (prior to clang 4.3 on mac). This gatb-core release is the last one to officially support clang version older than 4.3 on mac and 3.2 on linux.
+
+--------------------------------------------------------------------------------
# RELEASE 1.2.1
* bug fixes when MPHF is queried on a false positive node.
diff --git a/gatb-core/cmake/Delivery.cmake b/gatb-core/cmake/Delivery.cmake
index 292e662..373fd01 100644
--- a/gatb-core/cmake/Delivery.cmake
+++ b/gatb-core/cmake/Delivery.cmake
@@ -11,7 +11,7 @@ IF (NOT CPACK_USER_NAME)
ENDIF (NOT CPACK_USER_NAME)
# We get the date
-GetCurrentDate (CPACK_DATE)
+string (TIMESTAMP CPACK_DATE "%Y-%m-%d/%H:%M:%S")
# We may have to set (if not defined) the CPACK_GFORGE_PROJECT_NAME
IF (NOT CPACK_GFORGE_PROJECT_NAME)
diff --git a/gatb-core/cmake/GetDate.cmake b/gatb-core/cmake/GetDate.cmake
deleted file mode 100644
index 8bb29ed..0000000
--- a/gatb-core/cmake/GetDate.cmake
+++ /dev/null
@@ -1,31 +0,0 @@
-################################################################################
-# Current Date
-################################################################################
-
-# For the moment, we can't use string(timestamp...) for getting the current date,
-# because it needs a 2.8.11 version for cmake. Since we may have to compile the
-# stuff on servers where we are not admin, we may face too old cmake versions for
-# supporting this feature.
-# Ideally, we should use: string (TIMESTAMP gatb-core-date "%Y-%m-%d %H:%M:%S")
-
-# The trick is to rely on PERL (likely to be installed on the machine)
-# (see http://osdir.com/ml/programming.tools.cmake.user/2006-06/msg00323.html)
-
-INCLUDE(FindPerl)
-
-MACRO (GetCurrentDate dateString)
- # We execute a command that retrieves the current date.
- EXECUTE_PROCESS (
- COMMAND "${PERL_EXECUTABLE}" "-le" "@T=localtime; printf (\"%04d-%02d-%02d/%02d:%02d:%02d\",$T[5]+1900,$T[4]+1,$T[3],$T[2],$T[1],$T[0])"
- OUTPUT_VARIABLE ${dateString}
- )
-ENDMACRO()
-
-
-MACRO (GetCurrentDateShort dateString)
- # We execute a command that retrieves the current date.
- EXECUTE_PROCESS (
- COMMAND "${PERL_EXECUTABLE}" "-le" "@T=localtime; printf (\"%04d%02d%02d\",$T[5]+1900,$T[4]+1,$T[3])"
- OUTPUT_VARIABLE ${dateString}
- )
-ENDMACRO()
diff --git a/gatb-core/doc/doxygen/src/mainpage.hpp b/gatb-core/doc/doxygen/src/mainpage.hpp
index 6ab7374..3a38946 100644
--- a/gatb-core/doc/doxygen/src/mainpage.hpp
+++ b/gatb-core/doc/doxygen/src/mainpage.hpp
@@ -44,7 +44,7 @@
* gatb::core is a high-performance and low memory footprint C++ library.
*
* It supports the following operations natively:
- * - FASTA/FASTQ parsing
+ * - FASTA/FASTQ parsing and writing; support of plain text and gzipped files
* - K-mer counting
* - Minimizer computation of k-mers, partitioning of datasets by minimizers
* - de Bruijn graph construction
@@ -56,6 +56,14 @@
* So, one can say that GATB-CORE library provides means to build and use De Bruijn graphs with a low memory footprint,
* which comes initially from the <a href="https://project.inria.fr/gatb/software/minia/">minia</a> assembly tool.
*
+ * However, in addition to the de Bruijn graph data structure, GATB-Core provides several
+ * other ones that can be of interest for general purpose developments. These are:
+ *
+ * - Open-Addressing Hash Table
+ * - Linked-List Hash Table
+ * - Bloom Filters. There are several flavors: basic, cache-optimized, optimized for k-mer neighbours; accessible through BloomFactory.
+ * - Minimal Perfect Hash Function (BBHash)
+ *
* The documentation you are reading is the official documentation of the gatb::core reference API. The
* audience is therefore developers interested in creating bioinformatics softwares.
*
diff --git a/gatb-core/doc/doxygen/src/snippetspage.hpp b/gatb-core/doc/doxygen/src/snippetspage.hpp
index 0cd1219..b59b6e1 100644
--- a/gatb-core/doc/doxygen/src/snippetspage.hpp
+++ b/gatb-core/doc/doxygen/src/snippetspage.hpp
@@ -64,19 +64,22 @@
*
* If CppUnit is installed, a unit tests binary should be generated; you can launch it with
* \code
- * cd <some_directory>/gatb-core/gatb-core/build
+ * cd <some_directory>/gatb-core/gatb-core/build/bin
*
* export CPPUNIT_VERBOSE=1
*
- * # Copy database for unit tests
- * cp -r ../test/db $BUILD_DIR/test/
- *
* # Launch the full test suite
- * cd bin
- * ./gatb-core-cppunit
+ * ./gatb-core-cppunit all <gatb-core-home>/gatb-core/test/db
* \endcode
* You can use the exit status code of the command to know the success status (0 for success).
*
+ * Use
+ * \code
+ * ./gatb-core-cppunit -h
+ * \endcode
+ *
+ * to get more information on how to use this testing program.
+ *
* Note that one may set the environment variable CPPUNIT_VERBOSE to 1 to known which tests pass.
*
* By default, gatb::core supports kmer sizes up to 128. In fact, it has 4 different implementations
@@ -92,28 +95,39 @@
*
* A directory named <a href="https://github.com/GATB/gatb-core/tree/master/gatb-core/examples">examples</a> holds some snippets that show how to use services provided by the library.
*
- * In order to compile them, you will first need to compile the library.
+ * In order to compile them, you will first need to compile the full library (see above).
*
* A simple way to generate the snippets is to type:
* \code
+ * cmake -DGATB_CORE_INCLUDE_EXAMPLES=True -DCMAKE_BUILD_TYPE=Debug .. [1]
* make examples
* \endcode
*
+ * [1] some code snippets use assert(), so it is required to compile in Debug mode.
+ *
+ * You can also compile a single snippet using its name, such as:
+ * \code
+ * make bank1
+ * \endcode
+ *
* Then, you can have a look at the header of each snippet source file to review how
* to use them. You can also have a look at <a href="https://github.com/GATB/gatb-core/blob/master/gatb-core/test/jenkins/test-snippets-debian7-64bits-gcc-4.7.sh">this script</a> we used on a Jenkins platform
* to automatically run and test all these snippets; in that script, you'll see how
* they are started (i.e. what is the command-line to use).
*
+ * Some documentation about these code snippets is also available <a href="https://github.com/GATB/gatb-core/tree/master/gatb-core/examples">here</a>.
+ *
************************************************************************************
- * \section use_eclipse Use Eclipse c++ IDE to work with GATB-Core
+ * \section use_eclipse Use Eclipse c++ IDE or Xcode IDE to work with GATB-Core
*
* If you are interested in using an IDE to develop c++ source codes relying on the
* GATB-Core library, we have written this manual to explain how to setup Eclipse
- * c++ framework:
+ * c++ framework or Xcode one:
*
* <a href="https://gatb.inria.fr/use-eclipse-to-develop-gatb-core-softwares/">https://gatb.inria.fr/use-eclipse-to-develop-gatb-core-softwares/</a>
+ * <a href="https://gatb.inria.fr/use-xcode-to-develop-gatb-core-softwares/">https://gatb.inria.fr/use-xcode-to-develop-gatb-core-softwares/</a>
*
- * You also have the opportunity to easily work on <a href="https://github.com/GATB/gatb-core/tree/master/gatb-core/examples">code snippets</a> directly from Eclipse. Please, refer to the above link.
+ * You also have the opportunity to easily work on <a href="https://github.com/GATB/gatb-core/tree/master/gatb-core/examples">code snippets</a> directly from Eclipse/Xcode. Please, refer to the above links.
*
*************************************************************************************
* \page new_project Quick project creation
@@ -543,6 +557,17 @@
* \n
*
**************************************************************************************
+ * \section snippets_bank_snippet7b Filter sequences using Phred quality
+ *
+ * This snippet shows how to parse a FastQ file and filtering out reads by Phred quality.
+ *
+ * Code is from example bank24.cpp:
+ *\snippet bank24.cpp snippet1
+ * [go back to \ref snippets_bank "top"]
+ *
+ * \n
+ *
+ **************************************************************************************
* \section snippets_bank_snippet8 Conversion of a FASTA bank to a binary format
*
* This snippet shows how to parse a nucleic bank and convert it to a binary format.
diff --git a/gatb-core/docker/Dockerfile.clang b/gatb-core/docker/Dockerfile.clang
index 7188fcd..a7dfc9c 100644
--- a/gatb-core/docker/Dockerfile.clang
+++ b/gatb-core/docker/Dockerfile.clang
@@ -155,12 +155,13 @@ RUN add-apt-repository "deb http://apt.llvm.org/jessie/ llvm-toolchain-jessie-${
# install cmake since we have to control which version we use.
# Cmake install procedure: https://cmake.org/install/
#
+ENV CC=/usr/bin/clang-${CLANG_VERSION} \
+ CXX=/usr/bin/clang++-${CLANG_VERSION}
+
RUN cd /opt \
&& export CMAKE_URL="http://cmake.org/files/v${CMAKE_SERIES}/cmake-${CMAKE_VERSION}.tar.gz" \
&& wget --no-check-certificate ${CMAKE_URL} -O - | tar xzf - \
&& cd cmake-${CMAKE_VERSION} \
- && export CC=/usr/bin/clang-${CLANG_VERSION} \
- && export CXX=/usr/bin/clang++-${CLANG_VERSION} \
&& ./bootstrap && make && make install && cd /opt && rm -rf cmake-${CMAKE_VERSION}
# ###
diff --git a/gatb-core/examples/.DS_Store b/gatb-core/examples/.DS_Store
deleted file mode 100644
index 3f1e0fc..0000000
Binary files a/gatb-core/examples/.DS_Store and /dev/null differ
diff --git a/gatb-core/examples/Makefile b/gatb-core/examples/Makefile
new file mode 100644
index 0000000..40e7153
--- /dev/null
+++ b/gatb-core/examples/Makefile
@@ -0,0 +1,24 @@
+# example GATB-core makefile
+
+.PHONY: all clean
+
+# change this to the folder where gatb-core is
+GATB=$(shell pwd)/../
+
+GATB_LIB=$(GATB)/build/lib
+
+CXXFLAGS = -std=c++0x -O3 -I$(GATB)/src -I$(GATB)/build/include -I$(GATB)/thirdparty
+LDFLAGS= -L$(GATB_LIB) -lgatbcore -lpthread -lz -lhdf5 -std=c++0x -ldl -static
+
+
+SRCS = $(wildcard *.cpp)
+OBJS = $(patsubst %.cpp,%,$(SRCS))
+
+all: $(OBJS)
+
+clean:
+ rm -fv $(OBJS)
+
+%: %.cpp
+ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
diff --git a/gatb-core/examples/README.md b/gatb-core/examples/README.md
new file mode 100644
index 0000000..a4178f3
--- /dev/null
+++ b/gatb-core/examples/README.md
@@ -0,0 +1,94 @@
+# GATB-Core example code snippets
+
+This directory contains many ready-to-compile code snippets you can use to learn [GATB-Core c++ API](http://gatb-core.gforge.inria.fr/doc/api/).
+
+## Dependencies
+
+The following third parties have to be already installed to compile GATB-Core examples:
+
+* a **C++/11 capable compiler** (*e.g.* gcc 4.7+, clang 3.5+, Apple/clang 6.0+)
+* **CMake 3.1+**
+
+
+## Compile GATB-CORE snippets
+
+### Get a copy of the project
+
+If not already done:
+
+ cd <some_directory>
+ git clone https://github.com/GATB/gatb-core.git
+
+### Compile all snippets
+
+ cd <some_directory>/gatb-core/gatb-core
+ mkdir build
+ cd build
+ cmake -DGATB_CORE_INCLUDE_EXAMPLES=True -DCMAKE_BUILD_TYPE=Debug ..
+ make -j8 examples
+
+### Compile a particular snippet
+
+Instead of using:
+
+ make -j8 examples
+
+simply do:
+
+ make -j8 bank15
+
+to compile snippet "bank15.cpp". Apply the same recipe to compile any other code snippets.
+
+### Simple Makefile
+
+Alternatively, you can use the provided "makefile" script to compile a single example. It needs to be modified to point to the correct path of the GATB-core library. Maybe it will need some tweaking (try removing the "-static" flag if compilation fails).
+
+Try it, from this folder:
+
+ make bank/bank1
+
+should compile the first bank1.cpp example
+
+### Run a compiled code snippet
+
+Have a look at the begining of each c++ source code: you'll see how to use the example programs.
+
+For instance, taking the above example "bank15.cpp", you run it as follows:
+
+ # from within the 'build' directory:
+ ./bin/bank15 ../test/db/reads1.fa
+
+## Documentation
+
+Basic APIs explained:
+
+* [Bank](http://gatb-core.gforge.inria.fr/doc/api/snippets_bank.html): read/write FastA and FastQ files (plain text and gzip)
+* [Iterator](http://gatb-core.gforge.inria.fr/doc/api/snippets_iterators.html): go through a Bank by iterating over its sequences
+* [k-mer](http://gatb-core.gforge.inria.fr/doc/api/snippets_kmer.html): from sequences to k-mers
+* [De Bruijn Graph](http://gatb-core.gforge.inria.fr/doc/api/snippets_graph.html): from sequences to De Bruijn Graphs
+
+Advanced APIs explained:
+
+* [Multi-threading](http://gatb-core.gforge.inria.fr/doc/api/snippets_multithread.html): easy way to manage multi-threaded tasks on Linux and OSX
+* [Storage](http://gatb-core.gforge.inria.fr/doc/api/snippets_storage.html): easy way to handle HDF5 storage
+
+Make your own full-featured GATB-Tool:
+
+* [Tool API](http://gatb-core.gforge.inria.fr/doc/api/snippets_tools.html): make a command-line based tool quickly
+
+The complete GATB-Core API reference documentation is [here](http://gatb-core.gforge.inria.fr/doc/api/index.html).
+
+## Online tutorial
+
+Some of these code snippets are also available for direct use from the [GATB-Core online Tutorial](https://gatb.inria.fr/gatb-programming-tutorial/).
+
+## Contact
+
+To contact a developer, request help, *etc*, use:
+
+ https://gatb.inria.fr/contact/
+
+
+## License
+
+GATB is free software; you can redistribute it and/or modify it under the [Affero GPL v3 license](http://www.gnu.org/licenses/agpl-3.0.en.html).
diff --git a/gatb-core/examples/bank/bank15.cpp b/gatb-core/examples/bank/bank15.cpp
index 0d4b753..707eb7d 100644
--- a/gatb-core/examples/bank/bank15.cpp
+++ b/gatb-core/examples/bank/bank15.cpp
@@ -26,12 +26,13 @@ int main (int argc, char* argv[])
// We declare a Bank instance.
IBank* bank = Bank::open (filename);
LOCAL (bank);
-
- // IN A NEAR FUTURE, WE WILL HAVE STL LIKE ITERATORS.
-#if 0
- for (BankFasta::iterator it = bank->begin(); it != bank->end(); ++it)
+ Iterator<Sequence>* it = bank->iterator();
+ LOCAL (it);
+
+ // We loop over sequences.
+ for (it->first(); !it->isDone(); it->next())
{
- Sequence& seq = *it;
+ Sequence& seq = it->item();
Data& data = seq.getData();
@@ -40,10 +41,9 @@ int main (int argc, char* argv[])
if (data.size() < seqMinSize) { seqMinSize = data.size(); }
dataSize += data.size ();
}
-#endif
- std::cout << "data size : " << dataSize << std::endl;
- std::cout << "sequence number : " << nbSequences << std::endl;
+ std::cout << "# letters : " << dataSize << std::endl;
+ std::cout << "# sequences : " << nbSequences << std::endl;
std::cout << "sequence max size : " << seqMaxSize << std::endl;
std::cout << "sequence min size : " << seqMinSize << std::endl;
}
diff --git a/gatb-core/examples/bank/bank21.cpp b/gatb-core/examples/bank/bank21.cpp
new file mode 100644
index 0000000..1e14ab4
--- /dev/null
+++ b/gatb-core/examples/bank/bank21.cpp
@@ -0,0 +1,97 @@
+//! [snippet1]
+// We include what we need for the test
+#include <gatb/gatb_core.hpp>
+#include <iostream>
+
+/* WARNING:
+ * If you DO NOT have used 'cmake' with Debug mode set to on as follows:
+ * cmake -D CMAKE_BUILD_TYPE=Debug ..
+ * to prepare a compiled version of this snippet, please uncomment
+ * these lines (otherwise assert() calls won't be executed):
+ */
+#undef NDEBUG
+#include <assert.h>
+
+/********************************************************************************/
+/* Compare two banks for equality */
+/* */
+/* This snippet shows how to compare two banks: */
+/* - first bank is either a Fasta or a Fastq file */
+/* - second bank is the leon-lossless-compressed file of the first bank */
+/* */
+/* Note: we use here a try/catch block in case the bank opening doesn't work. */
+/* */
+/* Cmd-line: bank21 <fasta/q file> <leon file> */
+/* */
+/* Sample: bank21 gatb-core/gatb-core/test/db/sample.fastq \ */
+/* gatb-core/gatb-core/test/db/sample.fastq.leon */
+/* */
+/********************************************************************************/
+int main (int argc, char* argv[])
+{
+ if (argc < 3)
+ {
+ std::cerr << "you must provide two banks." << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ // We define a try/catch block in case some method fails (bad filename for instance)
+ try
+ {
+ string btype = Bank::getType(argv[1]);
+ assert(
+ btype.compare("fasta")==0 ||
+ btype.compare("fastq")==0
+ );
+ // We open the reference file
+ IBank* fasBank = Bank::open (argv[1]);
+
+ btype = Bank::getType(argv[2]);
+ assert(btype.compare("leon")==0);
+
+ // We open its leon-lossless-compressed representation
+ IBank* leonBank = Bank::open (argv[2]);
+
+ u_int64_t nbSeqFas = 0;
+ u_int64_t nbSeqLeon = 0;
+
+ // We create iterators over this bank.
+ Iterator<Sequence>* itFas = fasBank->iterator();
+ Iterator<Sequence>* itLeon = leonBank->iterator();
+ {
+ // we use a GATB-Core macro to automatically release
+ // memory allocated here as soon as we leave this code
+ // block
+ LOCAL(itFas);
+ LOCAL(itLeon);
+
+ // We do not use estimate() methods. Instead, we count
+ // exact number of sequences in both banks
+ for (itFas->first(); !itFas->isDone(); itFas->next()){nbSeqFas++;}
+ for (itLeon->first(); !itLeon->isDone(); itLeon->next()){nbSeqLeon++;}
+ assert(nbSeqFas==nbSeqLeon);
+ }
+
+ // We create a PairedIterator to go through both banks simultaneously
+ itFas = fasBank->iterator();
+ itLeon = leonBank->iterator();
+ LOCAL(itFas);
+ LOCAL(itLeon);
+ PairedIterator<Sequence,Sequence> it (itFas, itLeon);
+
+ for (it.first(); !it.isDone(); it.next())
+ {
+ // check sequence comment for equality
+ assert(it->first.getComment().compare(it->second.getComment())==0);
+ // check sequence letters for equality
+ assert(it->first.toString().compare(it->second.toString())==0);
+ // check sequence quality for equality
+ assert(it->first.getQuality().compare(it->second.getQuality())==0);
+ }
+ }
+ catch (Exception& e)
+ {
+ std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
+ }
+}
+//! [snippet1]
diff --git a/gatb-core/examples/bank/bank22.cpp b/gatb-core/examples/bank/bank22.cpp
new file mode 100644
index 0000000..4e4be6b
--- /dev/null
+++ b/gatb-core/examples/bank/bank22.cpp
@@ -0,0 +1,50 @@
+//! [snippet1]
+// We include what we need for the test
+#include <gatb/gatb_core.hpp>
+#include <iostream>
+
+/********************************************************************************/
+/* Extract some sequences from a Fastq sequence file. */
+/* */
+/* Cmd-line: bank22 <in_bank> <out_bank> <nb_seq_to_retain> */
+/* */
+/* Sample: bank22 gatb-core/gatb-core/test/db/giab.hg002.2D.fastq.gz \ */
+/* /tmp/new_file.fastq */
+/* 500 */
+/* */
+/********************************************************************************/
+int main (int argc, char* argv[])
+{
+ if (argc < 4)
+ {
+ std::cerr << "you must provide: in_bank_name out_bank_name nb_seq_to_retain" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ // We open the reference file
+ IBank* inBank = Bank::open (argv[1]);
+ std::string outB(argv[2]);
+ //constructor: fileName to create,
+ BankFasta outputBank (
+ outB, // file name to create
+ true // write fastq instead of default fasta
+ //,true // optional you can use gzip compression directly
+ );
+
+ // We create iterators over this bank.
+ Iterator<Sequence>* itseq = inBank->iterator();
+ itseq = inBank->iterator();
+ LOCAL(itseq);
+ int limit = atoi(argv[3]);
+ int count=0;
+ for (itseq->first(); !itseq->isDone(); itseq->next())
+ {
+ outputBank.insert (itseq->item());
+ count++;
+ if (count>=limit){
+ break;
+ }
+ }
+ outputBank.flush();
+}
+//! [snippet1]
diff --git a/gatb-core/examples/bank/bank15.cpp b/gatb-core/examples/bank/bank23.cpp
similarity index 74%
copy from gatb-core/examples/bank/bank15.cpp
copy to gatb-core/examples/bank/bank23.cpp
index 0d4b753..42939af 100644
--- a/gatb-core/examples/bank/bank15.cpp
+++ b/gatb-core/examples/bank/bank23.cpp
@@ -7,12 +7,12 @@
/********************************************************************************/
/* Bank management */
/* */
-/* This snippet shows how to open a FASTA bank and iterate its sequences */
+/* This snippet shows how to open a bank and iterate its sequences */
/* to provide some stats: data size, nb. sequences, etc. */
/* */
-/* Cmd-line: bank15 <fasta/q file> */
+/* Cmd-line: bank23 <fasta/q file> */
/* */
-/* Sample: bank15 gatb-core/gatb-core/test/db/reads1.fa */
+/* Sample: bank23 gatb-core/gatb-core/test/db/reads1.fa */
/* */
/********************************************************************************/
int main (int argc, char* argv[])
@@ -26,12 +26,12 @@ int main (int argc, char* argv[])
// We declare a Bank instance.
IBank* bank = Bank::open (filename);
LOCAL (bank);
-
- // IN A NEAR FUTURE, WE WILL HAVE STL LIKE ITERATORS.
-#if 0
- for (BankFasta::iterator it = bank->begin(); it != bank->end(); ++it)
+ ProgressIterator<Sequence> iter (*bank, "Iterating sequences");
+
+ // We loop over sequences.
+ for (iter.first(); !iter.isDone(); iter.next())
{
- Sequence& seq = *it;
+ Sequence& seq = iter.item();
Data& data = seq.getData();
@@ -40,10 +40,9 @@ int main (int argc, char* argv[])
if (data.size() < seqMinSize) { seqMinSize = data.size(); }
dataSize += data.size ();
}
-#endif
- std::cout << "data size : " << dataSize << std::endl;
- std::cout << "sequence number : " << nbSequences << std::endl;
+ std::cout << "# letters : " << dataSize << std::endl;
+ std::cout << "# sequences : " << nbSequences << std::endl;
std::cout << "sequence max size : " << seqMaxSize << std::endl;
std::cout << "sequence min size : " << seqMinSize << std::endl;
}
diff --git a/gatb-core/examples/bank/bank24.cpp b/gatb-core/examples/bank/bank24.cpp
new file mode 100644
index 0000000..688d17f
--- /dev/null
+++ b/gatb-core/examples/bank/bank24.cpp
@@ -0,0 +1,77 @@
+//! [snippet1]
+
+// We include what we need for the test
+#include <gatb/gatb_core.hpp>
+#include <iostream>
+
+/********************************************************************************/
+/* Bank management */
+/* */
+/* This snippet shows how to open a FastQ file and iterate its sequences */
+/* using a filter discarding all reads having a Phred score >= 30. */
+/* */
+/* Cmd-line: bank24 <fastq file> */
+/* */
+/* Sample: bank23 gatb-core/gatb-core/test/db/sample.fastq */
+/* */
+/********************************************************************************/
+int threshold = 30;
+
+int computeMeanPhredScore(const std::string& quality){
+ int score=0;
+ // quality information is supposed to be FastQ-Sanger-encoded:
+ // each letter in quality string is an ASCII code ranging from 33 to 126.
+ for(char c : quality){
+ score += (c-33);
+ }
+ return score / quality.size();
+}
+
+struct QualityFilter {
+ bool operator () (Sequence& seq) const {
+ return computeMeanPhredScore(seq.getQuality()) >= threshold;
+ }
+};
+
+/********************************************************************************/
+// START Application
+int main (int argc, char* argv[])
+{
+ // We check that the user provides at least one option: a Fasta/FastQ file.
+ // Online GATB-Tutorial: this argument is automatically filled in with an
+ // appropriate file.
+ if (argc < 2)
+ {
+ std::cerr << "Please, provide a sequence file." << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ // We define a try/catch block in case some method fails (bad filename for instance)
+ try
+ {
+ // We declare an input Bank and use it locally
+ IBank* inputBank = Bank::open (argv[1]);
+ LOCAL (inputBank);
+
+ // We create an iterator over this bank using some filtering system
+ FilterIterator<Sequence,QualityFilter> it (inputBank->iterator(), QualityFilter());
+
+ // We loop over sequences.
+ for (it.first(); !it.isDone(); it.next())
+ {
+ // Shortcut
+ Sequence& seq = it.item();
+
+ // We dump the sequence quality
+ std::cout << "[" << seq.getQuality() << "] " << computeMeanPhredScore(seq.getQuality()) << std::endl;
+
+ }
+ }
+ catch (Exception& e)
+ {
+ std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
+ }
+}
+
+//! [snippet1]
+
diff --git a/gatb-core/examples/bank/bank25.cpp b/gatb-core/examples/bank/bank25.cpp
new file mode 100644
index 0000000..23e9086
--- /dev/null
+++ b/gatb-core/examples/bank/bank25.cpp
@@ -0,0 +1,93 @@
+//! [snippet1]
+// We include what we need for the test
+#include <gatb/gatb_core.hpp>
+#include <iostream>
+
+/* !!!!! WARNING !!!!!
+ * DO NOT EDIT: snippet used to test Leon!
+ */
+#undef NDEBUG
+#include <assert.h>
+
+/********************************************************************************/
+/* Compare two banks for equality */
+/* */
+/* This snippet shows how to compare two banks: */
+/* - first bank is either a Fasta or a Fastq file */
+/* - secon bank is either a Fasta or a Fastq or a Leon lossless file */
+/* */
+/* Note: we use here a try/catch block in case the bank opening doesn't work. */
+/* */
+/* Cmd-line: bank25 <fasta/q file> <leon file> */
+/* */
+/* Sample: bank25 gatb-core/gatb-core/test/db/sample.fastq \ */
+/* gatb-core/gatb-core/test/db/sample.fastq.leon */
+/* */
+/********************************************************************************/
+int main (int argc, char* argv[])
+{
+ if (argc < 3)
+ {
+ std::cerr << "you must provide two banks." << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ // We define a try/catch block in case some method fails (bad filename for instance)
+ try
+ {
+ string btype = Bank::getType(argv[1]);
+ assert(
+ btype.compare("fasta")==0 ||
+ btype.compare("fastq")==0
+ );
+ // We open the reference file
+ IBank* fasBank = Bank::open (argv[1]);
+
+ btype = Bank::getType(argv[2]);
+
+ // We open its leon-lossless-compressed representation
+ IBank* leonBank = Bank::open (argv[2]);
+
+ u_int64_t nbSeqFas = 0;
+ u_int64_t nbSeqLeon = 0;
+
+ // We create iterators over this bank.
+ Iterator<Sequence>* itFas = fasBank->iterator();
+ Iterator<Sequence>* itLeon = leonBank->iterator();
+ {
+ // we use a GATB-Core macro to automatically release
+ // memory allocated here as soon as we leave this code
+ // block
+ LOCAL(itFas);
+ LOCAL(itLeon);
+
+ // We do not use estimate() methods. Instead, we count
+ // exact number of sequences in both banks
+ for (itFas->first(); !itFas->isDone(); itFas->next()){nbSeqFas++;}
+ for (itLeon->first(); !itLeon->isDone(); itLeon->next()){nbSeqLeon++;}
+ assert(nbSeqFas==nbSeqLeon);
+ }
+
+ // We create a PairedIterator to go through both banks simultaneously
+ itFas = fasBank->iterator();
+ itLeon = leonBank->iterator();
+ LOCAL(itFas);
+ LOCAL(itLeon);
+ PairedIterator<Sequence,Sequence> it (itFas, itLeon);
+
+ for (it.first(); !it.isDone(); it.next())
+ {
+ // check sequence comment for equality
+ assert(it->first.getComment().compare(it->second.getComment())==0);
+ // check sequence letters for equality
+ assert(it->first.toString().compare(it->second.toString())==0);
+ // check sequence quality for equality
+ assert(it->first.getQuality().compare(it->second.getQuality())==0);
+ }
+ }
+ catch (Exception& e)
+ {
+ std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
+ }
+}
+//! [snippet1]
diff --git a/gatb-core/examples/bank/bank26.cpp b/gatb-core/examples/bank/bank26.cpp
new file mode 100644
index 0000000..e39115a
--- /dev/null
+++ b/gatb-core/examples/bank/bank26.cpp
@@ -0,0 +1,54 @@
+//! [snippet1]
+// We include what we need for the test
+#include <gatb/gatb_core.hpp>
+#include <iostream>
+
+/* !!!!! WARNING !!!!!
+ * DO NOT EDIT: snippet used to test Leon!
+ */
+
+/********************************************************************************/
+/* Bank: count sequences */
+/* */
+/* This snippet shows how to open a bank and iterate its sequences. */
+/* */
+/* Cmd-line: bank26 <fasta/q file> */
+/* */
+/* Sample: bank26 gatb-core/gatb-core/test/db/reads1.fa */
+/* */
+/********************************************************************************/
+int main (int argc, char* argv[])
+{
+ if (argc < 2)
+ {
+ std::cerr << "you must provide a bank." << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ // We define a try/catch block in case some method fails (bad filename for instance)
+ try
+ {
+ // We declare an input Bank and use it locally
+ IBank* inputBank = Bank::open (argv[1]);
+ LOCAL (inputBank);
+
+ // We create an iterator over this bank.
+ Iterator<Sequence>* it = inputBank->iterator();
+ LOCAL (it);
+
+ // We loop over sequences.
+ for (it->first(); !it->isDone(); it->next())
+ {
+ // Shortcut
+ Sequence& seq = it->item();
+
+ // We dump coment and data size
+ std::cout << seq.getComment() << "[" << seq.getDataSize() << "] " << std::endl;
+ }
+ }
+ catch (Exception& e)
+ {
+ std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
+ }
+}
+//! [snippet1]
diff --git a/gatb-core/examples/bank/bank27.cpp b/gatb-core/examples/bank/bank27.cpp
new file mode 100644
index 0000000..666c76f
--- /dev/null
+++ b/gatb-core/examples/bank/bank27.cpp
@@ -0,0 +1,70 @@
+//! [snippet1]
+
+// We include what we need for the test
+#include <gatb/gatb_core.hpp>
+#include <iostream>
+
+/* !!!!! WARNING !!!!!
+ * DO NOT EDIT: snippet used to test Leon!
+ */
+
+/********************************************************************************/
+/* Bank management */
+/* */
+/* This snippet shows how to open a FASTA bank and iterate its sequences */
+/* to provide some stats: data size, nb. sequences, etc. */
+/* */
+/* Cmd-line: bank27 -in <fasta/q file> */
+/* */
+/* Sample: bank27 -in gatb-core/gatb-core/test/db/reads1.fa */
+/* */
+/********************************************************************************/
+int main (int argc, char* argv[])
+{
+ /** We create a command line parser. */
+ OptionsParser parser ("BankStats");
+ parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true));
+ parser.push_back (new OptionOneParam (STR_KMER_SIZE, "k-mer size", false));
+
+ try
+ {
+ /** We parse the user options. */
+ IProperties* options = parser.parse (argc, argv);
+
+ // We get information about the bank.
+ u_int64_t nbSequences=0, dataSize=0, seqMaxSize=0, seqMinSize=~0, nbSmallSequences=0;
+
+ u_int64_t kmerSize = options->get(STR_KMER_SIZE) ?
+ options->getInt(STR_KMER_SIZE) : 31;
+ // We declare an input Bank and use it locally
+ IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT));
+ LOCAL (inputBank);
+
+ Iterator<Sequence>* it = inputBank->iterator();
+ for (it->first(); !it->isDone(); it->next())
+ {
+ Data& data = it->item().getData();
+
+ nbSequences ++;
+ if (data.size() > seqMaxSize) { seqMaxSize = data.size(); }
+ if (data.size() < seqMinSize) { seqMinSize = data.size(); }
+ if (data.size() < kmerSize) { nbSmallSequences++; }
+ dataSize += data.size ();
+ }
+
+ std::cout << "nb. nucleotides : " << dataSize << std::endl;
+ std::cout << "nb. sequences : " << nbSequences << std::endl;
+ std::cout << "sequence max size : " << seqMaxSize << std::endl;
+ std::cout << "sequence min size : " << seqMinSize << std::endl;
+ std::cout << "nb. small sequences : " << nbSmallSequences << " (< " << kmerSize << " nucl.)" << std::endl;
+ }
+ catch (OptionFailure& e)
+ {
+ return e.displayErrors (std::cout);
+ }
+ catch (Exception& e)
+ {
+ std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
+ }
+}
+//! [snippet1]
diff --git a/gatb-core/examples/bank/bank28.cpp b/gatb-core/examples/bank/bank28.cpp
new file mode 100644
index 0000000..c16b62a
--- /dev/null
+++ b/gatb-core/examples/bank/bank28.cpp
@@ -0,0 +1,64 @@
+//! [snippet1]
+
+// We include what we need for the test
+#include <gatb/gatb_core.hpp>
+#include <iostream>
+
+/* !!!!! WARNING !!!!!
+ * DO NOT EDIT: snippet used to test Leon!
+ */
+
+/********************************************************************************/
+/* Bank management */
+/* */
+/* Same as bank27 but provide results in a convenient way for script handling. */
+/* */
+/* Cmd-line: bank28 -in <fasta/q file> */
+/* */
+/* Sample: bank28 -in gatb-core/gatb-core/test/db/reads1.fa */
+/* */
+/********************************************************************************/
+int main (int argc, char* argv[])
+{
+ /** We create a command line parser. */
+ OptionsParser parser ("BankStats");
+ parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true));
+ parser.push_back (new OptionOneParam (STR_KMER_SIZE, "k-mer size", false));
+
+ try
+ {
+ /** We parse the user options. */
+ IProperties* options = parser.parse (argc, argv);
+
+ // We get information about the bank.
+ u_int64_t nbSequences=0, dataSize=0, seqMaxSize=0, seqMinSize=~0, nbSmallSequences=0;
+
+ u_int64_t kmerSize = options->get(STR_KMER_SIZE) ?
+ options->getInt(STR_KMER_SIZE) : 31;
+ // We declare an input Bank and use it locally
+ IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT));
+ LOCAL (inputBank);
+
+ Iterator<Sequence>* it = inputBank->iterator();
+ for (it->first(); !it->isDone(); it->next())
+ {
+ Data& data = it->item().getData();
+
+ nbSequences ++;
+ if (data.size() > seqMaxSize) { seqMaxSize = data.size(); }
+ if (data.size() < seqMinSize) { seqMinSize = data.size(); }
+ if (data.size() < kmerSize) { nbSmallSequences++; }
+ dataSize += data.size ();
+ }
+ printf("%u %u %u %u %u", dataSize, nbSequences, seqMaxSize, seqMinSize, nbSmallSequences);
+ }
+ catch (OptionFailure& e)
+ {
+ return e.displayErrors (std::cout);
+ }
+ catch (Exception& e)
+ {
+ std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
+ }
+}
+//! [snippet1]
diff --git a/gatb-core/examples/kmer/.DS_Store b/gatb-core/examples/kmer/.DS_Store
deleted file mode 100644
index fade974..0000000
Binary files a/gatb-core/examples/kmer/.DS_Store and /dev/null differ
diff --git a/gatb-core/examples/storage/.DS_Store b/gatb-core/examples/storage/.DS_Store
deleted file mode 100644
index 8989602..0000000
Binary files a/gatb-core/examples/storage/.DS_Store and /dev/null differ
diff --git a/gatb-core/examples/tools/multithreading8.cpp b/gatb-core/examples/tools/multithreading8.cpp
new file mode 100644
index 0000000..6c5af67
--- /dev/null
+++ b/gatb-core/examples/tools/multithreading8.cpp
@@ -0,0 +1,89 @@
+//! [snippet1]
+
+// We include what we need for the test
+#include <gatb/gatb_core.hpp>
+
+using namespace std;
+
+/********************************************************************************/
+/* Multithreaded iteration of two bank. */
+/* (same as multithreading6 but with PairedIterator) */
+/* */
+/* Cmd-line: multithreading8 <fasta/q file> */
+/* */
+/* Sample: multithreading8 test/db/reads1.fa test/db/reads2.fa */
+
+/* */
+/* */
+/********************************************************************************/
+int main (int argc, char* argv[])
+{
+ if (argc < 2)
+ {
+ cerr << "you must provide at least the FASTA file path." << endl;
+ return EXIT_FAILURE;
+ }
+
+ // We get a handle on a bank
+ BankFasta bank1 (argv[1]);
+ BankFasta bank2 (argv[2]);
+
+ // We get the number of cores to be used.
+ size_t nbCores = (argc >=4 ? atoi(argv[3]) : 0);
+
+ // We create a dispatcher (use all cores by default).
+ Dispatcher dispatcher (nbCores);
+
+ // We will count nucleotides occurrences.
+ ThreadObject<int> sumA, sumC, sumG, sumT, sumN;
+
+ PairedIterator<Sequence> itPair(bank1.iterator(), bank2.iterator());
+
+ // We iterate the bank. Note how we provide a bank iterator to the dispatcher
+ dispatcher.iterate (itPair, [&] (pair<Sequence, Sequence> &seqPair)
+ /*for (itPair.first(); !itPair.isDone(); itPair.next())
+ {
+ pair<Sequence, Sequence> &seqPair = itPair.item();*/ // single-threaded, but if you enable that code, do comment-out the localA/C/T/G objects (as we're not in a thread)
+ {
+ // We use shortcuts references for the different local sums. It avoids to retrieve
+ // them each time a nucleotide of the sequence is handled (see for loop below)
+ // and may give much better performance.
+ int& localA = sumA();
+ int& localC = sumC();
+ int& localG = sumG();
+ int& localT = sumT();
+ int& localN = sumN();
+
+ for (unsigned i = 0; i < 2; ++i)
+ {
+ Sequence &seq = i == 0 ? seqPair.first : seqPair.second;
+
+ // We loop the nucleotides of the current sequence.
+ for (size_t i=0; i<seq.getDataSize(); i++)
+ {
+ switch (seq.getDataBuffer()[i])
+ {
+ case 'A': localA++; break;
+ case 'C': localC++; break;
+ case 'G': localG++; break;
+ case 'T': localT++; break;
+ case 'N': localN++; break;
+ }
+ }
+ }
+ }
+ , 1 /*groupSize of 1*/);
+
+ sumA.foreach ([&] (int n) { *sumA += n; });
+ sumC.foreach ([&] (int n) { *sumC += n; });
+ sumG.foreach ([&] (int n) { *sumG += n; });
+ sumT.foreach ([&] (int n) { *sumT += n; });
+ sumN.foreach ([&] (int n) { *sumN += n; });
+
+ cout << "|A|=" << *sumA << endl;
+ cout << "|C|=" << *sumC << endl;
+ cout << "|G|=" << *sumG << endl;
+ cout << "|T|=" << *sumT << endl;
+ cout << "|N|=" << *sumN << endl;
+}
+//! [snippet1]
diff --git a/gatb-core/scripts/NewProject/CMakeLists.txt b/gatb-core/scripts/NewProject/CMakeLists.txt
index e58d66d..a7382ee 100644
--- a/gatb-core/scripts/NewProject/CMakeLists.txt
+++ b/gatb-core/scripts/NewProject/CMakeLists.txt
@@ -1,9 +1,9 @@
project(XXX)
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 3.1.0)
################################################################################
-# The version number.
+# The version number of YOUR tool.
################################################################################
SET (gatb-tool_VERSION_MAJOR 1)
SET (gatb-tool_VERSION_MINOR 0)
@@ -41,7 +41,7 @@ SET (CMAKE_MODULE_PATH ${GATB_CORE_HOME}/cmake)
# We don't want to install some GATB-CORE artifacts
SET (GATB_CORE_EXCLUDE_TOOLS 1)
SET (GATB_CORE_EXCLUDE_TESTS 1)
-SET (GATB_CORE_EXCLUDE_EXAMPLES 1)
+SET (GATB_CORE_INCLUDE_EXAMPLES 0)
# GATB CORE
include (GatbCore)
diff --git a/gatb-core/scripts/delivery_compile.sh b/gatb-core/scripts/delivery_compile.sh
index e7264d7..11a5497 100755
--- a/gatb-core/scripts/delivery_compile.sh
+++ b/gatb-core/scripts/delivery_compile.sh
@@ -23,7 +23,7 @@ cat $CONFIG_FILE_IN
# clean, compile and package library
make clean
-make -j8 package
+make package
# get back the to official config_sha1.hpp
git checkout $CONFIG_FILE_IN
diff --git a/gatb-core/scripts/make_official_release.sh b/gatb-core/scripts/make_official_release.sh
index a965b27..8f164a2 100755
--- a/gatb-core/scripts/make_official_release.sh
+++ b/gatb-core/scripts/make_official_release.sh
@@ -168,4 +168,4 @@ cmake -DGH_LOGIN=$GH_LOGIN -DGH_TOKEN=$GH_TOKEN -DGH_OWNER=$GH_OWNER \
-DGH_REPO=$GH_REPO -DMAJOR=$MAJOR_V -DMINOR=$MINOR_V -DPATCH=$PATCH_V \
-DSILENT_MODE=$SILENT -DCPACK_USER_NAME=$IF_LOGIN ..
-make -j8 $COMMAND
+make $COMMAND
diff --git a/gatb-core/src/.DS_Store b/gatb-core/src/.DS_Store
deleted file mode 100644
index 42cce0a..0000000
Binary files a/gatb-core/src/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/.DS_Store b/gatb-core/src/gatb/.DS_Store
deleted file mode 100644
index 83b614f..0000000
Binary files a/gatb-core/src/gatb/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/bank/.DS_Store b/gatb-core/src/gatb/bank/.DS_Store
deleted file mode 100644
index c0a9ee9..0000000
Binary files a/gatb-core/src/gatb/bank/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/bank/api/.DS_Store b/gatb-core/src/gatb/bank/api/.DS_Store
deleted file mode 100644
index c8b7219..0000000
Binary files a/gatb-core/src/gatb/bank/api/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/bank/api/Sequence.hpp b/gatb-core/src/gatb/bank/api/Sequence.hpp
index 9f68de5..fbed862 100644
--- a/gatb-core/src/gatb/bank/api/Sequence.hpp
+++ b/gatb-core/src/gatb/bank/api/Sequence.hpp
@@ -155,6 +155,122 @@ private:
/********************************************************************************/
} } } /* end of namespaces. */
+
+#include <gatb/tools/designpattern/impl/IteratorHelpers.hpp>
+
+//specialization of composite iterator for Sequence, so that the sequence index is correctly computed
+namespace gatb {
+ namespace core {
+ namespace tools {
+ namespace dp {
+ namespace impl {
+
+template <>
+class CompositeIterator <bank::Sequence> : public Iterator <bank::Sequence>
+{
+public:
+
+ /** Constructor.
+ * \param[in] iterators : the iterators vector
+ */
+ CompositeIterator (std::vector <Iterator<bank::Sequence>*>& iterators)
+ : _iterators(iterators), _currentIdx(0), _currentIt(0), _isDone(true)
+ {
+ for (size_t i=0; i<_iterators.size(); i++) { _iterators[i]->use(); }
+
+ _currentIt = _iterators[_currentIdx];
+ }
+
+ /** Destructor. */
+ virtual ~CompositeIterator ()
+ {
+ for (size_t i=0; i<_iterators.size(); i++) { _iterators[i]->forget(); }
+ }
+
+ /** \copydoc Iterator::first */
+ void first()
+ {
+ _seqIndex = 0;
+ /** We initialize attributes. */
+ _currentIdx = 0;
+ _isDone = true;
+
+ /** We look for the first non finished iterator. */
+ update (true);
+
+ if (_isDone==false) { _currentIt->item().setIndex(_seqIndex); }
+
+ }
+
+ /** \copydoc Iterator::next */
+ void next()
+ {
+ _currentIt->next();
+ _isDone = _currentIt->isDone();
+
+ if (_isDone == true) { update (false); }
+ _seqIndex++;
+
+ if (_isDone==false) { _currentIt->item().setIndex(_seqIndex); }
+
+ }
+
+ /** \copydoc Iterator::isDone */
+ bool isDone() { return _isDone; }
+
+ /** \copydoc Iterator::item */
+ bank::Sequence& item () { return _currentIt->item(); }
+
+ /** IMPORTANT : the Item argument provided to 'setItem' must be the object to be modified by
+ * one of the delegate iterator AND NOT the current item of CompositeIterator. Therefore,
+ * we make point the delegate current item to this provided Item argument. */
+ void setItem (bank::Sequence& i) { _currentIt->setItem (i); }
+
+ /** Get a vector holding the composite structure of the iterator. */
+ virtual std::vector<Iterator<bank::Sequence>*> getComposition() { return _iterators; }
+
+private:
+ size_t _seqIndex;
+
+ std::vector <Iterator<bank::Sequence>*> _iterators;
+
+ size_t _currentIdx;
+ Iterator<bank::Sequence>* _currentIt;
+
+ bool _isDone;
+
+ void update (bool isFirst)
+ {
+ if (_currentIdx >= _iterators.size()) { _isDone=true; return; }
+
+ if (!isFirst) { _currentIdx++; }
+
+ while ((int)_currentIdx<(int)_iterators.size() && _isDone == true)
+ {
+ Iterator<bank::Sequence>* previous = _currentIt;
+
+ /** We get the next iterator. */
+ _currentIt = _iterators[_currentIdx];
+ assert (_currentIt != 0);
+
+ /** We have to take the reference of the previous iterator. */
+ _currentIt->setItem (previous->item());
+
+ /** We have to "first" this iterator. */
+ _currentIt->first();
+
+ /** We update the 'isDone' status. */
+ _isDone = _currentIt->isDone();
+
+ if (_isDone==true) { _currentIdx++; }
+
+ /** We can finish the previous item (only if not first call). */
+ if (!isFirst) { previous->finalize(); }
+ }
+ }
+};
+
+ }}}}} //end of namespace
/********************************************************************************/
#endif /* _GATB_CORE_BANK_SEQUENCE_HPP_ */
diff --git a/gatb-core/src/gatb/bank/impl/AbstractBank.hpp b/gatb-core/src/gatb/bank/impl/AbstractBank.hpp
index b44ae91..393d1cd 100644
--- a/gatb-core/src/gatb/bank/impl/AbstractBank.hpp
+++ b/gatb-core/src/gatb/bank/impl/AbstractBank.hpp
@@ -50,7 +50,7 @@ class AbstractBank : public IBank, public system::SmartPointer
public:
/** Constructor. */
- AbstractBank () : _estimateThreshold(5000) {}
+ AbstractBank () : _estimateThreshold(50000) {}
std::string getIdNb (int i) { return std::string("not_a_compo_bank"); }
diff --git a/gatb-core/src/gatb/bank/impl/Bank.cpp b/gatb-core/src/gatb/bank/impl/Bank.cpp
index b944067..ca3ddc2 100644
--- a/gatb-core/src/gatb/bank/impl/Bank.cpp
+++ b/gatb-core/src/gatb/bank/impl/Bank.cpp
@@ -20,6 +20,8 @@
#include <gatb/bank/impl/Bank.hpp>
#include <gatb/bank/impl/BankFasta.hpp>
+#include <gatb/tools/compression/Leon.hpp>
+
#include <gatb/bank/impl/BankBinary.hpp>
#include <gatb/bank/impl/BankAlbum.hpp>
@@ -46,6 +48,7 @@ Bank::Bank ()
/** We register most known factories. */
_registerFactory_ ("album", new BankAlbumFactory(), false);
_registerFactory_ ("fasta", new BankFastaFactory(), false);
+ _registerFactory_ ("leon", new BankLeonFactory(), false);
_registerFactory_ ("binary", new BankBinaryFactory(), false);
DEBUG (("Bank::Bank, found %ld factories\n", _factories.size()));
@@ -175,6 +178,21 @@ std::string Bank::_getType_ (const std::string& uri)
if (bank != 0)
{
result = it->name;
+ if(!result.compare("fasta"))
+ {
+ //distinguish fasta and fastq
+ tools::dp::Iterator<Sequence>* its = bank->iterator(); LOCAL(its);
+ its->first();
+ if(!its->isDone())
+ {
+ std::string qual = its->item().getQuality();
+ if(!qual.empty())
+ {
+ result= "fastq";
+ }
+ }
+ }
+
delete bank;
break;
}
diff --git a/gatb-core/src/gatb/bank/impl/BankBinary.cpp b/gatb-core/src/gatb/bank/impl/BankBinary.cpp
index aa3ab98..83719dc 100644
--- a/gatb-core/src/gatb/bank/impl/BankBinary.cpp
+++ b/gatb-core/src/gatb/bank/impl/BankBinary.cpp
@@ -544,7 +544,7 @@ void BankBinary::Iterator::estimate (u_int64_t& number, u_int64_t& totalSize, u
u_int64_t current = ftell (file);
// we go to the end of the file
- fseeko (file, 0, SEEK_END);
+ fseek (file, 0, SEEK_END); /* cygwin doesnt like fseeko and fseek/fseeko seems similar */
// we keep the current location in the file
u_int64_t end = ftell (file);
diff --git a/gatb-core/src/gatb/bank/impl/BankFasta.cpp b/gatb-core/src/gatb/bank/impl/BankFasta.cpp
index 09c80c8..2e78464 100644
--- a/gatb-core/src/gatb/bank/impl/BankFasta.cpp
+++ b/gatb-core/src/gatb/bank/impl/BankFasta.cpp
@@ -654,10 +654,10 @@ void BankFasta::Iterator::init ()
/** We check that we can open the file. */
if ((*bf)->stream == NULL)
{
- /** We first try do do some cleanup. */
- finalize (); // GR: cannot finalize here because finalize returns if (_isInitialized == false) which is the case here
-
+ // there used to be some cleanup here but what's the point, we're going to throw an exception anyway
+
//GR : dunno why this exception does not show up, adding a message here
+ //RC: the exceptino doesn't even trigger, or there is an exception but the message doesn't show?
fprintf(stderr,"unable to open file %s : %s \n",fname,strerror(errno));
/** We launch an exception. */
diff --git a/gatb-core/src/gatb/bcalm2/bcalm_algo.cpp b/gatb-core/src/gatb/bcalm2/bcalm_algo.cpp
index 141e456..7d35c02 100644
--- a/gatb-core/src/gatb/bcalm2/bcalm_algo.cpp
+++ b/gatb-core/src/gatb/bcalm2/bcalm_algo.cpp
@@ -1,5 +1,94 @@
#include "bcalm_algo.hpp"
+
#include <libgen.h> // for basename()
+#include "logging.hpp"
+#include "ograph.h"
+
+#include <assert.h>
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+#include <chrono>
+#include <tuple>
+
+#include <gatb/tools/designpattern/impl/Command.hpp>
+
+#include <atomic>
+#include <thread>
+
+#include "ThreadPool.h"
+
+#include <gatb/system/impl/System.hpp>
+#include <gatb/tools/misc/impl/Property.hpp>
+
+#include <gatb/tools/storage/impl/Storage.hpp>
+#include <gatb/tools/storage/impl/StorageTools.hpp>
+
+#include <gatb/tools/math/NativeInt64.hpp>
+#include <gatb/tools/math/NativeInt128.hpp>
+#include <gatb/tools/math/LargeInt.hpp>
+
+#include <gatb/bank/impl/Banks.hpp>
+#include <gatb/bank/impl/Bank.hpp>
+#include <gatb/bank/impl/BankHelpers.hpp>
+#include <gatb/bank/impl/BankConverterAlgorithm.hpp>
+
+#include <gatb/kmer/impl/Model.hpp>
+
+#include <gatb/kmer/impl/PartiInfo.hpp> // for repartitor
+#include <gatb/tools/misc/impl/Progress.hpp>
+#include <gatb/tools/designpattern/impl/IteratorHelpers.hpp>
+
+#define get_wtime() chrono::system_clock::now()
+#ifndef diff_wtime
+#define diff_wtime(x,y) chrono::duration_cast<chrono::nanoseconds>(y - x).count()
+#endif
+
+//#define BINSEQ // "graph4 is not ready" according to antoine. also, initBinSeq provokes segfault at end of bcalm
+
+#ifdef BINSEQ
+#include "binSeq.h"
+#define BUCKET_STR_TYPE binSeq
+#define TO_BUCKET_STR(x) binSeq(x)
+#define FROM_BUCKET_STR(x) (x.str())
+#else
+#define BUCKET_STR_TYPE string
+#define TO_BUCKET_STR(x) x
+#define FROM_BUCKET_STR(x) x
+#endif
+
+
+// timing-related variables
+
+#define THREAD_SAFE_TIMING
+#ifdef THREAD_SAFE_TIMING
+typedef std::atomic<double> atomic_double;
+#else
+#define atomic_double_add(d1,d2) d1 += d2;
+typedef double atomic_double;
+#endif
+
+
+using namespace gatb::core::system;
+using namespace gatb::core::system::impl;
+
+using namespace gatb::core::bank;
+using namespace gatb::core::bank::impl;
+
+using namespace gatb::core::kmer;
+using namespace gatb::core::kmer::impl;
+
+using namespace gatb::core::tools::storage;
+using namespace gatb::core::tools::storage::impl;
+using namespace gatb::core::tools::misc;
+using namespace gatb::core::tools::misc::impl;
+using namespace gatb::core::tools::dp;
+using namespace gatb::core::tools::dp::impl;
+
+
+
+
+
/*
* some notes: this code could be further optimized.
@@ -19,27 +108,139 @@ static void atomic_double_add(std::atomic<double> &d1, double d2) {
static atomic_double global_wtime_compactions (0), global_wtime_cdistribution (0), global_wtime_add_nodes (0), global_wtime_create_buckets (0), global_wtime_foreach_bucket (0), global_wtime_lambda (0), global_wtime_parallel (0), global_wtime_longest_lambda (0), global_wtime_best_sched(0);
static bool time_lambdas = true;
-static std::mutex lambda_timing_mutex, active_minimizers_mutex;
+static std::mutex lambda_timing_mutex;
static size_t nb_threads_simulate=1; // this is somewhat a legacy parameter, i should get rid of (and replace by nb_threads)
-static unsigned long memory_usage(string message="", bool verbose=true)
-{
- // using Progress.cpp of gatb-core
- u_int64_t mem = System::info().getMemorySelfUsed() / 1024;
- u_int64_t memMaxProcess = System::info().getMemorySelfMaxUsed() / 1024;
- char tmp[128];
- snprintf (tmp, sizeof(tmp), " -- memory [current, maximum (maxRSS)]: [%4lu, %4lu] MB ",
- mem, memMaxProcess);
- if (verbose)
- {
- std::cout << message << " " << tmp << std::endl;
- }
- return mem;
-}
+namespace gatb { namespace core { namespace debruijn { namespace impl {
+ /* formerly lambda function inside bcalm but needed it in InsertIntoQueues also. no choice here unless I wanted to typedef Model again*/
+ #define minimizerMin(a,b) ((model.compareIntMinimizers(a,b)) ? a : b)
+ #define minimizerMax(a,b) ((model.compareIntMinimizers(a,b)) ? b : a)
-namespace gatb { namespace core { namespace debruijn { namespace impl {
+ /* class (formerly a simple lambda function) to process a kmer and decide which bucket(s) it should go to */
+ /* needed to make it a class because i want it to remember its thread index */
+ template <int SPAN>
+ class InsertIntoQueues
+ {
+ typedef typename Kmer<SPAN>::Type Type;
+ typedef typename Kmer<SPAN>::Count Count;
+ typedef typename Kmer<SPAN>::ModelCanonical ModelCanon;
+ typedef typename Kmer<SPAN>::template ModelMinimizer <ModelCanon> Model;
+
+ // new version, no longer using a queue-type object.
+ typedef std::tuple<uint32_t, Type, uint32_t, uint32_t, uint32_t> tuple_t;
+ typedef vector<tuple_t> flat_vector_queue_t;
+
+ unsigned int p, k, abundance_threshold, nb_threads;
+ Model &model, &modelK1;
+ std::atomic<unsigned long> &nb_left_min_diff_right_min, &nb_kmers_in_partition;
+ Repartitor &repart;
+ int _currentThreadIndex;
+ std::vector<BankFasta*> &traveller_kmers_files;
+ vector<std::mutex> &traveller_kmers_save_mutex;
+
+ // saving traveller kmers in plain ASCII in files: a bit wasteful, but went to the easy solution
+ void save_traveller_kmer (uint32_t minimizer, const string& seq, int abundance, uint32_t leftmin, uint32_t rightmin, int p) {
+ Sequence s (Data::ASCII);
+ s.getData().setRef ((char*)seq.c_str(), seq.size());
+ s._comment = to_string(abundance); //abundance in comment
+ traveller_kmers_save_mutex[p].lock();
+ traveller_kmers_files[p]->insert(s);
+ traveller_kmers_save_mutex[p].unlock();
+ }
+
+ public:
+ vector<flat_vector_queue_t> &flat_bucket_queues;
+
+ /* function to add a kmer to a bucket */
+ void add_to_bucket_queue(uint32_t minimizer, /* string seq, */ Type &kmer, uint32_t abundance, uint32_t leftmin, uint32_t rightmin)
+ {
+ //bucket_queues.push_back(minimizer,std::make_tuple(TO_BUCKET_STR(seq),leftmin,rightmin,abundance));
+ flat_bucket_queues[getThreadIndex()].push_back(std::make_tuple(minimizer, kmer, abundance, leftmin, rightmin));
+ }
+
+ /* boilerplate constructor */
+ InsertIntoQueues(vector<flat_vector_queue_t> &flat_bucket_queues,
+ Model &model,
+ Model &modelK1,
+ unsigned int p, unsigned int k, unsigned int nb_threads,
+ int abundance_threshold,
+ Repartitor &repart,
+ std::atomic<unsigned long> &nb_left_min_diff_right_min,
+ std::atomic<unsigned long> &nb_kmers_in_partition,
+ std::vector<BankFasta*> &traveller_kmers_files,
+ vector<std::mutex> &traveller_kmers_save_mutex
+ ) :
+ p(p), k(k), abundance_threshold(abundance_threshold), nb_threads(nb_threads),
+ model(model), modelK1(modelK1),
+ nb_left_min_diff_right_min(nb_left_min_diff_right_min), nb_kmers_in_partition(nb_kmers_in_partition),
+ repart(repart), _currentThreadIndex(-1), traveller_kmers_files(traveller_kmers_files),
+ traveller_kmers_save_mutex(traveller_kmers_save_mutex), flat_bucket_queues(flat_bucket_queues) {}
+
+ /* does the actual work of processing a kmer, computing its minimizers, saving it to the right queue (basically the queue corresponding to its thread) */
+ void operator() (Count& item) {
+ // if the abundance threshold is higher than the h5 abundance,
+ // filter out this kmer (useful when you want to re-use same .h5 but with higher "-abundance" parameter)
+ size_t abundance = item.abundance;
+ if (abundance < (size_t)abundance_threshold)
+ return;
+
+ Type current = item.value; // current is a canonical kmer (i checked)
+ uint32_t leftMin(modelK1.getMinimizerValue(current >> 2)); // that's because the lowest bit in the gatb kmer representation are the rightmost sequence nucleotides
+ uint32_t rightMin(modelK1.getMinimizerValue(current));
+
+ ++nb_kmers_in_partition;
+
+ if (repart(leftMin) == p)
+ add_to_bucket_queue(leftMin, current, abundance, leftMin, rightMin);
+
+ if (leftMin != rightMin)
+ {
+ nb_left_min_diff_right_min ++;
+
+ if (repart(rightMin) == p)
+ add_to_bucket_queue(rightMin, current, abundance, leftMin, rightMin);
+
+ // handle "traveller kmers"
+ uint32_t max_minimizer = minimizerMax(leftMin, rightMin);
+ uint32_t min_minimizer = minimizerMin(leftMin, rightMin);
+ if (repart(max_minimizer) != repart(min_minimizer))
+ {
+ string seq = model.toString(current);
+ save_traveller_kmer(max_minimizer, seq, abundance, leftMin, rightMin, repart(max_minimizer));
+ //add_to_bucket_queue(max_minimizer, seq, leftMin, rightMin, repart(max_minimizer)); // no longer saved into the queue, but to a file instead
+
+ // sanity check
+ if (repart(max_minimizer) < repart(min_minimizer))
+ { printf("unexpected problem: traveller kmer = %s, min_minimizer=%d max_minimizer=%d, repart(min_minimizer)=%d, repart(max_minimizer)=%d\n", seq.c_str(), min_minimizer, max_minimizer, repart(min_minimizer), repart(max_minimizer)); exit(1); }
+ }
+ }
+
+ // sanity check
+ if (repart(leftMin) != p && repart(rightMin) != p)
+ { printf("unexpected problem: repart bucket\n"); exit(1); }
+ }
+
+ /* neat trick taken from erwan's later work in gatb to find the thread id of a dispatched function */
+ int getThreadIndex()
+ {
+ if (_currentThreadIndex < 0)
+ {
+ std::pair<IThread*,size_t> info;
+ if (ThreadGroup::findThreadInfo (System::thread().getThreadSelf(), info) == true)
+ {
+ _currentThreadIndex = info.second;
+ }
+ else
+ {
+ throw Exception("Unable to find thread index during InsertIntoQueues");
+ }
+ }
+ return _currentThreadIndex;
+ }
+
+ };
template<size_t SPAN>
void bcalm2(Storage *storage,
@@ -127,26 +328,17 @@ void bcalm2(Storage *storage,
Model model(kmerSize, minSize, typename Kmer<SPAN>::ComparatorMinimizerFrequencyOrLex(), freq_order);
Model modelK1(kmerSize-1, minSize, typename Kmer<SPAN>::ComparatorMinimizerFrequencyOrLex(), freq_order);
- auto minimizerMin = [&repart, &model] (uint32_t a, uint32_t b)
- {
- return (model.compareIntMinimizers(a,b)) ? a : b;
- };
-
- auto minimizerMax = [&repart, &model] (uint32_t a, uint32_t b)
- {
- return (model.compareIntMinimizers(a,b)) ? b : a;
- };
-
std::vector<BankFasta*> out_to_glue(nb_threads); // each thread will write to its own glue file, to avoid locks
// remove potential old glue files
for (unsigned int i = 0; i < 10000 /* there cannot be more than 10000 threads, right? unsure if i'll pay for that asumption someday*/; i++)
+ {
if (System::file().doesExist(prefix + ".glue." + std::to_string(i)))
- {
System::file().remove (prefix + ".glue." + std::to_string(i));
- }
+ }
unsigned long *nb_seqs_in_glue = new unsigned long[nb_threads];
+ unsigned long *nb_pretips = new unsigned long[nb_threads];
// another system could have been to send all sequences in a queue, and a thread responsible for writing to glue would dequeue (might be faster)
for (unsigned int i = 0; i < (unsigned int)nb_threads; i++)
@@ -154,6 +346,7 @@ void bcalm2(Storage *storage,
string glue_file = prefix + ".glue." + std::to_string(i);
out_to_glue[i] = new BankFasta(glue_file);
nb_seqs_in_glue[i] = 0;
+ nb_pretips[i] = 0;
}
double weighted_best_theoretical_speedup_cumul = 0;
@@ -165,33 +358,17 @@ void bcalm2(Storage *storage,
auto start_buckets=chrono::system_clock::now();
- std::vector<std::set<uint32_t>> active_minimizers;
- active_minimizers.resize(nb_partitions);
-
- /* now our vocabulary is: a "DSK partition" == a "partition" == a "super-bucket" */
+ /* now our vocabulary is: a "DSK partition" == a "partition" == a "super-bucket" */
/* buckets remain what they are in bcalm-original */
/* a travelling kmer is one that goes to two buckets from different superbuckets */
// I used to save traveller kmers into bucket_queues, but this would be a memory hog. Let's use files instead. Total volume will be small (a few gigs for human), but that's memory saved
std::vector<BankFasta*> traveller_kmers_files(nb_partitions);
+ vector<std::mutex> traveller_kmers_save_mutex(nb_partitions);
std::string traveller_kmers_prefix = prefix + ".doubledKmers.";
- std::mutex *traveller_kmers_save_mutex = new std::mutex[nb_partitions];
for (unsigned int i = 0; i < nb_partitions; i++)
traveller_kmers_files[i] = new BankFasta(traveller_kmers_prefix + std::to_string(i));
-
- auto save_traveller_kmer = [&traveller_kmers_files, &traveller_kmers_save_mutex]
- (uint32_t minimizer, string seq, int abundance, uint32_t leftmin, uint32_t rightmin, int p) {
- // saving traveller kmers in plain ASCII in files: a bit wasteful, but went to the easy solution
- Sequence s (Data::ASCII);
- s.getData().setRef ((char*)seq.c_str(), seq.size());
- s._comment = to_string(abundance); //abundance in comment
- traveller_kmers_save_mutex[p].lock();
- traveller_kmers_files[p]->insert(s);
- traveller_kmers_files[p]->flush();
- traveller_kmers_save_mutex[p].unlock();
-
- };
-
+
Dispatcher dispatcher (nb_threads); // setting up a multi-threaded dispatcher, so I guess we can say that things are getting pretty serious now
// i want to do this but i'm not inside an Algorithm object:
@@ -200,8 +377,7 @@ void bcalm2(Storage *storage,
);*/
// copied from createIterator in Algorithm.hpp
- // We create some listener to be notified every 1000 iterations and attach it to the iterator.
-
+ // We create some listener to be notified every 1000 iterations and attach it to the iterator.
IteratorListener* listener;
if (verbose)
listener = new ProgressTimer(nb_partitions, "Iterating DSK partitions");
@@ -213,6 +389,16 @@ void bcalm2(Storage *storage,
nb_partitions/100);
it_parts->addObserver (listener);
LOCAL(it_parts);
+
+ bcalm_logging = verbose;
+ logging("prior to queues allocation");
+
+ // new version, no longer using a queue-type object.
+ typedef std::tuple<uint32_t, Type, uint32_t, uint32_t, uint32_t> tuple_t;
+ typedef vector<tuple_t> flat_vector_queue_t;
+ vector<flat_vector_queue_t> flat_bucket_queues(nb_threads);
+
+ logging("Starting BCALM2");
/*
*
@@ -225,111 +411,18 @@ void bcalm2(Storage *storage,
{
uint32_t p = it_parts->item(); /* partition index */
- size_t k = kmerSize;
-
- // create many queues in place of Buckets
- // (this code used to be outside the partition loop, but I think it's a good idea to reinit the queues after each superbucket(=partition) to avoid queues leaking memory
-
- // this implementation is supposedly efficient, but:
- // - as fast as the lockbasedqueue below
- // - uses much more memory
- //moodycamel::ConcurrentQueue<std::tuple<BUCKET_STR_TYPE,uint32_t,uint32_t> > bucket_queues[rg];
-
- // another queue system, very simple, with locks
- // it's fine but uses a linked list, so more memory than I'd like
- //LockBasedQueue<std::tuple<BUCKET_STR_TYPE,uint32_t,uint32_t> > bucket_queues[rg];
+ bool verbose_partition = verbose && ((p % ((nb_partitions+9)/10)) == 0); // only print verbose information 10 times at most
- // still uses more memory than i'd like
- // LockStdQueue<std::tuple<BUCKET_STR_TYPE,uint32_t,uint32_t> > bucket_queues[rg];
-
- //LockStdQueue<std::tuple<BUCKET_STR_TYPE,uint32_t, uint32_t> > *bucket_queues=new LockStdQueue<std::tuple<BUCKET_STR_TYPE,uint32_t,uint32_t > > [rg];
- LockStdQueue<std::tuple<BUCKET_STR_TYPE,uint32_t,uint32_t, uint32_t> > *bucket_queues=new LockStdQueue<std::tuple<BUCKET_STR_TYPE,uint32_t,uint32_t, uint32_t> > [rg]; // graph3<span> switch
-
- //LockStdVector<std::tuple<BUCKET_STR_TYPE,uint32_t,uint32_t> > bucket_queues[rg]; // very inefficient
-
-
- /* lambda function to add a kmer to a bucket */
- auto add_to_bucket_queue = [&active_minimizers, &bucket_queues](uint32_t minimizer, string seq, int abundance, uint32_t leftmin, uint32_t rightmin, int p)
- {
- //std::cout << "adding elt to bucket: " << seq << " "<< minimizer<<std::endl;
- //bucket_queues[minimizer].enqueue(std::make_tuple(TO_BUCKET_STR(seq),leftmin,rightmin));
- bucket_queues[minimizer].enqueue(std::make_tuple(TO_BUCKET_STR(seq),leftmin,rightmin,abundance)); // graph3<span> switch
-
- if (active_minimizers[p].find(minimizer) == active_minimizers[p].end())
- {
- active_minimizers_mutex.lock();
- active_minimizers[p].insert(minimizer);
- active_minimizers_mutex.unlock();
- }
- };
+ size_t k = kmerSize;
std::atomic<unsigned long> nb_left_min_diff_right_min;
std::atomic<unsigned long> nb_kmers_in_partition;
nb_kmers_in_partition = 0;
nb_left_min_diff_right_min = 0;
- std::atomic<uint32_t> kmerInGraph;
- kmerInGraph = 0;
-
- /* lambda function to process a kmer and decide which bucket(s) it should go to */
- auto insertIntoQueues = [p, &minimizerMax, &minimizerMin, &add_to_bucket_queue,
- &bucket_queues, &modelK1, &k, &repart, &nb_left_min_diff_right_min,
- &kmerInGraph, &model, &save_traveller_kmer, abundance_threshold,
- &nb_kmers_in_partition]
- (Count& item) {
-
- // if the abundance threshold is higher than the h5 abundance,
- // filter out this kmer (useful when you want to re-use same .h5 but with higher "-abundance" parameter)
- size_t abundance = item.abundance;
- if (abundance < (size_t)abundance_threshold)
- return;
-
- Type current = item.value;
-
- string seq = model.toString(current);
- typename Model::Kmer kmmerBegin = modelK1.codeSeed(seq.substr(0, k - 1).c_str(), Data::ASCII);
- uint32_t leftMin(modelK1.getMinimizerValue(kmmerBegin.value()));
- typename Model::Kmer kmmerEnd = modelK1.codeSeed(seq.substr(seq.size() - k + 1, k - 1).c_str(), Data::ASCII);
- uint32_t rightMin(modelK1.getMinimizerValue(kmmerEnd.value()));
- // string seq;
- // uint32_t leftMin(0);
- // uint32_t rightMin(0);
-
- ++kmerInGraph;
- ++nb_kmers_in_partition;
-
- if (repart(leftMin) == p)
- add_to_bucket_queue(leftMin, seq, abundance, leftMin, rightMin, p);
-
- if (leftMin != rightMin)
- {
- nb_left_min_diff_right_min ++;
-
- if (repart(rightMin) == p)
- add_to_bucket_queue(rightMin, seq, abundance, leftMin, rightMin, p);
-
- // handle traveller kmers
- uint32_t max_minimizer = minimizerMax(leftMin, rightMin);
- uint32_t min_minimizer = minimizerMin(leftMin, rightMin);
- if (repart(max_minimizer) != repart(min_minimizer))
- {
- /* I call that a "traveller kmer" */
- save_traveller_kmer(max_minimizer, seq, abundance, leftMin, rightMin, repart(max_minimizer));
- //add_to_bucket_queue(max_minimizer, seq, leftMin, rightMin, repart(max_minimizer)); // no longer saved into the queue, but to a file instead
-
- // sanity check
- if (repart(max_minimizer) < repart(min_minimizer))
- { printf("wtf? traveller kmer = %s, min_minimizer=%d max_minimizer=%d, repart(min_minimizer)=%d, repart(max_minimizer)=%d\n", seq.c_str(), min_minimizer, max_minimizer, repart(min_minimizer), repart(max_minimizer)); exit(1); }
- }
- }
-
- // sanity check
- if (repart(leftMin) != p && repart(rightMin) != p)
- { printf("wtf? repart bucket\n"); exit(1); }
-
- };
-
auto start_createbucket_t=get_wtime();
+
+ InsertIntoQueues<SPAN> insertIntoQueues(flat_bucket_queues, model, modelK1, p, k, nb_threads, abundance_threshold, repart, nb_left_min_diff_right_min, nb_kmers_in_partition, traveller_kmers_files, traveller_kmers_save_mutex);
/* MAIN FIRST LOOP: expand a superbucket by inserting kmers into queues. this creates buckets */
// do it for all passes (because the union of passes correspond to a partition)
@@ -339,50 +432,147 @@ void bcalm2(Storage *storage,
unsigned long interm_partition_index = p + pass_index * nb_partitions;
Iterator<Count>* it_kmers = partition[interm_partition_index].iterator();
LOCAL (it_kmers);
+
+ if (pass_index == 0) // the first time,
+ for (int i = 0; i < nb_threads; i++) // resize approximately the bucket queues
+ flat_bucket_queues[i].reserve(partition[interm_partition_index].getNbItems()/nb_threads);
+
dispatcher.iterate (it_kmers, insertIntoQueues);
+ /*for (it_kmers->first (); !it_kmers->isDone(); it_kmers->next()) // non-dispatcher version
+ insertIntoQueues(it_kmers->item());*/
}
- if (verbose)
+ if (verbose_partition)
cout << endl << "Iterated " << nb_kmers_in_partition << " kmers, among them " << nb_left_min_diff_right_min << " were doubled" << endl;
// also add traveller kmers that were saved to disk from a previous superbucket
// but why don't we need to examine other partitions for potential traveller kmers?
// no, because we iterate partitions in minimizer order.
- // but then you might again something else:
+ // but then you might say again something else:
// "i thought bcalm1 needed to iterate partitions in minimizer order, but not bcalm2"
// -> indeed, bcalm2 algorithm doesn't, but in the implementation i still choose to iterate in minimizer order.
// because it seemed like a good idea at the time, when handling traveller kmers.
- // looking back, it might be a good idea to not do that anymore.
- // this could enable loading multiple partitions at once (and more parallelization)
+ // an alternative possibility would be to revert to minimizer-type 0 and repartition-type 0
+ // advantages:
+ // - this could enable loading multiple partitions at once (and more parallelization)
+ // - faster kmer counting (16 mins vs 18 mins for cami medium, 1B distinct kmers)
+ // but so far, I have not seen the need to load multiple partitions and the gain for dsk isnt big
+ // disadvantages:
+ // - would need to do a pass to write all traveller kmers to disk at first
+ traveller_kmers_files[p]->flush();
string traveller_kmers_file = traveller_kmers_prefix + std::to_string(p);
- unsigned long nb_traveller_kmers_loaded = 0;
+ std::atomic<unsigned long> nb_traveller_kmers_loaded;
+ nb_traveller_kmers_loaded = 0;
+
if (System::file().doesExist(traveller_kmers_file)) // for some partitions, there may be no traveller kmers
{
+
BankFasta traveller_kmers_bank (traveller_kmers_file);
BankFasta::Iterator it (traveller_kmers_bank);
- for (it.first(); !it.isDone(); it.next())
+
+ class InsertTravellerKmer
{
- string seq = it->toString();
- string comment = it->getComment();
- int abundance = atoi(comment.c_str());
+ int _currentThreadIndex;
+ vector<flat_vector_queue_t> &flat_bucket_queues;
+ Model &model, &modelK1;
+ int k;
+ std::atomic<unsigned long> &nb_traveller_kmers_loaded;
- // those could be saved in the BankFasta comment eventually
- typename Model::Kmer kmmerBegin = modelK1.codeSeed(seq.substr(0, k - 1).c_str(), Data::ASCII);
- uint32_t leftMin(modelK1.getMinimizerValue(kmmerBegin.value()));
- typename Model::Kmer kmmerEnd = modelK1.codeSeed(seq.substr(seq.size() - k + 1, k - 1).c_str(), Data::ASCII);
- uint32_t rightMin(modelK1.getMinimizerValue(kmmerEnd.value()));
+ public:
+ InsertTravellerKmer(vector<flat_vector_queue_t> &flat_bucket_queues, Model& model, Model &modelK1, int k, std::atomic<unsigned long> &nb_traveller_kmers_loaded)
+ : _currentThreadIndex(-1), flat_bucket_queues(flat_bucket_queues), model(model), modelK1(modelK1), k(k), nb_traveller_kmers_loaded(nb_traveller_kmers_loaded) {}
- uint32_t max_minimizer = minimizerMax(leftMin, rightMin);
- add_to_bucket_queue(max_minimizer, seq, abundance, leftMin, rightMin, p);
- nb_traveller_kmers_loaded++;
- }
- if (verbose)
+ int getThreadIndex()
+ {
+ if (_currentThreadIndex < 0)
+ {
+ std::pair<IThread*,size_t> info;
+ if (ThreadGroup::findThreadInfo (System::thread().getThreadSelf(), info) == true)
+ _currentThreadIndex = info.second;
+ else
+ throw Exception("Unable to find thread index during InsertIntoQueues");
+ }
+ return _currentThreadIndex;
+ }
+ void operator () (const Sequence &sequence)
+ {
+ string seq = sequence.toString();
+ string comment = sequence.getComment();
+ uint32_t abundance = atoi(comment.c_str());
+
+ // those could be saved in the BankFasta comment eventually
+ typename Model::Kmer current = model.codeSeed(seq.c_str(), Data::ASCII);
+ Type kmer = current.value();
+ uint32_t leftMin(modelK1.getMinimizerValue(kmer >> 2));
+ uint32_t rightMin(modelK1.getMinimizerValue(kmer));
+
+ uint32_t max_minimizer = minimizerMax(leftMin, rightMin);
+ //add_to_bucket_queue(max_minimizer, seq, abundance, leftMin, rightMin, p);
+ flat_bucket_queues[getThreadIndex()].push_back(std::make_tuple(max_minimizer, kmer, abundance, leftMin, rightMin));
+ nb_traveller_kmers_loaded++;
+ }
+ };
+ InsertTravellerKmer insertTravellerKmer(flat_bucket_queues, model, modelK1, k, nb_traveller_kmers_loaded);
+
+ dispatcher.iterate(it,insertTravellerKmer);
+
+ if (verbose_partition)
std::cout << "Loaded " << nb_traveller_kmers_loaded << " doubled kmers for partition " << p << endl;
traveller_kmers_bank.finalize();
System::file().remove (traveller_kmers_file);
}
+
+ /* now that we have computed flat_bucket_queues' by each thread,
+ * sort them by minimizer */
+
+ //logging("begin sorting bucket queues");
+ ThreadPool pool_sort(nb_threads);
+ for (int thread = 0; thread < nb_threads; thread++)
+ {
+ auto sort_cmp = [] (tuple_t const &a, tuple_t const &b) -> bool { return get<0>(a) < get<0>(b); };
+
+ // todo check si les minimiseurs sont pas deja quasiment triés dans un sens ou un autre, ca faciliterait le tri ici
+ auto sort_bucket = [&sort_cmp, &flat_bucket_queues, thread] (int thread_id)
+ {std::sort(flat_bucket_queues[thread].begin(), flat_bucket_queues[thread].end(), sort_cmp);};
+
+ if (nb_threads > 1)
+ pool_sort.enqueue(sort_bucket);
+ else
+ sort_bucket(0);
+ }
+ pool_sort.join();
+ //logging("end sorting bucket queues");
+
+ /* remember which minimizer occurs in flat_bucket_queues' and its start position */
+ set<uint32_t> set_minimizers;
+ vector<uint64_t> nb_kmers_per_minimizer(rg);
+ for (uint64_t i = 0; i < rg; i++)
+ nb_kmers_per_minimizer[i] = 0;
+
+ vector<vector<uint64_t>> start_minimizers(nb_threads);
+ for (int thread = 0; thread < nb_threads; thread++)
+ {
+ // should be done in parallel possibly, if it takes time.
+ set<uint32_t> set_minimizers_thread;
+ start_minimizers[thread].resize(rg);
+ uint64_t pos=0;
+ //std:: cout << "iterating flat bucket queues for thread " << thread << " elts: " << flat_bucket_queues[thread].size() << std::endl;
+ for (auto v: flat_bucket_queues[thread])
+ {
+ uint32_t minimizer = get<0>(v);
+ if (set_minimizers_thread.find(minimizer) == set_minimizers_thread.end())
+ {
+ set_minimizers.insert(minimizer);
+ set_minimizers_thread.insert(minimizer);
+ start_minimizers[thread][minimizer] = pos;
+ }
+ nb_kmers_per_minimizer[minimizer]++;
+ pos++;
+ }
+ }
+
auto end_createbucket_t=get_wtime();
atomic_double_add(global_wtime_create_buckets, diff_wtime(start_createbucket_t, end_createbucket_t));
@@ -392,42 +582,57 @@ void bcalm2(Storage *storage,
auto start_foreach_bucket_t=get_wtime();
/**FOREACH BUCKET **/
- for(auto actualMinimizer : active_minimizers[p])
+ for(auto actualMinimizer : set_minimizers)
{
- auto lambdaCompact = [&bucket_queues, actualMinimizer,
- &maxBucket, &lambda_timings, &repart, &modelK1, &out_to_glue, &nb_seqs_in_glue, kmerSize, minSize](int thread_id) {
+ auto lambdaCompact = [&nb_kmers_per_minimizer, actualMinimizer, &model,
+ &maxBucket, &lambda_timings, &repart, &modelK1, &out_to_glue, &nb_seqs_in_glue, &nb_pretips, kmerSize, minSize,
+ nb_threads, &start_minimizers, &flat_bucket_queues](int thread_id) {
auto start_nodes_t=get_wtime();
- bool debug = false;
-
// (make sure to change other places labelled "// graph3" and "// graph4" as well)
//graph4 g(kmerSize-1,actualMinimizer,minSize); // graph4
- uint number_elements(bucket_queues[actualMinimizer].size_approx());
+ uint number_elements(nb_kmers_per_minimizer[actualMinimizer]);
#ifdef BINSEQ
graph4 graphCompactor(kmerSize-1,actualMinimizer,minSize,number_elements);
#else
// cout<<"here"<<endl;
//graph3 graphCompactor(kmerSize-1,actualMinimizer,minSize,number_elements);
graph3<SPAN> graphCompactor(kmerSize-1,actualMinimizer,minSize,number_elements); // graph3<span> switch
+ graphCompactor.pre_tip_cleaning = false; // this is the actual trigger for bcalm pre-tip simplifications.
+ // i'm leaving it off for now because the gains do not seem that big
#endif
/* add nodes to graph */
- //std::tuple<BUCKET_STR_TYPE,uint,uint> bucket_elt;
- std::tuple<BUCKET_STR_TYPE,uint,uint,uint> bucket_elt; // graph3<span> switch
+ //while (bucket_queues.pop_immediately(actualMinimizer,bucket_elt))
+
+ /* go through all the flat_bucket_queues's that were constructed by each thread,
+ * and iterate a certain minimizer. i dont even need a priority queue! */
- while (bucket_queues[actualMinimizer].try_dequeue(bucket_elt))
+ // used to be in a lambda outside of that lambda, there was a bug, decided to put it here but didnt even solve the bug, hmm. i should have been more explicit whether the bug still happens or not, i dunno now.
+ for (int thread = 0; thread < nb_threads; thread++)
{
- // for(uint i(0);i<number_elements;++i)
- // {
- // bucket_queues[actualMinimizer].try_dequeue(bucket_elt);
- // g.addleftmin(std::get<1>(bucket_elt));
- // g.addrightmin(std::get<2>(bucket_elt));
- // g.addvertex(FROM_BUCKET_STR(std::get<0>(bucket_elt)));
- if (debug)
- std::cout << " (debug) adding to graph: " << std::get<0>(bucket_elt) << std::endl;
- graphCompactor.addtuple(bucket_elt);
-
+ uint64_t pos = start_minimizers[thread][actualMinimizer];
+ unsigned int size = flat_bucket_queues[thread].size();
+ if (pos == size) continue;
+ while (actualMinimizer == get<0>(flat_bucket_queues[thread][pos]))
+ {
+ auto tupl = flat_bucket_queues[thread][pos]; // the tuple format in flat_bucket_queues is: (minimizer, seq, abundance, leftmin, rightmin)
+ std::tuple<BUCKET_STR_TYPE,uint,uint,uint> bucket_elt; // graph3<span> switch
+ // g.addleftmin(std::get<1>(bucket_elt));
+ // g.addrightmin(std::get<2>(bucket_elt));
+ // g.addvertex(FROM_BUCKET_STR(std::get<0>(bucket_elt)));
+ string seq = model.toString(get<1>(tupl));
+ uint32_t a = get<2>(tupl), b = get<3>(tupl), c = get<4>(tupl);
+ bucket_elt = make_tuple(seq,b,c,a);
+ //std::cout << " (debug) adding to graph: " << std::get<0>(bucket_elt) << std::endl;
+ graphCompactor.addtuple(bucket_elt); // addtuple wants that tuple: (seq, leftmin, rightmin, abundance)
+
+ pos++;
+ if (pos == size) break;
+ }
}
+
+
// cout<<"endaddtuple"<<endl;
auto end_nodes_t=get_wtime();
atomic_double_add(global_wtime_add_nodes, diff_wtime(start_nodes_t, end_nodes_t));
@@ -451,8 +656,7 @@ void bcalm2(Storage *storage,
//std::vector<unsigned int> abundances ; // graph3
std::vector<unsigned int>& abundances = graphCompactor.unitigs_abundances[i]; // graph3 // graph3<span> switch
#endif
- if (debug)
- std::cout << " (debug) got from compacted graph: " << seq << std::endl;
+ //std::cout << " (debug) got from compacted graph: " << seq << std::endl;
typename Model::Kmer kmmerBegin = modelK1.codeSeed(seq.substr(0, kmerSize - 1).c_str(), Data::ASCII);
uint leftMin(modelK1.getMinimizerValue(kmmerBegin.value()));
@@ -471,6 +675,8 @@ void bcalm2(Storage *storage,
nb_seqs_in_glue[thread_id]++;
}
}
+ nb_pretips[thread_id] += graphCompactor.nb_pretips;
+ graphCompactor.nb_pretips = 0;
graphCompactor.clear(); // frees memory allocated during graph3 constructor (sort of a destructor, if you will)
auto end_cdistribution_t=get_wtime();
atomic_double_add(global_wtime_cdistribution, diff_wtime(start_cdistribution_t, end_cdistribution_t));
@@ -496,34 +702,18 @@ void bcalm2(Storage *storage,
} // end for each bucket
pool.join();
-
- // flush glues
- for (unsigned int thread_id = 0; thread_id < (unsigned int)nb_threads; thread_id++)
+ //logging("done compactions");
+
+ // flush glues, clear flat_bucket_queues
+ for (int thread_id = 0; thread_id < nb_threads; thread_id++)
{
+ flat_bucket_queues[thread_id].clear();
out_to_glue[thread_id]->flush ();
}
-
if (partition[p].getNbItems() == 0)
continue; // no stats to print here
- // check if buckets are indeed empty
- for (unsigned int minimizer = 0; minimizer < rg; minimizer++)
- {
- if (bucket_queues[minimizer].size_approx() != 0)
- {
- printf("WARNING! bucket %d still has non-processed %d elements\n", minimizer, bucket_queues[minimizer].size_approx() );
- //std::tuple<BUCKET_STR_TYPE,uint32_t,uint32_t> bucket_elt;
- std::tuple<BUCKET_STR_TYPE,uint32_t,uint32_t,uint32_t> bucket_elt; // graph3<span> switch
- while (bucket_queues[minimizer].try_dequeue(bucket_elt))
- {
- printf(" %s leftmin %d rightmin %d abundance %d repartleft %d repartright %d repartmin %d\n", FROM_BUCKET_STR(std::get<0>(bucket_elt)).c_str(), std::get<1>(bucket_elt), std::get<2>(bucket_elt), std::get<3>(bucket_elt), repart(std::get<1>(bucket_elt)), repart(std::get<2>(bucket_elt)), repart(minimizer)); // graph3<span> switch
- }
-
- }
- }
-
-
/* compute and print timings */
{
auto end_foreach_bucket_t=get_wtime();
@@ -546,9 +736,9 @@ void bcalm2(Storage *storage,
double longest_lambda = lambda_timings.front();
- if (verbose)
+ if (verbose_partition)
{
- cout <<"\nIn this superbucket (containing " << active_minimizers.size() << " active minimizers)," <<endl;
+ cout <<"\nIn this superbucket (containing " << set_minimizers.size() << " active minimizers)," <<endl;
cout <<" sum of time spent in lambda's: "<< global_wtime_lambda / 1000000 <<" msecs" <<endl;
cout <<" longest lambda: "<< longest_lambda / 1000000 <<" msecs" <<endl;
cout <<" tot time of best scheduling of lambdas: "<< tot_time_best_sched_lambda / 1000000 <<" msecs" <<endl;
@@ -557,7 +747,7 @@ void bcalm2(Storage *storage,
double best_theoretical_speedup = global_wtime_lambda / longest_lambda;
double actual_theoretical_speedup = global_wtime_lambda / tot_time_best_sched_lambda;
- if (verbose)
+ if (verbose_partition)
{
cout <<" best theoretical speedup: "<< best_theoretical_speedup << "x" <<endl;
if (nb_threads_simulate > 1)
@@ -577,13 +767,10 @@ void bcalm2(Storage *storage,
}
}
- delete [] bucket_queues;
- memory_usage("Done with partition " + std::to_string(p), verbose);
+ if (verbose_partition)
+ logging("Done with partition " + std::to_string(p));
} // end iteration superbuckets
-
-
- // FIXME there may be a memory leak here, test it (saw it on spruce)
-
+
/*
*
* Finishing up
@@ -607,11 +794,29 @@ void bcalm2(Storage *storage,
}
list_of_glues.close();
- /* printing some timing stats */
+ // gather some stats
+ uint64_t nbSeqsInGlue = 0;
+ uint64_t nbPretips = 0;
+ for (int thread_id = 0; thread_id < nb_threads; thread_id++)
+ {
+ nbSeqsInGlue += nb_seqs_in_glue[thread_id];
+ nbPretips += nb_pretips[thread_id];
+ }
+
auto end_t=chrono::system_clock::now();
+ float wtime = chrono::duration_cast<chrono::nanoseconds>(end_t - start_buckets).count() / unit;
+
+ Group& bcalmGroup = storage->getGroup("bcalm");
+ bcalmGroup.setProperty ("nb_pretips_removed", Stringify::format("%ld", nb_pretips));
+ bcalmGroup.setProperty ("nb_sequences_in_glue", Stringify::format("%ld", nbSeqsInGlue));
+ bcalmGroup.setProperty ("wtime_compactions", Stringify::format("%f", wtime));
+
+ /* printing some timing stats */
if (verbose)
{
- cout<<"Buckets compaction and gluing : "<<chrono::duration_cast<chrono::nanoseconds>(end_t - start_buckets).count() / unit<<" secs"<<endl;
+ cout <<"Number of sequences in glue: "<< nbSeqsInGlue << std::endl;
+ cout <<"Number of pre-tips removed : "<< nbPretips << std::endl;
+ cout<<"Buckets compaction and gluing : "<< wtime <<" secs"<<endl;
cout<<"Within that, \n";
cout <<" creating buckets from superbuckets: "<< global_wtime_create_buckets / unit <<" secs"<<endl;
cout <<" bucket compaction (wall-clock during threads): "<< global_wtime_foreach_bucket / unit <<" secs" <<endl;
@@ -649,15 +854,16 @@ void bcalm2(Storage *storage,
if (minimizer_type == 1)
delete[] freq_order;
delete[] nb_seqs_in_glue;
+ delete[] nb_pretips;
for (unsigned int i = 0; i < (unsigned int)nb_threads; i++)
delete out_to_glue[i];
- delete[] traveller_kmers_save_mutex;
for (unsigned int i = 0; i < nb_partitions; i++)
delete traveller_kmers_files[i];
-
- memory_usage("Done with all compactions", verbose);
+
+ logging("Done with all compactions");
//delete storage; exit(0); // to stop after bcalm, before bglue
}
+
}}}}
diff --git a/gatb-core/src/gatb/bcalm2/bcalm_algo.hpp b/gatb-core/src/gatb/bcalm2/bcalm_algo.hpp
index 4dd4f35..6364f04 100644
--- a/gatb-core/src/gatb/bcalm2/bcalm_algo.hpp
+++ b/gatb-core/src/gatb/bcalm2/bcalm_algo.hpp
@@ -21,99 +21,13 @@
#ifndef _GATB_CORE_BCALM_ALGO_HPP_
#define _GATB_CORE_BCALM_ALGO_HPP_
-#include <assert.h>
-#include <iostream>
-#include <memory>
-#include <iostream>
-#include <iomanip>
-#include <algorithm>
-#include <chrono>
-#include <tuple>
-#include "ograph.h"
-
-#include <gatb/tools/designpattern/impl/Command.hpp>
-
-#include <gatb/system/impl/System.hpp>
-#include <gatb/tools/misc/impl/Property.hpp>
-
#include <gatb/tools/storage/impl/Storage.hpp>
-#include <gatb/tools/storage/impl/StorageTools.hpp>
-
-#include <gatb/tools/math/NativeInt64.hpp>
-#include <gatb/tools/math/NativeInt128.hpp>
-#include <gatb/tools/math/LargeInt.hpp>
-
-#include <gatb/bank/impl/Banks.hpp>
-#include <gatb/bank/impl/Bank.hpp>
-#include <gatb/bank/impl/BankHelpers.hpp>
-#include <gatb/bank/impl/BankConverterAlgorithm.hpp>
-
-#include <gatb/kmer/impl/Model.hpp>
-
-#include <gatb/kmer/impl/PartiInfo.hpp> // for repartitor
-#include <gatb/tools/misc/impl/Progress.hpp>
-#include <gatb/tools/designpattern/impl/IteratorHelpers.hpp>
-
-#include <thread>
-#include <atomic>
-#include "lockstdqueue.h"
-
-#include "ThreadPool.h"
-
-
-using namespace gatb::core::system;
-using namespace gatb::core::system::impl;
-
-using namespace gatb::core::bank;
-using namespace gatb::core::bank::impl;
-
-using namespace gatb::core::kmer;
-using namespace gatb::core::kmer::impl;
-
-using namespace gatb::core::tools::storage;
-using namespace gatb::core::tools::storage::impl;
-using namespace gatb::core::tools::misc;
-using namespace gatb::core::tools::misc::impl;
-using namespace gatb::core::tools::dp;
-using namespace gatb::core::tools::dp::impl;
-
-
-
-
-#define get_wtime() chrono::system_clock::now()
-#ifndef diff_wtime
-#define diff_wtime(x,y) chrono::duration_cast<chrono::nanoseconds>(y - x).count()
-#endif
-
-//#define BINSEQ // "graph4 is not ready" according to antoine. also, initBinSeq provokes segfault at end of bcalm
-
-#ifdef BINSEQ
-#include "binSeq.h"
-#define BUCKET_STR_TYPE binSeq
-#define TO_BUCKET_STR(x) binSeq(x)
-#define FROM_BUCKET_STR(x) (x.str())
-#else
-#define BUCKET_STR_TYPE string
-#define TO_BUCKET_STR(x) x
-#define FROM_BUCKET_STR(x) x
-#endif
-
-
-// timing-related variables
-
-#define THREAD_SAFE_TIMING
-#ifdef THREAD_SAFE_TIMING
-typedef std::atomic<double> atomic_double;
-#else
-#define atomic_double_add(d1,d2) d1 += d2;
-typedef double atomic_double;
-#endif
namespace gatb { namespace core { namespace debruijn { namespace impl {
template<size_t SPAN>
-void bcalm2(Storage* storage,
+void bcalm2(gatb::core::tools::storage::impl::Storage* storage,
std::string prefix,
int kmerSize,
int abundance,
diff --git a/gatb-core/src/gatb/bcalm2/bglue_algo.cpp b/gatb-core/src/gatb/bcalm2/bglue_algo.cpp
index 6508550..136955d 100644
--- a/gatb-core/src/gatb/bcalm2/bglue_algo.cpp
+++ b/gatb-core/src/gatb/bcalm2/bglue_algo.cpp
@@ -1,13 +1,79 @@
/* remaining issue:
-- no more than 2^32 sequences to glue together (should be ok for spruce)
+- no more than 2^(32-1) sequences to glue together (should be ok for spruce)
*/
#include "bglue_algo.hpp"
+
+#include <unordered_map>
+#include "unionFind.hpp"
+#include <BooPHF/BooPHF.h>
+#include "ThreadPool.h"
+
+#include "logging.hpp"
+/*#include "buffer_allocator.tcc"
+#include "buffer_manager.tcc"*/
#include <sstream>
#include <iomanip>
+/*#include "ctpl_stl.h" // alternative to threadpool // https://github.com/vit-vit/CTPL/blob/master/ctpl_stl.h // didn't commit because didnt use
+#include "buffer_allocator.h" // memory pool from https://github.com/vincetse/allocator, didn't commit the files because didnt use
+#include "buffer_manager.h" // memory pool
+*/
+
+#include <gatb/tools/designpattern/impl/Command.hpp>
+
+#include <gatb/system/impl/System.hpp>
+#include <gatb/tools/misc/impl/Property.hpp>
+
+#include <gatb/tools/storage/impl/Storage.hpp>
+#include <gatb/tools/storage/impl/StorageTools.hpp>
+
+#include <gatb/tools/math/NativeInt64.hpp>
+#include <gatb/tools/math/NativeInt128.hpp>
+#include <gatb/tools/math/LargeInt.hpp>
+
+#include <gatb/bank/impl/Banks.hpp>
+#include <gatb/bank/impl/Bank.hpp>
+#include <gatb/bank/impl/BankHelpers.hpp>
+#include <gatb/bank/impl/BankConverterAlgorithm.hpp>
+
+#include <gatb/kmer/impl/Model.hpp>
+
+#include <gatb/kmer/impl/PartiInfo.hpp> // for repartitor
+#include <gatb/tools/misc/impl/Progress.hpp>
+#include <gatb/tools/designpattern/impl/IteratorHelpers.hpp>
+#include <gatb/tools/collections/impl/BooPHF.hpp>
+
+#include <queue> // for priority_queue
+
+
+//heh at this point I could have maybe just included gatb_core.hpp but well, no circular dependencies, this file is part of gatb-core now.
+
+using namespace gatb::core::system;
+using namespace gatb::core::system::impl;
+
+using namespace gatb::core::bank;
+using namespace gatb::core::bank::impl;
+
+using namespace gatb::core::kmer;
+using namespace gatb::core::kmer::impl;
+
+using namespace gatb::core::tools::storage;
+using namespace gatb::core::tools::storage::impl;
+using namespace gatb::core::tools::misc;
+using namespace gatb::core::tools::misc::impl;
+using namespace gatb::core::tools::dp;
+using namespace gatb::core::tools::dp::impl;
+using namespace gatb::core::tools::collections;
+using namespace gatb::core::tools::collections::impl;
+
+
+
using namespace std;
-template <typename T>
+
+namespace gatb { namespace core { namespace debruijn { namespace impl {
+
+ template <typename T>
std::string to_string_with_precision(const T a_value, const int n = 1)
{
std::ostringstream out;
@@ -40,34 +106,6 @@ void free_memory_vector(std::vector<T> &vec)
}
-static bool logging_bglue_verbose = true;
-static unsigned long logging(string message="")
-{
- time_t t = time(0); // get time now
- struct tm * now = localtime( & t );
- if (logging_bglue_verbose)
- {
- cout << setiosflags(ios::right);
- cout << resetiosflags(ios::left);
- cout << setw(40) << left << message << " ";
- }
- char tmp[128];
- snprintf (tmp, sizeof(tmp), " %02d:%02d:%02d ",
- now->tm_hour, now->tm_min, now->tm_sec);
- if (logging_bglue_verbose)
- cout << tmp ;
-
- // using Progress.cpp of gatb-core
- u_int64_t mem = System::info().getMemorySelfUsed() / 1024;
- u_int64_t memMaxProcess = System::info().getMemorySelfMaxUsed() / 1024;
- snprintf (tmp, sizeof(tmp), " memory [current, maxRSS]: [%4lu, %4lu] MB ",
- mem, memMaxProcess);
-
- if (logging_bglue_verbose)
- cout << tmp << std::endl;
- return mem;
-}
-
static
char rc /*cheap desambiguation compared to GraphUnitigs because TemplateSpecialization8 complains */(char s) {
if (s == 'A') return 'T';
@@ -82,7 +120,7 @@ char rc /*cheap desambiguation compared to GraphUnitigs because TemplateSpeciali
}
-static string rc(string &s) {
+static string rc(const string &s) {
string rcs = "";
for (signed int i = s.length() - 1; i >= 0; i--) {rcs += rc(((char)s[i]));}
return rcs;
@@ -158,20 +196,21 @@ static string skip_first_abundance(const string& list)
}
-
+template<int SPAN>
struct markedSeq
{
- string seq;
- string abundances;
+ // there used to be "string seq; string abundance" but i noticed that i did not need that info for determining the chain of glues. not much space saved though (like 10-20%). I suppose the biggest memory-hog is the ks/ke unordered_map
+ uint64_t index;
+ bool rc;
bool lmark, rmark;
- string ks, ke; // [start,end] kmers of seq, in canonical form (redundant information with seq, but helpful)
+ typedef typename Kmer<SPAN>::Type Type;
+ Type ks, ke; // [start,end] kmers of seq, in canonical form (redundant information with seq, but helpful)
- markedSeq(string seq, string abundances, bool lmark, bool rmark, string ks, string ke) : seq(seq), abundances(abundances), lmark(lmark), rmark(rmark), ks(ks), ke(ke) {};
+ markedSeq(uint64_t index, bool lmark, bool rmark, const Type &ks, const Type &ke) : index(index), rc(false), lmark(lmark), rmark(rmark), ks(ks), ke(ke) {};
void revcomp()
{
- seq = rc(seq);
- abundances = reverse_abundances(abundances);
+ rc = !rc;
std::swap(lmark, rmark);
std::swap(ks, ke);
}
@@ -198,87 +237,87 @@ static uint32_t no_rev_index(uint32_t index)
return index & ((1LL<<31) - 1LL);
}
+//typedef lazy::memory::buffer_allocator<markedSeq> custom_allocator_t;
+//typedef std::allocator<markedSeq> custom_allocator_t;
-
-static vector<vector<uint32_t> > determine_order_sequences(vector<markedSeq> &sequences, int kmerSize)
+/* input: markedSequences, list of sequences in a partition
+ * output: res, a list of lists of sequences that will be glued together
+ */
+template<int SPAN>
+static void determine_order_sequences(vector<vector<uint32_t>> &res, const vector<markedSeq<SPAN>> &markedSequences, int kmerSize, bool debug=false)
{
- bool debug = false;
- unordered_map<string, set<uint32_t> > kmerIndex;
+ typedef typename Kmer<SPAN>::Type Type;
+ unordered_map<Type, set<uint32_t> > kmerIndex;
set<uint32_t> usedSeq;
- vector<vector<uint32_t>> res;
unsigned int nb_chained = 0;
// index kmers to their seq
- for (uint32_t i = 0; i < sequences.size(); i++)
+ // kmerIndex associates a kmer extremity to its index in markedSequences
+ for (uint32_t i = 0; i < markedSequences.size(); i++)
{
- kmerIndex[sequences[i].ks].insert(i);
- kmerIndex[sequences[i].ke].insert(i);
+ kmerIndex[markedSequences[i].ks].insert(i);
+ kmerIndex[markedSequences[i].ke].insert(i);
}
- auto glue_from_extremity = [&](markedSeq& current, uint32_t chain_index, int i)
+ auto glue_from_extremity = [&](markedSeq<SPAN> current, uint32_t chain_index, uint32_t markedSequence_index)
{
vector<uint32_t> chain;
chain.push_back(chain_index);
bool rmark = current.rmark;
- int current_index = i;
- usedSeq.insert(i);
+ usedSeq.insert(markedSequence_index);
while (rmark)
{
if (debug)
- std::cout << "current ke " << current.ke << " index " << current_index << " markings: " << current.lmark << current.rmark <<std::endl;
+ std::cout << "current ke " << current.ke << " index " << no_rev_index(chain_index) << " markings: " << current.lmark << current.rmark <<std::endl;
// this sequence has a rmark, so necessarily there is another sequence to glue it with. find it here.
set<uint32_t> candidateSuccessors = kmerIndex[current.ke];
- assert(candidateSuccessors.find(current_index) != candidateSuccessors.end()); // remove the current seq from our indexing data structure
- candidateSuccessors.erase(current_index);
+ assert(candidateSuccessors.find(markedSequence_index) != candidateSuccessors.end()); // remove the current seq from our indexing data structure
+ candidateSuccessors.erase(markedSequence_index);
assert(candidateSuccessors.size() == 1); // normally there is exactly one sequence to glue with
- int successor_index = *candidateSuccessors.begin(); // pop()
- assert(successor_index != current_index);
- markedSeq successor = sequences[successor_index];
+ uint32_t successor_index = *candidateSuccessors.begin(); // pop()
+ assert(successor_index != markedSequence_index);
+ markedSeq<SPAN> successor = markedSequences[successor_index];
- uint32_t chain_index = successor_index;
+ chain_index = markedSequences[successor_index].index;
if (successor.ks != current.ke || (!successor.lmark))
{
successor.revcomp();
- chain_index = rev_index(successor_index);
+ chain_index = rev_index(chain_index);
}
- if (debug)
- std::cout << "successor " << successor_index << " successor ks ke " << successor.ks << " "<< successor.ke << " markings: " << successor.lmark << successor.rmark << std::endl;
-
- assert(successor.lmark);
- assert(successor.ks == current.ke);
-
- // edge case where the seq to be glued starts and ends with itself.
- // it should be a kmer (is tested below with an assert())
- if (successor.ks == successor.ke)
+ // some checks
{
if (debug)
- std::cout << "successor seq loops: " << successor.seq << std::endl;
- assert(successor.seq.size() == (unsigned int) kmerSize);
- if (successor.lmark == false)
- assert(successor.rmark == true);
- else
- assert(successor.rmark == false);
- // it's the only possible cases I can think of
-
- // there is actually nothing to be done now, it's an extremity, so it will end.
- // on a side note, it's pointless to save this kmer in bcalm.
+ std::cout << "successor " << successor_index /*<<" successor ks ke " << successor.ks << " "<< successor.ke*/ /* need to convert Type to string to print, didn't bother writing that code yet */ << " markings: " << successor.lmark << successor.rmark << std::endl;
+ assert(successor.lmark);
+ assert(successor.ks == current.ke);
+ // edge case where the seq to be glued starts and ends with itself.
+ // it should be a kmer (is tested below with an assert())
+ if (successor.ks == successor.ke)
+ {
+ if (successor.lmark == false)
+ assert(successor.rmark == true);
+ else
+ assert(successor.rmark == false);
+ // it's the only possible cases I can think of
+ // there is actually nothing to be done now, it's an extremity, so it will end.
+ // on a side note, it's pointless to save this kmer in bcalm.
+ }
}
-
current = successor;
+ markedSequence_index = successor_index;
chain.push_back(chain_index);
- current_index = successor_index;
rmark = current.rmark;
- assert((usedSeq.find(current_index) == usedSeq.end()));
- usedSeq.insert(current_index);
+ assert((usedSeq.find(markedSequence_index) == usedSeq.end()));
+ usedSeq.insert(markedSequence_index);
}
res.push_back(chain);
@@ -286,11 +325,10 @@ static vector<vector<uint32_t> > determine_order_sequences(vector<markedSeq> &se
};
- for (unsigned int i = 0; i < sequences.size(); i++)
+ // iterated markedSequences, and picks extremities of a chain
+ for (unsigned int i = 0; i < markedSequences.size(); i++)
{
- if (debug)
- std::cout << "sequence in glue partition: " << sequences[i].seq << std::endl;
- markedSeq current = sequences[i];
+ markedSeq<SPAN> current = markedSequences[i];
if (usedSeq.find(i) != usedSeq.end())
{
if (debug)
@@ -298,7 +336,7 @@ static vector<vector<uint32_t> > determine_order_sequences(vector<markedSeq> &se
continue;
}
- if (current.lmark & current.rmark)
+ if (current.lmark && current.rmark)
{
if (debug)
std::cout << "not the extremity of a chain" << std::endl;
@@ -306,11 +344,11 @@ static vector<vector<uint32_t> > determine_order_sequences(vector<markedSeq> &se
}
/* normalize sequence so that lmark is false */
- uint32_t chain_index = i;
+ uint32_t chain_index = markedSequences[i].index;
if (current.lmark)
{
current.revcomp();
- chain_index = rev_index(i);
+ chain_index = rev_index(chain_index);
}
assert(current.lmark == false);
@@ -326,131 +364,70 @@ static vector<vector<uint32_t> > determine_order_sequences(vector<markedSeq> &se
// my fix plan: we pick an extremity at random, and chop the last nucleotide and mark it to not be glued. also find the corresponding kmer in other extremity, and mark it as not to be glued
vector<int> remaining_indices;
- for (uint32_t i = 0; i < sequences.size(); i++)
+ for (uint32_t i = 0; i < markedSequences.size(); i++)
{
if (usedSeq.find(i) != usedSeq.end())
remaining_indices.push_back(i);
}
uint32_t chain_index = remaining_indices[0];
- string kmer = sequences[chain_index].substr(0,kmerSize);
- // sequences[chain_index] = // TODO continue
+ string kmer = markedSequences[chain_index].substr(0,kmerSize);
+ // markedSequences[chain_index] = // TODO continue
}
*/
- if (nb_chained < sequences.size())
+ if (nb_chained < markedSequences.size())
{
- std::cout << " WARNING: " << sequences.size() - nb_chained << " sequence chunks not returned in output unitigs (likely small circular contigs)" << std::endl;
+ std::cout << " Note: " << markedSequences.size() - nb_chained << " sequence chunks not returned in output unitigs (likely small circular contigs)" << std::endl;
}
// assert(sequences.size() == nb_chained); // make sure we've scheduled to glue all sequences in this partition
- return res;
}
/* straightforward glueing of a chain
* sequences should be ordered and in the right orientation
* so, it' just a matter of chopping of the first kmer
*/
-static void glue_sequences(vector<uint32_t> &chain, vector<markedSeq> &sequences, int kmerSize, string &res_seq, string &res_abundances)
+static void glue_sequences(vector<uint32_t> &chain, std::vector<std::string> &sequences, std::vector<std::string> &abundances, int kmerSize, string &res_seq, string &res_abundances)
{
- string res;
- string abundances;
+ bool debug=false;
+
string previous_kmer = "";
unsigned int k = kmerSize;
- bool last_rmark = false;
-
+
+ if (debug) std::cout << "glueing new chain: ";
for (auto it = chain.begin(); it != chain.end(); it++)
{
-
uint32_t idx = *it;
- markedSeq ms = sequences[no_rev_index(idx)];
+ string seq = sequences[no_rev_index(idx)];
+ string abs = abundances[no_rev_index(idx)];
if (is_rev_index(idx))
{
- ms.revcomp();
+ seq = rc(seq);
+ abs = reverse_abundances(abs);
}
- string seq = ms.seq;
-
if (previous_kmer.size() == 0) // it's the first element in a chain
{
- assert(ms.lmark == false);
- res += seq;
- abundances += ms.abundances;
+ res_seq += seq;
+ res_abundances += abs;
}
else
{
assert(seq.substr(0, k).compare(previous_kmer) == 0);
- res += seq.substr(k);
- abundances += skip_first_abundance(ms.abundances);
+ res_seq += seq.substr(k);
+ res_abundances += skip_first_abundance(abs);
}
+
+ if (debug) std::cout << seq << " ";
previous_kmer = seq.substr(seq.size() - k);
assert(previous_kmer.size() == k);
- last_rmark = ms.rmark;
}
- assert(last_rmark == false);
- if (last_rmark) { cout<<"bad gluing, missed an element" << endl; exit(1); } // in case assert()'s are disabled
-
- res_seq = res;
- res_abundances = abundances;
+ if (debug) std::cout << std::endl;
}
-// is also thread-safe thank to a lock
-class BufferedFasta
-{
- std::mutex mtx;
- std::vector<pair<string,string> > buffer;
-// std::vector<string > buffer;
- unsigned long buffer_length;
-
- public:
- BankFasta *bank;
-
- unsigned long max_buffer;
- BufferedFasta(string filename, unsigned long given_max_buffer = 500000)
- {
- max_buffer = given_max_buffer; // that much of buffering will be written to the file at once (in bytes)
- buffer_length = 0;
- bank = new BankFasta(filename);
- }
-
- ~BufferedFasta()
- {
- flush(); // probably very useful
- delete bank;
- }
-
- void insert(string &seq, string &comment)
- {
- mtx.lock();
- buffer_length += seq.size() + comment.size();
- buffer.push_back(make_pair(seq,comment));
-// buffer.push_back(seq);
- if (buffer_length > max_buffer)
- flush();
- mtx.unlock();
- }
-
- void flush()
- {
- for (auto &p : buffer)
- {
- string seq = get<0>(p);
- string comment = get<1>(p);
- Sequence s (Data::ASCII);
- s.getData().setRef ((char*)seq.c_str(), seq.size());
- s._comment = comment;
- bank->insert(s);
- }
- bank->flush();
- buffer_length = 0;
- // std::cout << "buffer capacity" << buffer.capacity() << endl;
- buffer.clear();
- free_memory_vector(buffer);
- }
-};
-
-static void output(string &seq, BufferedFasta &out, string comment = "")
+static void output(const string &seq, gatb::core::debruijn::impl::BufferedFasta &out, const string comment = "")
{
out.insert(seq, comment);
// BufferedFasta takes care of the flush
@@ -467,8 +444,6 @@ struct Comp{
};
-//typedef boomphf::SingleHashFunctor<partition_t > hasher_t;
-
// taken from GATB's MPHF.hpp and BooPHF.hpp (except that we don't need the iteration stuff from that file)
template<typename Key>
class hasher_t
@@ -492,155 +467,278 @@ class hasher_t
// since I contrl BooPHF code's, I know it calls this function with 0x33333333CCCCCCCCULL as the second seed.
}
};
+
+typedef uint64_t partition_t;
-namespace gatb { namespace core { namespace debruijn { namespace impl {
-
-/* main */
-template<size_t SPAN>
-void bglue(Storage *storage,
- std::string prefix,
- int kmerSize,
- int minSize,
- int nb_threads,
- int minimizer_type,
- bool verbose
- )
+/* computes and uniquifies the hashes of marked kmers at extremities of all to-be-glued sequences */
+template <int SPAN>
+void prepare_uf(std::string prefix, IBank *in, const int nb_threads, int& kmerSize, int pass, int nb_passes, uint64_t &nb_elts, uint64_t estimated_nb_glue_sequences)
{
- //std::cout << "bglue_algo params, prefix:" << prefix << " k:" << kmerSize << " minsize:" << minSize << " threads:" << nb_threads << " mintype:" << minimizer_type << std::endl;
- logging_bglue_verbose = verbose;
- int nbGluePartitions=200; // TODO autodetect it or set it as a parameter.
- bool debug_uf_stats = false; // formerly cmdline parameter
- bool only_uf = false; // idem
+
+ std::atomic<unsigned long> nb_marked_extremities, nb_unmarked_extremities;
+ nb_marked_extremities = 0; nb_unmarked_extremities = 0;
- if (verbose)
- {
- std::cout << "Nb bglue threads: " << nb_threads << std::endl;
- }
+ std::vector<std::vector<partition_t >> uf_hashes_vectors(nb_threads);
+
+ // relatively accurate number of sequences to be inserted
+ for (int i = 0; i < nb_threads; i++)
+ uf_hashes_vectors[i].reserve(estimated_nb_glue_sequences/(nb_passes*nb_threads));
- size_t k = kmerSize;
-
- ifstream f((prefix + ".glue").c_str());
- if (f.peek() == std::ifstream::traits_type::eof())
+ /* class (formerly a simple lambda function) to process a kmer and decide which bucket(s) it should go to */
+ /* needed to make it a class because i want it to remember its thread index */
+ class UniquifyKeys
{
- std::cout << "Empty glue file (no unitigs), abort." << std::endl;
- exit(1);
- }
+ typedef typename Kmer<SPAN>::ModelCanonical ModelCanon;
+
+ int k;
+ int pass, nb_passes, nb_threads;
+ ModelCanon modelCanon;
+ Hasher_T<ModelCanon> hasher;
+ std::atomic<unsigned long> &nb_marked_extremities, &nb_unmarked_extremities;
+ std::vector<std::vector<partition_t >> &uf_hashes_vectors;
+ int _currentThreadIndex;
+
+ public:
+ UniquifyKeys(int k, int pass, int nb_passes, int nb_threads,
+ std::atomic<unsigned long> &nb_marked_extremities, std::atomic<unsigned long> & nb_unmarked_extremities,
+ std::vector<std::vector<partition_t >> &uf_hashes_vectors
+ ) : k(k), pass(pass), nb_passes(nb_passes), nb_threads(nb_threads), modelCanon(k), hasher(modelCanon),
+ nb_marked_extremities(nb_marked_extremities), nb_unmarked_extremities(nb_unmarked_extremities),
+ uf_hashes_vectors(uf_hashes_vectors), _currentThreadIndex(-1)
+ {}
+
+ void operator() (const Sequence& sequence) {
+ const string seq = sequence.toString();
+ const string comment = sequence.getComment();
+
+ const bool lmark = comment[0] == '1';
+ const bool rmark = comment[1] == '1';
+ int thread = getThreadIndex();
- IBank *in = Bank::open (prefix + ".glue");
+ if (lmark)
+ {
+ const string kmerBegin = seq.substr(0, k );
+ // UF of canonical kmers in ModelCanon form, then hashed
+ const typename ModelCanon::Kmer kmmerBegin = modelCanon.codeSeed(kmerBegin.c_str(), Data::ASCII);
+ const uint64_t h1 = hasher(kmmerBegin);
+ if (h1 % (uint64_t)nb_passes == (uint64_t)pass)
+ {
+ uf_hashes_vectors[thread].push_back(h1);
+ nb_marked_extremities++;
+ }
+ }
+ else
+ nb_unmarked_extremities++;
+ if (rmark)
+ {
+ const string kmerEnd = seq.substr(seq.size() - k , k );
+ const typename ModelCanon::Kmer kmmerEnd = modelCanon.codeSeed(kmerEnd.c_str(), Data::ASCII);
+ const uint64_t h2 = hasher(kmmerEnd);
-typedef typename Kmer<SPAN>::ModelCanonical ModelCanon;
-// unused
-//typedef typename Kmer<SPAN>::Count Count;
-//typedef typename Kmer<SPAN>::template ModelMinimizer <ModelCanon> Model;
-//typedef typename Kmer<SPAN>::Type Type;
+ if (h2 % (uint64_t)nb_passes == (uint64_t)pass)
+ {
+ uf_hashes_vectors[thread].push_back(h2);
+ nb_marked_extremities++;
+ }
+ }
+ else
+ nb_unmarked_extremities++;
+ }
-typedef uint64_t partition_t;
+ /* neat trick taken from erwan's later work in gatb to find the thread id of a dispatched function */
+ int getThreadIndex()
+ {
+ if (_currentThreadIndex < 0)
+ {
+ std::pair<IThread*,size_t> info;
+ if (ThreadGroup::findThreadInfo (System::thread().getThreadSelf(), info) == true)
+ {
+ _currentThreadIndex = info.second;
+ }
+ else
+ {
+ throw Exception("Unable to find thread index during InsertIntoQueues");
+ }
+ }
+ return _currentThreadIndex;
+ }
+ };
- std::atomic<unsigned long> nb_extremities;
- nb_extremities = 0;
- // create a hasher for UF
- ModelCanon modelCanon(kmerSize); // i'm a bit lost with those models.. I think GATB could be made more simple here.
- Hasher_T<ModelCanon> hasher(modelCanon);
+ Dispatcher dispatcher (nb_threads);
- Iterator<Sequence>* it = in->iterator();
+ UniquifyKeys uniquifyKeys(kmerSize, pass, nb_passes, nb_threads,
+ nb_marked_extremities, nb_unmarked_extremities,
+ uf_hashes_vectors);
+ dispatcher.iterate (in->iterator(), uniquifyKeys);
+ logging( std::to_string(nb_marked_extremities.load()) + " marked kmers, " + std::to_string(nb_unmarked_extremities.load()) + " unmarked kmers");
- int nb_uf_hashes_vectors = 1000;
- std::vector<std::vector<partition_t >> uf_hashes_vectors(nb_uf_hashes_vectors);
- // std::mutex uf_hashes_vectorsMutex[nb_uf_hashes_vectors];
- std::mutex *uf_hashes_vectorsMutex=new std::mutex [nb_uf_hashes_vectors];
- // prepare UF: create the set of keys
- auto prepareUF = [k, &modelCanon, \
- &uf_hashes_vectorsMutex, &uf_hashes_vectors, &hasher, nb_uf_hashes_vectors, &nb_extremities](const Sequence& sequence)
- {
- string seq = sequence.toString();
- string comment = sequence.getComment();
+ //
+ // single-threaded version
+ /* auto it = in->iterator();
+ for (it->first (); !it->isDone(); it->next())
+ prepareUF(it->item());*/
- bool lmark = comment[0] == '1';
- bool rmark = comment[1] == '1';
- if ((!lmark) && (!rmark)) // if both marks are 0, nothing to glue here
- return;
+ logging("created vector of hashes, size approx " + std::to_string( sizeof(partition_t)*nb_marked_extremities.load()/1024/1024) + " MB)");
+ ThreadPool uf_sort_pool(nb_threads); // ThreadPool
+ // ctpl::thread_pool uf_merge_pool(nb_threads);
- string kmerBegin = seq.substr(0, k );
- string kmerEnd = seq.substr(seq.size() - k , k );
+ // sort and uniquify UF vectors (from uf_hashes_vector). the uniquify is actually optional but doesn't cost much
+ for (int i = 0; i < nb_threads; i++)
+ {
+ auto sortuniq = [&uf_hashes_vectors, i] (int thread_id)
+ {
+ std::vector<partition_t> &vec = uf_hashes_vectors[i];
+ sort( vec.begin(), vec.end() );
+ vec.erase( unique( vec.begin(), vec.end() ), vec.end() );
+ };
+ uf_sort_pool.enqueue(sortuniq); // ThreadPool
+ //uf_sort_pool.push(sortuniq); // ctpl
+ //sortuniq(0); // single-threaded
+ }
- // UF of canonical kmers in ModelCanon form, then hashed
- typename ModelCanon::Kmer kmmerBegin = modelCanon.codeSeed(kmerBegin.c_str(), Data::ASCII);
- typename ModelCanon::Kmer kmmerEnd = modelCanon.codeSeed(kmerEnd.c_str(), Data::ASCII);
+ uf_sort_pool.join(); // ThreadPool
+
+
+ // a single-threaded merge and write to file before they're loaded again in bglue
+ BagFile<uint64_t> * bagf = new BagFile<uint64_t>( prefix+".glue.hashes."+ to_string(pass)); LOCAL(bagf);
+ Bag<uint64_t> * currentbag = new BagCache<uint64_t> ( bagf, 10000 ); LOCAL(currentbag);// really? we have to through these hoops to do a simple binary file in gatb? gotta change this.
+ uint64_t nb_elts_pass = 0;
- uint64_t h1 = hasher(kmmerBegin);
- uint64_t h2 = hasher(kmmerEnd);
+ priority_queue<std::tuple<uint64_t,int>, std::vector<std::tuple<uint64_t,int>>, std::greater<std::tuple<uint64_t,int>> > pq; // http://stackoverflow.com/questions/2439283/how-can-i-create-min-stl-priority-queue
+ vector<uint64_t> hash_vector_idx(nb_threads);
+ vector<uint64_t> hash_vector_size(nb_threads);
- uf_hashes_vectorsMutex[h1%nb_uf_hashes_vectors].lock();
- uf_hashes_vectors[h1%nb_uf_hashes_vectors].push_back(h1);
- uf_hashes_vectorsMutex[h1%nb_uf_hashes_vectors].unlock();
+ // prime the pq
+ for (int i = 0; i < nb_threads; i++)
+ {
+ hash_vector_idx[i] = 0;
+ hash_vector_size[i] = uf_hashes_vectors[i].size();
+ if (hash_vector_size[i] > 0)
+ pq.emplace(make_tuple(uf_hashes_vectors[i][hash_vector_idx[i]++], i));
+ }
- uf_hashes_vectorsMutex[h2%nb_uf_hashes_vectors].lock();
- uf_hashes_vectors[h2%nb_uf_hashes_vectors].push_back(h2);
- uf_hashes_vectorsMutex[h2%nb_uf_hashes_vectors].unlock();
+ uint64_t prev = 0;
+ while (pq.size() > 0)
+ {
+ std::tuple<uint64_t, int> elt = pq.top(); pq.pop();
+ uint64_t cur = get<0>(elt);
+ //std::cout << "got " << cur << " queue " << get<1>(elt) << std::endl;
+ if (cur != prev)
+ {
+ currentbag->insert(cur);
+ nb_elts_pass ++;
+ }
+ prev = cur;
+
+ int i = get<1>(elt);
+ if (hash_vector_idx[i] < hash_vector_size[i])
+ pq.emplace(make_tuple(uf_hashes_vectors[i][hash_vector_idx[i]++], i));
+ }
- nb_extremities+=2;
- };
+ for (int i = 0; i < nb_threads; i++)
+ free_memory_vector(uf_hashes_vectors[i]);
- Dispatcher dispatcher (nb_threads);
- it = in->iterator(); // yeah so.. I think the old iterator cannot be reused
- dispatcher.iterate (it, prepareUF);
+
+ currentbag->flush();
+
+ free_memory_vector(uf_hashes_vectors);
- logging("created vector of redundant UF elements (" + std::to_string(nb_extremities.load()) + " kmers, approx " + std::to_string( sizeof(partition_t)*nb_extremities/1024/1024) + " MB)");
+ logging("pass " + to_string(pass+1) + "/" + to_string(nb_passes) + ", " + std::to_string(nb_elts_pass) + " unique hashes written to disk, size " + to_string(nb_elts_pass* sizeof(partition_t) / 1024/1024) + " MB");
- ThreadPool uf_merge_pool(nb_threads);
+ nb_elts += nb_elts_pass;
+}
- // uniquify UF vectors
- for (int i = 0; i < nb_uf_hashes_vectors; i++)
- {
- auto uniquify = [&uf_hashes_vectors, i] (int thread_id)
- {
- std::vector<partition_t> &vec = uf_hashes_vectors[i];
- //http://stackoverflow.com/questions/1041620/whats-the-most-efficient-way-to-erase-duplicates-and-sort-a-vector
- set<partition_t> s( vec.begin(), vec.end() );
- vec.assign( s.begin(), s.end() );
+/* main */
+template<size_t SPAN>
+void bglue(Storage *storage,
+ std::string prefix,
+ int kmerSize,
+ int nb_threads,
+ bool verbose
+ )
+{
+ auto start_t=chrono::system_clock::now();
+ double unit = 1000000000;
+ cout.setf(ios_base::fixed);
+ cout.precision(1);
- };
- uf_merge_pool.enqueue(uniquify);
- }
+ std::cout << "bglue_algo params, prefix:" << prefix << " k:" << kmerSize << " threads:" << nb_threads << std::endl;
+ bcalm_logging = verbose;
+ size_t k = kmerSize;
+ bool debug_uf_stats = false; // formerly cmdline parameter
+ bool only_uf = false; // idem
- uf_merge_pool.join();
+ //int nbGluePartitions=200; // no longer fixed
+ // autodetecting number of partitions
+ int max_open_files = System::file().getMaxFilesNumber() / 2;
+ int nbGluePartitions = std::min(2000, max_open_files); // ceil it at 2000 anyhow
- logging("sorted and unique UF elements");
+ logging("Starting bglue with " + std::to_string( nb_threads) + " threads");
- // compute number of UF elements from intermediate vectors
- unsigned long tmp_nb_uf_keys = 0;
- for (int i = 0; i < nb_uf_hashes_vectors; i++)
- tmp_nb_uf_keys += uf_hashes_vectors[i].size();
+ // create a hasher for UF
+ typedef typename Kmer<SPAN>::ModelCanonical ModelCanon;
+ ModelCanon modelCanon(kmerSize); // i'm a bit lost with those models.. I think GATB could be made more simple here.
+ Hasher_T<ModelCanon> hasher(modelCanon);
- // merge intermediate vectors into a single vector, to prepare MPHF (this step could be skipped if created a special iterator for boophf)
- std::vector<partition_t > uf_hashes;
- uf_hashes.reserve(tmp_nb_uf_keys);
- for (int i = 0; i < nb_uf_hashes_vectors; i++)
+ ifstream f((prefix + ".glue").c_str());
+ if (f.peek() == std::ifstream::traits_type::eof())
{
- uf_hashes.insert( uf_hashes.end(), uf_hashes_vectors[i].begin(), uf_hashes_vectors[i].end());
- free_memory_vector(uf_hashes_vectors[i]);
+ std::cout << "Empty glue file (no sequences)." << std::endl;
+ return;
}
- logging("merged UF elements (" + std::to_string(uf_hashes.size()) + ") into a single vector");
-
- unsigned long nb_uf_keys = uf_hashes.size();
- if (nb_uf_keys != tmp_nb_uf_keys) { std::cout << "Error during UF preparation, bad number of keys in merge: " << tmp_nb_uf_keys << " " << nb_uf_keys << std::endl; exit(1); }
+ IBank *in = Bank::open (prefix + ".glue");
+ LOCAL(in);
+
+ uint64_t nb_glue_sequences = 0;
+
+ if (storage != nullptr)
+ {
+ Group& bcalmGroup = storage->getGroup("bcalm");
+ nb_glue_sequences = atol(bcalmGroup.getProperty ("nb_sequences_in_glue").c_str());
+ }
- if (uf_hashes.size() == 0) // prevent an edge case when there's nothing to glue, boophf doesn't like it
+ if (nb_glue_sequences == 0)
{
- uf_hashes.push_back(0);
+ uint64_t estimated_nb_glue_sequences = in->estimateNbItems();
+ logging("estimating number of sequences to be glued (couldn't find true number)");
+ nb_glue_sequences = estimated_nb_glue_sequences;
}
+ logging("number of sequences to be glued: " + to_string(nb_glue_sequences) );
+
+ /*
+ * puts all the uf hashes in disk.
+ */
+ int nb_prepare_passes = 3;
+ uint64_t nb_elts = 0;
+ for (int pass = 0; pass < nb_prepare_passes; pass++)
+ prepare_uf<SPAN>(prefix, in, nb_threads, kmerSize, pass, nb_prepare_passes, nb_elts, nb_glue_sequences);
+
+ // load uf hashes from disk
+ std::vector<partition_t> uf_hashes;
+ uf_hashes.reserve(nb_elts);
+ for (int pass = 0; pass < nb_prepare_passes; pass++)
+ {
+ IteratorFile<uint64_t> file(prefix+".glue.hashes." + to_string(pass));
+ for (file.first(); !file.isDone(); file.next())
+ uf_hashes.push_back(file.item());
+ }
+ if (uf_hashes.size() == 0) // prevent an edge case when there's nothing to glue, boophf doesn't like it
+ uf_hashes.push_back(0);
+ for (int pass = 0; pass < nb_prepare_passes; pass++)
+ System::file().remove (prefix+".glue.hashes." + to_string(pass));
- auto data_iterator = boomphf::range(uf_hashes.begin(), uf_hashes.end());
+ unsigned long nb_uf_keys = uf_hashes.size();
+ logging("loaded all unique UF elements (" + std::to_string(nb_uf_keys) + ") into a single file vector of size " + to_string(nb_uf_keys* sizeof(partition_t) / 1024/1024) + " MB");
int gamma = 3; // make it even faster.
- boomphf::mphf<partition_t , hasher_t< partition_t> > uf_mphf(nb_uf_keys, data_iterator, nb_threads, gamma, verbose);
+ boomphf::mphf<partition_t , /*TODO we don't need hasher_t here now that we're not hashing kmers, but I forgot to change*/ hasher_t< partition_t> > uf_mphf(nb_uf_keys, uf_hashes, nb_threads, gamma, verbose);
free_memory_vector(uf_hashes);
@@ -652,14 +750,16 @@ typedef uint64_t partition_t;
// create a UF data structure
+ unionFind<uint32_t> ufkmers(nb_uf_keys);
+
#if 0
unionFind<unsigned int> ufmin;
unionFind<std::string> ufprefixes;
unsigned int prefix_length = 10;
unionFind<std::string> ufkmerstr;
#endif
- // those were toy one, here is the real one:
- unionFind<uint32_t> ufkmers(nb_uf_keys);
+// those were toy one, here is the real one:
+
// instead of UF of kmers, we do a union find of hashes of kmers. less memory. will have collisions, but that's okay i think. let's see.
// actually, in the current implementation, partition_t is not used, but values are indeed hardcoded in 32 bits (the UF implementation uses a 64 bits hash table for internal stuff)
@@ -670,26 +770,26 @@ typedef uint64_t partition_t;
auto createUF = [k, &modelCanon, \
&uf_mphf, &ufkmers, &hasher](const Sequence& sequence)
{
- string seq = sequence.toString();
+ const string seq = sequence.toString();
+ const string comment = sequence.getComment();
if (seq.size() < k)
{
std::cout << "unexpectedly small sequence found ("<<seq.size()<<"). did you set k correctly?" <<std::endl; exit(1);
}
- string comment = sequence.getComment();
bool lmark = comment[0] == '1';
bool rmark = comment[1] == '1';
- if ((!lmark) && (!rmark)) // if both marks are 0, nothing to glue here
+ if ((!lmark) || (!rmark)) // if either mark is 0, no need to associate kmers in UF
return;
- string kmerBegin = seq.substr(0, k );
- string kmerEnd = seq.substr(seq.size() - k , k );
+ const string kmerBegin = seq.substr(0, k );
+ const string kmerEnd = seq.substr(seq.size() - k , k );
// UF of canonical kmers in ModelCanon form, then hashed
- typename ModelCanon::Kmer kmmerBegin = modelCanon.codeSeed(kmerBegin.c_str(), Data::ASCII);
- typename ModelCanon::Kmer kmmerEnd = modelCanon.codeSeed(kmerEnd.c_str(), Data::ASCII);
+ const typename ModelCanon::Kmer kmmerBegin = modelCanon.codeSeed(kmerBegin.c_str(), Data::ASCII);
+ const typename ModelCanon::Kmer kmmerEnd = modelCanon.codeSeed(kmerEnd.c_str(), Data::ASCII);
ufkmers.union_(uf_mphf.lookup(hasher(kmmerBegin)), uf_mphf.lookup(hasher(kmmerEnd)));
//ufkmers.union_((hasher(kmmerBegin)), (hasher(kmmerEnd)));
@@ -719,8 +819,8 @@ typedef uint64_t partition_t;
};
//setDispatcher (new SerialDispatcher()); // force single thread
- it = in->iterator(); // yeah so.. I think the old iterator cannot be reused
- dispatcher.iterate (it, createUF);
+ Dispatcher dispatcher (nb_threads);
+ dispatcher.iterate (in->iterator(), createUF);
#if 0
ufmin.printStats("uf minimizers");
@@ -743,63 +843,74 @@ typedef uint64_t partition_t;
return;
/* now we're mirroring the UF to a vector of uint32_t's, it will take less space, and strictly same information
- * this is to get rid of the rank (one uint32) per element in the current UF implementation */
+ * this is to get rid of the rank (one uint32) per element in the current UF implementation
+ * we're using the disk to save space of populating one vector from the other in memory. */
+
+ BagFile<uint64_t> *ufkmers_bagf = new BagFile<uint64_t>(prefix+".glue.uf"); LOCAL(ufkmers_bagf);
+ BagCache<uint64_t> *ufkmers_bag = new BagCache<uint64_t>( ufkmers_bagf, 10000 ); LOCAL(ufkmers_bag);
- std::vector<uint32_t > ufkmers_vector(nb_uf_keys);
for (unsigned long i = 0; i < nb_uf_keys; i++)
- ufkmers_vector[i] = ufkmers.find(i);
+ //ufkmers_vector[i] = ufkmers.find(i); // just in-memory without the disk
+ ufkmers_bag->insert(ufkmers.find(i));
- logging("UF to vector done");
-
+ uint64_t size_mdata = sizeof(std::atomic<uint64_t>) * ufkmers.mData.size();
free_memory_vector(ufkmers.mData);
- logging("freed original UF");
+ logging("freed original UF (" + to_string(size_mdata/1024/1024) + " MB)");
+ ufkmers_bag->flush();
+ std::vector<uint32_t > ufkmers_vector(nb_uf_keys);
+ IteratorFile<uint64_t> ufkmers_file(prefix+".glue.uf");
+ unsigned long i = 0;
+ for (ufkmers_file.first(); !ufkmers_file.isDone(); ufkmers_file.next())
+ ufkmers_vector[i++] = ufkmers_file.item();
+
+ System::file().remove (prefix+".glue.uf");
+
+ logging("loaded 32-bit UF (" + to_string(nb_uf_keys*sizeof(uint32_t)/1024/1024) + " MB)");
+
// setup output file
string output_prefix = prefix;
std::atomic<unsigned long> out_id; // identified for output sequences
out_id = 0;
- BufferedFasta out (output_prefix, 4000000 /* give it a large buffer*/);
- out.bank->setDataLineSize(0); // antoine wants one seq per line in output
+ BufferedFasta out (output_prefix, 100000);
- auto get_partition = [&modelCanon, &ufkmers_vector, &hasher, &uf_mphf]
- (string &kmerBegin, string &kmerEnd,
+ auto get_UFclass = [&modelCanon, &ufkmers_vector, &hasher, &uf_mphf]
+ (const string &kmerBegin, const string &kmerEnd,
bool lmark, bool rmark,
typename ModelCanon::Kmer &kmmerBegin, typename ModelCanon::Kmer &kmmerEnd, // those will be populated based on lmark and rmark
- bool &found_partition)
+ bool &found_class)
{
- found_partition = false;
- uint32_t partition = 0;
+ found_class = false;
+ uint32_t ufclass = 0;
if (lmark)
{
kmmerBegin = modelCanon.codeSeed(kmerBegin.c_str(), Data::ASCII);
- found_partition = true;
- partition = ufkmers_vector[uf_mphf.lookup(hasher(kmmerBegin))];
+ found_class = true;
+ ufclass = ufkmers_vector[uf_mphf.lookup(hasher(kmmerBegin))];
}
if (rmark)
{
kmmerEnd = modelCanon.codeSeed(kmerEnd.c_str(), Data::ASCII);
- if (found_partition) // just do a small check
+ if (found_class) // just do a small check
{
- if (ufkmers_vector[uf_mphf.lookup(hasher(kmmerEnd))] != partition)
- { std::cout << "bad UF! left kmer has partition " << partition << " but right kmer has partition " << ufkmers_vector[uf_mphf.lookup(hasher(kmmerEnd))] << std::endl; exit(1); }
+ if (ufkmers_vector[uf_mphf.lookup(hasher(kmmerEnd))] != ufclass)
+ { std::cout << "bad UF! left kmer has partition " << ufclass << " but right kmer has partition " << ufkmers_vector[uf_mphf.lookup(hasher(kmmerEnd))] << std::endl; exit(1); }
}
else
{
- partition = ufkmers_vector[uf_mphf.lookup(hasher(kmmerEnd))];
- found_partition = true;
+ ufclass = ufkmers_vector[uf_mphf.lookup(hasher(kmmerEnd))];
+ found_class = true;
}
}
- return partition;
+ return ufclass;
};
- // std::mutex gluePartitionsLock[nbGluePartitions];
- std::mutex *gluePartitionsLock=new std::mutex [nbGluePartitions];
std::mutex outLock; // for the main output file
std::vector<BufferedFasta*> gluePartitions(nbGluePartitions);
std::string gluePartition_prefix = output_prefix + ".gluePartition.";
@@ -820,45 +931,40 @@ typedef uint64_t partition_t;
// partition the glue into many files, à la dsk
auto partitionGlue = [k, &modelCanon /* crashes if copied!*/, \
- &get_partition, &gluePartitions, &gluePartitionsLock,
+ &get_UFclass, &gluePartitions,
&out, &outLock, &nb_seqs_in_partition, &out_id, nbGluePartitions]
(const Sequence& sequence)
{
- string seq = sequence.toString();
+ const string &seq = sequence.toString();
+ const string &comment = sequence.getComment();
- string comment = sequence.getComment();
bool lmark = comment[0] == '1';
bool rmark = comment[1] == '1';
- string kmerBegin = seq.substr(0, k );
- string kmerEnd = seq.substr(seq.size() - k , k );
+ const string kmerBegin = seq.substr(0, k );
+ const string kmerEnd = seq.substr(seq.size() - k , k );
// make canonical kmer
typename ModelCanon::Kmer kmmerBegin;
typename ModelCanon::Kmer kmmerEnd;
- bool found_partition = false;
-
- uint32_t partition = get_partition(kmerBegin, kmerEnd, lmark, rmark, kmmerBegin, kmmerEnd, found_partition);
+ bool found_class = false;
- // compute kmer extremities if we have not already
- if (!lmark)
- kmmerBegin = modelCanon.codeSeed(kmerBegin.c_str(), Data::ASCII);
- if (!rmark)
- kmmerEnd = modelCanon.codeSeed(kmerEnd.c_str(), Data::ASCII);
+ uint32_t ufclass = get_UFclass(kmerBegin, kmerEnd, lmark, rmark, kmmerBegin, kmmerEnd, found_class);
- if (!found_partition) // this one doesn't need to be glued
+ if (!found_class) // this one doesn't need to be glued
{
- string abundances = comment.substr(3);
+ const string abundances = comment.substr(3);
float mean_abundance = get_mean_abundance(abundances);
uint32_t sum_abundances = get_sum_abundance(abundances);
- output(seq, out, std::to_string(out_id++) + " LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " KM:f:" + to_string_with_precision(mean_abundance));
+ output(seq, out, std::to_string(out_id++) + " LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
+ // km is not a standard GFA field so i'm putting it in lower case as per the spec
// maybe could optimize by writing to disk using queues, if that's ever a bottleneck
return;
}
- int index = partition % nbGluePartitions;
- //stringstream ss1; // to save partition later in the comment. (later: why? probably to avoid recomputing it)
+ int index = ufclass % nbGluePartitions;
+ //stringstream ss1; // to save partition later in the comment. [why? probably to avoid recomputing it]
//ss1 << blabla;
output(seq, *gluePartitions[index], comment);
@@ -866,12 +972,23 @@ typedef uint64_t partition_t;
};
logging("Disk partitioning of glue");
+ dispatcher.iterate (in->iterator(), partitionGlue); // multi-threaded
+ /*// single-threaded version
+ auto it = in->iterator();
+ for (it->first (); !it->isDone(); it->next())
+ partitionGlue(it->item());
+ */
- it = in->iterator(); // yeah so.. I think the old iterator cannot be reused
- dispatcher.iterate (it, partitionGlue);
+ for (int i = 0; i < nbGluePartitions; i++)
+ delete gluePartitions[i]; // takes care of the final flush (this doesn't delete the file, just closes it)
+ free_memory_vector(gluePartitions);
+ out.flush();
+
+
+ logging("Done disk partitioning of glue");
// get top10 largest glue partitions
- int top_n_glue_partition = 10;
+ int top_n_glue_partition = std::min(10,nbGluePartitions);
vector<unsigned long> vx, copy_nb_seqs_in_partition;
vx.resize(nb_seqs_in_partition.size());
copy_nb_seqs_in_partition.resize(nb_seqs_in_partition.size());
@@ -882,66 +999,59 @@ typedef uint64_t partition_t;
}
partial_sort( vx.begin(), vx.begin()+top_n_glue_partition, vx.end(), Comp<unsigned long>(copy_nb_seqs_in_partition) );
- if (logging_bglue_verbose)
+ if (verbose)
{
std::cout << "Top 10 glue partitions by size:" << std::endl;
for (int i = 0; i < top_n_glue_partition; i++)
std::cout << "Glue partition " << vx[i] << " has " << copy_nb_seqs_in_partition[vx[i]] << " sequences " << endl;
}
- for (int i = 0; i < nbGluePartitions; i++)
- {
- delete gluePartitions[i]; // takes care of the final flush (this doesn't delete the file, just closes it)
- }
-
- out.flush();
-
-
logging("Glueing partitions");
// glue all partitions using a thread pool
ThreadPool pool(nb_threads);
for (int partition = 0; partition < nbGluePartitions; partition++)
{
- auto glue_partition = [&modelCanon, &ufkmers, &hasher, partition, &gluePartition_prefix,
- &get_partition, &out, &outLock, &out_id, kmerSize]( int thread_id)
+ auto glue_partition = [&modelCanon, &ufkmers, partition, &gluePartition_prefix, nbGluePartitions, ©_nb_seqs_in_partition,
+ &get_UFclass, &out, &outLock, &out_id, kmerSize]( int thread_id)
{
int k = kmerSize;
string partitionFile = gluePartition_prefix + std::to_string(partition);
- BankFasta partitionBank (partitionFile);
+ BankFasta partitionBank (partitionFile); // BankFasta
+ BankFasta::Iterator it (partitionBank); // BankFasta
outLock.lock(); // should use a printlock..
- string message = "Gluing partition " +to_string(partition) + " (size: " +to_string(System::file().getSize(partitionFile)/1024/1024) + " MB)";
- logging(message);
+ if (partition % 20 == 0) // sparse printing
+ {
+ string message = "Gluing partition " +to_string(partition) + " (size: " +to_string(System::file().getSize(partitionFile)/1024/1024) + " MB)";
+ logging(message);
+ }
outLock.unlock();
- BankFasta::Iterator it (partitionBank);
-
- unordered_map<int,vector<markedSeq>> msInPart;
+ unordered_map<int, vector< markedSeq<SPAN> >> msInPart;
+ uint64_t seq_index = 0;
- for (it.first(); !it.isDone(); it.next())
+ for (it.first(); !it.isDone(); it.next()) // BankFasta
{
- string seq = it->toString();
+ const string seq = it->toString();
+ const string comment = it->getComment();
- string kmerBegin = seq.substr(0, k );
- string kmerEnd = seq.substr(seq.size() - k , k );
+ const string kmerBegin = seq.substr(0, k );
+ const string kmerEnd = seq.substr(seq.size() - k , k );
- uint32_t partition = 0;
- bool found_partition = false;
+ uint32_t ufclass = 0;
+ bool found_class = false;
- string comment = it->getComment();
bool lmark = comment[0] == '1';
bool rmark = comment[1] == '1';
- string abundances = comment.substr(3);
// todo speed improvement: get partition id from sequence header (so, save it previously)
// make canonical kmer
- typename ModelCanon::Kmer kmmerBegin;
- typename ModelCanon::Kmer kmmerEnd;
+ typename ModelCanon::Kmer kmmerBegin, kmmerEnd;
- partition = get_partition(kmerBegin, kmerEnd, lmark, rmark, kmmerBegin, kmmerEnd, found_partition);
+ ufclass = get_UFclass(kmerBegin, kmerEnd, lmark, rmark, kmmerBegin, kmmerEnd, found_class);
// compute kmer extremities if we have not already
if (!lmark)
@@ -949,56 +1059,94 @@ typedef uint64_t partition_t;
if (!rmark)
kmmerEnd = modelCanon.codeSeed(kmerEnd.c_str(), Data::ASCII);
- string ks = modelCanon.toString(kmmerBegin.value());
- string ke = modelCanon.toString(kmmerEnd .value());
- markedSeq ms(seq, abundances, lmark, rmark, ks, ke);
+ markedSeq<SPAN> ms(seq_index, lmark, rmark, kmmerBegin.value(), kmmerEnd.value());
- msInPart[partition].push_back(ms);
+ //if (ufclass == 38145) std::cout << " ufclass " << ufclass << " seq " << seq << " seq index " << seq_index << " " << lmark << rmark << " ks " << kmmerBegin.value() << " ke " << kmmerEnd.value() << std::endl; // debug specific partition
+ msInPart[ufclass].push_back(ms);
+ seq_index++;
}
-
- // now iterates all sequences in a partition to glue them in clever order (avoid intermediate gluing)
+ // now iterates all sequences in a partition to determine the order in which they're going to be glues (avoid intermediate gluing)
+ vector<vector<uint32_t>> ordered_sequences_idxs ;
for (auto it = msInPart.begin(); it != msInPart.end(); it++)
{
+ bool debug = false; //debug = it->first == 38145; // debug specific partition
//std::cout << "1.processing partition " << it->first << std::endl;
- vector<vector<uint32_t>> ordered_sequences_idxs = determine_order_sequences(it->second, kmerSize); // return indices of markedSeq's inside it->second
- //std::cout << "2.processing partition " << it->first << " nb ordered sequences: " << ordered_sequences.size() << std::endl;
+ determine_order_sequences<SPAN>(ordered_sequences_idxs, it->second, kmerSize, debug); // return indices of markedSeq's inside it->second
+ //std::cout << "2.processing partition " << it->first << " nb ordered sequences: " << ordered_sequences_idxs.size() << std::endl;
+ free_memory_vector(it->second);
+ }
- for (auto itO = ordered_sequences_idxs.begin(); itO != ordered_sequences_idxs.end(); itO++)
- {
- string seq, abundances;
- glue_sequences(*itO, it->second, kmerSize, seq, abundances); // takes as input the indices of ordered sequences, and the markedSeq's themselves
+ msInPart.clear();
+ unordered_map<int,vector<markedSeq<SPAN>>>().swap(msInPart); // free msInPart
+
+ vector<string> sequences;
+ vector<string> abundances;
+ sequences.reserve(copy_nb_seqs_in_partition[partition]);
+ abundances.reserve(copy_nb_seqs_in_partition[partition]);
+
+ for (it.first(); !it.isDone(); it.next()) // BankFasta
+ {
+ const string seq = it->toString();
+ const string comment = it->getComment();
+ const string abundance_str = comment.substr(3);
+ sequences.push_back(seq);
+ abundances.push_back(abundance_str);
+ }
- float mean_abundance = get_mean_abundance(abundances);
- uint32_t sum_abundances = get_sum_abundance(abundances);
- output(seq, out, std::to_string(out_id++) + " LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " KM:f:" + to_string_with_precision(mean_abundance));
- }
+ for (auto itO = ordered_sequences_idxs.begin(); itO != ordered_sequences_idxs.end(); itO++)
+ {
+ string seq, abs;
+ glue_sequences(*itO, sequences, abundances, kmerSize, seq, abs); // takes as input the indices of ordered sequences, and the markedSeq's themselves
- free_memory_vector(it->second);
+ float mean_abundance = get_mean_abundance(abs);
+ uint32_t sum_abundances = get_sum_abundance(abs);
+ output(seq, out, std::to_string(out_id++) + " LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
}
+
+ free_memory_vector(ordered_sequences_idxs);
- partitionBank.finalize();
+ partitionBank.finalize(); // BankFasta
System::file().remove (partitionFile);
};
pool.enqueue(glue_partition);
+ //glue_partition(0); // single threaded
}
pool.join();
+
+ out.flush(); // not sure if necessary
logging("end");
- // cleanup glue files
- std::string line;
- std::ifstream infile(prefix + ".glue");
- while (std::getline(infile, line))
+ bool debug_keep_glue_files = true; // for debugging // TODO enable it if -redo-bglue param was provided (need some info from UnitigsConstructionAlgorithm).
+ if (debug_keep_glue_files)
+ {
+ std::cout << "debug: not deleting glue files" << std::endl;
+ }
+ else
+ {
+ // cleanup glue files
+ std::string line;
+ std::ifstream infile(prefix + ".glue");
+ while (std::getline(infile, line))
+ {
+ System::file().remove (line);
+ }
+ infile.close();
+ System::file().remove (prefix + ".glue");
+ }
+ auto end_t=chrono::system_clock::now();
+ float wtime = chrono::duration_cast<chrono::nanoseconds>(end_t - start_t).count() / unit;
+
+ if (storage != nullptr)
{
- System::file().remove (line);
+ Group& bcalmGroup = storage->getGroup("bcalm");
+ bcalmGroup.setProperty ("wtime_glue", Stringify::format("%f", wtime));
}
- infile.close();
- System::file().remove (prefix + ".glue");
}
}}}}
diff --git a/gatb-core/src/gatb/bcalm2/bglue_algo.hpp b/gatb-core/src/gatb/bcalm2/bglue_algo.hpp
index d94c511..849c7d2 100644
--- a/gatb-core/src/gatb/bcalm2/bglue_algo.hpp
+++ b/gatb-core/src/gatb/bcalm2/bglue_algo.hpp
@@ -21,75 +21,123 @@
#ifndef _GATB_CORE_BGLUE_ALGO_HPP_
#define _GATB_CORE_BGLUE_ALGO_HPP_
-#include "unionFind.hpp"
+#include <iostream>
+#include <fstream>
+#include <sstream>
#include <atomic>
#include <set>
#include <vector>
#include <string>
#include <mutex>
-#include <unordered_map>
-#include <BooPHF/BooPHF.h>
-#include <ctime> // for time
-#include <iostream> // for time (and maybe other things?)
-#include <iomanip> // for cout mods
-#include "ThreadPool.h"
-
-#include <gatb/tools/designpattern/impl/Command.hpp>
-
-#include <gatb/system/impl/System.hpp>
-#include <gatb/tools/misc/impl/Property.hpp>
-
#include <gatb/tools/storage/impl/Storage.hpp>
-#include <gatb/tools/storage/impl/StorageTools.hpp>
-
-#include <gatb/tools/math/NativeInt64.hpp>
-#include <gatb/tools/math/NativeInt128.hpp>
-#include <gatb/tools/math/LargeInt.hpp>
-
-#include <gatb/bank/impl/Banks.hpp>
-#include <gatb/bank/impl/Bank.hpp>
-#include <gatb/bank/impl/BankHelpers.hpp>
-#include <gatb/bank/impl/BankConverterAlgorithm.hpp>
-
-#include <gatb/kmer/impl/Model.hpp>
-
-#include <gatb/kmer/impl/PartiInfo.hpp> // for repartitor
-#include <gatb/tools/misc/impl/Progress.hpp>
-#include <gatb/tools/designpattern/impl/IteratorHelpers.hpp>
-#include <gatb/tools/collections/impl/BooPHF.hpp>
-
-
-//heh at this point I could have maybe just included gatb_core.hpp but well, no circular dependencies, this file is part of gatb-core now.
-
-using namespace gatb::core::system;
-using namespace gatb::core::system::impl;
-
-using namespace gatb::core::bank;
-using namespace gatb::core::bank::impl;
-
-using namespace gatb::core::kmer;
-using namespace gatb::core::kmer::impl;
-
-using namespace gatb::core::tools::storage;
-using namespace gatb::core::tools::storage::impl;
-using namespace gatb::core::tools::misc;
-using namespace gatb::core::tools::misc::impl;
-using namespace gatb::core::tools::dp;
-using namespace gatb::core::tools::dp::impl;
-using namespace gatb::core::tools::collections;
-using namespace gatb::core::tools::collections::impl;
+namespace gatb { namespace core { namespace debruijn { namespace impl {
-namespace gatb { namespace core { namespace debruijn { namespace impl {
+// buffered and also thread-safe thank to a lock
+// not using BankFasta because I dont want to be recording variable-length strings in a std::vector<>, potential memory fragmentation
+// so instead it's in a flat buffer
+class BufferedFasta
+{
+ std::mutex mtx;
+ std::string buffer;
+ unsigned long buffer_length;
+ FILE* _insertHandle;
+
+ public:
+ unsigned long max_buffer;
+ bool threadsafe;
+
+ BufferedFasta(const std::string filename, unsigned long given_max_buffer = 50000)
+ {
+ max_buffer = given_max_buffer; // that much of buffering will be written to the file at once (in bytes)
+ threadsafe = true;
+ buffer_length = 0;
+ _insertHandle = fopen (filename.c_str(), "w");
+ if (!_insertHandle) { std::cout << "error opening " << filename << " for writing." << std::endl; exit(1);}
+ buffer.reserve(max_buffer+1000/*security*/);
+ }
+
+ ~BufferedFasta()
+ {
+ flush();
+ fclose(_insertHandle);
+ std::string().swap(buffer);
+ }
+
+ void insert(const std::string &seq, const std::string &comment)
+ {
+ if (threadsafe)
+ mtx.lock();
+ unsigned int insert_size = seq.size() + comment.size() + 3;
+ if (buffer_length + insert_size > max_buffer)
+ flush();
+ buffer_length += insert_size;
+ buffer += ">" + comment + "\n" + seq + "\n";
+ if (threadsafe)
+ mtx.unlock();
+ }
+
+ void flush()
+ {
+ unsigned int res = fprintf (_insertHandle, "%s", buffer.c_str());
+ if (res != buffer.size()) { std::cout << "couldn't flush, written " << res << " out of " << buffer.size() << std::endl; exit(1);}
+ if (_insertHandle != 0) { fflush (_insertHandle); }
+ buffer_length = 0;
+ buffer.clear();
+ }
+};
+
+// not using BankFasta because I suspect that it does some funky memory fragmentation. so this one is unbuffered
+class UnbufferedFastaIterator
+{
+ std::ifstream *input;
+ public:
+ UnbufferedFastaIterator(const std::string &filename)
+ {
+ input = new std::ifstream(filename);
+ }
+
+ ~UnbufferedFastaIterator() { delete input;}
+
+ bool read(std::string &seq, std::string &comment)
+ {
+ std::string line;
+ if (std::getline(*input, line))
+ {
+ if (line.empty())
+ return false;
+ if (line[0] == '>')
+ comment = line.substr(1);
+ }
+ else
+ return false;
+ if (std::getline(*input, line))
+ {
+ if (line.empty())
+ {
+ std::cout << "unexpected entry of one-line fasta: " << comment << std::endl;
+ exit(1);
+ }
+ seq = line;
+ return true;
+ }
+ return false;
+ }
+
+ void restart()
+ {
+ input->clear();
+ input->seekg(0);
+ }
+
+};
template<size_t SPAN>
-void bglue(Storage* storage,
+void bglue(gatb::core::tools::storage::impl::Storage* storage,
std::string prefix,
int kmerSize,
- int minSize,
int nb_threads,
- int minimizer_type,
bool verbose
);
diff --git a/gatb-core/src/gatb/bcalm2/lockstdqueue.h b/gatb-core/src/gatb/bcalm2/lockstdqueue.h
deleted file mode 100644
index 373d29c..0000000
--- a/gatb-core/src/gatb/bcalm2/lockstdqueue.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// just a thread safe queue, the most simple ever
-
-// adapted from:
-// https://raw.githubusercontent.com/cameron314/concurrentqueue/master/benchmarks/stdqueue.h
-// ©2014 Cameron Desrochers.
-
-#pragma once
-
-#include <queue>
-#include <mutex>
-
-
-// Simple wrapper around std::queue (not thread safe) - RC: made it thread safe
-template<typename T>
-class LockStdQueue
-{
-
-public:
- template<typename U>
- inline bool enqueue(U&& item)
- {
- std::lock_guard<std::mutex> guard(mutex);
- q.push(std::forward<U>(item));
- return true;
- }
-
- inline bool try_dequeue(T& item)
- {
- std::lock_guard<std::mutex> guard(mutex);
- if (q.empty()) {
- return false;
- }
-
- item = std::move(q.front());
- q.pop();
- return true;
- }
-
- unsigned long size_approx()
- {
- return q.size();
- }
-
- unsigned long overhead_per_element()
- {
- return 0; // I don't think anymore that's true. there must be some overhead
- }
-
-private:
- std::queue<T> q;
- mutable std::mutex mutex;
-};
diff --git a/gatb-core/src/gatb/bcalm2/logging.cpp b/gatb-core/src/gatb/bcalm2/logging.cpp
new file mode 100644
index 0000000..6846792
--- /dev/null
+++ b/gatb-core/src/gatb/bcalm2/logging.cpp
@@ -0,0 +1,40 @@
+#include <gatb/system/impl/System.hpp>
+#include <ctime> // for time
+#include <iostream> // for time (and maybe other things?)
+#include <iomanip> // for cout mods
+
+
+#include "logging.hpp"
+
+namespace gatb { namespace core { namespace debruijn { namespace impl {
+bool bcalm_logging = true;
+
+unsigned long logging(std::string message="")
+{
+ time_t t = time(0); // get time now
+ struct tm * now = localtime( & t );
+ if (bcalm_logging)
+ {
+ std::cout << std::setiosflags(std::ios::right);
+ std::cout << std::resetiosflags(std::ios::left);
+ std::cout << std::setw(40) << std::left << message << " ";
+ }
+ char tmp[128];
+ snprintf (tmp, sizeof(tmp), " %02d:%02d:%02d ",
+ now->tm_hour, now->tm_min, now->tm_sec);
+ if (bcalm_logging)
+ std::cout << tmp ;
+
+ // using Progress.cpp of gatb-core
+ u_int64_t mem = gatb::core::system::impl::System::info().getMemorySelfUsed() / 1024;
+ u_int64_t memMaxProcess = gatb::core::system::impl::System::info().getMemorySelfMaxUsed() / 1024;
+ snprintf (tmp, sizeof(tmp), " memory [current, maxRSS]: [%4lu, %4lu] MB ",
+ mem, memMaxProcess);
+
+ if (bcalm_logging)
+ std::cout << tmp << std::endl;
+ return mem;
+}
+
+
+}}}}
diff --git a/gatb-core/src/gatb/bcalm2/logging.hpp b/gatb-core/src/gatb/bcalm2/logging.hpp
new file mode 100644
index 0000000..3e14cad
--- /dev/null
+++ b/gatb-core/src/gatb/bcalm2/logging.hpp
@@ -0,0 +1,6 @@
+#include <string>
+namespace gatb { namespace core { namespace debruijn { namespace impl {
+
+extern bool bcalm_logging;
+extern unsigned long logging(std::string message);
+}}}}
diff --git a/gatb-core/src/gatb/bcalm2/ograph.cpp b/gatb-core/src/gatb/bcalm2/ograph.cpp
index 2ec2034..a96a72b 100644
--- a/gatb-core/src/gatb/bcalm2/ograph.cpp
+++ b/gatb-core/src/gatb/bcalm2/ograph.cpp
@@ -123,45 +123,58 @@ typename graph3<span>::kmerType graph3<span>::rcb(typename graph3<span>::kmerTyp
template<size_t span>
-void graph3<span>::compaction(uint iL, uint iR){
+void graph3<span>::compaction(uint iL, uint iR,typename graph3<span>::kmerType kmmer){
if(iR!=iL){
+ typename graph3<span>::kmerType RC=rcb(kmmer);
+ uint s1(unitigs[iL].size()),s2(unitigs[iR].size());
bool b1(isNumber(unitigs[iL][0])),b2(isNumber(unitigs[iR][0]));
- if(b1 and b2){return compaction(stoi(unitigs[iL]),stoi(unitigs[iR]));}
- if(b1){return compaction(stoi(unitigs[iL]),iR);}
- if(b2){return compaction(iL,stoi(unitigs[iR]));}
-
+ if(b1 and b2){return compaction(stoi(unitigs[iL]),stoi(unitigs[iR]),kmmer);}
+ if(b1){return compaction(stoi(unitigs[iL]),iR,kmmer);}
+ if(b2){return compaction(iL,stoi(unitigs[iR]),kmmer);}
+ //~ cout<<unitigs[iR]<<"\n";
+ //~ cout<<unitigs[iL]<<"\n";
typename graph3<span>::kmerType beg1;//(beg2int128(unitigs[iL])); // that kind of initialization isn't supported in LargeInt.
- beg1.setVal(beg2int128(unitigs[iL]));
+ beg1.setVal(beg2int128(unitigs[iL]));
typename graph3<span>::kmerType end2;//(end2int128(unitigs[iR]));
- end2.setVal(end2int128(unitigs[iR]));
+ end2.setVal(end2int128(unitigs[iR]));
- if(beg1==end2){
+ if(beg1==end2 and (end2==kmmer or end2==RC)){
+ //~ if(beg1==end2 ){
unitigs[iR]+=(unitigs[iL].substr(k));
unitigs[iL]=to_string(iR);
-
+ indexed_right[iR]=indexed_right[iL];
+ connected_right[iR]=connected_right[iL];
compact_abundances(iR,iL);
return;
}
typename graph3<span>::kmerType endrc2;//(beg2int128rc(unitigs[iR]));
- endrc2.setVal(beg2int128rc(unitigs[iR]));
- if(beg1==endrc2){
+ endrc2.setVal(beg2int128rc(unitigs[iR]));
+ if(beg1==endrc2 and (beg1==kmmer or beg1==RC)){
+ //~ if(beg1==endrc2 ){
reverseinplace2(unitigs[iR]);
+ indexed_left[iR]=indexed_right[iR];
+ connected_left[iR]=connected_right[iR];
+
unitigs[iR]+=(unitigs[iL].substr(k));
unitigs[iL]=to_string(iR);
-
+ indexed_right[iR]=indexed_right[iL];
+ connected_right[iR]=connected_right[iL];
compact_abundances(iR,iL,true,false);
return;
}
typename graph3<span>::kmerType beg2;//(rcb(endrc2));
- beg2.setVal(rcb(endrc2));
+ beg2.setVal(rcb(endrc2));
typename graph3<span>::kmerType end1;//(end2int128(unitigs[iL]));
end1.setVal(end2int128(unitigs[iL]));
- if(end1==beg2){
+ if(end1==beg2 and (end1==kmmer or end1==RC)){
+ //~ if(end1==beg2 ){
unitigs[iL]+=(unitigs[iR].substr(k));
unitigs[iR]=to_string(iL);
+ indexed_right[iL]=indexed_right[iR];
+ connected_right[iL]=connected_right[iR];
compact_abundances(iL,iR);
return;
}
@@ -171,10 +184,13 @@ void graph3<span>::compaction(uint iL, uint iR){
/* std::cout << "a : " << rcb(end2).toString(31) << std::endl;
std::cout << "a=b: " << begrc2.toString(31) << std::endl;*/ // manifestation of a bug when LargeInt constructors are removed
- if(end1==begrc2){
+ if(end1==begrc2 and (end1==kmmer or end1==RC)){
+ //~ if(end1==begrc2 ){
unitigs[iL]+=(reverseinplace(unitigs[iR]).substr(k));
unitigs[iR]=to_string(iL);
+ indexed_right[iL]=indexed_left[iR];
+ connected_right[iL]=connected_left[iR];
compact_abundances(iL,iR, false, true);
return;
}
@@ -197,218 +213,184 @@ void graph3<span>::compact_abundances(uint i1, uint i2, bool reverse_first, bool
}
template<size_t span>
+inline void graph3<span>::update_connected(kmerIndiceT<span> &ki)
+{
+ if (ki.position == SEQ_LEFT)
+ connected_left[ki.indice] = true;
+ else
+ connected_right[ki.indice] = true;
+}
+
+/*
+ * this function is the core one that decides what compactions need to be made
+ */
+template<size_t span>
void graph3<span>::debruijn(){
sort(left.begin(),left.end(),comparator<span>());
sort(right.begin(),right.end(),comparator<span>());
uint iL(0),iR(0),sizeLeft(left.size()),sizeRight(right.size());
typename graph3<span>::kmerType minusone;
minusone.setVal(-1);
- left.push_back({0,minusone});
- right.push_back({0,minusone});
-
- kmerIndiceT<span> kL,kR;
- while(iL!=sizeLeft and iR!=sizeRight){
+ left.push_back({0,minusone, SEQ_LEFT}); // dummy kmer so that we dont need to check bounds.. clever..
+ right.push_back({0,minusone, SEQ_LEFT});
+ uint debug_index = 0;
+
+ for (uint32_t i = 0; i< indiceUnitigs; i++)
+ {
+ connected_left[i] = false;
+ connected_right[i] = false;
+ }
+
+ kmerIndiceT<span> kL,kR;
+ std::vector<std::pair<uint,uint>> to_compact;
+ // in this pass we just flag the pairs to compact.
+ // before, we used to compact on the fly, but now i want to have proper connection info for all unitigs prior to compaction
+ while(iL < sizeLeft && iR < sizeRight){
kL=left[iL];
kR=right[iR];
+
+ if (debug_index > 0) if (kL.indice == debug_index || kR.indice == debug_index ) std::cout << " kl / kR " << kL.indice << " " << kR.indice << " " << kL.kmmer << " " << kR.kmmer << " unitigs " << unitigs[kL.indice] << " " << unitigs[kR.indice] << std::endl;
+
if(kL.kmmer==kR.kmmer){
+ if (debug_index > 0) if (kL.indice == debug_index || kR.indice == debug_index ) std::cout << " identical, kl / kR " << kL.indice << " " << kR.indice << " unitigs " << unitigs[kL.indice] << " " << unitigs[kR.indice] << " positions " << kL.position << " " << kR.position << std::endl;
+ update_connected(kL);
+ update_connected(kR);
+
+ // found the same (k-1)-mer in the left and right array, it means that two sequences end with those and could be potentially compacted
bool go(true);
++iL;++iR;
- if(left[iL].kmmer==kL.kmmer){
- go=false;
- while(left[++iL].kmmer==kL.kmmer){}
- }
- if(right[iR].kmmer==kL.kmmer){
- go=false;
- while(right[++iR].kmmer==kR.kmmer){}
+ if(left[iL].kmmer==kL.kmmer){
+ go=false;
+ update_connected(left[iL]);
+ while(left[++iL].kmmer<=kR.kmmer ){if(iL==sizeLeft){return;}}
+ }
+ if(right[iR].kmmer==kL.kmmer){
+ go=false;
+ update_connected(right[iR]);
+ while(right[++iR].kmmer<=kL.kmmer ){if(iR==sizeRight){return;}}
+ }
+ if(go){
+ compaction(kL.indice,kR.indice,kL.kmmer);
+ //~ to_compact.push_back(std::make_pair(kL.indice,kR.indice));
}
- if(go){compaction(kL.indice,kR.indice);}
+
}else{
if(kL.kmmer<kR.kmmer){
- while(left[++iL].kmmer==kL.kmmer){}
+ while(left[++iL].kmmer<kR.kmmer){}
}else{
- while(right[++iR].kmmer==kR.kmmer){}
+ while(right[++iR].kmmer<kL.kmmer){}
}
}
}
+
+ //~ for (auto p: to_compact)
+ //~ {
+ //~ compaction(std::get<0>(p),std::get<1>(p));
+ //~ }
}
template<size_t span>
-bool graph3<span>::output(uint i){return !isNumber(unitigs[i][0]);}
+bool graph3<span>::output(uint i){
+ if (isNumber(unitigs[i][0]))
+ return false;
+
+ if (pre_tip_cleaning)
+ {
+ if (indexed_left[i] && indexed_right[i])
+ {
+ if ((connected_left[i] && (!connected_right[i])) ||
+ (connected_right[i] && (!connected_left[i])))
+ {
+
+ if (unitigs[i].size() < 3*(k+1)) // the spades tip length convention, to be tuned
+ {
+ nb_pretips++;
+ //std::cout << "filtering tip " << unitigs[i] << " indexing l/r " << indexed_left[i] << " " << indexed_right[i] << " connected l/r " << connected_left[i] << " " << connected_right[i] << std::endl;
+ return false;
+ }
+ }
+ }
+ }
+
+ //std::cout << "returning seq " << unitigs[i] << " indexing l/r " << indexed_left[i] << " " << indexed_right[i] << " connected l/r " << connected_left[i] << " " << connected_right[i] << std::endl;
+ return true;
+}
template<size_t span>
-bool graph3<span>::clear(){delete [] unitigs; delete [] unitigs_abundances; return true;}
+bool graph3<span>::clear(){delete [] unitigs; delete [] unitigs_abundances;
+ /* // nah, not needed. it wasn't the cause of the memory fragmentation, because even when the graph isn't constructed it still happens
+ left.clear(); right.clear(); left.shrink_to_fit(); right.shrink_to_fit(); */
+ return true;}
template<size_t span>
uint graph3<span>::size(){return indiceUnitigs;};
+/* this function inserts sequences into the structure
+ * while the code uses the term "unitigs", initially these sequences are just kmers (but later they will be unitigs)
+ * sequences and their abundances are stored in a plain list
+ * the index consists of two lists: left and right
+ * both indices store tuples of the form (sequence index, canonical kmer)
+ * the left index corresponds to kmers that are seen at the left of input sequence in forward strand, or on the right of unitigs in reverse strand
+ * the right index is, well, the other ones. useful schema:
+ *
+ * l r
+ * ---> --->
+ * ------------------------ input sequence (possibly a k-mer)
+ * <--- <---
+ * r l
+ */
template<size_t span>
void graph3<span>::addtuple(tuple<string,uint,uint,uint>& tuple){
- unitigs[indiceUnitigs]=move(get<0>(tuple));
+ // input tuple: <unitigs string, left minimizer, right minimizer, abundance>
+ unitigs[indiceUnitigs]=get<0>(tuple);
unitigs_abundances[indiceUnitigs].push_back(get<3>(tuple));
+
+ bool debug = false;
+ string debug_kmer = "GTTTTTTAGATTCTGAGTGGAACGATGAATG";
+
if(minimizer==(get<1>(tuple))){
+ indexed_left.push_back(true);
typename graph3<span>::kmerType kmer1(beg2int128(unitigs[indiceUnitigs]));
typename graph3<span>::kmerType kmer2(rcb(kmer1));
- if(kmer1<kmer2){
- left.push_back(kmerIndiceT<span>{indiceUnitigs,kmer1});
- }else{
- right.push_back(kmerIndiceT<span>{indiceUnitigs,kmer2});
+ if(kmer1<=kmer2){
+ if (debug) if (unitigs[indiceUnitigs].compare(debug_kmer) == 0) std::cout << "for that seq " << unitigs[indiceUnitigs] << ", left kmer1 is " << kmer1 << " index " << indiceUnitigs << std::endl;
+ left.push_back(kmerIndiceT<span>{indiceUnitigs,kmer1, SEQ_LEFT});
+ }
+ if(kmer2<=kmer1){
+ if (debug) if (unitigs[indiceUnitigs].compare(debug_kmer) == 0) std::cout << "for that seq " << unitigs[indiceUnitigs] << ", left kmer2 is " << kmer1 << " index " << indiceUnitigs << std::endl;
+ right.push_back(kmerIndiceT<span>{indiceUnitigs,kmer2, SEQ_LEFT});
+
}
+ // TODO probably to handle kmers that are their own reerse compelment, do:
+ // if (kmer2 < kmer1) instead of the "else"
+ // but i didnt test it yet, was chasing another bug, so let's implement that later
}
+ else
+ indexed_left.push_back(false);
if(minimizer==get<2>(tuple)){
+ indexed_right.push_back(true);
typename graph3<span>::kmerType kmer1(end2int128(unitigs[indiceUnitigs]));
typename graph3<span>::kmerType kmer2(rcb(kmer1));
- if(kmer1<kmer2){
- right.push_back(kmerIndiceT<span>{indiceUnitigs,kmer1});
- }else{
- left.push_back(kmerIndiceT<span>{indiceUnitigs,kmer2});
+ if(kmer1<=kmer2){
+ if (debug) if (unitigs[indiceUnitigs].compare(debug_kmer) == 0) std::cout << "for that seq " << unitigs[indiceUnitigs] << ", right kmer1 is " << kmer1 << " index " << indiceUnitigs << std::endl;
+ right.push_back(kmerIndiceT<span>{indiceUnitigs,kmer1, SEQ_RIGHT});
+ }
+ if(kmer2<=kmer1){
+ if (debug) if (unitigs[indiceUnitigs].compare(debug_kmer) == 0) std::cout << "for that seq " << unitigs[indiceUnitigs] << ", right kmer2 is " << kmer2 << std::endl;
+ left.push_back(kmerIndiceT<span>{indiceUnitigs,kmer2, SEQ_RIGHT});
+
}
}
+ else
+ indexed_right.push_back(false);
++indiceUnitigs;
}
-// void compareUnitigs(const string& fileFa,const string& fileDot){
-// uint a(0),b(0),c(0),d(0);
-// unordered_set<string> setFa,setDot;
-// ifstream streamFa(fileFa),streamDot(fileDot);
-// string seq;
-// getline(streamFa,seq);
-// while (!streamFa.eof()) {
-// getline(streamFa,seq,'>');
-// seq=seq.substr(0,seq.size()-1);
-// setFa.insert(seq);
-// // cout<<seq<<endl;
-// // cin.get();
-// getline(streamFa,seq);
-// ++c;
-// }
-// cout<<1<<endl;
-// while (!streamDot.eof()){
-// getline(streamDot,seq);
-// transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
-// seq=seq.substr(0,seq.size()-1);
-// setDot.insert(seq);
-// // cout<<seq<<endl;
-// // cin.get();
-// ++d;
-// }
-// cout<<2<<endl;
-// for(auto it(setFa.begin());it!=setFa.end();++it){
-// if(setDot.count(*it)==0){
-// ++a;
-// }
-// }
-// cout<<3<<endl;
-// for(auto it(setDot.begin());it!=setDot.end();++it){
-// if(setFa.count(*it)==0){
-// ++a;
-// }
-// }
-// cout<<a<<" "<<b<<endl;
-// cout<<c<<" "<<d<<endl;
-// }
-//
-//
-// void compareKmers(const string& fileFa,const string& fileDot){
-// uint k(31);
-// string kmer;
-// uint a(0),b(0),c(0),d(0);
-// unordered_set<string> setFa,setDot;
-// ifstream streamFa(fileFa),streamDot(fileDot);
-// string seq,inter,nimp;
-//
-//
-//
-// // cout<<1<<endl;
-// while (!streamFa.eof()) {
-// getline(streamFa,nimp);
-// // cout<<"nimp"<<nimp<<endl;
-// getline(streamFa,seq);
-// // cout<<"seq"<<seq<<endl;
-// point:
-// char c=streamFa.peek();
-// if(c=='>'){
-// point2:
-// // seq=seq.substr(0,seq.size());
-// // for(uint j(0);(j)<seq.size();++j){
-// // if(seq[j]!='A' and seq[j]!='C' and seq[j]!='T' and seq[j]!='G'){
-// // cout<<seq<<endl;
-// // cout<<"lol"<<endl;
-// // exit(0);
-// // }
-// // }
-// for (uint i = 0; i+k <=seq.size(); ++i) {
-// kmer=seq.substr(i,k);
-// // cout<<kmer<<endl;
-// kmer=getRepresent(kmer);
-// // if(setDot.count(kmer)==0){
-// // ++a;
-// // }
-// setFa.insert(kmer);
-// }
-// }else{
-// if(!streamFa.eof()){
-// // cout<<"inter"<<endl;
-// // cout<<seq<<endl;
-// getline(streamFa,inter);
-// // cout<<inter<<endl;
-// seq+=inter;
-// goto point;
-// }else{
-// // cout<<"lol2"<<endl;
-// goto point2;
-// }
-// }
-// }
-// cout<<2<<endl;
-//
-// while (!streamDot.eof()){
-// getline(streamDot,seq);
-// seq=seq.substr(0,k);
-// // cout<<seq<<endl;
-// // cin.get();
-// if(setFa.count(getRepresent(seq))==0){
-// cout<<seq<<endl;
-// ++a;
-// }
-// }
-//
-// // while (!streamDot.eof()){
-// // getline(streamDot,seq);
-// // transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
-// // seq=seq.substr(0,seq.size()-1);
-// // // cout<<seq<<endl;
-// // for (uint i = 0; i+k <=seq.size(); ++i) {
-// // kmer=seq.substr(i,k);
-// // // cout<<kmer<<endl;
-// // kmer=getRepresent(kmer);
-// // // setDot.insert(kmer);
-// // if(setFa.count(kmer)==0){
-// // ++b;
-// // }
-// // }
-// // // cout<<seq<<endl;
-// // // cin.get();
-// // // ++d;
-// // }
-// // for(auto it(setFa.begin());it!=setFa.end();++it){
-// // if(setDot.count(*it)==0){
-// // ++a;
-// // }
-// // }
-// cout<<3<<endl;
-// // for(auto it(setDot.begin());it!=setDot.end();++it){
-// // if(setFa.count(*it)==0){
-// // ++b;
-// // }
-// // }
-// cout<<a<<" "<<b<<endl;
-// cout<<c<<" "<<d<<endl;
-// }
}}}}
diff --git a/gatb-core/src/gatb/bcalm2/ograph.h b/gatb-core/src/gatb/bcalm2/ograph.h
index 4db460c..e7976f5 100644
--- a/gatb-core/src/gatb/bcalm2/ograph.h
+++ b/gatb-core/src/gatb/bcalm2/ograph.h
@@ -20,18 +20,25 @@ using namespace gatb::core::kmer::impl;
namespace gatb { namespace core { namespace debruijn { namespace impl {
+enum seq_pos { SEQ_LEFT, SEQ_RIGHT } ; // indicates the location in the input sequence (regardless of orientation)
+
template<size_t span>
struct kmerIndiceT{
typedef typename Kmer<span>::Type Type;
//typedef __uint128_t Type;
uint32_t indice;
Type kmmer;
+ seq_pos position;
};
template<size_t span>
-struct comparator{bool operator()(const kmerIndiceT<span>& a , const kmerIndiceT<span>& b) { return a.kmmer < b.kmmer; }};
+struct comparator{bool operator()(const kmerIndiceT<span>& a , const kmerIndiceT<span>& b) {
+ if (a.kmmer != b.kmmer)
+ return a.kmmer < b.kmmer;
+ // it used to be just a kmmer comparison, i'm making it a bit less undefined on identical kmers now
+ return a.indice < b.indice; }};
template<size_t span>
@@ -40,15 +47,17 @@ class graph3{
typedef typename Kmer<span>::Type kmerType;
//typedef __uint128_t kmerType;
typedef kmerIndiceT<span> kmerIndice;
- uint k,indiceUnitigs,nbElement,minimizer,minsize;
+ uint k,indiceUnitigs,nbElement,minimizer,minsize,nb_pretips;
std::string* unitigs;
std::vector<uint>* unitigs_abundances;
std::vector<kmerIndice> left;
std::vector<kmerIndice> right;
+ std::vector<bool> connected_left, connected_right, indexed_left, indexed_right;
void addvertex(std::string& str);
void addtuple(std::tuple<std::string,uint,uint,uint>& tuple);
void addleftmin(unsigned int mini);
void addrightmin(unsigned int mini);
+ void update_connected(kmerIndiceT<span> &ki);
void debruijn();
void debruijn2();
void compaction2(uint iL, uint iR);
@@ -58,14 +67,17 @@ class graph3{
kmerType beg2int128rc(const std::string& str);
kmerType beg2int128(const std::string& str);
kmerType rcb(kmerType min);
- void compaction(uint iR, uint iL);
+ void compaction(uint iR, uint iL, kmerType kmmer);
void compact_abundances(uint i1, uint i2, bool reverse_first=false, bool reverse_second=false);
uint size();
bool output(uint i);
bool clear();
+ bool pre_tip_cleaning;
graph3(uint ka, uint min,uint size, uint nb){
indiceUnitigs=0;
+ pre_tip_cleaning = false;
+ nb_pretips=0;
minsize=size;
k=ka;
minimizer=min;
@@ -73,6 +85,10 @@ class graph3{
unitigs=new std::string [nbElement];
left.reserve(nbElement);
right.reserve(nbElement);
+ connected_left.reserve(nbElement);
+ connected_right.reserve(nbElement);
+ indexed_left.reserve(nbElement);
+ indexed_right.reserve(nbElement);
unitigs_abundances=new std::vector<uint> [nbElement];
}
};
diff --git a/gatb-core/src/gatb/debruijn/impl/ExtremityInfo.hpp b/gatb-core/src/gatb/debruijn/impl/ExtremityInfo.hpp
index 465ee1c..edc983e 100644
--- a/gatb-core/src/gatb/debruijn/impl/ExtremityInfo.hpp
+++ b/gatb-core/src/gatb/debruijn/impl/ExtremityInfo.hpp
@@ -42,6 +42,10 @@ namespace impl {
if ((int)pos > 2) { std::cout << "incorrect encoding for pos in packed ExtremityInfo: " << (int)pos << std::endl; exit(1); }
return (pos-1) + (rc << 1) + (unitig << 2);
}
+ uint64_t pack_norc() // possibly used in "merci.cpp"
+ {
+ return (pos-1) + ( 0 << 1) + (unitig << 2);
+ }
void unpack(uint64_t val)
{
diff --git a/gatb-core/src/gatb/debruijn/impl/Graph.cpp b/gatb-core/src/gatb/debruijn/impl/Graph.cpp
index 6325512..b33ac1b 100644
--- a/gatb-core/src/gatb/debruijn/impl/Graph.cpp
+++ b/gatb-core/src/gatb/debruijn/impl/Graph.cpp
@@ -472,8 +472,9 @@ void build_visitor_postsolid<Node,Edge,GraphDataVariant>::operator() (GraphData<
Group& dskGroup = (*solidStorage)("dsk");
Partition<Count>* solidCounts = & dskGroup.getPartition<Count> ("solid");
- /** We create an instance of the MPHF Algorithm class (why is that a class, and not a function?) and execute it. */
- if ((!graph.checkState(GraphTemplate<Node, Edge, GraphDataVariant>::STATE_MPHF_DONE)))
+ /** We create an instance of the MPHF Algorithm class (I was wondering: why is that a class, and not a function?) and execute it. */
+ bool noMphf = props->get("-no-mphf") != 0;
+ if ((!noMphf) && (!graph.checkState(GraphTemplate<Node, Edge, GraphDataVariant>::STATE_MPHF_DONE)))
{
DEBUG ((cout << "build_visitor : MPHFAlgorithm BEGIN\n"));
@@ -642,6 +643,7 @@ IOptionsParser* GraphTemplate<Node, Edge, GraphDataVariant>::getOptionsParser (b
parser->push_back (SortingCountAlgorithm<>::getOptionsParser(includeMandatory));
parser->push_back (DebloomAlgorithm<>::getOptionsParser());
parser->push_back (BranchingAlgorithm<>::getOptionsParser());
+ parser->push_front (new OptionNoParam ("-no-mphf", "don't construct the MPHF"));
/** We create a "general options" parser. */
IOptionsParser* parserGeneral = new OptionsParser ("general");
@@ -649,9 +651,21 @@ IOptionsParser* GraphTemplate<Node, Edge, GraphDataVariant>::getOptionsParser (b
parserGeneral->push_front (new OptionOneParam (STR_VERBOSE, "verbosity level", false, "1" ));
parserGeneral->push_front (new OptionOneParam (STR_NB_CORES, "number of cores", false, "0" ));
parserGeneral->push_front (new OptionNoParam (STR_CONFIG_ONLY, "dump config only"));
+
+ parser->push_back (parserGeneral);
+
+ OptionsParser* parserDebug = new OptionsParser ("debug ");
+
+ // those are only valid for GraphUnitigs, but GraphUnitigs doesn't have custom options (yet) so i'm adding here
+ parserDebug->push_front (new OptionNoParam ("-skip-links", "same, but skip links"));
+ parserDebug->push_front (new OptionNoParam ("-redo-links", "same, but redo links"));
+ parserDebug->push_front (new OptionNoParam ("-skip-bglue", "same, but skip bglue"));
+ parserDebug->push_front (new OptionNoParam ("-redo-bglue", "same, but redo bglue "));
+ parserDebug->push_front (new OptionNoParam ("-skip-bcalm", "same, but skip bcalm"));
+ parserDebug->push_front (new OptionNoParam ("-redo-bcalm", "debug function, redo the bcalm algo"));
/** We add it to the root parser. */
- parser->push_back (parserGeneral);
+ parser->push_back (parserDebug);
return parser;
}
@@ -851,7 +865,8 @@ GraphTemplate<Node, Edge, GraphDataVariant>::GraphTemplate (tools::misc::IProper
/** We create a storage instance. */
/* (this is actually loading, not creating, the storage at "uri") */
_storageMode = load_from_hdf5 ? STORAGE_HDF5 : STORAGE_FILE;
- setStorage (StorageFactory(_storageMode).create (input, false, false));
+ bool append = true; // special storagehdf5 which will open the hdf5 file as read&write
+ setStorage (StorageFactory(_storageMode).create (input, false, false, false, append));
/** We get some properties. */
_state = (typename GraphTemplate<Node, Edge, GraphDataVariant>::StateMask) atol (getGroup().getProperty ("state").c_str());
diff --git a/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp b/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp
index 643f23e..24a8308 100644
--- a/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp
+++ b/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp
@@ -46,6 +46,11 @@
#include <gatb/debruijn/impl/Simplifications.hpp>
+// for trim()
+#include <functional>
+#include <cctype>
+#include <locale>
+
using namespace std;
using namespace gatb::core::system::impl;
@@ -111,7 +116,7 @@ template<size_t span>
GraphUnitigsTemplate<span> GraphUnitigsTemplate<span>::create (bank::IBank* bank, const char* fmt, ...)
{
IOptionsParser* parser = BaseGraph::getOptionsParser (false); LOCAL(parser);
-
+
/** We build the command line from the format and the ellipsis. */
va_list args;
va_start (args, fmt);
@@ -136,13 +141,13 @@ GraphUnitigsTemplate<span> GraphUnitigsTemplate<span>::create (bank::IBank* ban
** INPUT :
** OUTPUT :
** RETURN :
-** REMARKS :
+** REMARKS : some massive code duplication with above here
*********************************************************************/
template<size_t span>
GraphUnitigsTemplate<span> GraphUnitigsTemplate<span>::create (const char* fmt, ...)
{
IOptionsParser* parser = BaseGraph::getOptionsParser (true); LOCAL (parser);
-
+
/** We build the command line from the format and the ellipsis. */
va_list args;
va_start (args, fmt);
@@ -196,26 +201,19 @@ GraphUnitigsTemplate<span>::GraphUnitigsTemplate (const std::string& uri)
** INPUT :
** OUTPUT :
** RETURN :
-** REMARKS :
+** REMARKS : quick hack, not supposed to be used outside of tests
*********************************************************************/
template<size_t span>
GraphUnitigsTemplate<span>::GraphUnitigsTemplate (bank::IBank* bank, tools::misc::IProperties* params)
{
- // quick hack, not supposed to be used outside o tests
/** We get the kmer size from the user parameters. */
BaseGraph::_kmerSize = params->getInt (STR_KMER_SIZE);
- modelK = new Model(BaseGraph::_kmerSize);
- modelKdirect= new ModelDirect(BaseGraph::_kmerSize);
size_t integerPrecision = params->getInt (STR_INTEGER_PRECISION);
/** We configure the data variant according to the provided kmer size. */
BaseGraph::setVariant (BaseGraph::_variant, BaseGraph::_kmerSize, integerPrecision);
string unitigs_filename = "dummy.unitigs.fa"; // because there's already a bank, but we don't know its name maybe? so just to be safe, i'm setting a dummy unitigs file. anyway, this constructor is only called in tests i think, not by minia for sure.
- /*(params->get(STR_URI_OUTPUT) ?
- params->getStr(STR_URI_OUTPUT) : System::file().getBaseName (input)
- )+ ".unitigs.fa"; */
-
params->setInt(STR_REPARTITION_TYPE, 1);
params->setInt(STR_MINIMIZER_TYPE, 1);
@@ -260,9 +258,20 @@ void GraphUnitigsTemplate<span>::build_unitigs_postsolid(std::string unitigs_fil
throw system::Exception ("Graph construction failure during build_visitor_postsolid, the input h5 file needs to contain at least solid kmers.");
}
- bool force_loading_unitigs = false; // debug option
+ bool redo_bcalm = props->get("-redo-bcalm");
+ bool redo_bglue = props->get("-redo-bglue");
+ bool redo_links = props->get("-redo-links");
+
+ bool skip_bcalm = props->get("-skip-bcalm");
+ bool skip_bglue = props->get("-skip-bglue");
+ bool skip_links = props->get("-skip-links");
- if (!checkState(STATE_BCALM2_DONE) && (!force_loading_unitigs /* for debug, if unitigs are made but the h5 didn't register it, stupid h5*/))
+ bool do_unitigs = !checkState(STATE_BCALM2_DONE);
+ bool do_bcalm = (redo_bcalm || do_unitigs) && (!skip_bcalm);
+ bool do_bglue = (redo_bglue || do_unitigs) && (!skip_bglue);
+ bool do_links = (redo_links || do_unitigs) && (!skip_links);
+
+ if (do_unitigs || do_bcalm || do_bglue || do_links)
{
int nb_threads =
props->getInt(STR_NB_CORES);
@@ -274,20 +283,20 @@ void GraphUnitigsTemplate<span>::build_unitigs_postsolid(std::string unitigs_fil
props->setInt(STR_KMER_SIZE, kmerSize);
- UnitigsConstructionAlgorithm<span> unitigs_algo(BaseGraph::getStorage(), unitigs_filename, nb_threads, props);
+ UnitigsConstructionAlgorithm<span> unitigs_algo(BaseGraph::getStorage(), unitigs_filename, nb_threads, props, do_bcalm, do_bglue, do_links);
BaseGraph::executeAlgorithm(unitigs_algo, &BaseGraph::getStorage(), props, BaseGraph::_info);
nb_unitigs = unitigs_algo.nb_unitigs;
+ BaseGraph::getGroup().setProperty ("nb_unitigs", Stringify::format("%d", nb_unitigs));
setState(STATE_BCALM2_DONE);
}
- else
- nb_unitigs = atol (BaseGraph::getGroup().getProperty ("nb_unitigs").c_str());
+
+ nb_unitigs = atol (BaseGraph::getGroup().getProperty ("nb_unitigs").c_str());
/** We save the state at storage root level. */
BaseGraph::getGroup().setProperty ("state", Stringify::format("%d", BaseGraph::_state));
- BaseGraph::getGroup().setProperty ("nb_unitigs", Stringify::format("%d", nb_unitigs));
}
static void
@@ -338,7 +347,7 @@ parse_unitig_header(string header, float& mean_abundance, vector<uint64_t>& inc,
}
else
{
- if (field == "KM")
+ if (field == "km")
{
mean_abundance = atof(tok.substr(tok.find_last_of(':')+1).c_str());
//std::cout << "unitig " << header << " mean abundance " << mean_abundance << std::endl;
@@ -347,15 +356,51 @@ parse_unitig_header(string header, float& mean_abundance, vector<uint64_t>& inc,
}
}
}
-
static void
-insert_navigational_vector(std::vector<uint64_t> &v, std::vector<uint64_t>& to_insert, uint64_t utig_counter, std::vector<uint64_t> &v_map)
+insert_navigational_vector(std::vector<uint64_t> &v, std::vector<uint64_t>& to_insert, std::vector<uint64_t> &v_map)
{
- v_map[utig_counter] = v.size();
+ v_map.push_back(v.size());
v.insert(v.end(), to_insert.begin(), to_insert.end());
}
+static void
+insert_compressed_navigational_vector(std::vector<uint64_t> &v, std::vector<uint64_t>& to_insert, dag::dag_vector &v_map)
+{
+ v_map.push_back(to_insert.size());
+ v.insert(v.end(), to_insert.begin(), to_insert.end());
+ /*for (auto x: to_insert) // that was for when v was a dag_vector
+ {
+ v.push_back(x);
+ }*/
+}
+
+// here we dynamically insert at the next open space in the vector after the position indicated by the v_map (pos variable)
+// since we precomputed v_map, we know that there is no way to overflow
+static void
+insert_navigational_vector_gfa(std::vector<uint64_t> &v, uint64_t to_insert, uint64_t pos)
+{
+ bool inserted = false;
+ uint64_t v_size = v.size();
+ for (uint i = 0 ; i < 16; i ++) // 16 is just some upper bound
+ {
+ if (pos+i < v_size && v[pos+i] == 0)
+ {
+ v[pos+i] = to_insert;
+ inserted = true;
+ break;
+ }
+ }
+ if (!inserted)
+ {
+ std::cout << "bad navigational vector insert at position " << pos << " / " << v_size << "! could not find a spot to insert. some debug: " << std::endl;
+ std::cout << v[pos] << " " << v[pos+1] << " " << v[pos+2] << std::endl;
+ exit(1);
+ }
+}
+
+
+
//http://stackoverflow.com/questions/30540101/iterator-for-a-subset-of-a-vector
template <class Iter>
class range {
@@ -373,6 +418,7 @@ make_range(Container& c, size_t b, size_t e) {
return range<typename Container::const_iterator> (c.begin()+b, c.begin()+e);
}
+
/* returns an iterator of all incoming or outcoming edges from an unitig */
static
range<std::vector<uint64_t>::const_iterator >
@@ -391,6 +437,80 @@ get_from_navigational_vector(const std::vector<uint64_t> &v, uint64_t utig, cons
}
+/* compressed counterpart of the function above */
+static
+range<std::vector<uint64_t>::const_iterator >
+get_from_compressed_navigational_vector(const std::vector<uint64_t> &v, uint64_t utig, const dag::dag_vector &v_map)
+{
+ if (utig == v_map.size() /*total number of unitigs*/ - 1)
+ {
+ //std::cout << "get from nav vector " << to_string(utig) << " " << to_string(v_map[utig]) << " " << to_string(v.size()) << " last unitig" << std::endl;
+ return make_range(v,v_map.prefix_sum(utig),v.size());
+ }
+ else
+ {
+ if (utig == 0)
+ return make_range(v,0, v_map[0]);
+
+ //std::cout << "get from nav vector " << to_string(utig) << " " << to_string(v_map[utig]) << " " << to_string(v_map[utig+1]) << " (utig " << utig << "/" << v_map.size() << ")" << std::endl;
+ uint64_t ps = v_map.prefix_sum(utig);
+ return make_range(v,ps,ps + v_map[utig]);
+ }
+}
+
+
+template<size_t span>
+void GraphUnitigsTemplate<span>::print_unitigs_mem_stats(uint64_t incoming_size, uint64_t outcoming_size, uint64_t total_unitigs_size, uint64_t nb_utigs_nucl, uint64_t nb_utigs_nucl_mem)
+{
+ uint64_t mem_vec_sizes = /*unitigs_sizes.get_alloc_byte_num(); // formerly */(unitigs_sizes.capacity() * sizeof(uint32_t));
+
+ std::cout << "Stats:" << std::endl;
+ std::cout << "Number of unitigs: " << nb_unitigs << std::endl;
+ std::cout << "Average number of incoming/outcoming neighbors: " << incoming_size/(float)nb_unitigs << "/" << outcoming_size/(float)nb_unitigs << std::endl;
+ std::cout << "Total number of nucleotides in unitigs: " << total_unitigs_size << std::endl;
+ std::cout << std::endl;
+ std::cout << "Memory usage:" << std::endl;
+ std::cout << " " << (sizeof(uint64_t) * incoming.size()) / 1024 / 1024 << " MB keys in incoming vector" << std::endl;
+ std::cout << " " << (sizeof(uint64_t) * outcoming.size()) / 1024 / 1024 << " MB keys in outcoming vector" << std::endl;
+ uint64_t inc_out_size = (sizeof(uint64_t) * incoming.size()) + (sizeof(uint64_t) * outcoming.size());
+ if (compress_navigational_vectors)
+ {
+ std::cout << " " << dag_incoming_map.get_alloc_byte_num() / 1024 / 1024 << " MB keys in dag_incoming_map vector" << std::endl;
+ std::cout << " " << dag_outcoming_map.get_alloc_byte_num() / 1024 / 1024 << " MB keys in dag_outcoming_map vector" << std::endl;
+ inc_out_size += dag_incoming_map.get_alloc_byte_num() + dag_outcoming_map.get_alloc_byte_num();
+ }
+ else
+ {
+ //std::cout << " " << dag_incoming.get_alloc_byte_num() / 1024 / 1024 << " MB keys in incoming vector" << std::endl;
+ //std::cout << " " << dag_outcoming.get_alloc_byte_num() / 1024 / 1024 << " MB keys in outcoming vector" << std::endl;
+ std::cout << " " << (sizeof(uint64_t) * incoming_map.size()) / 1024 / 1024 << " MB keys in incoming_map vector" << std::endl;
+ std::cout << " " << (sizeof(uint64_t) * outcoming_map.size()) / 1024 / 1024 << " MB keys in outcoming_map vector" << std::endl;
+ inc_out_size += (sizeof(uint64_t) * incoming_map.size()) + (sizeof(uint64_t) * outcoming_map.size());
+ }
+ uint64_t mem_unitigs;
+ if (pack_unitigs)
+ {
+ uint64_t mem_packed_unitigs = packed_unitigs_sizes.prefix_sum(nb_unitigs) + packed_unitigs_sizes.get_alloc_byte_num();
+ std::cout << " " << mem_packed_unitigs /1024 /1024 << " MB packed unitigs (incl. " << packed_unitigs_sizes.get_alloc_byte_num()/1024/1024 << " MB delimiters)" << std::endl;
+ mem_unitigs = mem_packed_unitigs;
+ }
+ else
+ {
+ mem_unitigs = (unitigs.capacity() * sizeof(string) + nb_utigs_nucl_mem);
+ std::cout << " " << mem_unitigs /1024 /1024 << " MB unitigs nucleotides (" << unitigs.capacity() << " * " << sizeof(string) << " + " << nb_utigs_nucl_mem << ")" << std::endl;
+ }
+ std::cout << " " << mem_vec_sizes/1024 /1024 << " MB unitigs lengths" << std::endl;
+ uint64_t mem_unitig_mean_abundance = /*unitigs_mean_abundance.get_alloc_byte_num() ; // <- in dag_vector format; in vector<float> format -> */(nb_unitigs*sizeof(float));
+ std::cout << " " << mem_unitig_mean_abundance / 1024 / 1024 << " MB unitigs abundances" << std::endl;
+ std::cout << " " << (2*nb_unitigs/8) / 1024 / 1024 << " MB deleted/visited bitvectors" << std::endl;
+ // summation of all of the above:
+ std::cout << "Estimated total: " << (mem_unitig_mean_abundance + (nb_unitigs*2.0/8.0) + inc_out_size + mem_unitigs + mem_vec_sizes) / 1024 / 1024 << " MB" << std::endl;
+
+ if (nb_utigs_nucl != nb_utigs_nucl_mem)
+ std::cout << "unitigs strings size " << nb_utigs_nucl << " vs capacity " << nb_utigs_nucl_mem << std::endl;
+
+}
+
template<size_t span>
void GraphUnitigsTemplate<span>::load_unitigs(string unitigs_filename)
@@ -405,17 +525,18 @@ void GraphUnitigsTemplate<span>::load_unitigs(string unitigs_filename)
//ProgressIterator<bank::Sequence> itSeq (*inputBank, "loading unitigs");
BankFasta::Iterator itSeq (inputBank);
-
- incoming_map.resize(nb_unitigs);
- outcoming_map.resize(nb_unitigs);
-
- unsigned int kmerSize = BaseGraph::_kmerSize;
+ unsigned int kmerSize = BaseGraph::_kmerSize;
+
+ //compress_navigational_vectors = false;
+ compress_navigational_vectors = true; //only a 10% speed hit but 2x less incoming/outcoming/incoming_map/outcoming_map memory usage, so, quite worth it.
+ pack_unitigs = true;
nb_unitigs_extremities = 0; // will be used by NodeIterator (getNodes)
- uint64_t utig_counter = 0;
uint64_t nb_utigs_nucl = 0;
uint64_t nb_utigs_nucl_mem = 0;
- for (itSeq.first(); !itSeq.isDone(); itSeq.next()) // could be done in parallel, maybe, if we used many unordered_map's with a hash on the query kmer (TODO opt)
+ uint64_t total_unitigs_size = 0;
+ float incoming_size = 0, outcoming_size = 0;
+ for (itSeq.first(); !itSeq.isDone(); itSeq.next()) // could be done in parallel, maybe, if we used many unordered_map's with a hash on the query kmer (the same opt could be done for LinkTigs)
{
const string& seq = itSeq->toString();
const string& comment = itSeq->getComment();
@@ -424,54 +545,261 @@ void GraphUnitigsTemplate<span>::load_unitigs(string unitigs_filename)
vector<uint64_t> inc, outc; // incoming and outcoming unitigs
parse_unitig_header(comment, mean_abundance, inc, outc);
- insert_navigational_vector(incoming, inc, utig_counter, incoming_map);
- insert_navigational_vector(outcoming, outc, utig_counter, outcoming_map);
+ incoming_size += inc.size();
+ outcoming_size += outc.size();
- unitigs.push_back(seq);
+ if (compress_navigational_vectors)
+ {
+ // we won't use dag_incoming and dag_outcoming, there doesnt seem to be any performance gain. a bit surprising, though, because i was storing 64bit ints before. but gamma coding is, after all, 2-optimal and most numbers are close to 32 bits.
+ insert_compressed_navigational_vector(/*dag_incoming*/ incoming, inc, dag_incoming_map);
+ insert_compressed_navigational_vector(/*dag_outcoming*/ outcoming, outc, dag_outcoming_map);
+
+ }
+ else
+ {
+ insert_navigational_vector(incoming, inc, incoming_map); // "incoming_map" records the number of incoming links for an unitig. "incoming" records links explicitly
+ insert_navigational_vector(outcoming, outc, outcoming_map);
+ }
+
+ if (pack_unitigs)
+ {
+ packed_unitigs += internal_compress_unitig(seq);
+ packed_unitigs_sizes.push_back((seq.size()+3)/4);
+ }
+ else
+ unitigs.push_back(internal_compress_unitig(seq));
+
+ unitigs_sizes.push_back(seq.size());
+ total_unitigs_size += seq.size();
unitigs_mean_abundance.push_back(mean_abundance);
- utig_counter++;
- nb_utigs_nucl += seq.size();
- nb_utigs_nucl_mem += seq.capacity();
+ //std::cout << "decoded : " << internal_get_unitig_sequence(unitigs.size()-1) << std::endl;
+ //std::cout << "real : " << seq << std::endl;
+
+ if (!pack_unitigs)
+ {
+ nb_utigs_nucl += unitigs[unitigs.size()-1].size();
+ nb_utigs_nucl_mem += unitigs[unitigs.size()-1].capacity();
+ }
if (seq.size() == kmerSize)
nb_unitigs_extremities++;
else
nb_unitigs_extremities+=2;
}
+ nb_unitigs = unitigs_sizes.size();
unitigs_traversed.resize(0);
- unitigs_traversed.resize(unitigs.size(), false); // resize "traversed" bitvector, setting it to zero as well
+ unitigs_traversed.resize(nb_unitigs, false); // resize "traversed" bitvector, setting it to zero as well
unitigs_deleted.resize(0);
- unitigs_deleted.resize(unitigs.size(), false); // resize "traversed" bitvector, setting it to zero as well
+ unitigs_deleted.resize(nb_unitigs, false); // resize "traversed" bitvector, setting it to zero as well
// an estimation of memory usage
- uint64_t nb_kmers = unitigs.size();
- uint64_t mem_vec = (unitigs.capacity() * sizeof(string) + nb_utigs_nucl_mem);
if (verbose)
+ print_unitigs_mem_stats(incoming_size, outcoming_size, total_unitigs_size, nb_utigs_nucl, nb_utigs_nucl_mem);
+}
+
+//https://stackoverflow.com/questions/216823/whats-the-best-way-to-trim-stdstring
+// trim from start
+static inline std::string <rim(std::string &s) {
+ s.erase(s.begin(), std::find_if(s.begin(), s.end(),
+ std::not1(std::ptr_fun<int, int>(std::isspace))));
+ return s;
+}
+
+// trim from end
+static inline std::string &rtrim(std::string &s) {
+ s.erase(std::find_if(s.rbegin(), s.rend(),
+ std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());
+ return s;
+}
+
+// trim from both ends
+static inline std::string &trim(std::string &s) {
+ return ltrim(rtrim(s));
+}
+
+//http://stackoverflow.com/questions/236129/split-a-string-in-c
+static vector<string> string_split(string s, char delim){
+ vector<string> ret;
+ stringstream sstream(s);
+ string temp;
+ while(getline(sstream, temp, delim)){
+ ret.push_back(trim(temp));
+ }
+ return ret;
+}
+
+
+// some code duplication with load_unitigs but nothing too major
+// inspired by gfakluge (https://github.com/edawson/gfakluge/blob/master/src/gfakluge.cpp), but quite limited compared to what kluge parses
+// Warning:
+// several assumptions are implicitly made about the GFA file, and they are not checked. so the code is not robust
+// - needs to have "k:i:[k value]" in header
+// - GFA has to be symmetric: (default of convertToGFA from bcalm), i.e. each link also has its symmetrical link
+// - segments identifiers need to be 0...|nb_unitigs|
+
+template<size_t span>
+void GraphUnitigsTemplate<span>::load_unitigs_from_gfa(string gfa_filename, unsigned int& kmerSize)
+{
+ // in load_unitigs these two are optional. but not here, i didn't make that function flexible
+ compress_navigational_vectors = false; // i dont think i can do random inserts in a dag_vector, so, wont use them
+ pack_unitigs = true;
+ bool verbose = true;
+
+ uint64_t incoming_size = 0, outcoming_size = 0, total_unitigs_size = 0;
+
+ ifstream gfi;
+ gfi.open(gfa_filename.c_str(), std::ifstream::in);
+ if (!gfi.good()){
+ cerr << "Cannot open GFA file. Exiting." << endl;
+ exit(1);
+ }
+
+ string line;
+ vector<string> line_tokens;
+ vector<uint64_t> inc, outc; // incoming and outcoming unitigs
+ bool missing_mean_abundance = false;
+
+ nb_unitigs = 0; // we'll increase number of unitigs as we see them, the following loop
+ unordered_map<uint32_t, uint32_t> unitig_id_corresp; // correspondence between unitig id's seen in the GFA, and the mapping 0,...,|nb_unitigs| that we store in, e.g., the unitig_sizes array
+ while (getline(gfi, line)){
+
+ vector<string> tokens = string_split(line, '\t');
+ if (tokens[0] == "H"){
+ if (tokens.size() < 3 || tokens[2].substr(0,1).compare("k") != 0)
+ {
+ std::cout << "unsupported GFA format - the header needs to contain the k-mer size (e.g. k:i:31)" << std::endl;
+ std::cout << "header: " << line << std::endl;
+ std::cout << "tokens[2]: " << string_split(tokens[2],':')[0] << std::endl;
+ exit(1);
+ }
+ line_tokens = string_split(tokens[2], ':'); // we expect k-mer to be at 2nd position of gfa header [SPECIFIC TO BCALM2 GFA FORMAT]
+ kmerSize = atoi(line_tokens[2].c_str());
+ }
+ else if (tokens[0] == "S"){
+ //std::cout << " line: " << line << std::endl;
+
+ float mean_abundance = 0;
+ for (unsigned int i = 0; i < tokens.size(); i++){
+ if (tokens[i].substr(0,2).compare("km") == 0)
+ mean_abundance = atof(string_split(tokens[i],':')[2].c_str());
+ }
+ if (mean_abundance == 0)
+ missing_mean_abundance = true;
+
+ string seq = tokens[2];
+ uint32_t unitig_id = atoi(tokens[1].c_str());
+ //std::cout << "seq: " << seq << " id: " << unitig_id << std::endl;
+
+ packed_unitigs += internal_compress_unitig(seq);
+ packed_unitigs_sizes.push_back((seq.size()+3)/4);
+
+ unitig_id_corresp[unitig_id] = (nb_unitigs++);
+
+ unitigs_sizes.push_back(seq.size());
+ total_unitigs_size += seq.size();
+ unitigs_mean_abundance.push_back(mean_abundance);
+
+ if (seq.size() == kmerSize)
+ nb_unitigs_extremities++;
+ else
+ nb_unitigs_extremities+=2;
+
+ inc.resize(0);
+ outc.resize(0);
+ }
+ else if (tokens[0] == "L"){ // do a first pass to get the number of in/out links
+ bool in = tokens[2] == "-";
+ uint64_t unitig_id = atoi(tokens[1].c_str());
+
+ // strong assumption, we never see a link from a node before its S definition (unitig is in unitig_id_corresp)
+ // to make it more general: do another pass, that first pass only loads nodes
+ if (unitig_id_corresp.find(unitig_id) == unitig_id_corresp.end())
+ {
+ std::cout << "Unsupported GFA file: L line from a certain node should always be after the S line of that node" << std::endl;
+ exit(1);
+ }
+
+ if (incoming_map.size() < unitig_id_corresp[unitig_id])
+ {
+ // linear resizing
+ incoming_map.resize(unitig_id_corresp[unitig_id]+100000);
+ outcoming_map.resize(unitig_id_corresp[unitig_id]+100000);
+ }
+
+ if (in)
+ {
+ incoming_map[unitig_id_corresp[unitig_id]+1]++; // the +1 is to make the later prefix sum correct
+ incoming_size++;
+ }
+ else
+ {
+ outcoming_map[unitig_id_corresp[unitig_id]+1]++;
+ outcoming_size++;
+ }
+ }
+ }
+
+ if (missing_mean_abundance)
+ std::cout << "NOTE: no segment abundance information was found in the GFA file (missing 'km' field in segment)" << std::endl;
+
+ incoming_map.resize(nb_unitigs); // fix size
+ outcoming_map.resize(nb_unitigs);
+ incoming.resize(incoming_size); // set size
+ outcoming.resize(outcoming_size);
+
+ // compute proper prefix sums
+ for (uint64_t i = 1; i < nb_unitigs; i++)
{
- std::cout << "Memory usage:" << std::endl;
- std::cout << " " << (sizeof(uint64_t) * incoming.size()) / 1024 / 1024 << " MB keys in incoming dict" << std::endl;
- std::cout << " " << (sizeof(uint64_t) * outcoming.size()) / 1024 / 1024 << " MB keys in outcoming dict" << std::endl;
- std::cout << " " << (sizeof(uint64_t) * incoming_map.size()) / 1024 / 1024 << " MB keys in incoming_map dict" << std::endl;
- std::cout << " " << (sizeof(uint64_t) * outcoming_map.size()) / 1024 / 1024 << " MB keys in outcoming_map dict" << std::endl;
- std::cout << " " << mem_vec /1024 /1024 << " MB unitigs nucleotides" << std::endl;
- std::cout << " " << (nb_kmers*sizeof(float)) / 1024 / 1024 << " MB unitigs abundances" << std::endl;
- std::cout << " " << (2*nb_kmers/8) / 1024 / 1024 << " MB deleted/visited bitvectors" << std::endl;
- std::cout << "Estimated total: " << (nb_kmers*(sizeof(float) + 2.0/8.0) + sizeof(uint64_t) * ( incoming.size() + outcoming.size() + incoming.size() + outcoming_map.size()) + mem_vec) / 1024 / 1024 << " MB" << std::endl;
-
- if (nb_utigs_nucl != nb_utigs_nucl_mem)
- std::cout << "unitigs strings size " << nb_utigs_nucl << " vs capacity " << nb_utigs_nucl_mem << std::endl;
+ incoming_map[i] += incoming_map[i-1];
+ outcoming_map[i] += outcoming_map[i-1];
}
+
+ // in this second pass we actually load the links
+ gfi.close();
+ gfi.open(gfa_filename.c_str(), std::ifstream::in);
+ while (getline(gfi, line)){
+ vector<string> tokens = string_split(line, '\t');
+ if (tokens[0] == "L"){
+ bool in = tokens[2] == "-";
+ bool rc = tokens[4] == "-";
+ uint64_t from_unitig_id = atoi(tokens[1].c_str());
+ uint64_t to_unitig_id = atoi(tokens[3].c_str());
+ Unitig_pos pos = (rc)?UNITIG_END:UNITIG_BEGIN; // see load_unitigs for comments on that part
+ if (in)
+ rc = !rc;
+ ExtremityInfo li(to_unitig_id, rc, pos);
+ if (in)
+ insert_navigational_vector_gfa(incoming, li.pack(), incoming_map[unitig_id_corresp[from_unitig_id]]);
+ else
+ insert_navigational_vector_gfa(outcoming, li.pack(), outcoming_map[unitig_id_corresp[from_unitig_id]]);
+ }
+ }
+
+ assert(nb_unitigs == unitigs_sizes.size()); // not sure if this is enforced
+
+ // code dupl
+ unitigs_traversed.resize(0);
+ unitigs_traversed.resize(nb_unitigs, false); // resize "traversed" bitvector, setting it to zero as well
+ unitigs_deleted.resize(0);
+ unitigs_deleted.resize(nb_unitigs, false); // resize "traversed" bitvector, setting it to zero as well
+
+ if (verbose)
+ print_unitigs_mem_stats(incoming_size, outcoming_size, total_unitigs_size);
}
+
/*********************************************************************
** METHOD :
** PURPOSE : creates or completes a graph from parsed command line arguments.
** INPUT : a bank or a h5 file
-** remarks: this function looks similar to the one in Graph; there is some code duplication here
+** remarks: this function contains similar things to the one in Graph; there is some code duplication here. but not entirely
+SETS: the following variables are set:
+ BaseGraph::_kmerSize
+ BaseGraph::_state
*********************************************************************/
template<size_t span>
GraphUnitigsTemplate<span>::GraphUnitigsTemplate (tools::misc::IProperties* params, bool load_unitigs_after)
@@ -497,10 +825,6 @@ GraphUnitigsTemplate<span>::GraphUnitigsTemplate (tools::misc::IProperties* para
string input = params->getStr(STR_URI_INPUT);
- string unitigs_filename = (params->get(STR_URI_OUTPUT) ?
- params->getStr(STR_URI_OUTPUT) : System::file().getBaseName (input)
- )+ ".unitigs.fa";
-
// build_visitor_solid has the following defaults:
// minimizer size of 8. that one is okay
// the rest needs to be set!
@@ -518,9 +842,40 @@ GraphUnitigsTemplate<span>::GraphUnitigsTemplate (tools::misc::IProperties* para
//std::cout << "setting repartition type to 1" << std::endl;;
}
+ bool load_from_gfa = (system::impl::System::file().getExtension(input) == "gfa");
+
+ if (load_from_gfa)
+ {
+ // basically shortcut everything, we've got a badass gfa over here
+ unsigned int k = 0;
+ load_unitigs_from_gfa(input, k); // will set the kmer size
+ BaseGraph::_kmerSize = k;
+ return;
+ }
+
+
bool load_from_hdf5 = (system::impl::System::file().getExtension(input) == "h5");
bool load_from_file = (system::impl::System::file().isFolderEndingWith(input,"_gatb"));
bool load_graph = (load_from_hdf5 || load_from_file);
+
+ string unitigs_filename, prefix;
+
+ if (params->get(STR_URI_OUTPUT))
+ prefix = params->getStr(STR_URI_OUTPUT);
+ else
+ {
+ if (load_from_file)
+ {
+ string input_modified = input;
+ input_modified[input_modified.size()-6] = '.'; // replaces "_gatb" with ".gatb" for the purpose of getBaseName, to harmonize with ".h5"
+ prefix = System::file().getBaseName (input_modified);
+ }
+ else
+ prefix = System::file().getBaseName (input) + prefix;
+ }
+
+ unitigs_filename = prefix + ".unitigs.fa";
+
if (load_graph)
{
/* it's not a bank, but rather a h5 file (kmercounted or more), let's complete it to a graph */
@@ -543,9 +898,6 @@ GraphUnitigsTemplate<span>::GraphUnitigsTemplate (tools::misc::IProperties* para
if (BaseGraph::_kmerSize == 0) /* try the dsk group -> maybe it's a dsk h5 file, not a minia one */
BaseGraph::_kmerSize = atol (BaseGraph::getGroup("dsk").getProperty ("kmer_size").c_str());
- modelK = new Model(BaseGraph::_kmerSize);
- modelKdirect= new ModelDirect(BaseGraph::_kmerSize);
-
// also assume kmer counting is done
setState(GraphUnitigsTemplate<span>::STATE_SORTING_COUNT_DONE);
@@ -573,8 +925,6 @@ GraphUnitigsTemplate<span>::GraphUnitigsTemplate (tools::misc::IProperties* para
{
/** We get the kmer size from the user parameters. */
BaseGraph::_kmerSize = params->getInt (STR_KMER_SIZE);
- modelK = new Model(BaseGraph::_kmerSize);
- modelKdirect= new ModelDirect(BaseGraph::_kmerSize);
size_t integerPrecision = params->getInt (STR_INTEGER_PRECISION);
/** We configure the data variant according to the provided kmer size. */
@@ -605,7 +955,7 @@ GraphUnitigsTemplate<span>::GraphUnitigsTemplate (const GraphUnitigsTemplate<spa
{
// will call Graph's constructor
std::cout << "GraphU copy-constructor called" << std::endl;
-
+ // doesn't it need other stuff to be copied tho? like all that's in operator=. so weird.
}
/*********************************************************************
@@ -635,12 +985,19 @@ GraphUnitigsTemplate<span>& GraphUnitigsTemplate<span>::operator= (GraphUnitigsT
outcoming = graph.outcoming;
incoming_map = graph.incoming_map;
outcoming_map = graph.outcoming_map;
+ dag_incoming = graph.dag_incoming;
+ dag_outcoming = graph.dag_outcoming;
+ dag_incoming_map = graph.dag_incoming_map;
+ dag_outcoming_map = graph.dag_outcoming_map;
+ compress_navigational_vectors = graph.compress_navigational_vectors;
+ pack_unitigs = graph.pack_unitigs;
+ packed_unitigs = graph.packed_unitigs;
+ packed_unitigs_sizes = graph.packed_unitigs_sizes;
unitigs = graph.unitigs;
+ unitigs_sizes = graph.unitigs_sizes;
unitigs_mean_abundance = graph.unitigs_mean_abundance;
unitigs_traversed = graph.unitigs_traversed;
unitigs_deleted = graph.unitigs_deleted;
- modelK = graph.modelK;
- modelKdirect = graph.modelKdirect;
nb_unitigs = graph.nb_unitigs;
nb_unitigs_extremities = graph.nb_unitigs_extremities;
@@ -677,12 +1034,19 @@ GraphUnitigsTemplate<span>& GraphUnitigsTemplate<span>::operator= (GraphUnitigsT
outcoming = std::move(graph.outcoming);
incoming_map = std::move(graph.incoming_map);
outcoming_map = std::move(graph.outcoming_map);
+ dag_incoming = std::move(graph.dag_incoming);
+ dag_outcoming = std::move(graph.dag_outcoming);
+ dag_incoming_map = std::move(graph.dag_incoming_map);
+ dag_outcoming_map = std::move(graph.dag_outcoming_map);
+ compress_navigational_vectors = std::move(graph.compress_navigational_vectors);
+ pack_unitigs = std::move(graph.pack_unitigs);
+ packed_unitigs = std::move(graph.packed_unitigs);
+ packed_unitigs_sizes = std::move(graph.packed_unitigs_sizes);
unitigs = std::move(graph.unitigs);
+ unitigs_sizes = std::move(graph.unitigs_sizes);
unitigs_mean_abundance = std::move(graph.unitigs_mean_abundance);
unitigs_traversed = std::move(graph.unitigs_traversed);
unitigs_deleted = std::move(graph.unitigs_deleted);
- modelK = std::move(graph.modelK);
- modelKdirect = std::move(graph.modelKdirect);
nb_unitigs = std::move(graph.nb_unitigs);
nb_unitigs_extremities = std::move(graph.nb_unitigs_extremities);
@@ -751,7 +1115,7 @@ GraphVector<EdgeGU> GraphUnitigsTemplate<span>::getEdges (const NodeGU& source,
res.resize(0);
unsigned int kmerSize = BaseGraph::_kmerSize;
- unsigned int seqSize = unitigs[source.unitig].size();
+ unsigned int seqSize = internal_get_unitig_length(source.unitig);
bool same_orientation = node_in_same_orientation_as_in_unitig(source);
bool pos_begin = source.pos & UNITIG_BEGIN;
@@ -763,6 +1127,7 @@ GraphVector<EdgeGU> GraphUnitigsTemplate<span>::getEdges (const NodeGU& source,
exit(1);
}
+ // these cases are to handle getEdges() for nodes that are inside unitigs. I don't think we use them at all.
if ((unsigned int)seqSize > (unsigned int)kmerSize)
{
// unitig: [kmer]-------
@@ -822,6 +1187,10 @@ GraphVector<EdgeGU> GraphUnitigsTemplate<span>::getEdges (const NodeGU& source,
auto edge_packed = *it;
ExtremityInfo li(edge_packed);
+ if (li.unitig > unitigs_deleted.size())
+ {
+ std::cout << "unexpected error: li.unitig=" <<li.unitig<< ", unitig_deleted.size()=" << unitigs_deleted.size() << std::endl; exit(1);
+ }
if (unitigs_deleted[li.unitig])
{
if (debug)
@@ -854,17 +1223,22 @@ GraphVector<EdgeGU> GraphUnitigsTemplate<span>::getEdges (const NodeGU& source,
{
// nodes to the right of a unitig (outcoming)
Direction dir = same_orientation?DIR_OUTCOMING:DIR_INCOMING;
- functor(get_from_navigational_vector(outcoming, source.unitig, outcoming_map), dir);
+ if (compress_navigational_vectors)
+ functor(get_from_compressed_navigational_vector(outcoming, source.unitig, dag_outcoming_map), dir);
+ else
+ functor(get_from_navigational_vector(outcoming, source.unitig, outcoming_map), dir);
}
if (pos_begin && (((direction & DIR_INCOMING) && same_orientation) || ( (!same_orientation) && (direction & DIR_OUTCOMING)) ))
{
// nodes to the left of a unitig (incoming)
Direction dir = same_orientation?DIR_INCOMING:DIR_OUTCOMING;
- functor(get_from_navigational_vector(incoming, source.unitig, incoming_map), dir);
+ if (compress_navigational_vectors)
+ functor(get_from_compressed_navigational_vector(incoming, source.unitig, dag_incoming_map), dir);
+ else
+ functor(get_from_navigational_vector(incoming, source.unitig, incoming_map), dir);
}
// sanity check on output, due to limitation on GraphVector nmber of elements
- // TODO address that. i don't like the potential performance hit here.
if (res.size() > 16)
{
std::cout << "Error : more than 16 edges (" << res.size() << ") out of node, not supported (already more than 8 is strange)" << std::endl;
@@ -920,8 +1294,8 @@ GraphIterator<NodeGU> GraphUnitigsTemplate<span>::getNodes () const
class NodeIterator : public tools::dp::ISmartIterator<NodeGU>
{
public:
- NodeIterator (const std::vector<std::string>& unitigs /* just to get lengths*/, const std::vector<bool>& unitigs_deleted, unsigned int k, unsigned int nb_unitigs_extremities)
- : _nbItems(nb_unitigs_extremities), _rank(0), _isDone(true), unitigs(unitigs), unitigs_deleted(unitigs_deleted), k(k), nb_unitigs(unitigs.size()) {
+ NodeIterator (const /*dag::dag_vector*/ std::vector<uint32_t>& unitigs_sizes, const std::vector<bool>& unitigs_deleted, unsigned int k, unsigned int nb_unitigs_extremities)
+ : _nbItems(nb_unitigs_extremities), _rank(0), _isDone(true), unitigs_sizes(unitigs_sizes), unitigs_deleted(unitigs_deleted), k(k), nb_unitigs(unitigs_sizes.size()) {
this->_item->strand = STRAND_FORWARD; // iterated nodes are always in forward strand.
}
@@ -954,7 +1328,7 @@ GraphIterator<NodeGU> GraphUnitigsTemplate<span>::getNodes () const
do
{
it++;
- if ((it < 2*nb_unitigs) && unitigs[it/2].size() == k) // takes care of the case where the unitig is just a kmer
+ if ((it < 2*nb_unitigs) && unitigs_sizes[it/2] == k) // takes care of the case where the unitig is just a kmer
it++;
} while ((it < 2*nb_unitigs) && unitigs_deleted[it/2]);
_isDone = it >= (2*nb_unitigs);
@@ -983,13 +1357,13 @@ GraphIterator<NodeGU> GraphUnitigsTemplate<span>::getNodes () const
u_int64_t _nbItems;
u_int64_t _rank;
bool _isDone;
- const std::vector<std::string>& unitigs;
+ const /*dag::dag_vector*/ std::vector<uint32_t>& unitigs_sizes;
const std::vector<bool>& unitigs_deleted;
unsigned int k;
unsigned int nb_unitigs;
};
- return new NodeIterator(unitigs, unitigs_deleted, BaseGraph::_kmerSize, nb_unitigs_extremities);
+ return new NodeIterator(unitigs_sizes, unitigs_deleted, BaseGraph::_kmerSize, nb_unitigs_extremities);
}
template<size_t span>
@@ -1045,7 +1419,7 @@ node_in_same_orientation_as_in_unitig(const NodeGU& node) const
template<size_t span>
std::string GraphUnitigsTemplate<span>::toString (const NodeGU& node) const
{
- const std::string& seq = unitigs[node.unitig];
+ const std::string& seq = internal_get_unitig_sequence(node.unitig);
int kmerSize = BaseGraph::_kmerSize;
if (node.pos == UNITIG_INSIDE)
@@ -1068,7 +1442,7 @@ template<size_t span>
bool GraphUnitigsTemplate<span>::
isLastNode(const NodeGU& node, Direction dir) const
{
- if (unitigs[node.unitig].size() == BaseGraph::_kmerSize) // special case.
+ if (internal_get_unitig_length(node.unitig) == BaseGraph::_kmerSize) // special case.
return true;
bool same_orientation = node_in_same_orientation_as_in_unitig(node);
@@ -1088,7 +1462,7 @@ bool GraphUnitigsTemplate<span>::
isFirstNode(const NodeGU& node, Direction dir) const
{
// special case
- if (unitigs[node.unitig].size() == BaseGraph::_kmerSize)
+ if (internal_get_unitig_length(node.unitig) == BaseGraph::_kmerSize)
{
return true;
}
@@ -1106,7 +1480,7 @@ simplePathMeanAbundance (const NodeGU& node, Direction dir)
return 0;
else // single-k-mer unitig
return unitigMeanAbundance(node);
- }
+ }
float coverage = 0;
int endDegree;
@@ -1115,6 +1489,9 @@ simplePathMeanAbundance (const NodeGU& node, Direction dir)
return coverage / (float)seqLength;
}
+/* return the unitig's mean abundance as given by bcalm
+ * so, it's the mean abundance of all kmers inside that unitig
+ */
template<size_t span>
double GraphUnitigsTemplate<span>::
unitigMeanAbundance (const NodeGU& node) const
@@ -1141,7 +1518,7 @@ unitigLength (const NodeGU& node, Direction dir) const
if (isLastNode(node,dir))
length = 0;
else
- length = unitigs[node.unitig].size() - BaseGraph::_kmerSize;
+ length = internal_get_unitig_length(node.unitig) - BaseGraph::_kmerSize;
return length;
}
@@ -1149,8 +1526,6 @@ template<size_t span>
NodeGU GraphUnitigsTemplate<span>::
unitigLastNode (const NodeGU& node, Direction dir) const
{
- //const std::string& seq = unitigs[node.unitig];
-
//std::cout << "lastnode" << toString(node) << " dir " << dir << std::endl;
if (isLastNode(node,dir))
@@ -1222,7 +1597,7 @@ template<size_t span>
std::string GraphUnitigsTemplate<span>::
unitigSequence (const NodeGU& node, bool& isolatedLeft, bool& isolatedRight) const
{
- const string& seq = unitigs[node.unitig];
+ const string& seq = internal_get_unitig_sequence(node.unitig);
//std::cout << " seq " << seq << " node " << toString(node) << std::endl;
NodeGU left = NodeGU(node.unitig, UNITIG_BEGIN);
@@ -1236,7 +1611,7 @@ unitigSequence (const NodeGU& node, bool& isolatedLeft, bool& isolatedRight) con
// keep traversing unitigs as far as we can.
/* arguments:
* starting node
- * output sequence length (will be reset if previously set)
+ * output sequence length (will be reset if previously set), NOTE: is the number of traversed kmers, so (sequencelength-k). there might be a +-1 length issue here.
* output end degree
* whether to mark during traversal
* total coverage of simple path (NOT normalized!)
@@ -1249,7 +1624,7 @@ simplePathLongest_avance(const NodeGU& node, Direction dir, int& seqLength, int&
{
bool debug = false;
if (debug)
- std::cout << "simplePathLongest_avance called, node " << toString(node) << " dir " << dir << std::endl;
+ std::cout << "simplePathLongest_avance called, node " << toString(node) << " strand " << node.strand << " dir " << dir << std::endl;
seqLength = 0;
unsigned int kmerSize = BaseGraph::_kmerSize;
@@ -1257,13 +1632,13 @@ simplePathLongest_avance(const NodeGU& node, Direction dir, int& seqLength, int&
if (!isLastNode(cur_node,dir))
{
// first node in unitig may have in-branching, it's fine for a simple path traversal. we'll just go to last node and record the sequence of that unitig
- int unitigLength = unitigs[node.unitig].size();
+ int unitigLength = internal_get_unitig_length(node.unitig);
bool same_orientation = node_in_same_orientation_as_in_unitig(node);
if (seq != nullptr)
{
- string new_seq = unitigs[node.unitig];
+ string new_seq = internal_get_unitig_sequence(node.unitig);
if (!same_orientation)
new_seq = revcomp(new_seq);
@@ -1280,7 +1655,7 @@ simplePathLongest_avance(const NodeGU& node, Direction dir, int& seqLength, int&
coverage += unitigMeanAbundance(cur_node) * (unitigLength - kmerSize + 1);
if (debug)
- std::cout << "simplePathLongest_avance was at a first node = " << toString(cur_node) << " strand " << cur_node.strand << " so traversed unitig of length " << unitigLength << std::endl;
+ std::cout << "simplePathLongest_avance was at a first node = " << toString(cur_node) << " strand " << cur_node.strand << " so traversed unitig of length " << unitigLength << " mean abundance: " << unitigMeanAbundance(cur_node) << ")" << std::endl;
cur_node = unitigLastNode(node,dir);
@@ -1323,13 +1698,13 @@ simplePathLongest_avance(const NodeGU& node, Direction dir, int& seqLength, int&
bool same_orientation = node_in_same_orientation_as_in_unitig(neighbors[0].to);
- int unitigLength = unitigs[neighbor_unitig].size();
+ int unitigLength = internal_get_unitig_length(neighbor_unitig);
if (debug)
- std::cout << "simplePathLongest_avance continues now at a last node = " << toString(cur_node) << " strand " << cur_node.strand << " of unitig " << cur_node.unitig << " length " << unitigs[cur_node.unitig].size() << ", neighbor.to " << toString(neighbors[0].to) << " strand " << neighbors[0].to.strand << " of unitig " << neighbors[0].to.unitig << " new seq length: " << unitigLength << std::endl;
+ std::cout << "simplePathLongest_avance continues now at a last node = " << toString(cur_node) << " strand " << cur_node.strand << " of unitig " << cur_node.unitig << " length " << internal_get_unitig_length(cur_node.unitig) << ", neighbor.to " << toString(neighbors[0].to) << " strand " << neighbors[0].to.strand << " of unitig " << neighbors[0].to.unitig << " new seq length: " << unitigLength << std::endl;
// fix for 1-bit encoded unitig position. That fix could have happened in unitig_parse_header but i didn't want to encode pos in 2 bits. Also, could have happened in NodeGU constructor, but didn't want to waste time checking unitig size there
- if (unitigs[neighbors[0].to.unitig].size() == kmerSize)
+ if (internal_get_unitig_length(neighbors[0].to.unitig) == kmerSize)
neighbors[0].to.pos = UNITIG_BOTH;
int npos = neighbors[0].to.pos;
@@ -1385,7 +1760,7 @@ simplePathLongest_avance(const NodeGU& node, Direction dir, int& seqLength, int&
// append the sequence (except the overlap part, of length k-1.
if (seq != nullptr)
{
- string new_seq = unitigs[cur_node.unitig];
+ string new_seq = internal_get_unitig_sequence(cur_node.unitig);
if (!same_orientation)
new_seq = revcomp(new_seq);
@@ -1397,6 +1772,8 @@ simplePathLongest_avance(const NodeGU& node, Direction dir, int& seqLength, int&
seqLength += unitigLength - (kmerSize-1);
coverage += unitigMeanAbundance(cur_node) * (unitigLength - kmerSize + 1); // here too, coverage is computed according to whole unitig
+
+ //if (debug) std::cout << "seqlength add " << (unitigLength - (kmerSize-1)) << " added cov " << (unitigMeanAbundance(cur_node) * (unitigLength - kmerSize + 1)) << " mean ab " << unitigMeanAbundance(cur_node) << std::endl;
if (markDuringTraversal&& unitigIsMarked(cur_node)) // just a debug, can be removed
{
@@ -1414,7 +1791,7 @@ template<size_t span>
std::string GraphUnitigsTemplate<span>::
simplePathBothDirections(const NodeGU& node, bool& isolatedLeft, bool& isolatedRight, bool markDuringTraversal, float &coverage)
{
- string seq = unitigs[node.unitig];
+ string seq = internal_get_unitig_sequence(node.unitig);
int kmerSize = BaseGraph::_kmerSize;
float midTotalCoverage = unitigMeanAbundance(node) * (seq.size() - kmerSize + 1);
@@ -1468,7 +1845,7 @@ debugPrintAllUnitigs() const
std::cout << "Debug: printing all graph unitigs and status" << std::endl;
for (unsigned int i = 0; i < nb_unitigs; i++)
{
- std::cout << "unitig " << i << " (length: " << unitigs[i].size() << ") " << (unitigs_deleted[i]?"[deleted]":"") << " links: ";
+ std::cout << "unitig " << i << " (length: " << internal_get_unitig_length(i) << ") " << (unitigs_deleted[i]?"[deleted]":"") << " links: ";
for (Direction dir=DIR_OUTCOMING; dir<DIR_END; dir = (Direction)((int)dir + 1) )
@@ -1503,9 +1880,11 @@ template<size_t span>
NodeGU GraphUnitigsTemplate<span>::
debugBuildNode(string startKmer) const
{
- for (unsigned int i = 0; i < unitigs.size(); i++)
+ bool debug=false;
+ for (unsigned int i = 0; i < nb_unitigs; i++)
{
- string unitig = unitigs[i];
+ string unitig = internal_get_unitig_sequence(i);
+ if (debug) std::cout << "debugBuildNode, testing unitig " << i << "/" << nb_unitigs << " : " << unitig << std::endl;
for (int rc = 0; rc < 2; rc++)
{
if (rc == 1)
@@ -1651,8 +2030,60 @@ void GraphUnitigsTemplate<span>::disableNodeState() const
std::cout << "GraphUnitigs::disableNodeState() not implemented" << std::endl;
}
+/*
+ *
+ * 2-bit compression of unitigs
+ *
+ */
+template<size_t span>
+std::string GraphUnitigsTemplate<span>::internal_get_unitig_sequence(unsigned int id) const
+{
+ std::string unitig_seq;
+ if (pack_unitigs)
+ {
+ if (id == 0)
+ unitig_seq = packed_unitigs.substr(0, packed_unitigs_sizes[0]);
+ else
+ {
+ uint64_t ps = packed_unitigs_sizes.prefix_sum(id);
+ unitig_seq = packed_unitigs.substr(ps, unitigs_sizes[id]);
+ }
+ }
+ else
+ unitig_seq = unitigs[id];
+ int i = unitigs_sizes[id];
+ std::string res(i,'x');
+ for (--i ; i >= 0; i--) {
+ const unsigned char c = (unitig_seq[i/4]);
+ const unsigned char byte = (c >> (2*(i % 4))) & 3;
+ if (byte == 2)
+ res[i] = 'T';
+ else
+ res[i] = 'A' | (byte << 1);
+ }
+ return res;
+}
+
+template<size_t span>
+unsigned int GraphUnitigsTemplate<span>::internal_get_unitig_length(unsigned int id) const
+{
+ return unitigs_sizes[id];
+}
+template<size_t span>
+std::string GraphUnitigsTemplate<span>::internal_compress_unitig(std::string seq) const
+{
+ unsigned int n = (seq.size()+3)/4;
+ unsigned char res[n];
+ for (size_t i = 0; i < n; i++)
+ res[i] = 0;
+ for (size_t i = 0; i < seq.size(); i++)
+ res[i / 4] |= ((seq[i] >> 1) & 3) << (2*(i % 4));
+
+ std::string res_str(reinterpret_cast<const char *>(res), n);
+ return res_str;
+}
/*
*
diff --git a/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.hpp b/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.hpp
index 4b07c49..f327ce6 100644
--- a/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.hpp
+++ b/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.hpp
@@ -28,6 +28,9 @@
#include <gatb/debruijn/impl/UnitigsConstructionAlgorithm.hpp>
#include <gatb/debruijn/impl/ExtremityInfo.hpp>
+#include <gatb/debruijn/impl/dag_vector.hpp> // TODO move it to 3rd party
+
+
/********************************************************************************/
namespace gatb {
namespace core {
@@ -282,7 +285,7 @@ public:
NodeGU unitigLastNode (const NodeGU& node, Direction dir) const;
NodeGU simplePathLastNode (const NodeGU& node, Direction dir) ; /* cannot be const becuse it called Longuest_avance that is sometimes not const.. grr. */
unsigned int unitigLength (const NodeGU& node, Direction dir) const;
- unsigned int simplePathLength (const NodeGU& node, Direction dir) ; /* same reason as above*/;
+ unsigned int simplePathLength (const NodeGU& node, Direction dir) ; /* same reason as above*/ /* NOTE: return number of traversed kmers, so (nucleotide length-k)*/
double unitigMeanAbundance (const NodeGU& node) const;
double simplePathMeanAbundance (const NodeGU& node, Direction dir) ;
void unitigDelete (NodeGU& node, Direction dir, NodesDeleter<NodeGU, EdgeGU, GraphUnitigsTemplate<span>>& nodesDeleter);
@@ -395,21 +398,41 @@ public: // was private: before, but had many compilation errors during the chang
void build_unitigs_postsolid(std::string unitigs_filename, tools::misc::IProperties* props);
void load_unitigs(std::string unitigs_filename);
+ void load_unitigs_from_gfa(std::string gfa_filename, unsigned int& kmerSize);
+ void print_unitigs_mem_stats(uint64_t avg_incoming_size, uint64_t avg_outcoming_size, uint64_t total_unitigs_size, uint64_t nb_utigs_nucl = 0, uint64_t nb_utigs_nucl_mem = 0);
+
bool node_in_same_orientation_as_in_unitig(const NodeGU& node) const;
-
+
+ // support for 2-bit compression of unitigs
+ std::string internal_get_unitig_sequence(unsigned int unitig_id) const;
+ unsigned int internal_get_unitig_length(unsigned int unitig_id) const;
+ std::string internal_compress_unitig(std::string seq) const;
+
typedef typename kmer::impl::Kmer<span>::ModelCanonical Model;
typedef typename kmer::impl::Kmer<span>::ModelDirect ModelDirect;
-
-
- // don't forget to copy those variables in operator= (and the move operator) !!
- Model *modelK;
- ModelDirect *modelKdirect;
+
+ // !!!!
+ // all member variables should be below this point, because i want you to also read this:
+ // don't forget to copy those variables in operator= (and the move operator)
+ // classic source of bugs but i couldn't find a foolproof way.
+ // !!!!
std::vector<uint64_t> incoming, outcoming, incoming_map, outcoming_map;
+ dag::dag_vector dag_incoming, dag_outcoming, dag_incoming_map, dag_outcoming_map;
std::vector<std::string> unitigs;
+ std::string packed_unitigs;
+ std::vector<uint32_t> unitigs_sizes;
+ dag::dag_vector packed_unitigs_sizes;
std::vector<float> unitigs_mean_abundance;
- std::vector<bool> unitigs_deleted; // could also be replaced by setting incoming and outcoming to all deleted.
+ //dag::dag_vector unitigs_sizes;// perf hit: from 45s to 74s in chr14; that's because unitigs_sizes is queried _a lot_ just to check if a unitig is just of length k. could save that space with a bit vector, and actually, just use packed_unitigs_sizes for the rest. so.. just to keep in mind that this is a "todo opt" in case we really want to save the space of unitigs_sizes
+ //dag::dag_vector unitigs_mean_abundance; // not a big gain and different assembly quality, so i'm keeping it as vector<float>
+ std::vector<bool> unitigs_deleted; // could also be replaced by modifying incoming and outcoming vectors. careful not to affect the prefix sum scheme tho.
std::vector<bool> unitigs_traversed;
uint64_t nb_unitigs, nb_unitigs_extremities;
+ bool compress_navigational_vectors;
+ bool pack_unitigs;
+ // !!!!
+ // read above
+ // !!!!
};
/********************************************************************************/
diff --git a/gatb-core/src/gatb/debruijn/impl/IterativeExtensions.cpp b/gatb-core/src/gatb/debruijn/impl/IterativeExtensions.cpp
index 412b934..9100f9d 100644
--- a/gatb-core/src/gatb/debruijn/impl/IterativeExtensions.cpp
+++ b/gatb-core/src/gatb/debruijn/impl/IterativeExtensions.cpp
@@ -222,12 +222,34 @@ void IterativeExtensions<span, Node, Edge, Graph>::construct_linear_seqs (
if (swf)
{
- char* found = strstr (seq.getDataBuffer(), R.c_str());
+
+ /* old version
+ char* found = strstr (seq.getDataBuffer(), R.c_str());
if (found != NULL && ksd.depth > (int)sizeKmer)
{
INFO (("swf STOP \n"));
break;
}
+ */
+ // Apr 2017 : new version for MindTheGap fill with contigs
+ //target can be the concatenation of several target kmers, checks for all the target kmers, if one found stop extending this contig but continue other branches
+ std::string subseed;
+ std::string target= R.c_str();
+ bool stopExtend= false;
+ for (unsigned i = 0; i < target.length(); i += sizeKmer)
+ {
+ subseed=target.substr(i, sizeKmer);
+ char* found = strstr (seq.getDataBuffer(), R.c_str());
+
+ if (found != NULL && ksd.depth > (int)sizeKmer)
+ {
+ stopExtend=true;
+ break;
+ }
+
+ }
+ if (stopExtend) continue; //one of the targets was found, stop this extension but keep extending other branches
+
}
if (nbNodes > max_nodes) //GR stop when too complex huum when to stop ?
diff --git a/gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp b/gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp
new file mode 100644
index 0000000..a13668a
--- /dev/null
+++ b/gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp
@@ -0,0 +1,381 @@
+#include <gatb/tools/misc/impl/Progress.hpp>
+#include <gatb/tools/misc/impl/Stringify.hpp>
+#include <gatb/system/impl/System.hpp>
+#include <gatb/bank/impl/Bank.hpp>
+#include <gatb/bank/impl/Banks.hpp>
+#include <gatb/bank/impl/BankHelpers.hpp>
+#include <gatb/bcalm2/logging.hpp>
+#include <gatb/debruijn/impl/ExtremityInfo.hpp>
+#include <gatb/debruijn/impl/LinkTigs.hpp>
+#include <gatb/kmer/impl/Model.hpp> // for revcomp_4NT
+
+#include <queue>
+#include <string>
+#include <unordered_map>
+
+
+using namespace std;
+using namespace gatb::core::bank;
+using namespace gatb::core::bank::impl;
+using namespace gatb::core::debruijn;
+using namespace gatb::core::debruijn::impl;
+using namespace gatb::core::kmer;
+using namespace gatb::core::kmer::impl;
+
+using namespace gatb::core::tools::misc;
+using namespace gatb::core::tools::misc::impl;
+using namespace gatb::core::system;
+using namespace gatb::core::system::impl;
+
+namespace gatb { namespace core { namespace debruijn { namespace impl {
+
+ static constexpr int nb_passes = 8;
+ static void write_final_output(const string& unitigs_filename, bool verbose, BankFasta* out, uint64_t &nb_unitigs);
+ static bool get_link_from_file(std::ifstream& input, std::string &link, uint64_t &unitig_id);
+
+/* this procedure finds the overlaps between unitigs, using a hash table of all extremity (k-1)-mers
+ * I guess it's like AdjList in ABySS. It's also like contigs_to_fastg in MEGAHIT.
+ *
+ * could be optimized by keeping edges during the BCALM step and tracking kmers in unitigs, but it's not the case for now, because would need to modify ograph
+ *
+ * it uses the disk to store the links for extremities until they're merged into the final unitigs file.
+ *
+ * so the memory usage is just that of the hash tables that record kmers, not of the links
+ */
+template<size_t span>
+void link_tigs(string unitigs_filename, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose)
+{
+ bcalm_logging = verbose;
+ BankFasta* out = new BankFasta(unitigs_filename+".indexed");
+ if (kmerSize < 4) { std::cout << "error, recent optimizations (specifically link_unitigs) don't support k<5 for now" << std::endl; exit(1); }
+ logging("Finding links between unitigs");
+
+ for (int pass = 0; pass < nb_passes; pass++)
+ link_unitigs_pass<span>(unitigs_filename, verbose, pass, kmerSize);
+
+ write_final_output(unitigs_filename, verbose, out, nb_unitigs);
+
+ delete out;
+ system::impl::System::file().remove (unitigs_filename);
+ system::impl::System::file().rename (unitigs_filename+".indexed", unitigs_filename);
+
+ logging("Done finding links between unitigs");
+}
+
+
+/*
+ * takes all the prefix.links.* files, sorted by unitigs.
+ * do a n-way merge to gather the links for each unitig in unitig order
+ * (single-threaded)
+ */
+
+
+static void write_final_output(const string& unitigs_filename, bool verbose, BankFasta* out, uint64_t &nb_unitigs)
+{
+ logging("gathering links from disk");
+ std::ifstream* inputLinks[nb_passes];
+
+ bitset<nb_passes> finished;
+ typedef std::tuple<uint64_t /*unitig id*/, int /*pass*/, std::string /*links */> pq_elt_t;
+ priority_queue<pq_elt_t, vector<pq_elt_t>, std::greater<pq_elt_t> > pq;
+
+ BankFasta inputBank (unitigs_filename);
+ BankFasta::Iterator itSeq (inputBank);
+ itSeq.first();
+ string cur_links, seq, comment;
+ seq = itSeq->toString();
+ comment = itSeq->getComment();
+
+ for (int pass = 0; pass < nb_passes; pass++)
+ {
+ string link; uint64_t unitig;
+ inputLinks[pass] = new std::ifstream(unitigs_filename+ ".links." + to_string(pass));
+ // prime the pq with the first element in the file
+ if (get_link_from_file(*inputLinks[pass], link, unitig))
+ pq.emplace(make_tuple(unitig, pass, link));
+ else
+ finished[pass] = true;
+ }
+
+ uint64_t last_unitig = 0;
+ nb_unitigs = 0; // passed variable
+
+ // nb_passes-way merge sort
+ while ((!finished.all()) || pq.size() > 0)
+ {
+ pq_elt_t cur = pq.top(); pq.pop();
+ int pass = get<1>(cur);
+ uint64_t unitig = get<0>(cur);
+
+ if (unitig != last_unitig)
+ {
+ Sequence s (Data::ASCII);
+ s.getData().setRef ((char*)seq.c_str(), seq.size());
+ s._comment = comment + " " + cur_links;
+ out->insert(s);
+
+ cur_links = "";
+ nb_unitigs++;
+ last_unitig = unitig;
+ itSeq.next();
+ seq = itSeq->toString();
+ comment = itSeq->getComment();
+ }
+
+ cur_links += get<2>(cur);
+ //if (unitig < 10) std::cout << " popped " << pass << " " << unitig << " " << cur_links << std::endl; // debug
+
+ // read next entry in the inputLinks[pass] file that we just popped
+ if (finished[pass])
+ continue;
+ string link;
+ if (get_link_from_file(*inputLinks[pass], link, unitig))
+ pq.emplace(make_tuple(unitig, pass, link));
+ else
+ finished[pass] = true;
+
+ }
+ // write the last element
+ Sequence s (Data::ASCII);
+ s.getData().setRef ((char*)seq.c_str(), seq.size());
+ s._comment = comment + " " + cur_links;
+ out->insert(s);
+ nb_unitigs++;
+
+ for (int pass = 0; pass < nb_passes; pass++)
+ {
+ system::impl::System::file().remove (unitigs_filename + ".links." + to_string(pass));
+ delete inputLinks[pass];
+ }
+}
+
+
+// well well, some potential code duplication with Model.hpp in here (or rather, specialization), but sshh
+static inline int nt2int(char nt)
+{
+ if (nt=='A') return 0;
+ if (nt=='C') return 1;
+ if (nt=='T') return 2;
+ if (nt=='G') return 3;
+ return 0;
+}
+
+/* that code doesn't support more than 8 passes*/
+static int normalized_smallmer(const unsigned char c1, const unsigned char c2, const unsigned char c3, const unsigned char c4)
+{
+ unsigned char smallmer = (nt2int(c1)<<6) + (nt2int(c2)<<4) + (nt2int(c3)<<2) + nt2int(c4);
+ const unsigned char rev = revcomp_4NT[smallmer];
+ if (rev < smallmer)
+ smallmer = rev;
+ return smallmer;
+}
+
+static bool
+is_in_pass (const std::string &seq, int pass, Unitig_pos p, int kmerSize) // TODO this is so un-even. should do more proper hashing..
+{
+ int e = 0;
+ if (p == UNITIG_END)
+ e = seq.size()-(kmerSize-1);
+ // x = 0123456789
+ // k = 5, k-1=4
+ // seq.size()-1-(k-1) = 10-4 = 6
+ return (normalized_smallmer(seq[e],seq[e+1],seq[e+kmerSize-1-1-1],seq[e+kmerSize-1-1]) % nb_passes) == pass;
+}
+
+/* returns true if it has read an element, false otherwise */
+static bool get_link_from_file(std::ifstream& input, std::string &link, uint64_t &unitig_id)
+{
+ string line;
+ if (std::getline(input, line))
+ {
+ unitig_id = stoull(line);
+ }
+ else
+ return false;
+ if (std::getline(input, link))
+ {
+ }
+ else
+ return false;
+ return true;
+}
+
+
+static void record_links(uint64_t utig_id, int pass, const string &link, std::ofstream &links_file)
+{
+ // maybe do a buffered thing here but it's not clear if it is bottleneck. in CAMI-medium it took 6 mins without if i don't write the links_file and 8 mins if i do..
+ links_file << to_string(utig_id) << "\n";
+ links_file << link << "\n";
+}
+
+
+template<size_t span>
+void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pass, const int kmerSize)
+{
+ typedef typename kmer::impl::Kmer<span>::ModelCanonical Model;
+ typedef typename kmer::impl::Kmer<span>::Type Type;
+
+ bool debug = false;
+
+ BankFasta inputBank (unitigs_filename);
+ BankFasta::Iterator itSeq (inputBank);
+ uint64_t utig_counter = 0;
+
+ Model modelKminusOne(kmerSize - 1); // it's canonical (defined in the .hpp file)
+
+ typedef typename std::unordered_map<Type, std::vector<uint64_t>> NodeLinksMap;
+
+ NodeLinksMap utigs_links_map;
+
+ logging("step 1 pass " + to_string(pass));
+
+ // this is the memory-limiting step, but can be lowered with larger nb_pass
+ for (itSeq.first(); !itSeq.isDone(); itSeq.next())
+ {
+ const string& seq = itSeq->toString();
+ if (debug) std::cout << "unitig: " << seq << std::endl;
+
+ if (is_in_pass(seq, pass, UNITIG_BEGIN, kmerSize))
+ {
+ if (debug) std::cout << "pass " << pass << " examining beginning" << std::endl;
+ typename Model::Kmer kmerBegin = modelKminusOne.codeSeed(seq.substr(0, kmerSize-1).c_str(), Data::ASCII);
+ bool beginInSameOrientation = modelKminusOne.toString(kmerBegin.value()) == seq.substr(0,kmerSize-1);
+ ExtremityInfo eBegin(utig_counter, !beginInSameOrientation /* because we record rc*/, UNITIG_BEGIN);
+ utigs_links_map[kmerBegin.value()].push_back(eBegin.pack());
+ }
+ if (is_in_pass(seq, pass, UNITIG_END, kmerSize))
+ {
+ if (debug) std::cout << "pass " << pass << " examining end" << std::endl;
+ typename Model::Kmer kmerEnd = modelKminusOne.codeSeed(seq.substr(seq.size() - kmerSize+1).c_str(), Data::ASCII);
+ bool endInSameOrientation = modelKminusOne.toString(kmerEnd.value()) == seq.substr(seq.size() - kmerSize+1);
+ ExtremityInfo eEnd( utig_counter, !endInSameOrientation, UNITIG_END);
+ utigs_links_map[kmerEnd.value()].push_back(eEnd.pack());
+ // there is no UNITIG_BOTH here because we're taking (k-1)-mers.
+ }
+ utig_counter++;
+ }
+
+ std::ofstream links_file(unitigs_filename+".links." +to_string(pass));
+
+ uint64_t nb_hashed_entries = 0;
+ for (auto v : utigs_links_map)
+ nb_hashed_entries += v.second.size();
+ logging("step 2 (" + to_string(utigs_links_map.size()) + "kmers/" + to_string(nb_hashed_entries) + "extremities)");
+
+ utig_counter = 0;
+ for (itSeq.first(); !itSeq.isDone(); itSeq.next())
+ {
+ const string& seq = itSeq->toString();
+
+ if (debug) std::cout << "unitig: " << seq << std::endl;
+
+ if (is_in_pass(seq, pass, UNITIG_BEGIN, kmerSize))
+ {
+ if (debug) std::cout << "pass " << pass << " examining beginning" << std::endl;
+ typename Model::Kmer kmerBegin = modelKminusOne.codeSeed(seq.substr(0, kmerSize-1).c_str(), Data::ASCII);
+ bool beginInSameOrientation = modelKminusOne.toString(kmerBegin.value()) == seq.substr(0,kmerSize-1); // that could be optimized, revcomp was already computed during codeSeed
+ // treat special palindromic kmer cases
+ bool nevermindInOrientation = false;
+ if (((kmerSize - 1) % 2) == 0)
+ if (kmerBegin.isPalindrome()) nevermindInOrientation = true;
+
+ string in_links = " "; // necessary placeholder to indicate we have links for that unitig
+
+ // in-neighbors
+ // so, the current kmer is kmerBegin, and we're iterating all potential in-neighbors as e_in objects
+ for (auto in_packed : utigs_links_map[kmerBegin.value()])
+ {
+ ExtremityInfo e_in(in_packed);
+
+ if (debug) std::cout << "extremity " << modelKminusOne.toString(kmerBegin.value()) << " ";
+ if (debug) std::cout << "potential in-neighbor: " << e_in.toString() << " beginSameOrientation " << beginInSameOrientation;
+
+ // what we want are these four cases:
+ // ------[end same orientation] -> [begin same orientation]----
+ // [begin diff orientation]---- -> [begin same orientation]----
+ // ------[end diff orientation] -> [begin diff orientation]----
+ // [begin same orientation]---- -> [begin diff orientation]----
+ if (( ((beginInSameOrientation) && (e_in.pos == UNITIG_END ) && (e_in.rc == false)) ||
+ ((beginInSameOrientation) && (e_in.pos == UNITIG_BEGIN) && (e_in.rc == true)) ||
+ (((!beginInSameOrientation)) && (e_in.pos == UNITIG_END ) && (e_in.rc == true)) ||
+ (((!beginInSameOrientation)) && (e_in.pos == UNITIG_BEGIN) && (e_in.rc == false)))
+ || nevermindInOrientation)
+ {
+
+ // this was for when i was wanting to save space while storing links in memory. now storing on disk
+ //LinkInfo li(e_in.unitig, e_in.rc ^ beginInSameOrientation);
+ //incoming[utig_number].push_back(li.pack());
+
+ //bool rc = e_in.rc ^ (!beginInSameOrientation); // "rc" sets the destination strand // i don't think it's the right formula because of k-1-mers that are their self revcomp. see the mikko bug in the test folder, that provides a nice illustration of that
+ bool rc = e_in.pos == UNITIG_END; // a better way to determine the rc flag is just looking at position of e_in k-1-mer
+
+ in_links += "L:-:" + to_string(e_in.unitig) + ":" + (rc?"-":"+") + " ";
+
+ /* what to do when kmerBegin is same as forward and reverse?
+ used to have this:
+ if (nevermindInOrientation)
+ in_links += "L:-:" + to_string(e_in.unitig) + ":" + ((!rc)?"+":"-") + " "; // in that case, there is also another link with the reverse direction
+ but it's bogus, because the reverse direction of the other sequence won't have the same start k-1-mer, even if it is just k-long
+ so there isn't anything to do actually. the case is handled by the "|| nevermindInOrientation )" in the if above */
+
+
+ if (debug) std::cout << " [valid] ";
+ }
+ if (debug) std::cout << std::endl;
+ }
+
+ record_links(utig_counter, pass, in_links, links_file);
+ }
+
+
+ if (is_in_pass(seq, pass, UNITIG_END, kmerSize))
+ {
+ if (debug) std::cout << "pass " << pass << " examining end" << std::endl;
+ typename Model::Kmer kmerEnd = modelKminusOne.codeSeed(seq.substr(seq.size() - kmerSize+1).c_str(), Data::ASCII);
+ bool endInSameOrientation = modelKminusOne.toString(kmerEnd.value()) == seq.substr(seq.size() - kmerSize+1);
+
+ bool nevermindOutOrientation = false;
+ if (((kmerSize - 1) % 2) == 0)
+ if (kmerEnd.isPalindrome()) nevermindOutOrientation = true;
+
+ string out_links = " "; // necessary placeholder to indicate we have links for that unitig
+
+ // out-neighbors
+ for (auto out_packed : utigs_links_map[kmerEnd.value()])
+ {
+ ExtremityInfo e_out(out_packed);
+
+ if (debug) std::cout << "extremity " << modelKminusOne.toString(kmerEnd.value()) << " ";
+ if (debug) std::cout << "potential out-neighbor: " << e_out.toString();
+
+ // what we want are these four cases:
+ // ------[end same orientation] -> [begin same orientation]----
+ // ------[end same orientation] -> ------[end diff orientation]
+ // ------[end diff orientation] -> [begin diff orientation]----
+ // ------[end diff orientation] -> ------[end same orientation]
+ if ((((endInSameOrientation) && (e_out.pos == UNITIG_BEGIN) && (e_out.rc == false)) ||
+ ((endInSameOrientation) && (e_out.pos == UNITIG_END ) && (e_out.rc == true)) ||
+ (((!endInSameOrientation)) && (e_out.pos == UNITIG_BEGIN) && (e_out.rc == true)) ||
+ (((!endInSameOrientation)) && (e_out.pos == UNITIG_END ) && (e_out.rc == false)))
+ ||nevermindOutOrientation)
+ {
+
+ //LinkInfo li(e_out.unitig, e_out.rc ^ endInSameOrientation);
+ //outcoming[utig_number].push_back(li.pack());
+
+ bool rc = e_out.pos == UNITIG_END; // a better way to determine the rc flag is just looking at position of e_in k-1-mer
+
+ out_links += "L:+:" + to_string(e_out.unitig) + ":" + (rc?"-":"+") + " ";
+
+ if (debug) std::cout << " [valid] ";
+ }
+ if (debug) std::cout << std::endl;
+ }
+ record_links(utig_counter, pass, out_links, links_file);
+ }
+
+ utig_counter++;
+ }
+}
+
+}}}}
diff --git a/gatb-core/src/gatb/debruijn/impl/LinkTigs.hpp b/gatb-core/src/gatb/debruijn/impl/LinkTigs.hpp
new file mode 100644
index 0000000..3369215
--- /dev/null
+++ b/gatb-core/src/gatb/debruijn/impl/LinkTigs.hpp
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ * GATB : Genome Assembly Tool Box
+ * Copyright (C) 2014-2016 INRIA
+ * Authors: R.Chikhi, G.Rizk, E.Drezen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+*****************************************************************************/
+
+
+#ifndef _GATB_CORE_LINK_TIGS_HPP_
+#define _GATB_CORE_LINK_TIGS_HPP_
+
+#include <gatb/bank/impl/Bank.hpp>
+#include <gatb/bank/impl/Banks.hpp>
+#include <gatb/bank/impl/BankHelpers.hpp>
+#include <gatb/bank/api/IBank.hpp>
+
+namespace gatb { namespace core { namespace debruijn { namespace impl {
+
+
+ template<size_t SPAN>
+ void link_tigs( std::string prefix, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose);
+
+ template<size_t span>
+ void link_unitigs_pass(const std::string unitigs_filename, bool verbose, const int pass, const int kmerSize);
+
+}}}}
+
+#endif
diff --git a/gatb-core/src/gatb/debruijn/impl/Simplifications.cpp b/gatb-core/src/gatb/debruijn/impl/Simplifications.cpp
index 10d6de3..bbd59e5 100644
--- a/gatb-core/src/gatb/debruijn/impl/Simplifications.cpp
+++ b/gatb-core/src/gatb/debruijn/impl/Simplifications.cpp
@@ -28,7 +28,7 @@
#define DEBUG(a) //a
#define DEBUG_TIPS(a) //a
#define DEBUG_BULGES(a) //a
-#define DEBUG_EC(a) //a
+#define DEBUG_EC(a) //a
// the only time when you don't want to define this, is when debugging with gdb, because can't debug lambda's
#define SIMPLIFICATION_LAMBDAS
@@ -82,8 +82,25 @@ Simplifications<GraphType, Node, Edge>::Simplifications(GraphType& graph, int nb
// (before, the previous system was to do a fixed amount of passes)
cutoffEvents = std::max((uint64_t)(nbNodes / 10000), (uint64_t)1);
+ // maybe todo: estimate better and illustrate on sample genomes
+
+
+ // set some default parameters for aggressive graph simplifications (minia)
- // TODO: estimate better and illustrate on sample genomes
+ // tips
+ _tipLen_Topo_kMult = 2.5;
+ _tipLen_RCTC_kMult = 10;
+ _tipRCTCcutoff = 2; // SPAdes-like
+
+ // bulges
+ _bulgeLen_kMult = 3;
+ _bulgeLen_kAdd = 100;
+ _bulgeAltPath_kAdd = 50;
+ _bulgeAltPath_covMult = 1.1;
+
+ // EC
+ _ecLen_kMult = 9;
+ _ecRCTCcutoff = 4;
}
@@ -355,16 +372,16 @@ bool Simplifications<GraphType,Node,Edge>::satisfyRCTC(double pathAbundance, Nod
* for CAMI we were more loose than SPAdes: topological tip had no cov criterion, wherehas it should have had rctc (like spades)
* and long tc_lb10 tips should have the auto coverage bound, but instead they had rctc 2.
*
+ * here we also keep that philosophy: topological tips (<=3.5*k) do not need a coverage criterium to be removed (no rctc)
+ *
* so TODO: make it more strict. but for now I'm focusing on EC.
*/
template<typename GraphType, typename Node, typename Edge>
unsigned long Simplifications<GraphType,Node,Edge>::removeTips()
{
unsigned int k = _graph.getKmerSize();
-
- unsigned int maxTipLengthTopological = (unsigned int)((float)k * (3.5 - 1.0)); // aggressive with SPAdes length threshold, but no coverage criterion
- unsigned int maxTipLengthRCTC = (unsigned int)(k * 10); // experimental, SPAdes-like
- double RCTCcutoff = 2; // SPAdes-like
+ unsigned int _maxTipLengthTopological = (unsigned int)((float)k * _tipLen_Topo_kMult); // aggressive with SPAdes length threshold, but no coverage criterion
+ unsigned int _maxTipLengthRCTC = (unsigned int)(k * _tipLen_RCTC_kMult); // experimental, SPAdes-like
unsigned long nbTipsRemoved = 0;
@@ -400,7 +417,8 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeTips()
dispatcher.iterate (*itNode, [&] (Node& node)
{
- /* just a quick note. It was observed in the context of flagging some node as uninteresting (not used anymore).
+ /* just a quick note, which was observed in the context of flagging some node as uninteresting (not used anymore).
+ * property: "a tip (detected at some point after some rounds of simplifications) is not necessarily a branching node initially in the original graph"
* here a strange dbg motif:
*
*
@@ -471,9 +489,9 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeTips()
TIME(auto end_simplepath_t=get_wtime());
TIME(__sync_fetch_and_add(&timeSimplePath, diff_wtime(start_simplepath_t,end_simplepath_t)));
- if (k + pathLen >= maxTipLengthTopological) // "k +" is to take into account that's we're actually traversing a path of extensions from "node"
+ if (k + pathLen >= _maxTipLengthTopological) // "k +" is to take into account that's we're actually traversing a path of extensions from "node"
isShortTopological = false;
- if (k + pathLen >= maxTipLengthRCTC)
+ if (k + pathLen >= _maxTipLengthRCTC)
isShortRCTC = false;
if (isShortTopological)
@@ -505,6 +523,8 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeTips()
isConnected |= (_graph.indegree(node) != 0 || _graph.outdegree(node) != 0);
}
+ // TODO would be worth it to check if the node where the tip is connected to, is also connected to the tip and something else. i.e existence of a V-shaped pattern.
+
bool isTopologicalShortTip = isShortTopological && isConnected;
bool isMaybeRCTCTip = isShortRCTC && isConnected;
@@ -515,7 +535,7 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeTips()
bool isRCTCTip = false;
if (!isTopologicalShortTip && isMaybeRCTCTip)
{
- isRCTCTip = this->satisfyRCTC(pathMeanAbundance, lastNode, RCTCcutoff, simplePathDir);
+ isRCTCTip = this->satisfyRCTC(pathMeanAbundance, lastNode, _tipRCTCcutoff, simplePathDir);
/* fun fact: not putting "this->" crashes gcc 4.7; was fun to debug :\ */
}
@@ -528,11 +548,18 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeTips()
if (isTip)
{
- // delete it
- _graph.simplePathDelete(simplePathStart, simplePathDir, nodesDeleter);
+ if (nodesDeleter.get(simplePathStart))
+ {
+ // not double-counting that delete
+ }
+ else
+ {
+ // delete it
+ _graph.simplePathDelete(simplePathStart, simplePathDir, nodesDeleter);
- __sync_fetch_and_add(&nbTipsRemoved, 1);
- DEBUG_TIPS(cout << endl << "TIP FOUND, deleting node : " << _graph.toString(simplePathStart) << endl);
+ __sync_fetch_and_add(&nbTipsRemoved, 1);
+ DEBUG_TIPS(cout << endl << "TIP FOUND, deleting node : " << _graph.toString(simplePathStart) << endl);
+ }
} // end if isTip
TIME(auto end_tip_processing_t=get_wtime());
@@ -642,6 +669,7 @@ void Simplifications<GraphType,Node,Edge>::heuristic_most_covered_path(
unsigned long nbCalls = 0;
mean_abundance = 0;
+ // so i've disabled the kmer version, see comment on hmcp_old() function below
#if 0
if (kmer_version)
{
@@ -979,13 +1007,14 @@ void Simplifications<GraphType,Node,Edge>::heuristic_most_covered_path_unitigs(
bool most_covered, unsigned long &nbCalls)
{
bool debug = false;
- nbCalls++;
if (traversal_depth < -1)
{
success = HMCP_DIDNT_FIND_END;
return;
}
+
+ nbCalls++;
Node current_node = startNode;
if (debug)
@@ -1001,7 +1030,7 @@ void Simplifications<GraphType,Node,Edge>::heuristic_most_covered_path_unitigs(
set<Node>& traversedNodes (usedNode);
int extra_depth = 1;
- auto processNode = [&](Node &node)
+ auto processNode = [&unitigs_lengths, &unitigs_abundances, &endNode, ¤t_node, &success, &mean_abundance, &traversedNodes](Node &node)
{
current_node = node;
if (current_node == endNode)
@@ -1032,7 +1061,6 @@ void Simplifications<GraphType,Node,Edge>::heuristic_most_covered_path_unitigs(
Node& simplePathStart = current_node;
Direction simplePathDir = dir;
- Node lastNode = _graph.simplePathLastNode (simplePathStart,simplePathDir);
unsigned int pathLen = _graph.simplePathLength(simplePathStart,simplePathDir);
if (pathLen > 0)
{
@@ -1043,7 +1071,8 @@ void Simplifications<GraphType,Node,Edge>::heuristic_most_covered_path_unitigs(
nbCalls += pathLen + 1;
extra_depth += pathLen + 1;
-
+
+ Node lastNode = _graph.simplePathLastNode (simplePathStart,simplePathDir);
if (processNode(lastNode)) // verify whether we're done
return;
@@ -1055,7 +1084,10 @@ void Simplifications<GraphType,Node,Edge>::heuristic_most_covered_path_unitigs(
else
{
if (debug)
- std::cout << "HMCP last node was equal to first node: " << _graph.toString(lastNode) << " " << _graph.toString(current_node) << std::endl;;
+ {
+ Node lastNode = _graph.simplePathLastNode (simplePathStart,simplePathDir);
+ std::cout << "HMCP last node was equal to first node: " << _graph.toString(lastNode) << " " << _graph.toString(current_node) << std::endl;
+ }
}
// end of simple path, yet no out-branching? means there is in-branching
@@ -1166,26 +1198,80 @@ void Simplifications<GraphType,Node,Edge>::heuristic_most_covered_path_unitigs(
return;
}
+class DebugBR
+{
+ string start;
+ struct InfoNeighbor
+ {
+ int idx;
+ string neighbor;
+ string dir;
+ string lastNode;
+ int pathLen;
+ int out_degree_lastNode;
+ string branchTo;
+ bool isTopologicalBulge;
+ };
+ vector<InfoNeighbor> neighbors;
+
+ public:
+ DebugBR(string node, int nb_neighbors) : start(node), neighbors(nb_neighbors) {
+ // std::cout << "new candidate bulge: " << node << std::endl;
+ }
+
+ void infoNeighbor(int neighbor_idx, string neighbor, string direction, string lastNode, int pathLen, int nb_outneighbors, string branchTo, bool isTopologicalBulge)
+ {
+ InfoNeighbor info{neighbor_idx, neighbor, direction, lastNode, pathLen, nb_outneighbors, branchTo, isTopologicalBulge};
+ neighbors[neighbor_idx] = info;
+ //cout << "last node of simple path: "<< lastNode << " and " << nb_outneighbors << " neighbors in bubble direction" << endl;
+ //cout << "endNode: " << branchTo << endl);
+ //cout << "pathlen: " << pathLen << " istopobulge: " << isTopologicalBulge << endl);
+ }
+
+ void draw()
+ {
+ cout << start << " candidate bulge" << std::endl;;
+ for (unsigned int i = 0; i < neighbors.size(); i++)
+ {
+ const InfoNeighbor &nfo = neighbors[i];
+ std::cout << " --> " << nfo.neighbor << "(" << nfo.dir << ") ...[len=" << nfo.pathLen << "]... " << nfo.lastNode << " [neighbors: " << nfo.out_degree_lastNode << "] -> " << nfo.branchTo << " istopobulge: " << nfo.isTopologicalBulge << std::endl;
+ }
+ }
+};
/* bulge removal algorithm. mimics spades, which doesnt remove bubbles, but only bulges. looks as effective.
- * it's slow to do heuristic_find_most_covered path so i'm testing it with no backtracking
*
* see a-b-c here for an explanation of bulge removal: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3791033/figure/f4/
*
* spades pops bulges based on something like the ratio between most examined simple path and a more covered path is (whether it is above 1.1).
* so i'm actually doing just that. I recall checking spades source code to implement this. this was during CAMI.
*
- * In SPAdes' source, a simple path isn't a non-branching one, it is rather the wikipedia definition: one where nodes aren't repeated (and also here, no node is its own reverse-complement). This makes me think that GATB's simplePath function is a bit of a misnomer, should be called nonBranchingPath.
+ * Small apparte about simple paths: in SPAdes' source, a simple path isn't a non-branching one, it is rather the wikipedia definition: one where nodes aren't repeated (and also here, no node is its own reverse-complement). This makes me think that GATB's simplePath function is a bit of a misnomer, should be called nonBranchingPath.
+ *
+ * so.. this is what we did for CAMI and how the code was up to 2017.
+ * but, here's the twist, it turns out that this code doesn't remove any SNPs.
+ * kinda embarassing. metagenomes don't have SNPs so that's why i never noticed so far.
+ * (maybe should have noticed that my human genomes assemblies were so bad)
+ *
+ * let me write the analysis of spades that i should have written earlier:
+ *
+ * in bulge_removed.hpp,
+ *
+ * if bulgecov > 1000, fail (i didn't implement this)
+ *
+ * delta = CountMaxDifference(max_delta_, g_.length(e), max_relative_delta_);
+ * means that delta is set to max(bulgelen*0.1, 3)
+ *
+ * so we're looking for alternative paths which are of length [bulgelen-delta;bulgelen+delta]
+ *
*/
template<typename GraphType, typename Node, typename Edge>
unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
{
unsigned int k = _graph.getKmerSize();
- unsigned int coeff = 3;
- unsigned int additive_coeff = 100;
- unsigned int maxBulgeLength = std::max((unsigned int)((double)k * coeff), (unsigned int)(k + additive_coeff)); // SPAdes, exactly
-
- unsigned int backtrackingLimit = k+20;//maxBulgeLength; // arbitrary, but if too high it will take much time;
+ unsigned int maxBulgeLength = std::max((unsigned int)((double)k * _bulgeLen_kMult), (unsigned int)(k + _bulgeLen_kAdd)); // SPAdes, exactly
+ unsigned int backtrackingLimit = k+_bulgeAltPath_kAdd;//maxBulgeLength; // arbitrary, but if too high it will take much time; // with unitigs, no reason that it has to depend on k, but for some reason, setting it to just "k" doesnt remove nearly as many bulges as k=20. todo investigate that someday.
+ unsigned int altPathCovMult = _bulgeAltPath_covMult;
// stats
//
@@ -1263,10 +1349,11 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
TIME(auto start_various_overhead_t=get_wtime());
- DEBUG_BULGES(cout << "putative bulge node: " << _graph.toString (node) << endl);
-
/** We follow the outgoing simple paths to get their length and last neighbor */
GraphVector<Edge> neighbors = _graph.neighborsEdge(node, dir);
+
+ DEBUG_BULGES(DebugBR debugBR(_graph.toString(node), neighbors.size()););
+
TIME(auto end_various_overhead_t=get_wtime());
TIME(__sync_fetch_and_add(&timeVarious, diff_wtime(start_various_overhead_t,end_various_overhead_t)));
@@ -1287,7 +1374,7 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
TIME(auto end_various_overhead_t=get_wtime());
TIME(__sync_fetch_and_add(&timeVarious, diff_wtime(start_various_overhead_t,end_various_overhead_t)));
-
+ /* that's the putative bulge*/
TIME(auto start_simplepath_t=get_wtime());
Node& simplePathStart = neighbors[i].to;
Direction simplePathDir = dir;
@@ -1296,7 +1383,6 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
TIME(__sync_fetch_and_add(&timeSimplePath, diff_wtime(start_simplepath_t,end_simplepath_t)));
__sync_fetch_and_add(&nbSimplePaths, 1);
- DEBUG_BULGES(cout << "neighbors " << i+1 << "/" << neighbors.size() << " from: " << _graph.toString (neighbors[i].to) << " dir: " << DIR2STR(dir) << endl);
bool isShort = true;
if (k + pathLen >= maxBulgeLength) // "k +" is to take into account that's we're actually traversing a path of extensions from "node"
@@ -1315,15 +1401,17 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
Node lastNode = _graph.simplePathLastNode(simplePathStart,simplePathDir);
GraphVector<Edge> outneighbors = _graph.neighborsEdge(lastNode, dir);
- DEBUG_BULGES(cout << "last node of simple path: "<< _graph.toString(lastNode) << " has indegree/outdegree: " <<_graph.indegree(lastNode) << "/" << _graph.outdegree(lastNode) << " and " << outneighbors.size() << " neighbors in bubble direction" << endl);
if (outneighbors.size() == 0) // might still be a tip, unremoved for some reason
continue;
- // TODO: so here is a hidden assumption: maybe outneighbors is of size more than 1, why do we care about just one of the nodes after. it doesn't matter much, in the sense that just some bulges might remain
+ // so here is a hidden assumption: maybe outneighbors is of size more than 1, we used to care about the first noed after.
+ // i could decide to enforce bulge popping only if the outneighbor has size 1.
+ //if (outneighbors.size() != 1)
+ // continue;
+ // but i'm decided to do without for now. TODO: explore all the end nodes, not just the first once
Node endNode = outneighbors[0].to;
- DEBUG_BULGES(cout << "endNode: " << _graph.toString(endNode) << endl);
// at this point, the last node in "nodes" is the last node of a potential Bulge path, and endNode is hopefully a branching node right after.
// check if it's connected to something that has in-branching.
@@ -1331,7 +1419,7 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
bool isTopologicalBulge = isDoublyConnected;
- DEBUG_BULGES(cout << "pathlen: " << pathLen << " istopobulge: " << isTopologicalBulge << endl);
+ DEBUG_BULGES(debugBR.infoNeighbor(i, _graph.toString(neighbors[i].to), DIR2STR(dir), _graph.toString(lastNode), pathLen, outneighbors.size(), _graph.toString(endNode), isTopologicalBulge););
TIME(end_various_overhead_t=get_wtime());
TIME(__sync_fetch_and_add(&timeVarious, diff_wtime(start_various_overhead_t,end_various_overhead_t)));
@@ -1343,7 +1431,7 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
__sync_fetch_and_add(&nbTopologicalBulges, 1);
- unsigned int depth = std::max((unsigned int)(pathLen * 1.1),(unsigned int) 3); // following SPAdes
+ unsigned int maxlen = std::max((unsigned int)(pathLen * 1.1),(unsigned int) (pathLen + 3)); // following SPAdes
double mean_abundance_most_covered;
int success;
Node startNode = node;
@@ -1353,7 +1441,9 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
Path_t<Node> heuristic_p_most; // actually won't be used.. (it's just for debug) so would be nice to get rid of it someday, but i don't want to deal with pointers.
/* startNode is branching, because we want to find alternative paths, except the one that go through (neighbors[i].to)*/
- this->heuristic_most_covered_path(dir, startNode, endNode, depth+2, success, mean_abundance_most_covered,
+ this->heuristic_most_covered_path(dir, startNode, endNode,
+ maxlen,
+ success, mean_abundance_most_covered,
heuristic_p_most,
backtrackingLimit, // avoid too much backtracking
&(neighbors[i].to), // avoid that node
@@ -1369,7 +1459,7 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
DEBUG_BULGES(cout << "HMCP failed: " << hmcpstatus2ascii(success) << endl);
TIME(__sync_fetch_and_add(&timeFailedPathFinding, diff_wtime(start_pathfinding_t,end_pathfinding_t)));
TIME(if (diff_wtime(start_pathfinding_t,end_pathfinding_t) > timeLongestFailure) { timeLongestFailure = diff_wtime(start_pathfinding_t,end_pathfinding_t); });
- longestFailureDepth = depth;
+ longestFailureDepth = maxlen;
if (success == HMCP_LOOP)
__sync_fetch_and_add(&nbNoAltPathBulgesLoop, 1);
@@ -1388,17 +1478,17 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
double mean_abundance_least_covered;
Path_t<Node> heuristic_p_least, heuristic_p_most;
- this->heuristic_most_covered_path(dir, startNode, endNode, depth+2, success, mean_abundance_most_covered, heuristic_p_most, backtrackingLimit, &(neighbors[i].to), true, true /* old version */);
- this->heuristic_most_covered_path(dir, startNode, endNode, depth+2, success, mean_abundance_least_covered, heuristic_p_least, backtrackingLimit, &(neighbors[i].to), false, true /* old version */);
+ this->heuristic_most_covered_path(dir, startNode, endNode, maxlen, success, mean_abundance_most_covered, heuristic_p_most, backtrackingLimit, &(neighbors[i].to), true, true /* old version */);
+ this->heuristic_most_covered_path(dir, startNode, endNode, maxlen, success, mean_abundance_least_covered, heuristic_p_least, backtrackingLimit, &(neighbors[i].to), false, true /* old version */);
cout << "alternative path is: "<< this->path2string(dir, heuristic_p_most, endNode)<< " abundance: "<< mean_abundance_most_covered <<endl;
DEBUG_BULGES(cout << endl << "alternative least is: "<< this->path2string(dir, heuristic_p_least, endNode)<< " abundance: "<< mean_abundance_least_covered <<endl);
}
double simplePathCoverage = _graph.simplePathMeanAbundance(simplePathStart, simplePathDir);
- bool isBulge = simplePathCoverage * 1.1 <= mean_abundance_most_covered;
+ bool isBulge = simplePathCoverage <= mean_abundance_most_covered * altPathCovMult /*typically 1.1 in genome assembly, SPAdes*/;
- DEBUG_BULGES(cout << "bulge coverages: " << simplePathCoverage << " (path: " << _graph.toString(simplePathStart) << " vs most covered:" << mean_abundance_most_covered << endl);
+ DEBUG_BULGES(cout << "bulge coverages: " << simplePathCoverage << " (path: " << _graph.toString(simplePathStart) << ") vs most covered:" << mean_abundance_most_covered << endl);
if (!isBulge)
{
@@ -1410,17 +1500,28 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeBulges()
continue;
}
- // delete the bulge
- //
- DEBUG_BULGES(cout << endl << "BULGE of length " << pathLen << " FOUND: " << _graph.toString (simplePathStart) << endl);
- _graph.simplePathDelete(simplePathStart, simplePathDir, nodesDeleter);
+ if (nodesDeleter.get(simplePathStart))
+ {
+ // not double-counting that delete
+ }
+ else
+ {
+ // delete the bulge
+ //
+ DEBUG_BULGES(cout << endl << "BULGE of length " << pathLen << " FOUND: " << _graph.toString (simplePathStart) << endl);
+ _graph.simplePathDelete(simplePathStart, simplePathDir, nodesDeleter);
- __sync_fetch_and_add(&nbBulgesRemoved, 1);
+ __sync_fetch_and_add(&nbBulgesRemoved, 1);
+ }
TIME(auto end_post_t=get_wtime());
TIME(__sync_fetch_and_add(&timePost, diff_wtime(start_post_t,end_post_t)));
+ break; // quite important to break here: don't try to remove the other neighbor (which might also satisfy the bulge condition)
+
} // for neighbors
+
+ DEBUG_BULGES(debugBR.draw(););
} // if outdegree
} // for direction
TIME(auto end_thread_t=get_wtime());
@@ -1497,8 +1598,8 @@ template<typename GraphType, typename Node, typename Edge>
unsigned long Simplifications<GraphType,Node,Edge>::removeErroneousConnections()
{
unsigned int k = _graph.getKmerSize();
- unsigned int maxECLength = (unsigned int)((float)k * (10 - 1.0)) ; // SPAdes mode
- double RCTCcutoff = 4.0;
+ unsigned int maxECLength = (unsigned int)((float)k * _ecLen_kMult) ; // SPAdes mode
+ double RCTCcutoff = _ecRCTCcutoff;
unsigned long nbSimplePaths = 0;
unsigned long nbLongSimplePaths = 0;
@@ -1622,7 +1723,7 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeErroneousConnections()
double pathMeanAbundance = _graph.simplePathMeanAbundance(simplePathStart,simplePathDir);
GraphVector<Edge> outneighbors = _graph.neighborsEdge(lastNode, dir);
- DEBUG_EC(cout << "last simple path node: "<< _graph.toString(lastNode) << " has " << outneighbors.size() << " outneighbors" << endl);
+ DEBUG_EC(cout << "last simple path node: "<< _graph.toString(lastNode) << " has " << outneighbors.size() << " outneighbors; mean abundance: " << pathMeanAbundance << endl);
if (outneighbors.size() == 0) // might still be a tip, unremoved for some reason
continue;
@@ -1655,16 +1756,25 @@ unsigned long Simplifications<GraphType,Node,Edge>::removeErroneousConnections()
bool isEC = isRCTC;
+ DEBUG_EC(cout << "isRCTC:" << isRCTC << endl);
+
TIME(auto start_ec_processing_t=get_wtime());
if (isEC)
{
- // delete it
- //
- _graph.simplePathDelete(simplePathStart, simplePathDir, nodesDeleter);
- DEBUG_EC(cout << endl << "EC of length " << pathLen << " FOUND: " << _graph.toString (node) << endl);
-
- __sync_fetch_and_add(&nbECRemoved, 1);
+ if (nodesDeleter.get(simplePathStart))
+ {
+ // not double-counting that delete
+ }
+ else
+ {
+ // delete it
+ //
+ _graph.simplePathDelete(simplePathStart, simplePathDir, nodesDeleter);
+ DEBUG_EC(cout << endl << "EC of length " << pathLen << " FOUND: " << _graph.toString (node) << endl);
+
+ __sync_fetch_and_add(&nbECRemoved, 1);
+ }
}
TIME(auto end_ec_processing_t=get_wtime());
diff --git a/gatb-core/src/gatb/debruijn/impl/Simplifications.hpp b/gatb-core/src/gatb/debruijn/impl/Simplifications.hpp
index 3ff6401..2cf942d 100644
--- a/gatb-core/src/gatb/debruijn/impl/Simplifications.hpp
+++ b/gatb-core/src/gatb/debruijn/impl/Simplifications.hpp
@@ -57,7 +57,20 @@ public:
std::string tipRemoval, bubbleRemoval, ECRemoval;
bool _doTipRemoval, _doBulgeRemoval, _doECRemoval;
-
+
+ /* now exposing some parameters */
+ double _tipLen_Topo_kMult;
+ double _tipLen_RCTC_kMult;
+ double _tipRCTCcutoff;
+
+ double _bulgeLen_kMult;
+ unsigned int _bulgeLen_kAdd;
+ unsigned int _bulgeAltPath_kAdd;
+ unsigned int _bulgeAltPath_covMult;
+
+ double _ecLen_kMult;
+ double _ecRCTCcutoff;
+
protected:
/*const*/ GraphType & _graph;
int _nbCores;
diff --git a/gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.cpp b/gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.cpp
index ee71cb5..267ba32 100644
--- a/gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.cpp
+++ b/gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.cpp
@@ -18,14 +18,16 @@
*****************************************************************************/
#include <gatb/debruijn/impl/UnitigsConstructionAlgorithm.hpp>
+#include <gatb/kmer/impl/Configuration.hpp>
+#include <gatb/kmer/impl/PartiInfo.hpp>
#include <gatb/system/impl/System.hpp>
#include <gatb/tools/designpattern/impl/Command.hpp>
#include <gatb/tools/misc/impl/Progress.hpp>
#include <gatb/tools/misc/impl/Stringify.hpp>
#include <gatb/bcalm2/bcalm_algo.hpp>
#include <gatb/bcalm2/bglue_algo.hpp>
+#include <gatb/debruijn/impl/LinkTigs.hpp>
-#include <queue>
// We use the required packages
using namespace std;
@@ -60,9 +62,13 @@ UnitigsConstructionAlgorithm<span>::UnitigsConstructionAlgorithm (
tools::storage::impl::Storage& storage,
std::string unitigs_filename,
size_t nb_cores,
- tools::misc::IProperties* options
+ tools::misc::IProperties* options,
+ bool do_bcalm,
+ bool do_bglue,
+ bool do_links
)
- : Algorithm("bcalm2-wrapper", nb_cores, options), _storage(storage), unitigs_filename(unitigs_filename)
+ : Algorithm("bcalm2-wrapper", nb_cores, options), _storage(storage), unitigs_filename(unitigs_filename),
+ do_bcalm(do_bcalm), do_bglue(do_bglue), do_links(do_links)
{
}
@@ -85,10 +91,10 @@ UnitigsConstructionAlgorithm<span>::~UnitigsConstructionAlgorithm ()
template <size_t span>
void UnitigsConstructionAlgorithm<span>::execute ()
{
- int kmerSize =
+ kmerSize =
getInput()->getInt(STR_KMER_SIZE);
int abundance =
- getInput()->getInt(STR_KMER_ABUNDANCE_MIN);
+ getInput()->getInt(STR_KMER_ABUNDANCE_MIN); // note: doesn't work when it's "auto"
int minimizerSize =
getInput()->getInt(STR_MINIMIZER_SIZE);
int nb_threads =
@@ -96,19 +102,17 @@ void UnitigsConstructionAlgorithm<span>::execute ()
int minimizer_type =
getInput()->getInt(STR_MINIMIZER_TYPE);
bool verbose = getInput()->getInt(STR_VERBOSE);
-
+
unsigned int nbThreads = this->getDispatcher()->getExecutionUnitsNumber();
if ((unsigned int)nb_threads > nbThreads)
- {
std::cout << "Uh. Unitigs graph construction called with nb_threads " << nb_threads << " but dispatcher has nbThreads " << nbThreads << std::endl;
- }
-
- bcalm2<span>(&_storage, unitigs_filename, kmerSize, abundance, minimizerSize, nbThreads, minimizer_type, verbose);
- bglue<span> (&_storage, unitigs_filename, kmerSize, minimizerSize, nbThreads, minimizer_type, verbose);
- link_unitigs(unitigs_filename, kmerSize, verbose);
+ if (do_bcalm) bcalm2<span>(&_storage, unitigs_filename, kmerSize, abundance, minimizerSize, nbThreads, minimizer_type, verbose);
+ if (do_bglue) bglue<span> (&_storage, unitigs_filename, kmerSize, nbThreads, verbose);
+ if (do_links) link_tigs<span>(unitigs_filename, kmerSize, nbThreads, nb_unitigs, verbose);
/** We gather some statistics. */
+ // nb_unitigs will be used in GraphUnitigs
//getInfo()->add (1, "stats");
//getInfo()->add (2, "nb_unitigs", "%ld", /* */);
@@ -116,161 +120,23 @@ void UnitigsConstructionAlgorithm<span>::execute ()
//getInfo()->add (2, "build", "%.3f", /* */);
}
-/* this procedure finds the overlaps between unitigs, using a hash table of all extremity (k-1)-mers
- *
- * I guess it's like AdjList in ABySS. It's also like contigs_to_fastg in MEGAHIT.
- *
- * could be replaced by keeping edges during BCALM2, but it's not the case for now */
-template<size_t span>
-void UnitigsConstructionAlgorithm<span>::
-link_unitigs(string unitigs_filename, int kmerSize, bool verbose)
+// unused but nifty
+static uint64_t sizeof_string_vector(std::vector<std::string>& v)
{
- bool debug = false;
-
- BankFasta inputBank (unitigs_filename);
- BankFasta::Iterator itSeq (inputBank);
- uint64_t utig_counter = 0;
-
- Model modelKminusOne(kmerSize - 1); // it's canonical (defined in the .hpp file)
-
- if (verbose)
- std::cout << "Finding links between unitigs, pass 1, mem current/maxRSS: " << system::impl::System::info().getMemorySelfUsed() / 1024 << "/" << system::impl::System::info().getMemorySelfMaxUsed() / 1024 << std::endl;
-
- for (itSeq.first(); !itSeq.isDone(); itSeq.next())
- {
- const string& seq = itSeq->toString();
-
- typename Model::Kmer kmerBegin = modelKminusOne.codeSeed(seq.substr(0, kmerSize-1).c_str(), Data::ASCII);
- typename Model::Kmer kmerEnd = modelKminusOne.codeSeed(seq.substr(seq.size() - kmerSize+1).c_str(), Data::ASCII);
-
- bool beginInSameOrientation = modelKminusOne.toString(kmerBegin.value()) == seq.substr(0,kmerSize-1);
- bool endInSameOrientation = modelKminusOne.toString(kmerEnd.value()) == seq.substr(seq.size() - kmerSize+1);
-
- ExtremityInfo eBegin(utig_counter, !beginInSameOrientation /* because we record rc*/, UNITIG_BEGIN);
- ExtremityInfo eEnd( utig_counter, !endInSameOrientation, UNITIG_END);
- // there is no UNITIG_BOTH here because we're taking (k-1)-mers.
-
- utigs_links_map[kmerBegin.value()].push_back(eBegin.pack());
- utigs_links_map[kmerEnd.value()].push_back(eEnd.pack());
- utig_counter++;
- }
-
- BankFasta* out = new BankFasta(unitigs_filename+".indexed");
-
- if (verbose)
- std::cout << "Finding links between unitigs, pass 2, mem current/maxRSS: " << system::impl::System::info().getMemorySelfUsed() / 1024 << "/" << system::impl::System::info().getMemorySelfMaxUsed() / 1024 << std::endl;
-
- uint64_t utigs_number = 0;
- for (itSeq.first(); !itSeq.isDone(); itSeq.next())
- {
- const string& seq = itSeq->toString();
- const string& comment = itSeq->getComment();
-
- typename Model::Kmer kmerBegin = modelKminusOne.codeSeed(seq.substr(0, kmerSize-1).c_str(), Data::ASCII);
- typename Model::Kmer kmerEnd = modelKminusOne.codeSeed(seq.substr(seq.size() - kmerSize+1).c_str(), Data::ASCII);
- bool beginInSameOrientation = modelKminusOne.toString(kmerBegin.value()) == seq.substr(0,kmerSize-1); // that could be optimized, revcomp was already computed during codeSeed
- bool endInSameOrientation = modelKminusOne.toString(kmerEnd.value()) == seq.substr(seq.size() - kmerSize+1);
-
- // treat special palindromic kmer cases
- bool nevermindInOrientation = false;
- bool nevermindOutOrientation = false;
- if (((kmerSize - 1) % 2) == 0)
- {
- if (kmerBegin.isPalindrome()) nevermindInOrientation = true;
- if (kmerEnd.isPalindrome()) nevermindOutOrientation = true;
-
- }
-
- if (debug) std::cout << "unitig: " << seq << std::endl;
-
- string links;
-
- // in-neighbors
- for (auto in_packed : utigs_links_map[kmerBegin.value()])
- {
- ExtremityInfo e_in(in_packed);
-
-
- if (debug) std::cout << "extremity " << modelKminusOne.toString(kmerBegin.value()) << " ";
- if (debug) std::cout << "potential in-neighbor: " << e_in.toString() << " beginSameOrientation " << beginInSameOrientation;
-
- // what we want are these four cases:
- // ------[end same orientation] -> [begin same orientation]----
- // [begin diff orientation]---- -> [begin same orientation]----
- // ------[end diff orientation] -> [begin diff orientation]----
- // [begin same orientation]---- -> [begin diff orientation]----
- if ((((beginInSameOrientation) && (e_in.pos == UNITIG_END ) && (e_in.rc == false)) ||
- ((beginInSameOrientation) && (e_in.pos == UNITIG_BEGIN) && (e_in.rc == true)) ||
- (((!beginInSameOrientation)) && (e_in.pos == UNITIG_END ) && (e_in.rc == true)) ||
- (((!beginInSameOrientation)) && (e_in.pos == UNITIG_BEGIN) && (e_in.rc == false)))
- || nevermindInOrientation)
- {
- if (nevermindInOrientation && (e_in.unitig == utigs_number)) continue; // don't consider the same extremity
-
- //LinkInfo li(e_in.unitig, e_in.rc ^ beginInSameOrientation);
- //incoming[utig_number].push_back(li.pack());
- bool rc = e_in.rc ^ (!beginInSameOrientation);
- links += "L:-:" + to_string(e_in.unitig) + ":" + (rc?"+":"-") + " "; /* invert-reverse because of incoming orientation. it's very subtle and i'm still not sure i got it right */
-
- if (nevermindInOrientation)
- links += "L:-:" + to_string(e_in.unitig) + ":" + ((!rc)?"+":"-") + " "; /* in that case, there is also another link with the reverse direction*/
-
- if (debug) std::cout << " [valid] ";
- }
-
- if (debug) std::cout << std::endl;
- }
-
- // out-neighbors
- for (auto out_packed : utigs_links_map[kmerEnd.value()])
- {
- ExtremityInfo e_out(out_packed);
-
- if (debug) std::cout << "extremity " << modelKminusOne.toString(kmerEnd.value()) << " ";
- if (debug) std::cout << "potential out-neighbor: " << e_out.toString();
-
- // what we want are these four cases:
- // ------[end same orientation] -> [begin same orientation]----
- // ------[end same orientation] -> ------[end diff orientation]
- // ------[end diff orientation] -> [begin diff orientation]----
- // ------[end diff orientation] -> ------[end same orientation]
- if ((((endInSameOrientation) && (e_out.pos == UNITIG_BEGIN) && (e_out.rc == false)) ||
- ((endInSameOrientation) && (e_out.pos == UNITIG_END ) && (e_out.rc == true)) ||
- (((!endInSameOrientation)) && (e_out.pos == UNITIG_BEGIN) && (e_out.rc == true)) ||
- (((!endInSameOrientation)) && (e_out.pos == UNITIG_END ) && (e_out.rc == false)))
- ||nevermindOutOrientation)
- {
- if (nevermindOutOrientation && (e_out.unitig == utigs_number)) continue; // don't consider the same extremity
-
- //LinkInfo li(e_out.unitig, e_out.rc ^ endInSameOrientation);
- //outcoming[utig_number].push_back(li.pack());
- bool rc = e_out.rc ^ (!endInSameOrientation);
- links += "L:+:" + to_string(e_out.unitig) + ":" + (rc?"-":"+") + " "; /* logically this is going to be opposite of the line above */
-
- if (nevermindOutOrientation)
- links += "L:+:" + to_string(e_out.unitig) + ":" + ((!rc)?"-":"+") + " "; /* in that case, there is also another link with the reverse direction*/
-
- if (debug) std::cout << " [valid] ";
- }
- if (debug) std::cout << std::endl;
- }
-
- Sequence s (Data::ASCII);
- s.getData().setRef ((char*)seq.c_str(), seq.size());
- s._comment = comment + " " + links;
- out->insert(s);
- utigs_number++;
- }
- nb_unitigs = utigs_number;
-
- delete out;
- system::impl::System::file().remove (unitigs_filename);
- system::impl::System::file().rename (unitigs_filename+".indexed", unitigs_filename);
+ //http://stackoverflow.com/questions/29868622/memory-consumed-by-a-string-vector-in-c
+ uint64_t sum=0;
+ for (auto s: v)
+ sum += s.capacity();
+
+ return sizeof(std::vector<string>) // The size of the vector basics.
+ + sizeof(std::string) * v.capacity() // Size of the string object, not the text
+ // One string object for each item in the vector.
+ // **The multiplier may want to be the capacity of the vector,
+ // **the reserved quantity.
+ // + sum of each string's length;
+ + sum;
}
-
-
-
/********************************************************************************/
} } } } /* end of namespaces. */
/********************************************************************************/
diff --git a/gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.hpp b/gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.hpp
index a0f21a3..1b8c493 100644
--- a/gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.hpp
+++ b/gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.hpp
@@ -23,15 +23,10 @@
/********************************************************************************/
#include <gatb/tools/misc/impl/Algorithm.hpp>
-#include <gatb/bank/api/IBank.hpp>
-#include <gatb/kmer/impl/Model.hpp>
-#include <gatb/kmer/impl/Configuration.hpp>
-#include <gatb/kmer/impl/PartiInfo.hpp>
+#include <gatb/kmer/impl/Model.hpp> // for KMER_DEFAULT_SPAN and so on
+
#include <gatb/tools/storage/impl/Storage.hpp>
#include <gatb/bcalm2/bcalm_algo.hpp>
-#include <gatb/debruijn/impl/ExtremityInfo.hpp>
-
-#include <unordered_map>
/********************************************************************************/
@@ -48,12 +43,6 @@ template <size_t span=KMER_DEFAULT_SPAN>
class UnitigsConstructionAlgorithm : public gatb::core::tools::misc::impl::Algorithm
{
public:
-
- /** Shortcuts. */
- typedef typename kmer::impl::Kmer<span>::ModelCanonical Model;
- typedef typename kmer::impl::Kmer<span>::Type Type;
- typedef typename kmer::impl::Kmer<span>::Count Count;
-
/** Constructor.
* \param[in] graph : graph from which we look for branching nodes
* \param[in] nb_cores : number of cores to be used; 0 means all available cores
@@ -63,7 +52,10 @@ public:
tools::storage::impl::Storage& storage,
std::string unitigs_filename,
size_t nb_cores = 0,
- tools::misc::IProperties* options = 0
+ tools::misc::IProperties* options = 0,
+ bool do_bcalm = true,
+ bool do_bglue = true,
+ bool do_links = true
);
/** Destructor. */
@@ -77,20 +69,15 @@ public:
/** \copydoc tools::misc::impl::Algorithm::execute */
void execute ();
- // structure that links each kmer to an unitig
- // also used to enumerate kmers
- typedef typename std::unordered_map<Type, std::vector<uint64_t>> NodeLinksMap;
+ int kmerSize;
- NodeLinksMap utigs_links_map;
-
- void link_unitigs(std::string unitigs_filename, int kmerSize, bool verbose);
-
uint64_t nb_unitigs;
private:
tools::storage::impl::Storage& _storage;
std::string unitigs_filename;
+ bool do_bcalm, do_bglue, do_links;
};
/********************************************************************************/
diff --git a/gatb-core/src/gatb/debruijn/impl/dag_vector.hpp b/gatb-core/src/gatb/debruijn/impl/dag_vector.hpp
new file mode 100644
index 0000000..c8609a2
--- /dev/null
+++ b/gatb-core/src/gatb/debruijn/impl/dag_vector.hpp
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2011 Daisuke Okanohara
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above Copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above Copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the authors nor the names of its contributors
+ * may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ */
+
+#ifndef DAG_VECTOR_HPP_
+#define DAG_VECTOR_HPP_
+
+#include <vector>
+#include <stdint.h>
+#include "rank_vector.hpp"
+
+namespace dag{
+
+/**
+ * Direct Accessible Gamma code Vector
+ */
+class dag_vector{
+public:
+ /**
+ * Constructor
+ */
+ dag_vector(): size_(0), sum_(0), max_shift_num_(0){
+ }
+
+ /**
+ * Destructor
+ */
+ ~dag_vector(){
+ }
+
+ /**
+ * Add element in a gamma code
+ * @param val an element to be added
+ */
+ void push_back(uint64_t val){
+ uint64_t val1 = val+1;
+ uint64_t shift = binary_len(val1);
+ resize(shift);
+ for (size_t i = 0; i < shift; ++i){
+ bitvals_[i].push_back((val1 >> i) & 0x1LLU);
+ bitunaries_[i].push_back(1);
+ }
+ if (shift == bitunaries_.size()){
+ max_shift_num_++;
+ } else {
+ bitunaries_[shift].push_back(0);
+ }
+ size_++;
+ sum_ += val;
+ }
+
+ /**
+ * Lookup the ind-th element
+ * @param ind the index
+ * @return the ind-th element
+ */
+ uint64_t operator[] (uint64_t ind) const{
+ uint64_t val = 0;
+ for (uint64_t shift = 0; shift < bitunaries_.size(); ++shift){
+ if (!bitunaries_[shift].get_bit(ind)){
+ return val + ((1LLU) << shift) -1;
+ }
+ ind = bitunaries_[shift].rank(ind);
+ val += bitvals_[shift].get_bit(ind) << shift;
+ }
+ return val + ((1LLU) << bitunaries_.size()) - 1;
+ }
+
+ /**
+ * Compute the prefix sum: the sum of [0...ind-1] values.
+ * O(log max_val) time.
+ * @param ind the index
+ * @return the sum of v[0] v[1] ... v[ind-1]
+ */
+ uint64_t prefix_sum(uint64_t ind) const{
+ uint64_t orig_ind = ind;
+ uint64_t ret = 0;
+ for (uint64_t shift = 0; shift < bitunaries_.size(); ++shift){
+ uint64_t ones = bitunaries_[shift].rank(ind);
+ ret += (ind - ones) << shift;
+ ret += bitvals_[shift].rank(ones) << shift;
+ ind = ones;
+ }
+ return ret + (ind << bitunaries_.size()) - orig_ind;
+ }
+
+ /**
+ * Compute the prefix sum and the value in O(log max_val) time.
+ * @param ind the index
+ * @return the pair of the prefix sum (sum of v[0] v[1] ... v[ind-1]) and v[ind]
+ */
+ std::pair<uint64_t, uint64_t> prefix_sum_val(uint64_t ind) const{
+ uint64_t orig_ind = ind;
+ uint64_t sum = 0;
+ uint64_t val = 0;
+ bool val_finish = false;
+ for (uint64_t shift = 0; shift < bitunaries_.size(); ++shift){
+ uint64_t ones = bitunaries_[shift].rank(ind);
+ sum += (ind - ones) << shift;
+ if (!val_finish && !bitunaries_[shift].get_bit(ind)){
+ val += (1LLU) << shift;
+ val_finish = true;
+ }
+ sum += bitvals_[shift].rank(ones) << shift;
+ if (!val_finish){
+ val += bitvals_[shift].get_bit(ones) << shift;
+ }
+ ind = ones;
+ }
+ if (!val_finish ){
+ val += (1LLU) << bitunaries_.size();
+ }
+ sum += ind << bitunaries_.size();
+ return std::make_pair(sum - orig_ind, val-1);
+ }
+
+ /**
+ * Return the number of elements
+ * @return the number of elements
+ */
+ uint64_t size() const{
+ return size_;
+ }
+
+ /**
+ * Return the sum of values
+ * @return the sum of values
+ */
+ uint64_t sum() const{
+ return sum_;
+ }
+
+ /**
+ * Swap the content
+ * @param dagv the dag_vector to be swapped
+ */
+ void swap(dag_vector& dagv){
+ bitvals_.swap(dagv.bitvals_);
+ bitunaries_.swap(dagv.bitunaries_);
+ std::swap(size_, dagv.size_);
+ std::swap(sum_, dagv.sum_);
+ std::swap(max_shift_num_, dagv.max_shift_num_);
+ }
+
+ /**
+ * Clear the content
+ */
+ void clear() {
+ std::vector<rank_vector>().swap(bitvals_);
+ std::vector<rank_vector>().swap(bitunaries_);
+ size_ = 0;
+ max_shift_num_ = 0;
+ }
+
+ /**
+ * Get the number of allocated bytes
+ */
+ uint64_t get_alloc_byte_num() const{
+ uint64_t byte_num = 0;
+ for (size_t i = 0; i < bitvals_.size(); ++i){
+ uint64_t block_num = (bitvals_[i].size() + 64 - 1) / 64;
+ byte_num += sizeof(uint64_t) * block_num
+ + sizeof(uint64_t) * (block_num / 4)
+ + sizeof(uint8_t) * block_num;
+ }
+ for (size_t i = 0; i < bitunaries_.size(); ++i){
+ uint64_t block_num = (bitunaries_[i].size() + 64 - 1) / 64;
+ byte_num += sizeof(uint64_t) * block_num
+ + sizeof(uint64_t) * (block_num / 4)
+ + sizeof(uint8_t) * block_num;
+ }
+ return byte_num;
+ }
+
+ class const_iterator : public std::iterator<std::random_access_iterator_tag, uint64_t, size_t> {
+ public:
+ const_iterator(const dag_vector& dagv) : bitunaries_(dagv.bitunaries_), bitvals_(dagv.bitvals_) {
+ bitunary_poses_.resize(bitunaries_.size()+1);
+ bitval_poses_.resize(bitvals_.size());
+ set_cur_val();
+ }
+
+ const_iterator& end(const dag_vector& dagv){
+ for (size_t i = 0; i < bitval_poses_.size(); ++i){
+ bitval_poses_[i] = bitvals_[i].size();
+ }
+ for (size_t i = 1; i < bitunary_poses_.size(); ++i){
+ bitunary_poses_[i-1] = bitunaries_[i-1].size();
+ }
+ bitunary_poses_.back() = dagv.max_shift_num_;
+ cur_val_ = 0;
+ cur_shift_ = bitval_poses_.size();
+ return *this;
+ }
+
+ const_iterator& operator++(){
+ for (size_t i = 0; ; ++i){
+ ++bitunary_poses_[i];
+ if (i == cur_shift_){
+ break;
+ }
+ ++bitval_poses_[i];
+ }
+ set_cur_val();
+ return *this;
+ }
+
+ const_iterator operator++(int){
+ const_iterator tmp(*this);
+ ++*this;
+ return tmp;
+ }
+
+ const_iterator& operator--(){
+ for (size_t i = 0; ; ++i){
+ --bitunary_poses_[i];
+ if (i == cur_shift_){
+ break;
+ }
+ --bitval_poses_[i];
+ }
+
+ set_cur_val();
+ return *this;
+ }
+
+ const_iterator operator--(int){
+ const_iterator tmp(*this);
+ --*this;
+ return tmp;
+ }
+
+ size_t operator-(const const_iterator& it) const{
+ return bitunary_poses_[0] - it.bitunary_poses_[0];
+ }
+
+ bool operator==(const const_iterator& it) const{
+ if (bitval_poses_ != it.bitval_poses_) return false;
+ return true;
+ }
+
+ bool operator!=(const const_iterator& it) const{
+ return !(*this == it);
+ }
+
+ uint64_t operator*() const {
+ return cur_val_;
+ }
+
+ private:
+ void set_cur_val() {
+ uint64_t val = 0;
+ cur_shift_ = 0;
+ for (; cur_shift_ < bitunaries_.size(); ++cur_shift_){
+ if (!bitunaries_[cur_shift_].get_bit(bitunary_poses_[cur_shift_])){
+ break;
+ }
+ val += bitvals_[cur_shift_].get_bit(bitval_poses_[cur_shift_]) << cur_shift_;
+ }
+ cur_val_ = val + (1LLU << cur_shift_) - 1;
+ }
+
+ const std::vector<rank_vector>& bitunaries_;
+ const std::vector<rank_vector>& bitvals_;
+ std::vector<uint64_t> bitval_poses_;
+ std::vector<uint64_t> bitunary_poses_;
+ uint64_t cur_shift_;
+ uint64_t cur_val_;
+ };
+
+ const_iterator begin() const{
+ return const_iterator(*this);
+ }
+
+ const_iterator end() const{
+ const_iterator it = const_iterator(*this);
+ return it.end(*this);
+ }
+
+ static uint64_t binary_len(uint64_t val){
+ uint64_t shift = 0;
+ for (; (val >> shift) > 1; ++shift){}
+ return shift;
+ }
+
+private:
+ void resize(uint64_t shift){
+ uint64_t old_shift = bitunaries_.size();
+ if (shift <= old_shift){
+ return;
+ }
+ bitunaries_.resize(shift);
+ bitvals_.resize(shift);
+ for (size_t i = 0; i < max_shift_num_; ++i){
+ bitunaries_[old_shift].push_back(0);
+ }
+ max_shift_num_ = 0;
+ }
+
+ std::vector<rank_vector> bitunaries_; /// unary codes
+ std::vector<rank_vector> bitvals_; /// value codes
+ uint64_t size_; /// the number of codes
+ uint64_t sum_; /// the sum of values
+ uint64_t max_shift_num_; /// the number of codes whose have the largest lengths
+};
+
+}
+
+#endif // DAG_VECTOR_HPP_
+
diff --git a/gatb-core/src/gatb/debruijn/impl/rank_vector.hpp b/gatb-core/src/gatb/debruijn/impl/rank_vector.hpp
new file mode 100644
index 0000000..b4dbd78
--- /dev/null
+++ b/gatb-core/src/gatb/debruijn/impl/rank_vector.hpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2011 Daisuke Okanohara
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above Copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above Copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the authors nor the names of its contributors
+ * may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ */
+
+
+#ifndef RANK_VECTOR_HPP_
+#define RANK_VECTOR_HPP_
+
+#include <vector>
+#include <stdint.h>
+
+namespace dag{
+
+/**
+ * Bit Vector supporing Rank operation
+ */
+class rank_vector{
+public:
+
+ /**
+ * Constructor
+ */
+ rank_vector(): size_(0), one_num_(0){
+ bits_.push_back(0);
+ lblocks_.push_back(0);
+ sblocks_.push_back(0);
+ }
+
+ /**
+ * Destructor
+ */
+ ~rank_vector(){
+ }
+
+ /**
+ * Add bit to the end of the vector
+ * @param bit a bit to be added
+ */
+ void push_back(uint64_t bit){
+ if (bit){
+ bits_[size_ / BLOCKSIZE] |= (1LLU << (size_ % BLOCKSIZE));
+ }
+ size_++;
+ if ((size_ % BLOCKSIZE) == 0){
+ add_block();
+ }
+ }
+
+ /**
+ * Get the pos-th bit
+ * @param pos the index
+ * @return the pos-th bit
+ */
+ uint64_t get_bit(uint64_t pos) const{
+ return (bits_[pos/BLOCKSIZE] >> (pos % BLOCKSIZE)) & 0x1LLU;
+ }
+
+ /**
+ * Calculate the number of ones in bits_[0...pos-1] in O(1) time.
+ * @param pos the position in the bit array
+ * @return the number of ones in bits_[0...pos-1]
+ */
+ uint64_t rank(uint64_t pos) const{
+ return lblocks_[pos/LBLOCKSIZE]
+ + sblocks_[pos/BLOCKSIZE]
+ + pop_count(bits_[pos/BLOCKSIZE] & ((1LLU << (pos % BLOCKSIZE)) - 1));
+ }
+
+ /**
+ * Return the size of bit array in bits.
+ * @return the number of bits
+ */
+ uint64_t size() const{
+ return size_;
+ }
+
+ /**
+ * Swap the content in bit vector
+ * @param rv the rank_vector to be swapped
+ */
+ void swap(rank_vector& rv){
+ bits_.swap(rv.bits_);
+ lblocks_.swap(rv.lblocks_);
+ sblocks_.swap(rv.sblocks_);
+ std::swap(size_, rv.size_);
+ std::swap(one_num_, rv.one_num_);
+ }
+
+ private:
+ static const uint64_t LBLOCKSIZE = 256;
+ static const uint64_t BLOCKSIZE = 64;
+
+ void add_block(){
+ if (bits_.size() > 0){
+ one_num_ += pop_count(bits_.back());
+ }
+
+ if (size_ % LBLOCKSIZE == 0){
+ lblocks_.push_back(one_num_);
+ }
+ sblocks_.push_back(one_num_ - lblocks_[size_ / LBLOCKSIZE]);
+ bits_.push_back(0LLU);
+ }
+
+ inline static uint64_t pop_count(uint64_t x){
+ x = x - ((x & 0xAAAAAAAAAAAAAAAALLU) >> 1);
+ x = (x & 0x3333333333333333LLU) + ((x >> 2) & 0x3333333333333333LLU);
+ x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FLLU;
+ return x * 0x0101010101010101LLU >> 56;
+ }
+
+ std::vector<uint64_t> bits_; /// bit array
+ std::vector<uint64_t> lblocks_; /// rank results for large blocks
+ std::vector<uint8_t> sblocks_; /// rank results for small blocks
+ uint64_t size_; /// the length of bit array
+ uint64_t one_num_; /// the number of ones in the bit array
+};
+
+
+}
+
+#endif // RANK_VECTOR_HPP_
diff --git a/gatb-core/src/gatb/gatb_core.hpp b/gatb-core/src/gatb/gatb_core.hpp
index 4eac4f9..26cbae2 100644
--- a/gatb-core/src/gatb/gatb_core.hpp
+++ b/gatb-core/src/gatb/gatb_core.hpp
@@ -81,7 +81,10 @@
#include <gatb/debruijn/impl/Frontline.hpp>
#include <gatb/debruijn/impl/IterativeExtensions.hpp>
#include <gatb/debruijn/impl/BranchingAlgorithm.hpp>
-
+#include <gatb/debruijn/impl/LinkTigs.hpp>
+#include <gatb/debruijn/impl/ExtremityInfo.hpp>
+#include <gatb/bcalm2/bcalm_algo.hpp>
+#include <gatb/bcalm2/bglue_algo.hpp>
#include <gatb/tools/compression/RangeCoder.hpp>
#include <gatb/tools/compression/CompressionUtils.hpp>
diff --git a/gatb-core/src/gatb/kmer/.DS_Store b/gatb-core/src/gatb/kmer/.DS_Store
deleted file mode 100644
index 7cb14a2..0000000
Binary files a/gatb-core/src/gatb/kmer/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/kmer/impl/.DS_Store b/gatb-core/src/gatb/kmer/impl/.DS_Store
deleted file mode 100644
index f67c141..0000000
Binary files a/gatb-core/src/gatb/kmer/impl/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/kmer/impl/ConfigurationAlgorithm.cpp b/gatb-core/src/gatb/kmer/impl/ConfigurationAlgorithm.cpp
index e94c2fc..e55c062 100644
--- a/gatb-core/src/gatb/kmer/impl/ConfigurationAlgorithm.cpp
+++ b/gatb-core/src/gatb/kmer/impl/ConfigurationAlgorithm.cpp
@@ -337,7 +337,7 @@ void ConfigurationAlgorithm<span>::execute ()
assert (_config._max_disk_space > 0);
- _config._nb_passes = ( (_config._volume/3) / _config._max_disk_space ) + 1; //minim, approx volume /3
+ _config._nb_passes = ( (_config._volume/4) / _config._max_disk_space ) + 1; //minim, approx volume /switched to approx /4 (was/3) because of more efficient superk storage
//_nb_passes = 1; //do not constrain nb passes on disk space anymore (anyway with minim, not very big)
//increase it only if ram issue
@@ -347,7 +347,7 @@ void ConfigurationAlgorithm<span>::execute ()
if (_config._storage_type == tools::storage::impl::STORAGE_FILE)
{
- std::cout << "using less max_open_open files (" << max_open_files << "), by 4x, due to storage file setting" << std::endl;
+ std::cout << "using less max_open_open files (" << max_open_files << "), by 3x, due to storage file setting" << std::endl;
max_open_files /= 3; // will need to open twice in STORAGE_FILE instead of HDF5, so this adjustment is needed. needs to be fixed later by putting partitions inside the same file. but i'd rather not do it in the current messy collection/group/partition hdf5-inspired system. overall, that's a FIXME
}
diff --git a/gatb-core/src/gatb/kmer/impl/CountProcessorHistogram.hpp b/gatb-core/src/gatb/kmer/impl/CountProcessorHistogram.hpp
index 814f62d..d74c14c 100644
--- a/gatb-core/src/gatb/kmer/impl/CountProcessorHistogram.hpp
+++ b/gatb-core/src/gatb/kmer/impl/CountProcessorHistogram.hpp
@@ -135,6 +135,9 @@ public:
result.add (0, "histogram");
result.add (1, "cutoff", "%ld", _histogram->get_solid_cutoff());
result.add (1, "nb_ge_cutoff", "%ld", _histogram->get_nbsolids_auto());
+ result.add (1, "ratio_weak_volume", "%.2f", _histogram->get_ratio_weak());
+
+
// result->add (1, "percent_ge_cutoff", "%.1f", nbSolids > 0 ? 100.0 * (double)_histogram->get_nbsolids_auto() / (double)_bankStats.kmersNbValid : 0);
result.add (1, "first_peak", "%ld", _histogram->get_first_peak());
diff --git a/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.cpp b/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.cpp
index cfe06de..82190d6 100644
--- a/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.cpp
+++ b/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.cpp
@@ -68,7 +68,7 @@ static const char* messages[] = {
* => http://stackoverflow.com/questions/2738435/using-numeric-limitsmax-in-constant-expressions
*/
template<size_t span, typename Abundance_t, typename NodeState_t>
-const Abundance_t MPHFAlgorithm<span,Abundance_t,NodeState_t>::MAX_ABUNDANCE = std::numeric_limits<Abundance_t>::max();
+ const Abundance_t MPHFAlgorithm<span,Abundance_t,NodeState_t>::MAX_ABUNDANCE = std::numeric_limits<Abundance_t>::max();
/*********************************************************************
** METHOD :
@@ -233,6 +233,8 @@ void MPHFAlgorithm<span,Abundance_t,NodeState_t>::populate ()
// TODO parallize that
+ std::vector<int> & _abundanceDiscretization = _abundanceMap->_abundanceDiscretization ;
+ int max_abundance_discrete = _abundanceDiscretization[_abundanceDiscretization.size()-2];
// set counts and at the same time, test the mphf
for (itKmers->first(); !itKmers->isDone(); itKmers->next())
{
@@ -247,14 +249,18 @@ void MPHFAlgorithm<span,Abundance_t,NodeState_t>::populate ()
/** We get the abundance of the current kmer. */
int abundance = itKmers->item().abundance;
- if (abundance > MAX_ABUNDANCE)
+ if (abundance > max_abundance_discrete)
{
_nb_abundances_above_precision++;
- abundance = MAX_ABUNDANCE;
+ abundance = max_abundance_discrete;
}
+ //get first cell strictly greater than abundance
+ std::vector<int>::iterator up = std::upper_bound(_abundanceDiscretization.begin(), _abundanceDiscretization.end(), abundance);
+ up--; // get previous cell
+ int idx = up- _abundanceDiscretization.begin() ;
/** We set the abundance of the current kmer. */
- _abundanceMap->at (h) = abundance;
+ _abundanceMap->at (h) = idx;
nb_iterated ++;
}
@@ -303,15 +309,15 @@ void MPHFAlgorithm<span,Abundance_t,NodeState_t>::check ()
/** We get the current abundance. */
Abundance_t abundance = (*_abundanceMap)[count.value];
- // sanity check (thank god i wrote this, was useful for spruce)
- if (abundance!=count.abundance && abundance<MAX_ABUNDANCE)
+ // sanity check (thank god i wrote this, was useful for spruce) //todo change this now that abundance is discretized
+ /* if (abundance!=count.abundance && abundance<MAX_ABUNDANCE)
{
std::cout << "debug info: " << (int)abundance << " " << (int)count.abundance << std::endl;
typename AbundanceMap::Hash::Code h = _abundanceMap->getCode (count.value);
size_t n = _abundanceMap->size();
std::cout << "debug info: " << h << " / " << n << std::endl;
throw Exception ("ERROR: MPHF isn't injective (abundance population failed)");
- }
+ }*/
nb_iterated ++;
}
diff --git a/gatb-core/src/gatb/kmer/impl/Model.hpp b/gatb-core/src/gatb/kmer/impl/Model.hpp
index 4235a43..d40e8e9 100644
--- a/gatb-core/src/gatb/kmer/impl/Model.hpp
+++ b/gatb-core/src/gatb/kmer/impl/Model.hpp
@@ -31,7 +31,6 @@
#include <gatb/system/api/Exception.hpp>
#include <gatb/kmer/api/IModel.hpp>
-
#include <gatb/tools/collections/api/Bag.hpp>
#include <gatb/tools/designpattern/api/Iterator.hpp>
@@ -41,6 +40,8 @@
#include <gatb/tools/math/Integer.hpp>
+#include <gatb/tools/storage/impl/Storage.hpp>
+
#include <vector>
#include <algorithm>
#include <iostream>
@@ -600,7 +601,7 @@ struct Kmer
/** Set the data to be iterated.
* \param[in] d : the data as information source for the iterator
*/
- void setData (tools::misc::Data& d)
+ void setData (tools::misc::Data& d) // TODO: should this be const? I feel like it should
{
/** We fill the vector with the items to be iterated. */
_ref.build (d, this->_items);
@@ -915,19 +916,52 @@ struct Kmer
has_frequency = false;
}
- void include_frequency (uint32_t *freq_order)
+ void include_frequency (uint32_t *freq_order)
{
_freq_order = freq_order;
has_frequency = true;
}
- bool operator() (const Type& a_t, const Type& b_t) const {
+ template<class Model> Type computeLargest (const Model& model,int mmersize)
+ {
+ Type largest;
+ if(has_frequency)
+ {
+ u_int64_t nbminims_total = ((u_int64_t)1 << (2*mmersize));
+
+ Type mmer_max;
+ mmer_max.setVal(0);
+ uint32_t _freq_max = _freq_order[mmer_max.getVal()];
+ for(uint32_t ii=0; ii< nbminims_total; ii++)
+ {
+ Type Tii;
+ Tii.setVal(ii);
+
+ if( ! (*this)(Tii,mmer_max ))
+ {
+ mmer_max.setVal(ii);
+ _freq_max = _freq_order[ii];
+ }
+ }
+ printf("largest freq is %i for %s\n",_freq_max,mmer_max.toString(mmersize).c_str());
+ largest = mmer_max;
+ }
+ else
+ {
+ largest = model.getKmerMax();
+ }
+
+ return largest;
+ }
+
+ bool operator() (const Type& a_t, const Type& b_t) const {
u_int64_t a = a_t.getVal();
u_int64_t b = b_t.getVal();
if (has_frequency)
{
//printf("testing freq order of %d %d: %d %d, min is gonna be: %d\n",a,b,_freq_order[a], _freq_order[b], (_freq_order[a] < _freq_order[b]) ? a : b);
+ //printf("freq order %llu %llu %i %i %i\n",a,b,_freq_order[a],_freq_order[b], _freq_order[a] > _freq_order[b]);
if (_freq_order[a] == _freq_order[b])
return a < b;
return _freq_order[a] < _freq_order[b];
@@ -980,7 +1014,7 @@ struct Kmer
if (kmerSize < minimizerSize) { throw system::Exception ("Bad values for kmer %d and minimizer %d", kmerSize, minimizerSize); }
_minimizerSize = minimizerSize;
-
+
/** We compute the number of mmers found in a kmer. */
_nbMinimizers = _kmerModel.getKmerSize() - minimizerSize + 1;
@@ -993,7 +1027,7 @@ struct Kmer
* The value is actually set by the Comparator instance provided as a template of the class. */
Type tmp;
_cmp.template init<ModelType> (getMmersModel(), tmp);
- _minimizerDefault.set (tmp);
+ _minimizerDefault.set (tmp); //////////max value of minim
u_int64_t nbminims_total = ((u_int64_t)1 << (2*_minimizerSize));
_mmer_lut = (Type *) MALLOC(sizeof(Type) * nbminims_total ); //free that in destructor
@@ -1006,27 +1040,33 @@ struct Kmer
for(u_int64_t ii=0; ii< nbminims_total; ii++)
{
Type mmer;
- mmer.setVal(ii);
+ mmer.setVal(ii);
+
+ // if(!is_allowed(mmer.getVal(),minimizerSize)) mmer = _mask;
+ // if(!is_allowed(rev_mmer.getVal(),minimizerSize)) rev_mmer = _mask;
+
+ if (isModelCanonical)
+ {
+ Type rev_mmer = revcomp(mmer, minimizerSize);
+ if(rev_mmer < mmer) mmer = rev_mmer;
+
+ //may be cleaner with this
+ //if (_cmp (rev_mmer, mmer ) == true)
+ // mmer = rev_mmer;
+ }
+
+ //std:: cout << "ii " << ii << " is allowed " << is_allowed(mmer.getVal(),minimizerSize) << " mmer getval " << mmer.getVal() << " is model canonical " << isModelCanonical << std::endl;
+
+ if (!is_allowed(mmer.getVal(),minimizerSize))
+ mmer = _mask;
- // if(!is_allowed(mmer.getVal(),minimizerSize)) mmer = _mask;
- // if(!is_allowed(rev_mmer.getVal(),minimizerSize)) rev_mmer = _mask;
-
- if (isModelCanonical)
- {
- Type rev_mmer = revcomp(mmer, minimizerSize);
- if(rev_mmer < mmer) mmer = rev_mmer;
- }
-
- //std:: cout << "ii " << ii << " is allowed " << is_allowed(mmer.getVal(),minimizerSize) << " mmer getval " << mmer.getVal() << " is model canonical " << isModelCanonical << std::endl;
-
- if (!is_allowed(mmer.getVal(),minimizerSize))
- mmer = _mask;
-
_mmer_lut[ii] = mmer;
}
if (freq_order)
setMinimizersFrequency(freq_order);
+
+
}
@@ -1141,9 +1181,13 @@ struct Kmer
justSweepForAA(km.value(0), _nbMinimizers, dummy);
}
+ //return value of larger mmer in the freq order
void setMinimizersFrequency (uint32_t *freq_order)
{
_cmp.include_frequency(freq_order);
+
+ //Type tmp =_cmp.template computeLargest<ModelType>(getMmersModel(),_minimizerSize);
+ //_minimizerDefault.set (tmp);
}
// hack to access compare int's, for bcalm, needs to be made cleaner later
@@ -1169,42 +1213,43 @@ struct Kmer
bool _defaultFast;
uint32_t *_freq_order;
+
/** Tells whether a minimizer is valid or not, in order to skip minimizers
* that are too frequent. */
bool is_allowed (uint32_t mmer, uint32_t len)
- {
- if (_freq_order) return true; // every minimizer is allowed in freq order
-
- u_int64_t _mmask_m1 ;
- u_int64_t _mask_0101 ;
- u_int64_t _mask_ma1 ;
+ {
+ if (_freq_order) return true; // every minimizer is allowed in freq order
+
+ u_int64_t _mmask_m1 ;
+ u_int64_t _mask_0101 ;
+ u_int64_t _mask_ma1 ;
//code to ban mmer with AA inside except if at the beginnning
// A C T G 00 01 10 11
- _mmask_m1 = (1 << ((len-2)*2)) -1 ; //vire 2 premieres lettres m = 8 donne 00 00 11 11 11 11 11 11
- _mask_0101 = 0x5555555555555555 ; // 01 01 01 01 01 01 01 01
- _mask_ma1 = _mask_0101 & _mmask_m1;// 00 00 01 01 01 01 01 01
-
- u_int64_t a1 = mmer; //
- a1 = ~(( a1 ) | ( a1 >>2 )); //
- a1 =((a1 >>1) & a1) & _mask_ma1 ; //
-
- if(a1 != 0) return false;
-
- // if ((mmer & 0x3f) == 0x2a) return false; // TTT suffix
- // if ((mmer & 0x3f) == 0x2e) return false; // TGT suffix
- // if ((mmer & 0x3c) == 0x28) return false; // TT* suffix
- // for (uint32_t j = 0; j < len - 3; ++j) // AA inside
- // if ((mmer & 0xf) == 0) return false;
- // else mmer >>= 2;
- // if (mmer == 0) return false; // AAA prefix
- // if (mmer == 0x04) return false; // ACA prefix
- // if ((mmer & 0xf) == 0) return false; // *AA prefix
-
- return true;
- }
-
+ _mmask_m1 = (1 << ((len-2)*2)) -1 ; //vire 2 premieres lettres m = 8 donne 00 00 11 11 11 11 11 11
+ _mask_0101 = 0x5555555555555555 ; // 01 01 01 01 01 01 01 01
+ _mask_ma1 = _mask_0101 & _mmask_m1;// 00 00 01 01 01 01 01 01
+
+ u_int64_t a1 = mmer; //
+ a1 = ~(( a1 ) | ( a1 >>2 )); //
+ a1 =((a1 >>1) & a1) & _mask_ma1 ; //
+
+ if(a1 != 0) return false;
+
+ // if ((mmer & 0x3f) == 0x2a) return false; // TTT suffix
+ // if ((mmer & 0x3f) == 0x2e) return false; // TGT suffix
+ // if ((mmer & 0x3c) == 0x28) return false; // TT* suffix
+ // for (uint32_t j = 0; j < len - 3; ++j) // AA inside
+ // if ((mmer & 0xf) == 0) return false;
+ // else mmer >>= 2;
+ // if (mmer == 0) return false; // AAA prefix
+ // if (mmer == 0x04) return false; // ACA prefix
+ // if ((mmer & 0xf) == 0) return false; // *AA prefix
+
+ return true;
+ }
+
/** Returns the minimizer of the provided vector of mmers. */
void computeNewMinimizerOriginal(Kmer& kmer) const
{
@@ -1220,15 +1265,17 @@ struct Kmer
Type kmer_minimizer_value = kmer._minimizer.value();
Type val = kmer.value(0);
-
+
for (int16_t idx=_nbMinimizers-1; idx>=0; idx--)
{
+
/** We extract the most left mmer in the kmer. */
Type candidate_minim = _mmer_lut[(val & _mask).getVal()];
-
+
+
/** We check whether this mmer is the new minimizer. */
if (_cmp (candidate_minim, kmer_minimizer_value ) == true)
- {
+ {
mmer.set(candidate_minim);
kmer._minimizer = mmer;
kmer._position = idx;
@@ -1304,7 +1351,10 @@ struct Kmer
SuperKmer (size_t kmerSize, size_t miniSize)
: minimizer(DEFAULT_MINIMIZER), kmerSize(kmerSize), miniSize(miniSize)
{
+ _max_size_sk = 1000;
kmers.clear();
+ _sk_buffer = (u_int8_t *) malloc(_max_size_sk);
+ _sk_buffer_idx=0;
// if (kmers.empty()) { kmers.resize(kmerSize); range.second = kmers.size()-1; }
}
@@ -1328,8 +1378,99 @@ struct Kmer
void reset()
{
kmers.clear();
+ //binrep.clear();
+ _sk_buffer_idx =0;
+ }
+
+ //save superkmer to CacheSuperKmerBinFiles
+ void save(tools::storage::impl::CacheSuperKmerBinFiles & cacheSuperkFile, int file_id)
+ {
+// printf("saving superk to file %i \n",file_id);
+// //debug
+// for (size_t ii=0 ; ii < kmers.size(); ii++)
+// {
+//
+// printf("%s\n", (((*this)[ii].forward()).toString(kmerSize)).c_str());
+//
+// }
+// //
+ size_t superKmerLen = size();
+
+
+ int required_bytes = (superKmerLen + kmerSize +3) /4 ;
+ if(required_bytes > _max_size_sk)
+ {
+ _sk_buffer = (u_int8_t *) realloc(_sk_buffer, _max_size_sk);
+ _max_size_sk = required_bytes;
+ }
+
+
+ //binrep.clear();
+ _sk_buffer_idx =0;
+
+ Type basekmer = (*this)[0].forward();
+
+ int rem_size = kmerSize;
+ u_int8_t newbyte=0;
+ u_int64_t mask4nt = 255;
+ u_int64_t mask1nt = 3;
+
+ while(rem_size>=4)
+ {
+ newbyte = basekmer.getVal() & mask4nt ; // ptet un getVal et cast to u_int8_t
+ rem_size -= 4;
+ basekmer = basekmer >> 8;
+ _sk_buffer[_sk_buffer_idx++]= newbyte;
+ //binrep.push_back(newbyte);
+// //debug pushing
+// Type dd; dd.setVal(newbyte);
+// printf("pushing %s\n", (dd.toString(4)).c_str());
+// //
+ }
+
+ //reste du kmer
+ newbyte = basekmer.getVal() & mask4nt;
+ int uid = rem_size; //uid = nb nt used in this newbyte
+
+ //reste du newbyte avec le superk
+
+ int skid =1;
+
+ while(true)
+ {
+
+ while(uid<4 && skid < superKmerLen)
+ {
+
+ u_int8_t newnt = ((*this)[skid].forward()).getVal() & mask1nt ;
+
+ newbyte |= newnt << (uid*2);
+ uid++; skid++;
+ }
+
+ if(uid > 0)
+ _sk_buffer[_sk_buffer_idx++]= newbyte;
+
+ //binrep.push_back(newbyte);
+// //debug pushing
+// Type dd; dd.setVal(newbyte);
+// printf("pushing %s\n", (dd.toString(4)).c_str());
+// //
+
+ if(skid >= superKmerLen) break;
+
+ newbyte=0; uid=0;
+ }
+
+
+ //printf("insert superK %i _sk_buffer_idx %i \n",kmers.size(),_sk_buffer_idx);
+
+ // cacheSuperkFile.insertSuperkmer(binrep.data(), binrep.size(), kmers.size(), file_id);
+ cacheSuperkFile.insertSuperkmer(_sk_buffer, _sk_buffer_idx, kmers.size(), file_id);
+
}
+
/** */
void save (tools::collections::Bag<Type>& bag)
{
@@ -1399,10 +1540,19 @@ struct Kmer
}
#endif
+ ~SuperKmer()
+ {
+ free(_sk_buffer);
+ }
+
private:
size_t kmerSize;
size_t miniSize;
std::vector<Kmer> kmers;
+ //std::vector<u_int8_t> binrep;
+ u_int8_t * _sk_buffer;
+ int _sk_buffer_idx;
+ int _max_size_sk;
};
/************************************************************/
diff --git a/gatb-core/src/gatb/kmer/impl/PartiInfo.hpp b/gatb-core/src/gatb/kmer/impl/PartiInfo.hpp
index bdc9f07..8331daf 100644
--- a/gatb-core/src/gatb/kmer/impl/PartiInfo.hpp
+++ b/gatb-core/src/gatb/kmer/impl/PartiInfo.hpp
@@ -58,6 +58,8 @@ public:
//increases both superk and kmer count
inline void incSuperKmer_per_minimBin(int numbin, int superksize, u_int64_t val=1)
{
+ _nb_superk_total+=val;
+ _nb_kmer_total += val*superksize;
_superk_per_mmer_bin[numbin]+= val ;
_kmer_per_mmer_bin[numbin]+= val *superksize;
}
@@ -102,6 +104,10 @@ public:
__sync_fetch_and_add (_kxmer_per_mmer_bin + ii, other.getNbKxmer_per_minim (ii));
}
+ __sync_fetch_and_add(&_nb_superk_total , other._nb_superk_total);
+ __sync_fetch_and_add(&_nb_kmer_total , other._nb_kmer_total);
+
+
return *this;
}
@@ -122,7 +128,16 @@ public:
{
return _nb_kxmers_per_parti[numpart];
}
-
+
+ inline u_int64_t getNbSuperKmerTotal() const
+ {
+ return _nb_superk_total;
+ }
+
+ inline u_int64_t getNbKmerTotal() const
+ {
+ return _nb_kmer_total;
+ }
/** */
inline u_int64_t getNbSuperKmer_per_minim(int numbin) const
{
@@ -198,7 +213,8 @@ public:
Typem cur;
cur.setVal(np);
- printf("Bin[%5i (%s) ]= %lli %lli\n",np,cur.toString(_mm).c_str(), this->getNbSuperKmer_per_minim(np),this->getNbKmer_per_minim(np) );
+ if(this->getNbSuperKmer_per_minim(np)!=0 || this->getNbKmer_per_minim(np) !=0 )
+ printf("Bin[%5i (%s) ]= %lli %lli\n",np,cur.toString(_mm).c_str(), this->getNbSuperKmer_per_minim(np),this->getNbKmer_per_minim(np) );
sumk += this->getNbKmer_per_minim(np);
sumsuperk += this->getNbSuperKmer_per_minim(np);
@@ -211,6 +227,8 @@ public:
/** Constructor. */
PartiInfo(int nbpart, int minimsize) : _nbpart(nbpart), _mm(minimsize)
{
+ _nb_superk_total =0;
+ _nb_kmer_total =0;
_nb_kmers_per_parti = (u_int64_t*) CALLOC (nbpart, sizeof(u_int64_t));
_nb_kxmers_per_parti = (u_int64_t*) CALLOC (nbpart, sizeof(u_int64_t));
_num_mm_bins = 1 << (2*_mm);
@@ -234,7 +252,9 @@ public:
_num_mm_bins = cr._num_mm_bins;
_nbpart = cr._nbpart;
_mm = cr._mm;
-
+ _nb_superk_total = cr._nb_superk_total;
+ _nb_kmer_total = cr._nb_kmer_total;
+
_nb_kmers_per_parti = (u_int64_t*) CALLOC (_nbpart, sizeof(u_int64_t));
_nb_kxmers_per_parti = (u_int64_t*) CALLOC (_nbpart, sizeof(u_int64_t));
_superk_per_mmer_bin = (u_int64_t*) CALLOC (_num_mm_bins, sizeof(u_int64_t));
@@ -272,6 +292,8 @@ private:
u_int64_t* _nb_kmers_per_parti;
u_int64_t* _nb_kxmers_per_parti; //now used to store number of kxmers per parti
u_int64_t* _superk_per_mmer_bin;
+ u_int64_t _nb_superk_total;
+ u_int64_t _nb_kmer_total;
u_int64_t* _kmer_per_mmer_bin;
u_int64_t* _kxmer_per_mmer_bin;
diff --git a/gatb-core/src/gatb/kmer/impl/PartitionsCommand.cpp b/gatb-core/src/gatb/kmer/impl/PartitionsCommand.cpp
index 1e46086..18ab0e8 100644
--- a/gatb-core/src/gatb/kmer/impl/PartitionsCommand.cpp
+++ b/gatb-core/src/gatb/kmer/impl/PartitionsCommand.cpp
@@ -20,6 +20,8 @@
#include <gatb/kmer/impl/PartitionsCommand.hpp>
#include <gatb/tools/collections/impl/OAHash.hpp>
#include <gatb/tools/collections/impl/Hash16.hpp>
+#include <gatb/tools/misc/impl/Stringify.hpp>
+
using namespace std;
@@ -55,7 +57,7 @@ namespace gatb { namespace core { namespace kmer { namespace impl {
*********************************************************************/
template<size_t span>
PartitionsCommand<span>:: PartitionsCommand (
- Iterable<Type>& partition,
+ // Iterable<Type>& partition,
CountProcessor* processor,
size_t cacheSize,
IteratorListener* progress,
@@ -65,10 +67,12 @@ PartitionsCommand<span>:: PartitionsCommand (
int parti,
size_t nbCores,
size_t kmerSize,
- MemAllocator& pool
+ MemAllocator& pool,
+ tools::storage::impl::SuperKmerBinFiles* superKstorage
+
)
:
- _partition(partition),
+ // _partition(partition),
_progress(progress),
_pInfo(pInfo),
_pass_num(passi),
@@ -78,7 +82,8 @@ PartitionsCommand<span>:: PartitionsCommand (
_cacheSize(cacheSize),
_pool(pool),
_globalTimeInfo(timeInfo),
- _processor(0)
+ _processor(0),
+ _superKstorage(superKstorage)
{
setProcessor (processor);
}
@@ -114,6 +119,57 @@ void PartitionsCommand<span>::insert (const Type& kmer, const CounterBuilder& co
_processor->process (_parti_num, kmer, counter.get());
}
+
+
+///////// multibank version with old partition system//////////////
+template<size_t span>
+PartitionsCommand_multibank<span>:: PartitionsCommand_multibank (
+ Iterable<Type>& partition,
+ CountProcessor* processor,
+ size_t cacheSize,
+ IteratorListener* progress,
+ TimeInfo& timeInfo,
+ PartiInfo<5>& pInfo,
+ int passi,
+ int parti,
+ size_t nbCores,
+ size_t kmerSize,
+ MemAllocator& pool
+ )
+:
+_partition(partition),
+_progress(progress),
+_pInfo(pInfo),
+_pass_num(passi),
+_parti_num(parti),
+_nbCores(nbCores),
+_kmerSize(kmerSize),
+_cacheSize(cacheSize),
+_pool(pool),
+_globalTimeInfo(timeInfo),
+_processor(0)
+{
+ setProcessor (processor);
+}
+
+template<size_t span>
+PartitionsCommand_multibank<span>::~PartitionsCommand_multibank()
+{
+ _globalTimeInfo += _timeInfo;
+
+ setProcessor (0);
+}
+
+template<size_t span>
+void PartitionsCommand_multibank<span>::insert (const Type& kmer, const CounterBuilder& counter)
+{
+ /** We call the count processor instance with the information collected for the current kmer. */
+ _processor->process (_parti_num, kmer, counter.get());
+}
+/////////////////
+
+
+
/*********************************************************************
# # # ##### # #
# # # # # # # #
@@ -135,7 +191,7 @@ void PartitionsCommand<span>::insert (const Type& kmer, const CounterBuilder& co
/** in this scheme we count k-mers inside a partition by a hash table */
template<size_t span>
PartitionsByHashCommand<span>:: PartitionsByHashCommand (
- Iterable<Type>& partition,
+ // Iterable<Type>& partition,
CountProcessor* processor,
size_t cacheSize,
IteratorListener* progress,
@@ -146,13 +202,164 @@ PartitionsByHashCommand<span>:: PartitionsByHashCommand (
size_t nbCores,
size_t kmerSize,
MemAllocator& pool,
- u_int64_t hashMemory
+ u_int64_t hashMemory,
+ tools::storage::impl::SuperKmerBinFiles* superKstorage
+
)
- : PartitionsCommand<span> (partition, processor, cacheSize, progress, timeInfo, pInfo, passi, parti,nbCores,kmerSize,pool),
+ : PartitionsCommand<span> (/*partition,*/ processor, cacheSize, progress, timeInfo, pInfo, passi, parti,nbCores,kmerSize,pool,superKstorage),
_hashMemory(hashMemory)
{
}
+
+//will take N sorted files, will merge them to M files, by chunks of T files at a time
+
+ template<size_t span>
+ class TempCountFileMerger
+ {
+ typedef typename Kmer<span>::Type Type;
+ typedef tools::misc::Abundance<Type> abundance_t;
+ typedef std::pair< int , Type> ptcf; // id pointer , kmer value
+ struct ptcfcomp { bool operator() (ptcf l,ptcf r) { return ((r.second) < (l.second)); } } ;
+
+ public:
+ TempCountFileMerger(int reduceTarget, int chunksize) :_reduceTarget(reduceTarget), _chunksize(chunksize),_idx(0)
+ {
+ }
+
+ std::vector<string> mergeFiles(std::vector<string> filenames)
+ {
+ ptcf best_elem;
+ int best_p;
+ int current_ab = 0;
+ int previous_ab = 0;
+ Type current_kmer,previous_kmer;
+
+
+ while(filenames.size() > _reduceTarget)
+ {
+
+ std::vector<string> currentFiles;
+ for(int ii=0; ii<_chunksize; ii++)
+ {
+ currentFiles.push_back(filenames.back()); filenames.pop_back();
+ }
+
+ //the new file containing the merged counts
+ std::string newfname = currentFiles[0] + Stringify::format ("_merged_%i", _idx++) ;
+ BagFile<abundance_t> * bagf = new BagFile<abundance_t>(newfname); LOCAL(bagf);
+ Bag<abundance_t> * currentbag = new BagCache<abundance_t> ( bagf, 10000 ); LOCAL(currentbag);
+
+ filenames.push_back(newfname);
+
+ std::vector<Iterator<abundance_t>*> _tmpCountIterators;
+
+ for(int ii=0; ii< currentFiles.size(); ii++)
+ {
+ _tmpCountIterators.push_back( new IteratorFile<abundance_t> (currentFiles[ii]) );
+ }
+ std::priority_queue< ptcf, std::vector<ptcf>,ptcfcomp > pq;
+
+
+ //// init all iterators ////
+ for(int ii=0; ii< _tmpCountIterators.size(); ii++)
+ {
+ _tmpCountIterators[ii]->first();
+ }
+
+ ////// init pq ////
+ for(int ii=0; ii< _tmpCountIterators.size(); ii++)
+ {
+ if( ! _tmpCountIterators[ii]->isDone()) {
+ pq.push(ptcf(ii,_tmpCountIterators[ii]->item().value) );
+ }
+ }
+
+ //now merge the n sorted iterators and merge their kmer counts.
+ if(pq.size() != 0)
+ {
+ //get first pointer
+ best_elem = pq.top() ; pq.pop();
+ best_p = best_elem.first;
+ previous_ab = _tmpCountIterators[best_p]->item().abundance;
+ previous_kmer = best_elem.second;
+
+ //go forward in this list
+ _tmpCountIterators[best_p]->next();
+ if (! _tmpCountIterators[best_p]->isDone())
+ {
+ pq.push(ptcf( best_p,_tmpCountIterators[best_p]->item().value) );
+ }
+
+ while (pq.size() != 0)
+ {
+
+ //get first pointer
+ best_elem = pq.top() ; pq.pop();
+ best_p = best_elem.first;
+ current_ab = _tmpCountIterators[best_p]->item().abundance;
+ current_kmer = best_elem.second;
+
+ //go forward in this list
+ _tmpCountIterators[best_p]->next();
+ if (! _tmpCountIterators[best_p]->isDone())
+ {
+ pq.push(ptcf( best_p,_tmpCountIterators[best_p]->item().value) );
+ }
+
+
+ if(current_kmer != previous_kmer)
+ {
+ //output previous kmer
+ currentbag->insert( abundance_t(previous_kmer,previous_ab) );
+ previous_kmer = current_kmer;
+ previous_ab = current_ab;
+ }
+ else
+ {
+ //merge counter
+ previous_ab += current_ab;
+ }
+ }
+
+ //output last one
+ currentbag->insert( abundance_t(previous_kmer,previous_ab) );
+ }
+
+
+ currentbag->flush();
+
+
+ //cleanup
+
+ for(int ii=0; ii< _tmpCountIterators.size(); ii++)
+ {
+ delete _tmpCountIterators[ii];
+ }
+
+
+ //erase used files
+ for(int ii=0; ii< currentFiles.size(); ii++)
+ {
+ std::string fname = currentFiles[ii];
+ system::impl::System::file().remove(fname);
+ }
+
+ }
+
+
+ return filenames;
+ }
+
+ private :
+
+ int _reduceTarget;
+ int _chunksize;
+ int _idx;
+
+ };
+
+
/*********************************************************************
** METHOD :
** PURPOSE :
@@ -164,107 +371,371 @@ PartitionsByHashCommand<span>:: PartitionsByHashCommand (
template<size_t span>
void PartitionsByHashCommand<span>:: execute ()
{
- this->_processor->beginPart (this->_pass_num, this->_parti_num, this->_cacheSize, this->getName());
-
- CounterBuilder solidCounter;
+ typedef typename tools::collections::impl::Hash16<Type>::cell cell_t;
+ this->_superKstorage->openFile("r",this->_parti_num);
+
+ this->_processor->beginPart (this->_pass_num, this->_parti_num, this->_cacheSize, this->getName());
+
+ CounterBuilder solidCounter;
+
/** We need a map for storing part of solid kmers. */
//OAHash<Type> hash (_hashMemory);
-
- Hash16<Type> hash16 (_hashMemory/MBYTE); // now use hash 16 to ensure always finish
- /** We directly fill the vector from the current partition file. */
- Iterator<Type>* it = this->_partition.iterator(); LOCAL(it);
+ Hash16<Type> hash16 (_hashMemory/MBYTE); // now use hash 16 to ensure always finish. needs more ram than OAHash but seems faster
+
+
+
// If the partition holds kmers (and not superkmers), it would be :
// for (it->first(); !it->isDone(); it->next()) { hash.increment (it->item()); }
-
+
DEBUG (("PartitionsByHashCommand::execute: fillsolid parti num %i by oahash --- mem %llu MB\n",
- this->_parti_num,_hashMemory/MBYTE
- ));
+ this->_parti_num,_hashMemory/MBYTE
+));
- //with decompactage
- //superk
- u_int8_t nbK, rem ;
- Type compactedK;
- int ks = this->_kmerSize;
- Type un; un.setVal(1);
- size_t _shift_val = Type::getSize() -8;
- Type kmerMask = (un << (ks*2)) - un;
- size_t shift = 2*(ks-1);
-
- /** We iterate the superkmers from the table. */
- for (it->first(); !it->isDone(); it->next())
- {
- /** A superkmer is encoded with two successive Type objects, so we read both of them. */
- Type superk = it->item();
- it->next();
- Type seedk = it->item();
+
+ typedef tools::misc::Abundance<Type> abundance_t;
+ std::vector<string> _tmpCountFileNames;
+
+
+ //with decompactage
+ //superk
+ int ks = this->_kmerSize;
+ Type un; un.setVal(1);
+ //size_t _shift_val = Type::getSize() -8;
+ Type kmerMask = (un << (ks*2)) - un;
+ size_t shift = 2*(ks-1);
- compactedK = superk;
- nbK = (compactedK >> _shift_val).getVal() & 255; // 8 bits poids fort = cpt //todo for large k values
- rem = nbK;
+ Type _seedk;
+
+ int _fileId = this->_parti_num;
+ unsigned char * _buffer = 0 ;
+ unsigned int _buffer_size = 0;
- Type temp = seedk;
- Type rev_temp = revcomp(temp,ks);
- Type newnt ;
- Type mink;
- /** We loop over each kmer of the current superkmer. */
- for (int ii=0; ii<nbK; ii++,rem--)
- {
+
+ unsigned int nb_bytes_read;
+ while(this->_superKstorage->readBlock(&_buffer, &_buffer_size, &nb_bytes_read, _fileId))
+ {
+ unsigned char * ptr = _buffer;
+ u_int8_t nbK; //number of kmers in the superkmer
+ u_int8_t newbyte=0;
+
+ while(ptr < (_buffer+nb_bytes_read)) //decode whole block
+ {
+ //decode a superkmer
+ nbK = *ptr; ptr++;
+ //int nb_bytes_superk = (this->_kmerSize + nbK -1 +3) /4 ;
+
+ int rem_size = this->_kmerSize;
+
+ Type Tnewbyte;
+ int nbr=0;
+ _seedk.setVal(0);
+ while(rem_size>=4)
+ {
+ newbyte = *ptr ; ptr++;
+ Tnewbyte.setVal(newbyte);
+ _seedk = _seedk | (Tnewbyte << (8*nbr)) ;
+ rem_size -= 4; nbr++;
+ }
+
+ int uid = 4; //uid = nb nt used in current newbyte
+
+ //reste du seed kmer
+ if(rem_size>0)
+ {
+ newbyte = *ptr ; ptr++;
+ Tnewbyte.setVal(newbyte);
+
+ _seedk = ( _seedk | (Tnewbyte << (8*nbr)) ) ;
+ uid = rem_size;
+ }
+ _seedk = _seedk & kmerMask;
+
+
+
+ u_int8_t rem = nbK;
+ Type temp = _seedk;
+ Type rev_temp = revcomp(temp,this->_kmerSize);
+ Type newnt ;
+ Type mink;
+
+
+ //iterate over kmers of this superk
+ for (int ii=0; ii< nbK; ii++,rem--)
+ {
+
#ifdef NONCANONICAL
- mink = temp;
+ mink = temp;
#else
- mink = std::min (rev_temp, temp);
+ mink = std::min (rev_temp, temp);
#endif
+
+
+ /** We insert the kmer into the hash. */
+ hash16.insert(mink);
+
+
+ if(rem < 2) break; //no more kmers in this superkmer, the last one has just been eaten
+
+ ////////now decode next kmer of this superkmer ///////
+
+ if(uid>=4) //read next byte
+ {
+ newbyte = *ptr ; ptr++;
+ Tnewbyte.setVal(newbyte);
+ uid =0;
+ }
+
+ newnt = (Tnewbyte >> (2*uid))& 3; uid++;
+ temp = ((temp << 2 ) | newnt ) & kmerMask;
+
+ newnt.setVal(comp_NT[newnt.getVal()]) ;
+ rev_temp = ((rev_temp >> 2 ) | (newnt << shift) ) & kmerMask;
+ }
+
+ //now go to next superk of this block, ptr should point to beginning of next superk
+ }
- /** We insert the kmer into the hash. */
- //hash.increment (mink);
- hash16.insert(mink);
-
- if(rem < 2) break;
- newnt = ( superk >> ( 2*(rem-2)) ) & 3 ;
-
- temp = ((temp << 2 ) | newnt ) & kmerMask;
- newnt.setVal(comp_NT[newnt.getVal()]) ;
- rev_temp = ((rev_temp >> 2 ) | (newnt << shift) ) & kmerMask;
+ //check if hashtable is getting too big : in that case dump it disk and resume with the emptied hashtable
+ //at the end merge-sort all the dumped files with the content of hash table
+ if(hash16.getByteSize() > _hashMemory) // to be improved (can be slightly larger than maxmemory by a block size)
+ //if(_tmpCountFileNames.size()<20) //force dumps for testing
+ {
+ //printf("splitting into subparts %lli KB / %lli KB parti %i subpart %i \n",hash16.getByteSize()/1024,_hashMemory/1024 ,this->_parti_num,_tmpCountFiles.size() );
+ //dump partial count to disk file
+
+
+ Iterator < cell_t >* itKmerAbundancePartial = hash16.iterator(true);
+ LOCAL (itKmerAbundancePartial);
+
+
+ std::string fname = this->_superKstorage->getFileName(this->_parti_num) + Stringify::format ("_subpart_%i", _tmpCountFileNames.size()) ;
+ _tmpCountFileNames.push_back(fname);
+
+ BagFile<abundance_t> * bagf = new BagFile<abundance_t>(fname); LOCAL(bagf);
+ Bag<abundance_t> * currentbag = new BagCache<abundance_t> ( bagf, 10000 ); LOCAL(currentbag);
+
+
+ for (itKmerAbundancePartial->first(); !itKmerAbundancePartial->isDone(); itKmerAbundancePartial->next())
+ {
+ cell_t & cell = itKmerAbundancePartial->item();
+ currentbag->insert( abundance_t(cell.graine,cell.val) );
+ }
+
+ currentbag->flush();
+ hash16.clear();
+ }
}
- }
+
+
+ if(_buffer!=0)
+ free(_buffer);
+
+
/** We loop over the solid kmers map.
* NOTE !!! we want the items to be sorted by kmer values (see finalize part of debloom). */
//Iterator < Abundance<Type> >* itKmerAbundance = hash.iterator(true);
-
//shortcut
- typedef typename tools::collections::impl::Hash16<Type>::cell cell_t;
Iterator < cell_t >* itKmerAbundance = hash16.iterator(true);
-
-
LOCAL (itKmerAbundance);
-
- for (itKmerAbundance->first(); !itKmerAbundance->isDone(); itKmerAbundance->next())
+
+
+ //now merge sort over current hash and over the sorted _tmpCountFiles
+ // : simple merge sort of n sorted iterators
+
+ if(_tmpCountFileNames.size()!=0)
{
- /** Shortcut. */
- // Abundance<Type>& current = itKmerAbundance->item();
- /** We update the solid counter. */
- //solidCounter.set (current.getAbundance());
+ TempCountFileMerger<span> tempCountFileMerger (10,10);
+ //will merge by chunk of 10 files at a time, until reach less than 10 files
+ _tmpCountFileNames = tempCountFileMerger.mergeFiles(_tmpCountFileNames);
+ //then will use code below to merge remaining files with the contents of the hash table
- /** We may add this kmer to the solid kmers bag. */
- //this->insert (current.getValue(), solidCounter);
+ std::vector<Iterator<abundance_t>*> _tmpCountIterators;
+
+ //how to make sure there are not too many subpart files ? and that we'll not reach the max open files limit ?
+ //we *could* merge only some of them at a time .. todo ? --> done with TempCountFileMerger above
+ for(int ii=0; ii< _tmpCountFileNames.size(); ii++)
+ {
+ std::string fname = _tmpCountFileNames[ii];
+ _tmpCountIterators.push_back( new IteratorFile<abundance_t> (fname) );
+ }
+
+ // Note (guillaume) : code below is ugly because I have to manage itKmerAbundance (iterator over cell_t)
+ // and _tmpCountIterators (iterators over abundance_t) differently since they have different types
+ // I would have liked to transform Iterator<cell_t> to an Iterator<abundance_t> with the following adaptor :
+ //
+ // struct cell2AbAdaptor { abundance_t operator() (cell_t& c) { return abundance_t(c.graine,c.val) ; } };
+ //Iterator<abundance_t>* hashAbundance = new IteratorAdaptor<cell_t,abundance_t,cell2AbAdaptor> (itKmerAbundance);
+ // but it turns out to be impossible because of the return by reference of the item() function.
+ // Another solution would be to dump contents of hash to a file then read it, but inefficient
+ // So, ugly code it is. (see all the if(best_p==-1) below)
+
+
+ //setup the priority queue for merge sorting
+ typedef std::pair< int , Type> ptcf; // id pointer , kmer value
+ struct ptcfcomp { bool operator() (ptcf l,ptcf r) { return ((r.second) < (l.second)); } } ;
+ std::priority_queue< ptcf, std::vector<ptcf>,ptcfcomp > pq;
+
+ //// init all iterators ////
+ itKmerAbundance->first();
+ for(int ii=0; ii< _tmpCountIterators.size(); ii++)
+ {
+ _tmpCountIterators[ii]->first();
+ }
+
+ ////// init pq ////
+
+ if(!itKmerAbundance->isDone())
+ {
+ pq.push(ptcf(-1,itKmerAbundance->item().graine) ); // -1 will mean in the itKmerAbundance
+ }
+
+ for(int ii=0; ii< _tmpCountIterators.size(); ii++)
+ {
+ if( ! _tmpCountIterators[ii]->isDone()) {
+ abundance_t &ab = _tmpCountIterators[ii]->item();
+ pq.push(ptcf(ii,ab.value) );
+ }
+ }
- cell_t & cell = itKmerAbundance->item();
- solidCounter.set (cell.val);
- this->insert (cell.graine, solidCounter);
+ ptcf best_elem;
+ int best_p;
+ int current_ab = 0;
+ int previous_ab = 0;
+ Type current_kmer,previous_kmer;
+
+
+ //now merge the n sorted iterators and merge their kmer counts.
+ if(pq.size() != 0)
+ {
+ //get first pointer
+ best_elem = pq.top() ; pq.pop();
+ best_p = best_elem.first;
+ if(best_p==-1)
+ {
+ previous_ab = itKmerAbundance->item().val;
+ }
+ else
+ {
+ previous_ab = _tmpCountIterators[best_p]->item().abundance;
+ }
+
+ previous_kmer = best_elem.second;
+ //go forward in this list
+ if(best_p==-1)
+ {
+ itKmerAbundance->next();
+ if (! itKmerAbundance->isDone())
+ {
+ pq.push(ptcf(-1,itKmerAbundance->item().graine) );
+ }
+ }
+ else
+ {
+ _tmpCountIterators[best_p]->next();
+ if (! _tmpCountIterators[best_p]->isDone())
+ {
+ pq.push(ptcf( best_p,_tmpCountIterators[best_p]->item().value) );
+ }
+ }
+
+ while (pq.size() != 0)
+ {
+
+ //get first pointer
+ best_elem = pq.top() ; pq.pop();
+ best_p = best_elem.first;
+
+ if(best_p==-1)
+ {
+ current_ab = itKmerAbundance->item().val;
+ }
+ else
+ {
+ current_ab = _tmpCountIterators[best_p]->item().abundance;
+ }
+ current_kmer = best_elem.second;
+
+ //go forward in this list
+ if(best_p==-1)
+ {
+ itKmerAbundance->next();
+ if (! itKmerAbundance->isDone())
+ {
+ pq.push(ptcf(-1,itKmerAbundance->item().graine) );
+ }
+ }
+ else
+ {
+ _tmpCountIterators[best_p]->next();
+ if (! _tmpCountIterators[best_p]->isDone())
+ {
+ pq.push(ptcf( best_p,_tmpCountIterators[best_p]->item().value) );
+ }
+ }
+
+ if(current_kmer != previous_kmer)
+ {
+ //output previous kmer
+ solidCounter.set (previous_ab);
+ this->insert (previous_kmer, solidCounter);
+ previous_kmer = current_kmer;
+ previous_ab = current_ab;
+ }
+ else
+ {
+ //merge counter
+ previous_ab += current_ab;
+ }
+
+ }
+
+ //output last one
+ solidCounter.set (previous_ab);
+ this->insert (previous_kmer, solidCounter);
+ }
+
+
+ //cleanup
+ for(int ii=0; ii< _tmpCountIterators.size(); ii++)
+ {
+ delete _tmpCountIterators[ii];
+ }
+
+
+ //erase sub files
+ for(int ii=0; ii< _tmpCountFileNames.size(); ii++)
+ {
+ std::string fname = _tmpCountFileNames[ii];
+ system::impl::System::file().remove(fname);
+ }
+
}
+ else // in that case no merging needed, just iterate the hash table and output kmer counts
+ {
+ for (itKmerAbundance->first(); !itKmerAbundance->isDone(); itKmerAbundance->next())
+ {
+
+ cell_t & cell = itKmerAbundance->item();
+ solidCounter.set (cell.val);
+ this->insert (cell.graine, solidCounter);
+ }
+ }
+
+
+ this->_superKstorage->closeFile(this->_parti_num);
this->_progress->inc (this->_pInfo.getNbKmer(this->_parti_num) ); // this->_pInfo->getNbKmer(this->_parti_num) kmers.size()
-
- this->_processor->endPart (this->_pass_num, this->_parti_num);
+
+ this->_processor->endPart (this->_pass_num, this->_parti_num);
};
/*********************************************************************
@@ -391,58 +862,293 @@ public:
kinsert = first_revk;
}
- //record kxmer
- rid = radix_kxmer.getVal();
- //idx = _r_idx[IX(kx_size,rid)]++;
- idx = __sync_fetch_and_add( _r_idx + IX(kx_size,rid) ,1); // si le sync fetch est couteux, faire un mini buffer par thread
-
-
-
-// if (idx >= _radix_sizes[IX(kx_size,rid)] || _radix_sizes[IX(kx_size,rid)] == 0)
-// { cout << "error, accessing _radix_kmers beyond bound" << endl; exit(1); }
-// cout << "tbl ref " << IX(kx_size,rid) << " idx " << idx << " radix sizes " << _radix_sizes[IX(kx_size,rid)] << endl;
-
- _radix_kmers [IX(kx_size,rid)][ idx] = kinsert << ((4-kx_size)*2); // [kx_size][rid]
- //cout << "went okay " << idx << endl;
-
- if (_bankIdMatrix) { _bankIdMatrix[IX(kx_size,rid)][ idx] = _bankId; }
-
- _first = true;
+ //record kxmer
+ rid = radix_kxmer.getVal();
+ //idx = _r_idx[IX(kx_size,rid)]++;
+ idx = __sync_fetch_and_add( _r_idx + IX(kx_size,rid) ,1); // si le sync fetch est couteux, faire un mini buffer par thread
+
+
+
+// if (idx >= _radix_sizes[IX(kx_size,rid)] || _radix_sizes[IX(kx_size,rid)] == 0)
+// { cout << "error, accessing _radix_kmers beyond bound" << endl; exit(1); }
+// cout << "tbl ref " << IX(kx_size,rid) << " idx " << idx << " radix sizes " << _radix_sizes[IX(kx_size,rid)] << endl;
+
+ _radix_kmers [IX(kx_size,rid)][ idx] = kinsert << ((4-kx_size)*2); // [kx_size][rid]
+ //cout << "went okay " << idx << endl;
+
+ if (_bankIdMatrix) { _bankIdMatrix[IX(kx_size,rid)][ idx] = _bankId; }
+
+ _first = true;
+ }
+ }
+
+ SuperKReader (size_t kmerSize, uint64_t * r_idx, Type** radix_kmers, uint64_t* radix_sizes, bank::BankIdType** bankIdMatrix, size_t bankId=0)
+ : _kmerSize (kmerSize), _kx(4), _radix_kmers(radix_kmers), _radix_sizes(radix_sizes), _bankIdMatrix(bankIdMatrix), _r_idx (r_idx), _first(true), _bankId(bankId)
+ {
+ Type un;
+ un.setVal(1);
+ _kmerMask = (un << (kmerSize*2)) - 1;
+ _mask_radix.setVal((int64_t) 255);
+ _mask_radix = _mask_radix << ((_kmerSize - 4)*2);
+ _shift = 2*(kmerSize-1);
+ _shift_val = un.getSize() -8;
+ _shift_radix = ((kmerSize - 4)*2); // radix is 4 nt long
+ }
+
+private :
+
+ size_t _kmerSize;
+ size_t _shift ;
+ size_t _shift_val ;
+ size_t _shift_radix ;
+ int _kx;
+ Type** _radix_kmers;
+ uint64_t* _radix_sizes;
+
+ bank::BankIdType** _bankIdMatrix;
+ uint64_t* _r_idx ;
+ bool _first;
+ Type _superk, _seedk;
+ Type _radix, _mask_radix ;
+ Type _kmerMask;
+ size_t _bankId;
+};
+
+
+
+
+//pour l'instant marchera que en mode comptage simple, pas en multi jeu separes
+//car le jeu separe necessite de passer un cpt de separation des banques, a verifier, et il faut que les buffer conservent l'ordre d'entree, a verifier aussi
+//readcommand pour lecture parallele des parti superkmers
+template<size_t span>
+class ReadSuperKCommand : public gatb::core::tools::dp::ICommand, public system::SmartPointer
+{
+ typedef typename Kmer<span>::Type Type;
+
+public:
+ ReadSuperKCommand(tools::storage::impl::SuperKmerBinFiles* superKstorage, int fileId, int kmerSize,
+ uint64_t * r_idx, Type** radix_kmers, uint64_t* radix_sizes, bank::BankIdType** bankIdMatrix)
+ : _superKstorage(superKstorage), _fileId(fileId),_buffer(0),_buffer_size(0), _kmerSize(kmerSize),_radix_kmers(radix_kmers), _radix_sizes(radix_sizes), _bankIdMatrix(bankIdMatrix), _r_idx (r_idx)
+ {
+ _kx=4;
+ Type un;
+ un.setVal(1);
+ _kmerMask = (un << (_kmerSize*2)) - 1;
+ _mask_radix.setVal((int64_t) 255);
+ _mask_radix = _mask_radix << ((_kmerSize - 4)*2);
+ _shift = 2*(_kmerSize-1);
+ _shift_val = un.getSize() -8;
+ _shift_radix = ((_kmerSize - 4)*2); // radix is 4 nt long
+ }
+
+ void execute ()
+ {
+ unsigned int nb_bytes_read;
+ while(_superKstorage->readBlock(&_buffer, &_buffer_size, &nb_bytes_read, _fileId))
+ {
+ //decode block and iterate through its superkmers
+ unsigned char * ptr = _buffer;
+ u_int8_t nbK; //number of kmers in the superkmer
+ int nbsuperkmer_read =0;
+ u_int8_t newbyte=0;
+
+ while(ptr < (_buffer+nb_bytes_read)) //decode whole block
+ {
+ //decode a superkmer
+ nbK = *ptr; ptr++;
+ //int nb_bytes_superk = (_kmerSize + nbK -1 +3) /4 ;
+
+ int rem_size = _kmerSize;
+
+ Type Tnewbyte;
+ int nbr=0;
+ _seedk.setVal(0);
+ while(rem_size>=4)
+ {
+ newbyte = *ptr ; ptr++;
+ Tnewbyte.setVal(newbyte);
+
+
+ _seedk = _seedk | (Tnewbyte << (8*nbr)) ;
+ rem_size -= 4; nbr++;
+ }
+
+ int uid = 4; //uid = nb nt used in current newbyte
+
+ //reste du seed kmer
+ if(rem_size>0)
+ {
+ newbyte = *ptr ; ptr++;
+ Tnewbyte.setVal(newbyte);
+
+ _seedk = ( _seedk | (Tnewbyte << (8*nbr)) ) ;
+ uid = rem_size;
+ }
+ _seedk = _seedk & _kmerMask;
+
+
+ //std::string pt = _seedk.toString(_kmerSize);
+ //printf("seedk \n%s \n",pt.c_str());
+
+ ///////////////////////// seedk should be ready here , now parse kx-mers ////////////////////////////////
+
+ u_int8_t rem = nbK;
+ Type temp = _seedk;
+ Type rev_temp = revcomp(temp,_kmerSize);
+ Type newnt ;
+ Type mink, prev_mink; prev_mink.setVal(0);
+ uint64_t idx;
+
+#ifdef NONCANONICAL
+ bool prev_which = true;
+#else
+ bool prev_which = (temp < rev_temp );
+#endif
+
+ int kx_size = -1; //next loop start at ii=0, first kmer will put it at 0
+ Type radix_kxmer_forward = (temp & _mask_radix) >> ((_kmerSize - 4)*2);
+ Type first_revk, kinsert,radix_kxmer;
+ first_revk.setVal(0);
+
+ if(!prev_which) first_revk = rev_temp;
+
+ u_int8_t rid;
+
+ for (int ii=0; ii< nbK; ii++,rem--)
+ {
+#ifdef NONCANONICAL
+ bool which = true;
+ mink = temp;
+#else
+ bool which = (temp < rev_temp );
+ mink = which ? temp : rev_temp;
+#endif
+
+ if (which != prev_which || kx_size >= _kx) // kxmer_size = 1
+ {
+ //output kxmer size kx_size,radix_kxmer
+ //kx mer is composed of superKp[ii-1] superKp[ii-2] .. superKp[ii-n] with nb elems n == kxmer_size +1 (un seul kmer ==k+0)
+
+ if(prev_which)
+ {
+ radix_kxmer = radix_kxmer_forward;
+ kinsert = prev_mink;
+ }
+ else // si revcomp, le radix du kxmer est le debut du dernier kmer
+ {
+ //previous mink
+ radix_kxmer = (prev_mink & _mask_radix) >> _shift_radix;
+ kinsert = first_revk;
+ }
+
+ //record kxmer
+ rid = radix_kxmer.getVal();
+ //idx = _r_idx[IX(kx_size,rid)]++;
+ idx = __sync_fetch_and_add( _r_idx + IX(kx_size,rid) ,1); // si le sync fetch est couteux, faire un mini buffer par thread
+
+ _radix_kmers [IX(kx_size,rid)][ idx] = kinsert << ((4-kx_size)*2); //[kx_size][rid]
+ if (_bankIdMatrix) { _bankIdMatrix[IX(kx_size,rid)][ idx] = _bankId; }
+
+ radix_kxmer_forward = (mink & _mask_radix) >> _shift_radix;
+ kx_size =0;
+
+ if(!which) first_revk = rev_temp;
+ }
+ else
+ {
+ kx_size++;
+ }
+
+ prev_which = which ;
+ prev_mink = mink;
+
+ if(rem < 2) break; //no more kmers in this superkmer, the last one has just been eaten
+
+ //////////////////////////////now decode next kmer of this superkmer //////////////////////////////////////////////
+
+ if(uid>=4) //read next byte
+ {
+ newbyte = *ptr ; ptr++;
+ Tnewbyte.setVal(newbyte);
+ uid =0;
+ }
+
+ newnt = (Tnewbyte >> (2*uid))& 3; uid++;
+
+ temp = ((temp << 2 ) | newnt ) & _kmerMask;
+
+ newnt.setVal(comp_NT[newnt.getVal()]) ;
+ rev_temp = ((rev_temp >> 2 ) | (newnt << _shift) ) & _kmerMask;
+
+ ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ }
+
+ //record last kxmer prev_mink et monk ?
+ if(prev_which)
+ {
+ radix_kxmer = radix_kxmer_forward;
+ kinsert = prev_mink;
+ }
+ else // si revcomp, le radix du kxmer est le debut du dernier kmer
+ {
+ //previous mink
+ radix_kxmer = (prev_mink & _mask_radix) >> _shift_radix;
+ kinsert = first_revk;
+ }
+
+ //record kxmer
+ rid = radix_kxmer.getVal();
+ //idx = _r_idx[IX(kx_size,rid)]++;
+ idx = __sync_fetch_and_add( _r_idx + IX(kx_size,rid) ,1); // si le sync fetch est couteux, faire un mini buffer par thread
+
+
+ _radix_kmers [IX(kx_size,rid)][ idx] = kinsert << ((4-kx_size)*2); // [kx_size][rid]
+ //cout << "went okay " << idx << endl;
+
+ if (_bankIdMatrix) { _bankIdMatrix[IX(kx_size,rid)][ idx] = _bankId; }
+
+
+
+
+ //////////////////////////////////////////////////////////
+ //ptr+=nb_bytes_superk;
+
+ //now go to next superk of this block, ptr should point to beginning of next superk
+ nbsuperkmer_read++;
+ /////////
+ }
+
+ //printf("nb superk in this block %i parti %i\n",nbsuperkmer_read,_fileId);
+
+
}
+
+ if(_buffer!=0)
+ free(_buffer);
}
-
- SuperKReader (size_t kmerSize, uint64_t * r_idx, Type** radix_kmers, uint64_t* radix_sizes, bank::BankIdType** bankIdMatrix, size_t bankId=0)
- : _kmerSize (kmerSize), _kx(4), _radix_kmers(radix_kmers), _radix_sizes(radix_sizes), _bankIdMatrix(bankIdMatrix), _r_idx (r_idx), _first(true), _bankId(bankId)
- {
- Type un;
- un.setVal(1);
- _kmerMask = (un << (kmerSize*2)) - 1;
- _mask_radix.setVal((int64_t) 255);
- _mask_radix = _mask_radix << ((_kmerSize - 4)*2);
- _shift = 2*(kmerSize-1);
- _shift_val = un.getSize() -8;
- _shift_radix = ((kmerSize - 4)*2); // radix is 4 nt long
- }
-
-private :
-
- size_t _kmerSize;
- size_t _shift ;
- size_t _shift_val ;
- size_t _shift_radix ;
+private:
+ tools::storage::impl::SuperKmerBinFiles* _superKstorage;
+ int _fileId;
+ unsigned char * _buffer;
+ unsigned int _buffer_size;
+ int _kmerSize;
int _kx;
+
Type** _radix_kmers;
- uint64_t* _radix_sizes;
-
+ uint64_t* _radix_sizes;
bank::BankIdType** _bankIdMatrix;
uint64_t* _r_idx ;
- bool _first;
+
Type _superk, _seedk;
Type _radix, _mask_radix ;
Type _kmerMask;
+ size_t _shift ;
+ size_t _shift_val ;
+ size_t _shift_radix ;
size_t _bankId;
+
};
-
+
/*********************************************************************
** METHOD :
** PURPOSE :
@@ -454,7 +1160,7 @@ private :
/** in this scheme we count k-mers in a partition by sorting a vector*/
template<size_t span>
PartitionsByVectorCommand<span>:: PartitionsByVectorCommand (
- Iterable<Type>& partition,
+ // Iterable<Type>& partition,
CountProcessor* processor,
size_t cacheSize,
IteratorListener* progress,
@@ -465,9 +1171,10 @@ PartitionsByVectorCommand<span>:: PartitionsByVectorCommand (
size_t nbCores,
size_t kmerSize,
MemAllocator& pool,
- vector<size_t>& offsets
+ vector<size_t>& offsets,
+ tools::storage::impl::SuperKmerBinFiles* superKstorage
)
- : PartitionsCommand<span> (partition, processor, cacheSize, progress, timeInfo, pInfo, passi, parti,nbCores,kmerSize,pool),
+ : PartitionsCommand<span> (/*partition,*/ processor, cacheSize, progress, timeInfo, pInfo, passi, parti,nbCores,kmerSize,pool,superKstorage),
_radix_kmers (0), _bankIdMatrix(0), _radix_sizes(0), _r_idx(0), _nbItemsPerBankPerPart(offsets)
{
_dispatcher = new Dispatcher (this->_nbCores);
@@ -501,7 +1208,9 @@ void PartitionsByVectorCommand<span>::execute ()
this->_processor->beginPart (this->_pass_num, this->_parti_num, this->_cacheSize, this->getName());
/** We check that we got something. */
- if (this->_partition.getNbItems() == 0) { return; }
+
+ if (this->_superKstorage->getNbItems(this->_parti_num) == 0) { return; }
+
/** We configure tables. */
_radix_kmers = (Type**) MALLOC (256*(KX+1)*sizeof(Type*)); //make the first dims static ? 5*256
@@ -529,6 +1238,9 @@ void PartitionsByVectorCommand<span>::execute ()
this->_processor->endPart (this->_pass_num, this->_parti_num);
};
+
+
+
/*********************************************************************
** METHOD :
** PURPOSE :
@@ -542,6 +1254,8 @@ void PartitionsByVectorCommand<span>::executeRead ()
{
TIME_INFO (this->_timeInfo, "1.read");
+ this->_superKstorage->openFile("r",this->_parti_num);
+
/** Recall that the attribute _offsets has a size equals to the number of banks + 1 as input
* and for each bank, it holds the number of items found for the currently processed partition.
*
@@ -584,16 +1298,20 @@ void PartitionsByVectorCommand<span>::executeRead ()
* On MacOs, we got some crashes with uint128 that were not aligned on 16 bytes
*/
if (_bankIdMatrix)
- {
- for (size_t xx=0; xx< (KX+1); xx++)
- {
- for (int ii=0; ii< 256; ii++)
- {
- size_t nbKmers = this->_pInfo.getNbKmer(this->_parti_num,ii,xx);
- _bankIdMatrix [IX(xx,ii)] = (bank::BankIdType*) this->_pool.pool_malloc (nbKmers * sizeof(bank::BankIdType), "bank ids alloc");
- }
- }
- }
+ {
+ throw Exception ("PartitionsByVectorCommand: multi-bank unsupported with new superk storage");
+
+ /*
+ for (size_t xx=0; xx< (KX+1); xx++)
+ {
+ for (int ii=0; ii< 256; ii++)
+ {
+ size_t nbKmers = this->_pInfo.getNbKmer(this->_parti_num,ii,xx);
+ _bankIdMatrix [IX(xx,ii)] = (bank::BankIdType*) this->_pool.pool_malloc (nbKmers * sizeof(bank::BankIdType), "bank ids alloc");
+ }
+ }
+ */
+ }
}
DEBUG (("PartitionsByVectorCommand<span>::executeRead: fillsolid parti num %i by vector nb kxmer / nbkmers %lli / %lli %f with %zu nbcores \n",
@@ -608,33 +1326,58 @@ void PartitionsByVectorCommand<span>::executeRead ()
* of _offsets). */
if (_bankIdMatrix)
+ {
+ throw Exception ("PartitionsByVectorCommand: multi-bank unsupported with new superk storage");
+// /** We create an iterator over all the items. */
+// Iterator<Type>* itGlobal = this->_partition.iterator();
+// LOCAL (itGlobal);
+//
+// /** We iterate the banks. */
+// for (size_t b=0; b<_nbItemsPerBankPerPart.size(); b++)
+// {
+// /** We truncate the global iterator.
+// * NB : we initialize (ie call 'first') the global iterator only at first call (for b==0). */
+// Iterator<Type>* itLocal = new TruncateIterator<Type> (*itGlobal, _nbItemsPerBankPerPart[b], b==0 ? true : false);
+// LOCAL (itLocal);
+//
+// /** We iterate this local iterator. */
+// _dispatcher->iterate (itLocal, SuperKReader<span> (this->_kmerSize, _r_idx, _radix_kmers, _radix_sizes, _bankIdMatrix, b), 10000); //must be even , reading by pairs
+// }
+//
+// /** We check that the global iterator is finished. */
+// if (itGlobal->isDone() == false) { throw Exception ("PartitionsByVectorCommand: iteration should be finished"); }
+
+ }
+ else
{
- /** We create an iterator over all the items. */
- Iterator<Type>* itGlobal = this->_partition.iterator();
- LOCAL (itGlobal);
+ /** We iterate the superkmers. */
- /** We iterate the banks. */
- for (size_t b=0; b<_nbItemsPerBankPerPart.size(); b++)
- {
- /** We truncate the global iterator.
- * NB : we initialize (ie call 'first') the global iterator only at first call (for b==0). */
- Iterator<Type>* itLocal = new TruncateIterator<Type> (*itGlobal, _nbItemsPerBankPerPart[b], b==0 ? true : false);
- LOCAL (itLocal);
+ vector<ICommand*> cmds;
+ for (size_t tid=0; tid < this->_nbCores; tid++)
+ {
+ cmds.push_back(new ReadSuperKCommand<span> (
+ this->_superKstorage,
+ this->_parti_num,
+ this->_kmerSize,
+ _r_idx, _radix_kmers, _radix_sizes, 0
+ )
+ );
+ }
+
+ _dispatcher->dispatchCommands (cmds, 0);
- /** We iterate this local iterator. */
- _dispatcher->iterate (itLocal, SuperKReader<span> (this->_kmerSize, _r_idx, _radix_kmers, _radix_sizes, _bankIdMatrix, b), 10000); //must be even , reading by pairs
- }
+
+
+
+// printf("-----done ReadSuperKCommand ---\n");
- /** We check that the global iterator is finished. */
- if (itGlobal->isDone() == false) { throw Exception ("PartitionsByVectorCommand: iteration should be finished"); }
- }
- else
- {
- /** We iterate the superkmers. */
- _dispatcher->iterate (this->_partition.iterator(), SuperKReader<span> (this->_kmerSize, _r_idx, _radix_kmers, _radix_sizes, 0, 0), 10000); //must be even , reading by pairs
}
+
+ this->_superKstorage->closeFile(this->_parti_num);
+
}
+
/*********************************************************************
** METHOD :
** PURPOSE :
@@ -1017,7 +1760,7 @@ void PartitionsByVectorCommand<span>::executeDump ()
//merge-scan all 'virtual' arrays and output counts
while (1)
{
- //go forward in this array or in new array of reaches end of this one
+ //go forward in this array or in new array if reaches end of this one
if (! vec_pointer[best_p]->next())
{
//reaches end of one array
@@ -1061,6 +1804,440 @@ void PartitionsByVectorCommand<span>::executeDump ()
for (int ii=0; ii<nbkxpointers; ii++) { delete vec_pointer[ii]; }
}
+
+
+//////////////////Multi bank version of partitionbyvectorcommand //////////////////
+
+template<size_t span>
+PartitionsByVectorCommand_multibank<span>:: PartitionsByVectorCommand_multibank (
+ Iterable<Type>& partition,
+ CountProcessor* processor,
+ size_t cacheSize,
+ IteratorListener* progress,
+ TimeInfo& timeInfo,
+ PartiInfo<5>& pInfo,
+ int passi,
+ int parti,
+ size_t nbCores,
+ size_t kmerSize,
+ MemAllocator& pool,
+ vector<size_t>& offsets
+ )
+: PartitionsCommand_multibank<span> (partition, processor, cacheSize, progress, timeInfo, pInfo, passi, parti,nbCores,kmerSize,pool),
+_radix_kmers (0), _bankIdMatrix(0), _radix_sizes(0), _r_idx(0), _nbItemsPerBankPerPart(offsets)
+{
+ _dispatcher = new Dispatcher (this->_nbCores);
+}
+
+/*********************************************************************
+ ** METHOD :
+ ** PURPOSE :
+ ** INPUT :
+ ** OUTPUT :
+ ** RETURN :
+ ** REMARKS :
+ *********************************************************************/
+template<size_t span>
+PartitionsByVectorCommand_multibank<span>:: ~PartitionsByVectorCommand_multibank ()
+{
+ if (_dispatcher) { delete _dispatcher; }
+}
+
+/*********************************************************************
+ ** METHOD :
+ ** PURPOSE :
+ ** INPUT :
+ ** OUTPUT :
+ ** RETURN :
+ ** REMARKS :
+ *********************************************************************/
+template<size_t span>
+void PartitionsByVectorCommand_multibank<span>::execute ()
+{
+ this->_processor->beginPart (this->_pass_num, this->_parti_num, this->_cacheSize, this->getName());
+
+ /** We check that we got something. */
+
+ if (this->_partition.getNbItems() == 0) { return; }
+
+ /** We configure tables. */
+ _radix_kmers = (Type**) MALLOC (256*(KX+1)*sizeof(Type*)); //make the first dims static ? 5*256
+ _radix_sizes = (uint64_t*) MALLOC (256*(KX+1)*sizeof(uint64_t));
+ _r_idx = (uint64_t*) CALLOC (256*(KX+1),sizeof(uint64_t));
+
+ /** We need extra information for kmers counting in case of several input banks. */
+ if (_nbItemsPerBankPerPart.size() > 1) { _bankIdMatrix = (bank::BankIdType**) MALLOC (256*(KX+1)*sizeof(bank::BankIdType*)); }
+ else { _bankIdMatrix = 0; }
+
+ /** We have 3 phases here: read, sort and dump. */
+ executeRead ();
+ executeSort ();
+ executeDump ();
+
+ /** We cleanup tables. */
+ FREE (_radix_sizes) ;
+ FREE (_radix_kmers);
+ FREE (_r_idx);
+ if (_bankIdMatrix) { FREE (_bankIdMatrix); }
+
+ /** We update the progress bar. */
+ this->_progress->inc (this->_pInfo.getNbKmer(this->_parti_num) );
+
+ this->_processor->endPart (this->_pass_num, this->_parti_num);
+};
+
+template<size_t span>
+void PartitionsByVectorCommand_multibank<span>::executeRead ()
+{
+ TIME_INFO (this->_timeInfo, "1.read");
+
+
+
+ /** Recall that the attribute _offsets has a size equals to the number of banks + 1 as input
+ * and for each bank, it holds the number of items found for the currently processed partition.
+ *
+ * bank0 bank1 ... bankI
+ * offsets : xxx xxx xxx
+ * <------------------------->
+ * current partition content
+ */
+
+ DEBUG (("_offsets.size=%d OFFSETS: ", _nbItemsPerBankPerPart.size() ));
+ for (size_t j=0; j<_nbItemsPerBankPerPart.size(); j++) { DEBUG (("%6d ", _nbItemsPerBankPerPart[j])); } DEBUG (("\n"));
+
+ uint64_t sum_nbxmer =0;
+
+ /** We synchronize this statements block because of threads concurrent access. */
+ {
+ LocalSynchronizer synchro (this->_pool.getSynchro());
+
+ /** We align the pool with a big alignment constraint (see below macos issue with uint128) */
+ this->_pool.align (16);
+
+ /** FIRST: allocation for the kmers. */
+ for (size_t xx=0; xx< (KX+1); xx++)
+ {
+ for (int ii=0; ii< 256; ii++)
+ {
+ /** Shortcut. */
+ size_t nbKmers = this->_pInfo.getNbKmer(this->_parti_num,ii,xx);
+
+ //use memory pool here to avoid memory fragmentation
+ _radix_kmers [IX(xx,ii)] = (Type*) this->_pool.pool_malloc (nbKmers * sizeof(Type), "kmers alloc");
+ _radix_sizes [IX(xx,ii)] = nbKmers;
+
+ sum_nbxmer += nbKmers;
+ }
+ }
+
+ /** SECOND: allocation for the bank ids if needed.
+ * => NEED TO BE DONE AFTER THE KMERS BECAUSE OF MEMORY ALIGNMENT CONCERNS.
+ * On MacOs, we got some crashes with uint128 that were not aligned on 16 bytes
+ */
+ if (_bankIdMatrix)
+ {
+
+ for (size_t xx=0; xx< (KX+1); xx++)
+ {
+ for (int ii=0; ii< 256; ii++)
+ {
+ size_t nbKmers = this->_pInfo.getNbKmer(this->_parti_num,ii,xx);
+ _bankIdMatrix [IX(xx,ii)] = (bank::BankIdType*) this->_pool.pool_malloc (nbKmers * sizeof(bank::BankIdType), "bank ids alloc");
+ }
+ }
+ }
+ }
+
+ DEBUG (("PartitionsByVectorCommand<span>::executeRead: fillsolid parti num %i by vector nb kxmer / nbkmers %lli / %lli %f with %zu nbcores \n",
+ this->_parti_num, sum_nbxmer, this->_pInfo.getNbKmer(this->_parti_num),
+ (double) sum_nbxmer / this->_pInfo.getNbKmer(this->_parti_num),this->_nbCores
+));
+
+ /** HOW TO COUNT KMERS BY SET OF READS ?
+ * Now, we are going to read the temporary partition built during the previous phase and fill
+ * the _radix_kmers attribute. We also need to know in _radix_kmers what is the contribution of
+ * each bank. We therefore need to iterate the current partition by bank (using the information
+ * of _offsets). */
+
+ if (_bankIdMatrix)
+ {
+ /** We create an iterator over all the items. */
+ Iterator<Type>* itGlobal = this->_partition.iterator();
+ LOCAL (itGlobal);
+
+ /** We iterate the banks. */
+ for (size_t b=0; b<_nbItemsPerBankPerPart.size(); b++)
+ {
+ /** We truncate the global iterator.
+ * NB : we initialize (ie call 'first') the global iterator only at first call (for b==0). */
+ Iterator<Type>* itLocal = new TruncateIterator<Type> (*itGlobal, _nbItemsPerBankPerPart[b], b==0 ? true : false);
+ LOCAL (itLocal);
+
+ /** We iterate this local iterator. */
+ _dispatcher->iterate (itLocal, SuperKReader<span> (this->_kmerSize, _r_idx, _radix_kmers, _radix_sizes, _bankIdMatrix, b), 10000); //must be even , reading by pairs
+ }
+
+ /** We check that the global iterator is finished. */
+ if (itGlobal->isDone() == false) { throw Exception ("PartitionsByVectorCommand: iteration should be finished"); }
+ }
+ else
+ {
+ /** We iterate the superkmers. */
+
+ _dispatcher->iterate (this->_partition.iterator(), SuperKReader<span> (this->_kmerSize, _r_idx, _radix_kmers, _radix_sizes, 0, 0), 10000); //must be even , reading by pairs
+
+
+
+ // printf("-----done ReadSuperKCommand ---\n");
+
+ }
+
+
+
+}
+
+template<size_t span>
+void PartitionsByVectorCommand_multibank<span>::executeSort ()
+{
+ TIME_INFO (this->_timeInfo, "2.sort");
+
+ vector<ICommand*> cmds;
+
+ int nwork = 256 / this->_nbCores;
+
+ for (size_t xx=0; xx < (KX+1); xx++)
+ {
+ cmds.clear();
+
+ //fill cmd work vector
+ for (size_t tid=0; tid < this->_nbCores; tid++)
+ {
+ int deb = 0 + tid * nwork;
+ int fin = (tid+1) * nwork -1; // thread will do inclusive range [begin -- end ]
+ if(tid == this->_nbCores-1) { fin = 255; }
+
+ // mettre dans le SortCommand le master radix_kmers et range a traiter
+ cmds.push_back (new SortCommand<span> (
+ _radix_kmers+ IX(xx,0),
+ (_bankIdMatrix ? _bankIdMatrix+ IX(xx,0) : 0),
+ deb, fin,
+ _radix_sizes + IX(xx,0)
+ ));
+ }
+
+ _dispatcher->dispatchCommands (cmds, 0);
+ }
+}
+
+template<size_t span>
+void PartitionsByVectorCommand_multibank<span>::executeDump ()
+{
+ TIME_INFO (this->_timeInfo, "3.dump");
+
+ int nbkxpointers = 453; //6 for k1 mer, 27 for k2mer, 112 for k3mer 453 for k4mer
+ vector< KxmerPointer<span>*> vec_pointer (nbkxpointers);
+ int best_p;
+
+ std::priority_queue< kxp, std::vector<kxp>,kxpcomp > pq;
+
+ size_t nbBanks = _nbItemsPerBankPerPart.size();
+ if (nbBanks == 0) nbBanks = 1;
+
+ CounterBuilder solidCounter (nbBanks);
+
+ Type previous_kmer ;
+
+ //init the pointers to the 6 arrays
+ int pidx =0;
+
+ ////////////////////////////////////////////////
+ ////-------------k0 pointers-----------/////////
+ ////////////////////////////////////////////////
+
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(0,0) ,0,0,0,255,this->_kmerSize, _radix_sizes + IX(0,0), _bankIdMatrix); // vec, prefix size, kxsize , radix min, radix max ,ksize
+
+ ////////////////////////////////////////////////
+ ////-------------k1 pointers-----------/////////
+ ////////////////////////////////////////////////
+
+ //prefix0
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(1,0) ,0,1,0,255,this->_kmerSize, _radix_sizes + IX(1, 0), _bankIdMatrix);
+ int lowr = 0;
+ int maxr = 63;
+
+ //prefix1
+ for(unsigned int ii=0; ii<4; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(1,0) ,1,1,lowr,maxr,this->_kmerSize, _radix_sizes + IX(1, 0), _bankIdMatrix);
+ lowr += 64;
+ maxr += 64;
+ }
+
+ ////////////////////////////////////////////////
+ ////-------------k2 pointers-----------/////////
+ ////////////////////////////////////////////////
+
+ //prefix0
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(2,0),0,2,0,255,this->_kmerSize, _radix_sizes + IX(2, 0), _bankIdMatrix);
+
+ //prefix1
+ lowr = 0; maxr = 63;
+ for(unsigned int ii=0; ii<4; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(2,0),1,2,lowr,maxr,this->_kmerSize, _radix_sizes + IX(2, 0), _bankIdMatrix);
+ lowr += 64;
+ maxr += 64;
+ }
+
+ //prefix2
+ lowr = 0; maxr = 15;
+ for(unsigned int ii=0; ii<16; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(2,0),2,2,lowr,maxr,this->_kmerSize, _radix_sizes + IX(2, 0), _bankIdMatrix);
+ lowr += 16;
+ maxr += 16;
+ }
+
+ ////////////////////////////////////////////////
+ ////-------------k3 pointers-----------/////////
+ ////////////////////////////////////////////////
+
+ //prefix0
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(3,0),0,3,0,255,this->_kmerSize, _radix_sizes + IX(3, 0), _bankIdMatrix);
+
+ //prefix1
+ lowr = 0; maxr = 63;
+ for(unsigned int ii=0; ii<4; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(3,0),1,3,lowr,maxr,this->_kmerSize, _radix_sizes + IX(3, 0), _bankIdMatrix);
+ lowr += 64;
+ maxr += 64;
+ }
+
+ //prefix2
+ lowr = 0; maxr = 15;
+ for(unsigned int ii=0; ii<16; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(3,0),2,3,lowr,maxr,this->_kmerSize, _radix_sizes + IX(3, 0), _bankIdMatrix);
+ lowr += 16;
+ maxr += 16;
+ }
+
+ //prefix3
+ lowr = 0; maxr = 3;
+ for(unsigned int ii=0; ii<64; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(3,0),3,3,lowr,maxr,this->_kmerSize, _radix_sizes + IX(3, 0), _bankIdMatrix);
+ lowr += 4;
+ maxr += 4;
+ }
+
+ ////////////////////////////////////////////////
+ ////-------------k4 pointers-----------/////////
+ ////////////////////////////////////////////////
+
+ //prefix0
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(4,0),0,4,0,255,this->_kmerSize, _radix_sizes + IX(4, 0), _bankIdMatrix);
+
+ //prefix1
+ lowr = 0; maxr = 63;
+ for(unsigned int ii=0; ii<4; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(4,0),1,4,lowr,maxr,this->_kmerSize, _radix_sizes + IX(4, 0), _bankIdMatrix);
+ lowr += 64;
+ maxr += 64;
+ }
+
+ //prefix2
+ lowr = 0; maxr = 15;
+ for(unsigned int ii=0; ii<16; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(4,0),2,4,lowr,maxr,this->_kmerSize, _radix_sizes + IX(4, 0), _bankIdMatrix);
+ lowr += 16;
+ maxr += 16;
+ }
+
+ //prefix3
+ lowr = 0; maxr = 3;
+ for(unsigned int ii=0; ii<64; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(4,0),3,4,lowr,maxr,this->_kmerSize, _radix_sizes + IX(4, 0), _bankIdMatrix);
+ lowr += 4;
+ maxr += 4;
+ }
+
+ //prefix4
+ lowr = 0; maxr = 0;
+ for(unsigned int ii=0; ii<256; ii++)
+ {
+ vec_pointer[pidx++] = new KxmerPointer<span> (_radix_kmers+ IX(4,0),4,4,lowr,maxr,this->_kmerSize, _radix_sizes + IX(4, 0), _bankIdMatrix);
+ lowr += 1;
+ maxr += 1;
+ }
+
+ //fill the priority queue with the first elems
+ for (int ii=0; ii<nbkxpointers; ii++)
+ {
+ if(vec_pointer[ii]->next()) { pq.push(kxp(ii,vec_pointer[ii]->value())); }
+ }
+
+ if (pq.size() != 0) // everything empty, no kmer at all
+ {
+ //get first pointer
+ best_p = pq.top().first ; pq.pop();
+
+ previous_kmer = vec_pointer[best_p]->value();
+
+ solidCounter.init (vec_pointer[best_p]->getBankId());
+
+ //merge-scan all 'virtual' arrays and output counts
+ while (1)
+ {
+ //go forward in this array or in new array if reaches end of this one
+ if (! vec_pointer[best_p]->next())
+ {
+ //reaches end of one array
+ if(pq.size() == 0) break; //everything done
+
+ //otherwise get new best
+ best_p = pq.top().first ; pq.pop();
+ }
+
+ if (vec_pointer[best_p]->value() != previous_kmer )
+ {
+ //if diff, changes to new array, get new min pointer
+ pq.push(kxp(best_p,vec_pointer[best_p]->value())); //push new val of this pointer in pq, will be counted later
+
+ best_p = pq.top().first ; pq.pop();
+
+ //if new best is diff, this is the end of this kmer
+ if(vec_pointer[best_p]->value()!=previous_kmer )
+ {
+ this->insert (previous_kmer, solidCounter);
+
+ solidCounter.init (vec_pointer[best_p]->getBankId());
+ previous_kmer = vec_pointer[best_p]->value();
+ }
+ else
+ {
+ solidCounter.increase (vec_pointer[best_p]->getBankId());
+ }
+ }
+ else
+ {
+ solidCounter.increase (vec_pointer[best_p]->getBankId());
+ }
+ }
+
+ //last elem
+ this->insert (previous_kmer, solidCounter);
+ }
+
+ /** Cleanup. */
+ for (int ii=0; ii<nbkxpointers; ii++) { delete vec_pointer[ii]; }
+}
+
+
/********************************************************************************/
} } } } /* end of namespaces. */
/********************************************************************************/
diff --git a/gatb-core/src/gatb/kmer/impl/PartitionsCommand.hpp b/gatb-core/src/gatb/kmer/impl/PartitionsCommand.hpp
index 2cc559b..c7bc178 100644
--- a/gatb-core/src/gatb/kmer/impl/PartitionsCommand.hpp
+++ b/gatb-core/src/gatb/kmer/impl/PartitionsCommand.hpp
@@ -96,6 +96,14 @@ private:
CountVector _abundancePerBank;
};
+
+ // not really nice, but right now two implems co-exists, old one with multi-bank support, new one with more efficient disk usage
+
+ //not really nice but temporary solution :
+ // -- one implem with the new efficient superkmer storage
+ // -- another implem with the old storage (2 kmer per superKmer) that supports multi-bank counting
+ //todo : multi bank kmer counting with new storage or do multi-bank couting externally (merge results like in simka)
+
/********************************************************************************/
template<size_t span>
class PartitionsCommand : public gatb::core::tools::dp::ICommand, public system::SmartPointer
@@ -109,7 +117,7 @@ public:
/** Constructor. */
PartitionsCommand (
- gatb::core::tools::collections::Iterable<Type>& partition,
+ // gatb::core::tools::collections::Iterable<Type>& partition,
CountProcessor* processor,
size_t cacheSize,
gatb::core::tools::dp::IteratorListener* progress,
@@ -119,7 +127,8 @@ public:
int parti,
size_t nbCores,
size_t kmerSize,
- gatb::core::tools::misc::impl::MemAllocator& pool
+ gatb::core::tools::misc::impl::MemAllocator& pool,
+ tools::storage::impl::SuperKmerBinFiles* superKstorage
);
/** Destructor. */
@@ -129,7 +138,7 @@ public:
virtual const char* getName() const = 0;
protected:
- gatb::core::tools::collections::Iterable<Type>& _partition;
+ // gatb::core::tools::collections::Iterable<Type>& _partition;
gatb::core::tools::dp::IteratorListener* _progress;
PartiInfo<5>& _pInfo;
int _pass_num;
@@ -146,6 +155,9 @@ protected:
CountProcessor* _processor;
void setProcessor (CountProcessor* processor) { SP_SETATTR(processor); }
+
+ tools::storage::impl::SuperKmerBinFiles* _superKstorage;
+
};
/********************************************************************************/
@@ -162,7 +174,7 @@ public:
/** Constructor. */
PartitionsByHashCommand (
- gatb::core::tools::collections::Iterable<Type>& partition,
+ // gatb::core::tools::collections::Iterable<Type>& partition,
CountProcessor* processor,
size_t cacheSize,
gatb::core::tools::dp::IteratorListener* progress,
@@ -173,7 +185,8 @@ public:
size_t nbCores,
size_t kmerSize,
gatb::core::tools::misc::impl::MemAllocator& pool,
- u_int64_t hashMemory
+ u_int64_t hashMemory,
+ tools::storage::impl::SuperKmerBinFiles* superKstorage
);
/** Get the class name (for statistics). */
@@ -188,65 +201,192 @@ private:
/********************************************************************************/
/** */
+
+
+
+
+
+
template<size_t span>
class PartitionsByVectorCommand : public PartitionsCommand<span>
{
public:
+
+ /** Shortcut. */ /* R: don't know how to avoid this code duplication */
+ typedef typename Kmer<span>::Type Type;
+ typedef typename Kmer<span>::Count Count;
+ typedef ICountProcessor<span> CountProcessor;
+
+ static const size_t KX = 4 ;
+
+private:
+ //used for the priority queue
+ typedef std::pair<int, Type> kxp; //id pointer in vec_pointer , value
+ struct kxpcomp { bool operator() (kxp l,kxp r) { return ((r.second) < (l.second)); } } ;
+
+public:
+ /** Constructor. */
+ PartitionsByVectorCommand (
+ // gatb::core::tools::collections::Iterable<Type>& partition,
+ CountProcessor* processor,
+ size_t cacheSize,
+ gatb::core::tools::dp::IteratorListener* progress,
+ tools::misc::impl::TimeInfo& timeInfo,
+ PartiInfo<5>& pInfo,
+ int passi,
+ int parti,
+ size_t nbCores,
+ size_t kmerSize,
+ gatb::core::tools::misc::impl::MemAllocator& pool,
+ std::vector<size_t>& offsets,
+ tools::storage::impl::SuperKmerBinFiles* superKstorage
+
+ );
+
+ /** Destructor. */
+ ~PartitionsByVectorCommand ();
+
+ /** Get the class name (for statistics). */
+ const char* getName() const { return "vector"; }
+
+ /** */
+ void execute ();
+
+private:
+
+ Type** _radix_kmers;
+ bank::BankIdType** _bankIdMatrix;
+ uint64_t* _radix_sizes;
+ uint64_t* _r_idx;
+
+ tools::dp::IDispatcher* _dispatcher;
+
+ void executeRead ();
+ void executeSort ();
+ void executeDump ();
+
+ std::vector<size_t> _nbItemsPerBankPerPart;
+};
- /** Shortcut. */ /* R: don't know how to avoid this code duplication */
- typedef typename Kmer<span>::Type Type;
- typedef typename Kmer<span>::Count Count;
- typedef ICountProcessor<span> CountProcessor;
- static const size_t KX = 4 ;
+
+////// ugly duplicated code below to keep support for multi-bank counting
+// it is a temporary solution
+// todo : support multi-bank with either
+// -- multi-bank support within the new efficient superkmer storage
+// -- "external" multi-bank support through result merging (a la simka)
-private:
- //used for the priority queue
- typedef std::pair<int, Type> kxp; //id pointer in vec_pointer , value
- struct kxpcomp { bool operator() (kxp l,kxp r) { return ((r.second) < (l.second)); } } ;
+template<size_t span>
+class PartitionsCommand_multibank : public gatb::core::tools::dp::ICommand, public system::SmartPointer
+{
public:
- /** Constructor. */
- PartitionsByVectorCommand (
- gatb::core::tools::collections::Iterable<Type>& partition,
- CountProcessor* processor,
- size_t cacheSize,
- gatb::core::tools::dp::IteratorListener* progress,
- tools::misc::impl::TimeInfo& timeInfo,
- PartiInfo<5>& pInfo,
- int passi,
- int parti,
- size_t nbCores,
- size_t kmerSize,
- gatb::core::tools::misc::impl::MemAllocator& pool,
- std::vector<size_t>& offsets
- );
-
- /** Destructor. */
- ~PartitionsByVectorCommand ();
-
- /** Get the class name (for statistics). */
- const char* getName() const { return "vector"; }
+
+ /** Shortcut. */
+ typedef typename Kmer<span>::Type Type;
+ typedef typename Kmer<span>::Count Count;
+ typedef ICountProcessor<span> CountProcessor;
+
+ /** Constructor. */
+ PartitionsCommand_multibank (
+ gatb::core::tools::collections::Iterable<Type>& partition,
+ CountProcessor* processor,
+ size_t cacheSize,
+ gatb::core::tools::dp::IteratorListener* progress,
+ tools::misc::impl::TimeInfo& timeInfo,
+ PartiInfo<5>& pInfo,
+ int passi,
+ int parti,
+ size_t nbCores,
+ size_t kmerSize,
+ gatb::core::tools::misc::impl::MemAllocator& pool
+ );
+
+ /** Destructor. */
+ ~PartitionsCommand_multibank();
+
+ /** Get the class name (for statistics). */
+ virtual const char* getName() const = 0;
+
+protected:
+ gatb::core::tools::collections::Iterable<Type>& _partition;
+ gatb::core::tools::dp::IteratorListener* _progress;
+ PartiInfo<5>& _pInfo;
+ int _pass_num;
+ int _parti_num;
+ size_t _nbCores;
+ size_t _kmerSize;
+ size_t _cacheSize;
+ gatb::core::tools::misc::impl::MemAllocator& _pool;
+
+ void insert (const Type& kmer, const CounterBuilder& count);
+
+ tools::misc::impl::TimeInfo& _globalTimeInfo;
+ tools::misc::impl::TimeInfo _timeInfo;
+
+ CountProcessor* _processor;
+ void setProcessor (CountProcessor* processor) { SP_SETATTR(processor); }
+};
- /** */
- void execute ();
+template<size_t span>
+class PartitionsByVectorCommand_multibank : public PartitionsCommand_multibank<span>
+{
+public:
+
+ /** Shortcut. */ /* R: don't know how to avoid this code duplication */
+ typedef typename Kmer<span>::Type Type;
+ typedef typename Kmer<span>::Count Count;
+ typedef ICountProcessor<span> CountProcessor;
+
+ static const size_t KX = 4 ;
+
private:
-
- Type** _radix_kmers;
- bank::BankIdType** _bankIdMatrix;
+ //used for the priority queue
+ typedef std::pair<int, Type> kxp; //id pointer in vec_pointer , value
+ struct kxpcomp { bool operator() (kxp l,kxp r) { return ((r.second) < (l.second)); } } ;
+
+public:
+ /** Constructor. */
+ PartitionsByVectorCommand_multibank (
+ gatb::core::tools::collections::Iterable<Type>& partition,
+ CountProcessor* processor,
+ size_t cacheSize,
+ gatb::core::tools::dp::IteratorListener* progress,
+ tools::misc::impl::TimeInfo& timeInfo,
+ PartiInfo<5>& pInfo,
+ int passi,
+ int parti,
+ size_t nbCores,
+ size_t kmerSize,
+ gatb::core::tools::misc::impl::MemAllocator& pool,
+ std::vector<size_t>& offsets
+ );
+
+ /** Destructor. */
+ ~PartitionsByVectorCommand_multibank ();
+
+ /** Get the class name (for statistics). */
+ const char* getName() const { return "vector"; }
+
+ /** */
+ void execute ();
+
+private:
+
+ Type** _radix_kmers;
+ bank::BankIdType** _bankIdMatrix;
uint64_t* _radix_sizes;
uint64_t* _r_idx;
-
- tools::dp::IDispatcher* _dispatcher;
-
+
+ tools::dp::IDispatcher* _dispatcher;
+
void executeRead ();
- void executeSort ();
- void executeDump ();
-
- std::vector<size_t> _nbItemsPerBankPerPart;
+ void executeSort ();
+ void executeDump ();
+
+ std::vector<size_t> _nbItemsPerBankPerPart;
};
-
/********************************************************************************/
} } } } /* end of namespaces. */
/********************************************************************************/
diff --git a/gatb-core/src/gatb/kmer/impl/RepartitionAlgorithm.cpp b/gatb-core/src/gatb/kmer/impl/RepartitionAlgorithm.cpp
index 22eab5d..631bb70 100644
--- a/gatb-core/src/gatb/kmer/impl/RepartitionAlgorithm.cpp
+++ b/gatb-core/src/gatb/kmer/impl/RepartitionAlgorithm.cpp
@@ -329,7 +329,8 @@ void RepartitorAlgorithm<span>::computeFrequencies (Repartitor& repartitor)
_bank->estimate (estimateSeqNb, estimateSeqTotalSize, estimateSeqMaxSize);
- u_int64_t nbseq_sample = std::max ( u_int64_t (estimateSeqNb * 0.05) ,u_int64_t( 1000000ULL) ) ;
+ u_int64_t nbseq_sample = std::min ( u_int64_t (estimateSeqNb * 0.05) ,u_int64_t( 50000000ULL) ) ;
+ // TODO would be better to just stop estimating minimizer frequency when it becomes stable. not after a fixed number of reads
u_int64_t rg = ((u_int64_t)1 << (2*_config._minim_size));
//cout << "\nAllocating " << ((rg*sizeof(uint32_t))/1024) << " KB for " << _minim_size <<"-mers frequency counting (" << rg << " elements total)" << endl;
@@ -337,7 +338,9 @@ void RepartitorAlgorithm<span>::computeFrequencies (Repartitor& repartitor)
Model model (_config._kmerSize, _config._minim_size);
- CancellableIterator<Sequence>* cancellable_it = new CancellableIterator<Sequence> (*(_bank->iterator()));
+ Iterator<Sequence>* bank_it = _bank->iterator();
+ LOCAL(bank_it);
+ CancellableIterator<Sequence>* cancellable_it = new CancellableIterator<Sequence> (*bank_it);
LOCAL(cancellable_it);
/** We create a sequence iterator and give it a progress message */
@@ -420,7 +423,7 @@ void RepartitorAlgorithm<span>::computeRepartition (Repartitor& repartitor)
u_int64_t nbseq_sample = (_config._estimateSeqNb / _config._nb_banks) * 0.01;
nbseq_sample = max((u_int64_t)nbseq_sample, (u_int64_t)100000);
- Iterator<Sequence>* it = _bank->iterator(); //LOCAL (it);
+ Iterator<Sequence>* it = _bank->iterator(); LOCAL (it);
std::vector<Iterator<Sequence>*> itBanks = it->getComposition();
/*
@@ -481,7 +484,6 @@ void RepartitorAlgorithm<span>::computeRepartition (Repartitor& repartitor)
//it_all_reads->finalize() ;
//cancellable_it->finalize() ;
itBanks[i]->finalize();
- delete itBanks[i];
//delete cancellable_it;
}
}
diff --git a/gatb-core/src/gatb/kmer/impl/Sequence2SuperKmer.hpp b/gatb-core/src/gatb/kmer/impl/Sequence2SuperKmer.hpp
index 387648c..0311876 100644
--- a/gatb-core/src/gatb/kmer/impl/Sequence2SuperKmer.hpp
+++ b/gatb-core/src/gatb/kmer/impl/Sequence2SuperKmer.hpp
@@ -89,8 +89,11 @@ public:
void operator() (const KType& kmer, size_t idx) {
+ //kmer.toString(_kmerSize)
if (kmer.isValid() == false)
{
+ // printf("non valid kmer %s \n", kmer.value().toString(31).c_str());
+ //caller->_model.toString(kmer)
// on invalid kmer : output previous superk utput prev
caller->processSuperkmer (superKmer);
superKmer.reset();
@@ -107,6 +110,12 @@ public:
/** We get the value of the current minimizer. */
u_int64_t h = kmer.minimizer().value().getVal();
+ if(DEFAULT_MINIMIZER == h)
+ {
+ printf("__ non valid kmer %s \n", kmer.value().toString(31).c_str());
+
+ }
+
/** We have to set minimizer value if not defined. */
if (superKmer.isValid() == false) { superKmer.minimizer = h; }
diff --git a/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp b/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp
index 100a393..34c832a 100644
--- a/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp
+++ b/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp
@@ -95,7 +95,7 @@ template<size_t span>
SortingCountAlgorithm<span>::SortingCountAlgorithm (IProperties* params)
: Algorithm("dsk", -1, params),
_bank(0), _repartitor(0),
- _progress (0), _tmpPartitionsStorage(0), _tmpPartitions(0), _storage(0)
+ _progress (0), _tmpPartitionsStorage(0), _tmpPartitions(0), _storage(0),_superKstorage(0)
{
}
@@ -111,7 +111,7 @@ template<size_t span>
SortingCountAlgorithm<span>::SortingCountAlgorithm (IBank* bank, IProperties* params)
: Algorithm("dsk", -1, params),
_bank(0), _repartitor(0),
- _progress (0), _tmpPartitionsStorage(0), _tmpPartitions(0), _storage(0)
+ _progress (0),_tmpPartitionsStorage(0), _tmpPartitions(0), _storage(0),_superKstorage(0)
{
setBank (bank);
}
@@ -135,7 +135,7 @@ SortingCountAlgorithm<span>::SortingCountAlgorithm (
)
: Algorithm("dsk", config._nbCores, params),
_config(config), _bank(0), _repartitor(0),
- _progress (0), _tmpPartitionsStorage(0), _tmpPartitions(0), _storage(0)
+ _progress (0),_tmpPartitionsStorage(0), _tmpPartitions(0), _storage(0),_superKstorage(0)
{
setBank (bank);
setRepartitor (repartitor);
@@ -157,8 +157,8 @@ SortingCountAlgorithm<span>::~SortingCountAlgorithm ()
setBank (0);
setRepartitor (0);
setProgress (0);
- setPartitionsStorage (0);
- setPartitions (0);
+ // setPartitionsStorage (0);
+ // setPartitions (0);
setStorage (0);
for (size_t i=0; i<_processors.size(); i++) { _processors[i]->forget(); }
@@ -183,7 +183,8 @@ SortingCountAlgorithm<span>& SortingCountAlgorithm<span>::operator= (const Sorti
setRepartitor (s._repartitor);
setProgress (s._progress);
setPartitionsStorage (s._tmpPartitionsStorage);
- setPartitions (s._tmpPartitions);
+ setPartitions (s._tmpPartitions);
+ _superKstorage = s._superKstorage;
setStorage (s._storage);
}
return *this;
@@ -479,7 +480,7 @@ void SortingCountAlgorithm<span>::configure ()
}
/** We check that the processor is ok, otherwise we build one. */
- if (_processors.size() == 0) { addProcessor (getDefaultProcessor(getInput(), storage)); };
+ if (_processors.size() == 0) { _processors = getDefaultProcessorVector(_config, getInput(), storage, storage); };
DEBUG (("SortingCountAlgorithm<span>::configure END _bank=%p _config.isComputed=%d _repartitor=%p storage=%p\n",
_bank, _config._isComputed, _repartitor, storage
@@ -562,9 +563,26 @@ void SortingCountAlgorithm<span>::execute ()
_progress->finish ();
+
+// pInfo.printInfo();
+
+
/** We want to remove physically the partitions. */
- _tmpPartitions->remove ();
+ if(_config._solidityKind != KMER_SOLIDITY_SUM)
+ _tmpPartitions->remove ();
+
+ u_int64_t totaltmp, biggesttmp, smallesttmp;
+ float meantmp;
+ if(_config._solidityKind == KMER_SOLIDITY_SUM)
+ _superKstorage->getFilesStats(totaltmp,biggesttmp,smallesttmp, meantmp);
+
+ if(_superKstorage!=0)
+ {
+ delete _superKstorage; //delete files and containing dir
+ _superKstorage =0;
+ }
+
/*************************************************************/
/* STATISTICS */
/*************************************************************/
@@ -585,10 +603,29 @@ void SortingCountAlgorithm<span>::execute ()
getInfo()->add (2, "kmers");
getInfo()->add (3, "kmers_nb_valid", "%lld", _bankStats.kmersNbValid);
getInfo()->add (3, "kmers_nb_invalid", "%lld", _bankStats.kmersNbInvalid);
+
+
}
- getInfo()->add (1, "stats");
+
+ u_int64_t nbtotalsuperk = pInfo.getNbSuperKmerTotal();
+ u_int64_t nbtotalk = pInfo.getNbKmerTotal();
+
+ getInfo()->add (1, "stats");
+
+ getInfo()->add (2, "temp_files");
+ getInfo()->add (3, "nb_superkmers","%lld",nbtotalsuperk);
+ getInfo()->add (3, "avg_superk_length","%.2f",(nbtotalk/(float) nbtotalsuperk));
+ getInfo()->add (3, "minimizer_density","%.2f",(nbtotalsuperk/(float)nbtotalk)*(_config._kmerSize - _config._minim_size +2));
+
+ if(_config._solidityKind == KMER_SOLIDITY_SUM)
+ {
+ getInfo()->add (3, "total_size_(MB)","%lld",totaltmp/1024LL/1024LL);
+ getInfo()->add (3, "tmp_file_biggest_(MB)","%lld",biggesttmp/1024LL/1024LL);
+ getInfo()->add (3, "tmp_file_smallest_(MB)","%lld",smallesttmp/1024LL/1024LL);
+ getInfo()->add (3, "tmp_file_mean_(MB)","%.1f",meantmp/1024LL/1024LL);
+ }
/** We dump information about count processors. */
if (_processors.size()==1) { getInfo()->add (2, _processors[0]->getProperties()); }
else
@@ -615,10 +652,151 @@ void SortingCountAlgorithm<span>::execute ()
* a lookup table that has computed the minimizers distribution on a subset of the
* processed bank.
*/
-template<size_t span>
+
+template<size_t span, bool newSuperKmerStorage=true> //
class FillPartitions : public Sequence2SuperKmer<span>
{
public:
+ /** Shortcut. */
+ typedef typename Sequence2SuperKmer<span>::Type Type;
+ typedef typename Sequence2SuperKmer<span>::Model Model;
+ typedef typename Model::Kmer KmerType;
+ typedef typename Kmer<span>::SuperKmer SuperKmer;
+
+ /** */
+ void processSuperkmer (SuperKmer& superKmer)
+ {
+ if ((superKmer.minimizer % this->_nbPass) == this->_pass && superKmer.isValid()) //check if falls into pass
+ {
+ /** We get the hash code for the current miminizer.
+ * => this will give us the partition where to dump the superkmer. */
+ size_t p = this->_repartition (superKmer.minimizer);
+
+ /** We save the superkmer into the right partition. */
+ if(newSuperKmerStorage)
+ superKmer.save (_superkmerFiles,p);
+ else
+ superKmer.save ((this->_partition)[p]);
+
+
+ //for debug purposes
+ _local_pInfo.incSuperKmer_per_minimBin (superKmer.minimizer, superKmer.size()); //tocheck
+
+ /*********************************************/
+ /** Now, we compute statistics about kxmers. */
+ /*********************************************/
+
+ Type radix_kxmer_forward ,radix_kxmer ;
+ bool prev_which = superKmer[0].which();
+ size_t kx_size =0;
+
+ radix_kxmer_forward = getHeavyWeight (superKmer[0].value());
+
+ for (size_t ii=1 ; ii < superKmer.size(); ii++)
+ {
+ //compute here stats on kx mer
+ //tant que tai <= xmer et which kmer[ii] == which kmer [ii-1] --> cest un kxmer
+ //do the same in sampling : gives ram estimation
+ if (superKmer[ii].which() != prev_which || kx_size >= this->_kx) // kxmer_size = 1 //cost should diminish with larger kxmer
+ {
+ //output kxmer size kx_size,radix_kxmer
+ //kx mer is composed of _superKp[ii-1] _superKp[ii-2] .. _superKp[ii-n] with nb elems n == kxmer_size +1 (un seul kmer ==k+0)
+ if(prev_which)
+ {
+ radix_kxmer = radix_kxmer_forward;
+ }
+ else // si revcomp, le radix du kxmer est le debut du dernier kmer
+ {
+ radix_kxmer = getHeavyWeight (superKmer[ii-1].value());
+ }
+
+ this->_local_pInfo.incKmer_and_rad (p, radix_kxmer.getVal(), kx_size); //nb of superkmer per x per parti per radix
+
+ radix_kxmer_forward = getHeavyWeight (superKmer[ii].value());
+ kx_size =0;
+ }
+ else
+ {
+ kx_size++;
+ }
+
+ prev_which = superKmer[ii].which() ;
+ }
+
+ //record last kx mer
+ if(prev_which)
+ {
+ radix_kxmer = radix_kxmer_forward;
+ }
+ else // si revcomp, le radix du kxmer est le debut du dernier kmer
+ {
+ radix_kxmer = getHeavyWeight (superKmer[superKmer.size()-1].value());
+ }
+
+ this->_local_pInfo.incKmer_and_rad(p, radix_kxmer.getVal(),kx_size );
+
+ /** We update progression information. */
+ this->_nbWrittenKmers += superKmer.size();
+ }
+ }
+
+ /** Constructor. */
+ FillPartitions (
+ Model& model,
+ size_t nbPasses,
+ size_t currentPass,
+ size_t nbPartitions,
+ size_t nbCacheItems,
+ IteratorListener* progress,
+ BankStats& bankStats,
+ Partition<Type>* partition,
+ Repartitor& repartition,
+ PartiInfo<5>& pInfo,
+ SuperKmerBinFiles* superKstorage
+ )
+ : Sequence2SuperKmer<span> (model, nbPasses, currentPass, nbPartitions, progress, bankStats),
+ _kx(4),
+ _extern_pInfo(pInfo) , _local_pInfo(nbPartitions,model.getMmersModel().getKmerSize()),
+ _repartition (repartition)
+ ,_partition (*partition, nbCacheItems, 0), _superkmerFiles(superKstorage,nbCacheItems* sizeof(Type))
+ {
+ _mask_radix.setVal((int64_t) 255);
+ _mask_radix = _mask_radix << ((this->_kmersize - 4)*2); //get first 4 nt of the kmers (heavy weight)
+ }
+
+
+ /** Destructor. */
+ ~FillPartitions ()
+ {
+ //printf("destruc fillparti _superkmerFiles %p \n",_superkmerFiles);
+
+ //add to global parti_info
+ _extern_pInfo += _local_pInfo;
+ }
+
+private:
+
+ size_t _kx;
+ PartiInfo<5>& _extern_pInfo;
+ PartiInfo<5> _local_pInfo;
+ Type _mask_radix;
+ Repartitor& _repartition;
+
+ /** Shared resources (must support concurrent accesses). */
+ PartitionCacheType <Type> _partition;
+ CacheSuperKmerBinFiles _superkmerFiles;
+
+
+
+ Type getHeavyWeight (const Type& kmer) const { return (kmer & this->_mask_radix) >> ((this->_kmersize - 4)*2); }
+};
+
+
+//specialization when newSuperKmerStorage = false
+template<size_t span>
+class FillPartitions <span, false>: public Sequence2SuperKmer<span>
+{
+public:
/** Shortcut. */
typedef typename Sequence2SuperKmer<span>::Type Type;
typedef typename Sequence2SuperKmer<span>::Model Model;
@@ -635,7 +813,14 @@ public:
size_t p = this->_repartition (superKmer.minimizer);
/** We save the superkmer into the right partition. */
- superKmer.save (this->_partition[p]);
+// if(newSuperKmerStorage)
+// superKmer.save (_superkmerFiles,p);
+// else
+ superKmer.save ((this->_partition)[p]);
+
+
+ //for debug purposes
+ _local_pInfo.incSuperKmer_per_minimBin (superKmer.minimizer, superKmer.size()); //tocheck
/*********************************************/
/** Now, we compute statistics about kxmers. */
@@ -706,20 +891,25 @@ public:
BankStats& bankStats,
Partition<Type>* partition,
Repartitor& repartition,
- PartiInfo<5>& pInfo
+ PartiInfo<5>& pInfo,
+ SuperKmerBinFiles* superKstorage
)
: Sequence2SuperKmer<span> (model, nbPasses, currentPass, nbPartitions, progress, bankStats),
_kx(4),
_extern_pInfo(pInfo) , _local_pInfo(nbPartitions,model.getMmersModel().getKmerSize()),
- _repartition (repartition), _partition (*partition, nbCacheItems, 0)
+ _repartition (repartition)
+ ,_partition (*partition, nbCacheItems, 0)/*, _superkmerFiles(superKstorage,nbCacheItems* sizeof(Type))*/
{
_mask_radix.setVal((int64_t) 255);
_mask_radix = _mask_radix << ((this->_kmersize - 4)*2); //get first 4 nt of the kmers (heavy weight)
}
+
/** Destructor. */
~FillPartitions ()
{
+ //printf("destruc fillparti _superkmerFiles %p \n",_superkmerFiles);
+
//add to global parti_info
_extern_pInfo += _local_pInfo;
}
@@ -731,13 +921,152 @@ private:
PartiInfo<5> _local_pInfo;
Type _mask_radix;
Repartitor& _repartition;
-
+
/** Shared resources (must support concurrent accesses). */
PartitionCacheType <Type> _partition;
+// CacheSuperKmerBinFiles _superkmerFiles;
+
+
Type getHeavyWeight (const Type& kmer) const { return (kmer & this->_mask_radix) >> ((this->_kmersize - 4)*2); }
};
+
+ //specialization when newSuperKmerStorage = true
+ template<size_t span>
+ class FillPartitions<span, true> : public Sequence2SuperKmer<span>
+ {
+ public:
+ /** Shortcut. */
+ typedef typename Sequence2SuperKmer<span>::Type Type;
+ typedef typename Sequence2SuperKmer<span>::Model Model;
+ typedef typename Model::Kmer KmerType;
+ typedef typename Kmer<span>::SuperKmer SuperKmer;
+
+ /** */
+ void processSuperkmer (SuperKmer& superKmer)
+ {
+ if ((superKmer.minimizer % this->_nbPass) == this->_pass && superKmer.isValid()) //check if falls into pass
+ {
+ /** We get the hash code for the current miminizer.
+ * => this will give us the partition where to dump the superkmer. */
+ size_t p = this->_repartition (superKmer.minimizer);
+
+ /** We save the superkmer into the right partition. */
+ superKmer.save (_superkmerFiles,p);
+
+ //for debug purposes
+ _local_pInfo.incSuperKmer_per_minimBin (superKmer.minimizer, superKmer.size()); //tocheck
+
+ /*********************************************/
+ /** Now, we compute statistics about kxmers. */
+ /*********************************************/
+
+ Type radix_kxmer_forward ,radix_kxmer ;
+ bool prev_which = superKmer[0].which();
+ size_t kx_size =0;
+
+ radix_kxmer_forward = getHeavyWeight (superKmer[0].value());
+
+ for (size_t ii=1 ; ii < superKmer.size(); ii++)
+ {
+ //compute here stats on kx mer
+ //tant que tai <= xmer et which kmer[ii] == which kmer [ii-1] --> cest un kxmer
+ //do the same in sampling : gives ram estimation
+ if (superKmer[ii].which() != prev_which || kx_size >= this->_kx) // kxmer_size = 1 //cost should diminish with larger kxmer
+ {
+ //output kxmer size kx_size,radix_kxmer
+ //kx mer is composed of _superKp[ii-1] _superKp[ii-2] .. _superKp[ii-n] with nb elems n == kxmer_size +1 (un seul kmer ==k+0)
+ if(prev_which)
+ {
+ radix_kxmer = radix_kxmer_forward;
+ }
+ else // si revcomp, le radix du kxmer est le debut du dernier kmer
+ {
+ radix_kxmer = getHeavyWeight (superKmer[ii-1].value());
+ }
+
+ this->_local_pInfo.incKmer_and_rad (p, radix_kxmer.getVal(), kx_size); //nb of superkmer per x per parti per radix
+
+ radix_kxmer_forward = getHeavyWeight (superKmer[ii].value());
+ kx_size =0;
+ }
+ else
+ {
+ kx_size++;
+ }
+
+ prev_which = superKmer[ii].which() ;
+ }
+
+ //record last kx mer
+ if(prev_which)
+ {
+ radix_kxmer = radix_kxmer_forward;
+ }
+ else // si revcomp, le radix du kxmer est le debut du dernier kmer
+ {
+ radix_kxmer = getHeavyWeight (superKmer[superKmer.size()-1].value());
+ }
+
+ this->_local_pInfo.incKmer_and_rad(p, radix_kxmer.getVal(),kx_size );
+
+ /** We update progression information. */
+ this->_nbWrittenKmers += superKmer.size();
+ }
+ }
+
+ /** Constructor. */
+ FillPartitions (
+ Model& model,
+ size_t nbPasses,
+ size_t currentPass,
+ size_t nbPartitions,
+ size_t nbCacheItems,
+ IteratorListener* progress,
+ BankStats& bankStats,
+ Partition<Type>* partition,
+ Repartitor& repartition,
+ PartiInfo<5>& pInfo,
+ SuperKmerBinFiles* superKstorage
+ )
+ : Sequence2SuperKmer<span> (model, nbPasses, currentPass, nbPartitions, progress, bankStats),
+ _kx(4),
+ _extern_pInfo(pInfo) , _local_pInfo(nbPartitions,model.getMmersModel().getKmerSize()),
+ _repartition (repartition)
+ ,/*_partition (*partition, nbCacheItems, 0),*/ _superkmerFiles(superKstorage,nbCacheItems* sizeof(Type))
+ {
+ _mask_radix.setVal((int64_t) 255);
+ _mask_radix = _mask_radix << ((this->_kmersize - 4)*2); //get first 4 nt of the kmers (heavy weight)
+ }
+
+
+ /** Destructor. */
+ ~FillPartitions ()
+ {
+ //printf("destruc fillparti _superkmerFiles %p \n",_superkmerFiles);
+
+ //add to global parti_info
+ _extern_pInfo += _local_pInfo;
+ }
+
+ private:
+
+ size_t _kx;
+ PartiInfo<5>& _extern_pInfo;
+ PartiInfo<5> _local_pInfo;
+ Type _mask_radix;
+ Repartitor& _repartition;
+
+ /** Shared resources (must support concurrent accesses). */
+ //PartitionCacheType <Type> _partition;
+ CacheSuperKmerBinFiles _superkmerFiles;
+
+
+
+ Type getHeavyWeight (const Type& kmer) const { return (kmer & this->_mask_radix) >> ((this->_kmersize - 4)*2); }
+ };
+
/*********************************************************************
** METHOD :
** PURPOSE :
@@ -748,86 +1077,121 @@ private:
*********************************************************************/
template<size_t span>
void SortingCountAlgorithm<span>::fillPartitions (size_t pass, Iterator<Sequence>* itSeq, PartiInfo<5>& pInfo)
-{
- TIME_INFO (getTimeInfo(), "fill_partitions");
-
- DEBUG (("SortingCountAlgorithm<span>::fillPartitions _kmerSize=%d _minim_size=%d \n", _config._kmerSize, _config._minim_size));
-
- /** We delete the previous partitions storage. */
- if (_tmpPartitionsStorage) { _tmpPartitionsStorage->remove (); }
-
- /** We build the temporary storage name from the output storage name. */
- string tmpStorageName = getInput()->getStr(STR_URI_OUTPUT_TMP) + "/" + System::file().getTemporaryFilename("dsk_partitions");
- //string tmpStorageName = "./" + System::file().getTemporaryFilename("dsk_partitions");
-
- /** We create the partition files for the current pass. */
- setPartitionsStorage (StorageFactory(STORAGE_TYPE).create (tmpStorageName, true, false));
- setPartitions (0); // close the partitions first, otherwise new files are opened before closing parti from previous pass
- setPartitions ( & (*_tmpPartitionsStorage)().getPartition<Type> ("parts", _config._nb_partitions));
-
- /** We update the message of the progress bar. */
- _progress->setMessage (Stringify::format(progressFormat1, pass+1, _config._nb_passes));
-
- /** We create a kmer model; using the frequency order if we're in that mode */
- uint32_t* freq_order = NULL;
-
- /** We may have to retrieve the minimizers frequencies computed in the RepartitorAlgorithm. */
- if (_config._minimizerType == 1) { freq_order = _repartitor->getMinimizerFrequencies (); }
-
- Model model( _config._kmerSize, _config._minim_size, typename kmer::impl::Kmer<span>::ComparatorMinimizerFrequencyOrLex(), freq_order);
-
- /** We have to reinit the progress instance since it may have been used by SampleRepart before. */
- _progress->init();
-
- /** We may have several input banks instead of a single one. */
- std::vector<Iterator<Sequence>*> itBanks = itSeq->getComposition();
-
- /** We first reset the vector holding the kmers number for each partition and for each bank.
- * It can be seen as the following matrix:
- *
- * part0 part1 part2 ... partJ
- * bank0 xxx xxx xxx xxx
- * bank1 xxx xxx xxx xxx
- * ...
- * bankI xxx xxx xxx xxx
- *
- * Here xxx is the number of items found for the bank I in the partition J
- */
- _nbKmersPerPartitionPerBank.clear();
-
- /** We launch the iteration of the sequences iterator with the created functors. */
- for (size_t i=0; i<itBanks.size(); i++)
- {
- size_t groupSize = 1000;
- bool deleteSynchro = true;
-
- /** We fill the partitions. Each thread will read synchronously and will call FillPartitions
- * in a synchronous way (in order to have global BanksStats correctly computed). */
-
-
- getDispatcher()->iterate (itBanks[i], FillPartitions<span> (
- model, _config._nb_passes, pass, _config._nb_partitions, _config._nb_cached_items_per_core_per_part, _progress, _bankStats, _tmpPartitions, *_repartitor, pInfo
- ), groupSize, deleteSynchro);
-
-
- /** We flush the partitions in order to be sure to have the exact number of items per partition. */
- _tmpPartitions->flush();
-
- /** We get a snapshot of items number in each partition. */
- vector<size_t> nbItems;
- for (size_t p=0; p<_config._nb_partitions; p++)
- {
- nbItems.push_back ((*_tmpPartitions)[p].getNbItems());
- }
-
- /** We add the current number of kmers in each partition for the reached ith bank. */
- _nbKmersPerPartitionPerBank.push_back (nbItems);
+ {
+ TIME_INFO (getTimeInfo(), "fill_partitions");
+ DEBUG (("SortingCountAlgorithm<span>::fillPartitions _kmerSize=%d _minim_size=%d \n", _config._kmerSize, _config._minim_size));
- //GR: close the input bank here with call to finalize
- itBanks[i]->finalize();
- }
-}
+ if(_config._solidityKind != KMER_SOLIDITY_SUM)
+ {
+ /** We delete the previous partitions storage. */
+ if (_tmpPartitionsStorage) { _tmpPartitionsStorage->remove (); }
+
+ /** We build the temporary storage name from the output storage name. */
+ string tmpStorageName = getInput()->getStr(STR_URI_OUTPUT_TMP) + "/" + System::file().getTemporaryFilename("dsk_partitions");
+
+ /** We create the partition files for the current pass. */
+ setPartitionsStorage (StorageFactory(STORAGE_TYPE).create (tmpStorageName, true, false));
+ setPartitions (0); // close the partitions first, otherwise new files are opened before closing parti from previous pass
+ setPartitions ( & (*_tmpPartitionsStorage)().getPartition<Type> ("parts", _config._nb_partitions));
+
+ }
+ else
+ {
+ /** We build the temporary storage name from the output storage name. */
+ _tmpStorageName_superK = getInput()->getStr(STR_URI_OUTPUT_TMP) + "/" + System::file().getTemporaryFilename("superK_partitions");
+
+
+ if(_superKstorage!=0)
+ {
+ delete _superKstorage;
+ _superKstorage =0;
+ }
+
+ _superKstorage = new SuperKmerBinFiles(_tmpStorageName_superK,"superKparts", _config._nb_partitions) ;
+
+ }
+ /** We update the message of the progress bar. */
+ _progress->setMessage (Stringify::format(progressFormat1, pass+1, _config._nb_passes));
+
+ /** We create a kmer model; using the frequency order if we're in that mode */
+ uint32_t* freq_order = NULL;
+
+ /** We may have to retrieve the minimizers frequencies computed in the RepartitorAlgorithm. */
+ if (_config._minimizerType == 1) { freq_order = _repartitor->getMinimizerFrequencies (); }
+
+ Model model( _config._kmerSize, _config._minim_size, typename kmer::impl::Kmer<span>::ComparatorMinimizerFrequencyOrLex(), freq_order);
+
+ /** We have to reinit the progress instance since it may have been used by SampleRepart before. */
+ _progress->init();
+
+ /** We may have several input banks instead of a single one. */
+ std::vector<Iterator<Sequence>*> itBanks = itSeq->getComposition();
+
+ /** We first reset the vector holding the kmers number for each partition and for each bank.
+ * It can be seen as the following matrix:
+ *
+ * part0 part1 part2 ... partJ
+ * bank0 xxx xxx xxx xxx
+ * bank1 xxx xxx xxx xxx
+ * ...
+ * bankI xxx xxx xxx xxx
+ *
+ * Here xxx is the number of items found for the bank I in the partition J
+ */
+ _nbKmersPerPartitionPerBank.clear();
+
+ /** We launch the iteration of the sequences iterator with the created functors. */
+ for (size_t i=0; i<itBanks.size(); i++)
+ {
+ size_t groupSize = 1000;
+ bool deleteSynchro = true;
+
+ /** We fill the partitions. Each thread will read synchronously and will call FillPartitions
+ * in a synchronous way (in order to have global BanksStats correctly computed). */
+
+ if(_config._solidityKind == KMER_SOLIDITY_SUM)
+ {
+ getDispatcher()->iterate (itBanks[i], FillPartitions<span,true> (
+ model, _config._nb_passes, pass, _config._nb_partitions, _config._nb_cached_items_per_core_per_part, _progress, _bankStats, _tmpPartitions, *_repartitor, pInfo,_superKstorage
+ ), groupSize, deleteSynchro);
+ }
+ else
+ {
+ getDispatcher()->iterate (itBanks[i], FillPartitions<span,false> (
+ model, _config._nb_passes, pass, _config._nb_partitions, _config._nb_cached_items_per_core_per_part, _progress, _bankStats, _tmpPartitions, *_repartitor, pInfo,_superKstorage
+ ), groupSize, deleteSynchro);
+ }
+
+
+ /** We flush the partitions in order to be sure to have the exact number of items per partition. */
+ if(_config._solidityKind != KMER_SOLIDITY_SUM)
+ {
+ _tmpPartitions->flush();
+
+ /** We get a snapshot of items number in each partition. */
+ vector<size_t> nbItems;
+ for (size_t p=0; p<_config._nb_partitions; p++)
+ {
+ nbItems.push_back ((*_tmpPartitions)[p].getNbItems()); //todo for multi count
+ }
+
+ /** We add the current number of kmers in each partition for the reached ith bank. */
+ _nbKmersPerPartitionPerBank.push_back (nbItems);
+ }
+
+ //GR: close the input bank here with call to finalize
+ itBanks[i]->finalize();
+ }
+
+ if(_config._solidityKind == KMER_SOLIDITY_SUM)
+ {
+ _superKstorage->flushFiles();
+ _superKstorage->closeFiles();
+ }
+
+
+ }
/*********************************************************************
** METHOD :
@@ -899,6 +1263,7 @@ void SortingCountAlgorithm<span>::fillSolidKmers_aux (ICountProcessor<span>* pro
/** We update the message of the progress bar. */
_progress->setMessage (Stringify::format (progressFormat2, pass+1, _config._nb_passes));
+
/** We retrieve the list of cores number for dispatching N partitions in N threads.
* We need to know these numbers for allocating the N maps according to the maximum allowed memory.
*/
@@ -970,11 +1335,11 @@ void SortingCountAlgorithm<span>::fillSolidKmers_aux (ICountProcessor<span>* pro
{
if (pool.getCapacity() != 0) { pool.reserve(0); }
- // also allow to use mem pool for oahash ? ou pas la peine
- cmd = new PartitionsByHashCommand<span> (
- (*_tmpPartitions)[p], processorClone, cacheSize, _progress, _fillTimeInfo,
- pInfo, pass, p, _config._nbCores_per_partition, _config._kmerSize, pool, mem
- );
+
+ cmd = new PartitionsByHashCommand<span> (
+ processorClone, cacheSize, _progress, _fillTimeInfo,
+ pInfo, pass, p, _config._nbCores_per_partition, _config._kmerSize, pool, mem,_superKstorage
+ );
}
else
{
@@ -1045,10 +1410,21 @@ void SortingCountAlgorithm<span>::fillSolidKmers_aux (ICountProcessor<span>* pro
}
}
- cmd = new PartitionsByVectorCommand<span> (
- (*_tmpPartitions)[p], processorClone, cacheSize, _progress, _fillTimeInfo,
- pInfo, pass, p, _config._nbCores_per_partition, _config._kmerSize, pool, nbItemsPerBankPerPart
- );
+ if ( _config._solidityKind == KMER_SOLIDITY_SUM)
+ {
+ cmd = new PartitionsByVectorCommand<span> (
+ processorClone, cacheSize, _progress, _fillTimeInfo,
+ pInfo, pass, p, _config._nbCores_per_partition, _config._kmerSize, pool, nbItemsPerBankPerPart,_superKstorage
+ );
+ }
+ else
+ {
+ cmd = new PartitionsByVectorCommand_multibank<span> (
+ (*_tmpPartitions)[p], processorClone, cacheSize, _progress, _fillTimeInfo,
+ pInfo, pass, p, _config._nbCores_per_partition, _config._kmerSize, pool, nbItemsPerBankPerPart
+ );
+ }
+
}
cmds.push_back (cmd);
@@ -1068,6 +1444,11 @@ void SortingCountAlgorithm<span>::fillSolidKmers_aux (ICountProcessor<span>* pro
// free internal memory of pool here
pool.free_all();
}
+
+
+ if(_config._solidityKind == KMER_SOLIDITY_SUM)
+ _superKstorage->closeFiles();
+
}
/*********************************************************************
diff --git a/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.hpp b/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.hpp
index 8fdbf75..b6937f8 100644
--- a/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.hpp
+++ b/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.hpp
@@ -229,6 +229,7 @@ private:
/** Handle on the count processor object. */
std::vector<CountProcessor*> _processors;
+
/** Handle on the progress information. */
gatb::core::tools::dp::IteratorListener* _progress;
void setProgress (gatb::core::tools::dp::IteratorListener* progress) { SP_SETATTR(progress); }
@@ -254,6 +255,11 @@ private:
tools::storage::impl::StorageMode_e _storage_type;
tools::storage::impl::Storage* _storage;
void setStorage (tools::storage::impl::Storage* storage) { SP_SETATTR(storage); }
+
+
+ //superkmer efficient storage
+ tools::storage::impl::SuperKmerBinFiles* _superKstorage;
+ std::string _tmpStorageName_superK;
};
/********************************************************************************/
diff --git a/gatb-core/src/gatb/system/api/Exception.hpp b/gatb-core/src/gatb/system/api/Exception.hpp
index 3579b81..4fdbdee 100644
--- a/gatb-core/src/gatb/system/api/Exception.hpp
+++ b/gatb-core/src/gatb/system/api/Exception.hpp
@@ -137,8 +137,23 @@ namespace system {
{
*buffer = 0;
- strerror_r (errno, buffer, BUFSIZ);
- { _message += std::string(" (") + std::string(buffer) + std::string(")"); }
+#ifdef __CYGWIN__
+ // strerror_r doesnt seem to be declared in cygwin
+ // "The strerror_r() function is similar to strerror(), but is thread safe."
+ strerror (errno, buffer, BUFSIZ);
+#else
+
+#if !defined(__linux__) || ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && !defined(_GNU_SOURCE)) || !defined(__GLIBC__) // XSI-Compliant strerror_r
+ int ret_code = strerror_r (errno, buffer, BUFSIZ);
+ const char* ret_buffer = buffer;
+ if (ret_code == 0)
+#else // GNU's strerror_r might return a static string instead of filling buffer
+ const char* ret_buffer = strerror_r (errno, buffer, BUFSIZ);
+ if (ret_buffer != NULL)
+#endif
+#endif
+
+ { _message += std::string(" (") + std::string(ret_buffer) + std::string(")"); }
free(buffer);
}
}
diff --git a/gatb-core/src/gatb/system/impl/FileSystemLinux.cpp b/gatb-core/src/gatb/system/impl/FileSystemLinux.cpp
index 267aea5..a53e909 100644
--- a/gatb-core/src/gatb/system/impl/FileSystemLinux.cpp
+++ b/gatb-core/src/gatb/system/impl/FileSystemLinux.cpp
@@ -57,7 +57,7 @@ public:
u_int64_t tell () { return (isOpen() ? ftello64 (_handle) : 0); }
/** \copydoc IFile::seeko */
- int seeko (u_int64_t offset, int whence) { return (isOpen() ? fseeko64 (_handle, offset, whence) : -1); }
+ int seeko (u_int64_t offset, int whence) { return (isOpen() ? fseek /* cygwin doesnt like fseeko and fseek/fseeko seems similar */(_handle, offset, whence) : -1); }
};
/*********************************************************************
diff --git a/gatb-core/src/gatb/system/impl/FileSystemMacos.cpp b/gatb-core/src/gatb/system/impl/FileSystemMacos.cpp
index af7acdc..ac806b3 100644
--- a/gatb-core/src/gatb/system/impl/FileSystemMacos.cpp
+++ b/gatb-core/src/gatb/system/impl/FileSystemMacos.cpp
@@ -57,7 +57,7 @@ public:
u_int64_t tell () { return (isOpen() ? ftello (_handle) : 0); }
/** \copydoc IFile::seeko */
- int seeko (u_int64_t offset, int whence) { return (isOpen() ? fseeko (_handle, offset, whence) : -1); }
+ int seeko (u_int64_t offset, int whence) { return (isOpen() ? fseek /* cygwin doesnt like fseeko and fseek/fseeko seems similar */ (_handle, offset, whence) : -1); }
};
/*********************************************************************
diff --git a/gatb-core/src/gatb/system/impl/SystemInfoCommon.cpp b/gatb-core/src/gatb/system/impl/SystemInfoCommon.cpp
index 0a6e790..64b4b49 100644
--- a/gatb-core/src/gatb/system/impl/SystemInfoCommon.cpp
+++ b/gatb-core/src/gatb/system/impl/SystemInfoCommon.cpp
@@ -121,7 +121,6 @@ std::string SystemInfoCommon::getBuildSystem () const { return STR_OPERATING_SYS
#include <unistd.h>
#include <sys/resource.h>
#include <sys/times.h>
-#include <sys/vtimes.h>
/********************************************************************************/
size_t SystemInfoLinux::getNbCores () const
diff --git a/gatb-core/src/gatb/system/impl/ThreadLinux.cpp b/gatb-core/src/gatb/system/impl/ThreadLinux.cpp
index 79cef64..544a3ab 100644
--- a/gatb-core/src/gatb/system/impl/ThreadLinux.cpp
+++ b/gatb-core/src/gatb/system/impl/ThreadLinux.cpp
@@ -44,7 +44,20 @@ namespace gatb { namespace core { namespace system { namespace impl {
class ThreadLinux : public IThread, public system::SmartPointer
{
public:
- ThreadLinux (void* (mainloop) (void*), void* data) { pthread_create (&_thread, NULL, mainloop, data); }
+ ThreadLinux (void* (mainloop) (void*), void* data) {
+
+ //set stack size to 8 MB
+ pthread_attr_t tattr;
+ int ret = pthread_attr_init ( &tattr ) ;
+ size_t size = 4096*2000 ; // must be multiple of page size
+ ret = pthread_attr_setstacksize(&tattr, size);
+
+ pthread_create (&_thread, NULL, mainloop, data);
+
+ pthread_attr_destroy(&tattr);
+
+ }
+
~ThreadLinux () { /* pthread_detach (_thread); */ }
void join () { pthread_join (_thread, NULL); }
Id getId () const { return (Id) _thread; }
diff --git a/gatb-core/src/gatb/system/impl/ThreadMacos.cpp b/gatb-core/src/gatb/system/impl/ThreadMacos.cpp
index 2abfdcf..2d770ed 100644
--- a/gatb-core/src/gatb/system/impl/ThreadMacos.cpp
+++ b/gatb-core/src/gatb/system/impl/ThreadMacos.cpp
@@ -44,7 +44,21 @@ namespace gatb { namespace core { namespace system { namespace impl {
class ThreadMacos : public IThread, public system::SmartPointer
{
public:
- ThreadMacos (void* (mainloop) (void*), void* data) { pthread_create (&_thread, NULL, mainloop, data); }
+ ThreadMacos (void* (mainloop) (void*), void* data) {
+
+
+ //set stack size to 8 MB
+ pthread_attr_t tattr;
+ int ret = pthread_attr_init ( &tattr ) ;
+ size_t size = 4096*2000 ; // must be multiple of page size
+ ret = pthread_attr_setstacksize(&tattr, size);
+
+ pthread_create (&_thread,&tattr, mainloop, data);
+
+ pthread_attr_destroy(&tattr);
+
+ }
+
~ThreadMacos () { /* pthread_detach (_thread); */ }
void join () { pthread_join (_thread, NULL); }
Id getId () const { return (Id) _thread; }
diff --git a/gatb-core/src/gatb/template/TemplateSpecialization10.cpp.in b/gatb-core/src/gatb/template/TemplateSpecialization10.cpp.in
index ec4477f..30c249f 100644
--- a/gatb-core/src/gatb/template/TemplateSpecialization10.cpp.in
+++ b/gatb-core/src/gatb/template/TemplateSpecialization10.cpp.in
@@ -1,6 +1,7 @@
#include <gatb/bcalm2/bcalm_algo.cpp>
#include <gatb/bcalm2/bglue_algo.cpp>
#include <gatb/bcalm2/ograph.cpp>
+#include <gatb/debruijn/impl/LinkTigs.cpp>
using namespace gatb::core::kmer;
using namespace gatb::core::kmer::impl;
@@ -22,14 +23,17 @@ template void bcalm2<${KSIZE}>(Storage* storage,
template void bglue<${KSIZE}>(Storage* storage,
std::string prefix,
int kmerSize,
- int minSize,
int nb_threads,
- int minimizer_type,
bool verbose
);
template class graph3<${KSIZE}>; // graph3<span> switch
+template void link_tigs<${KSIZE}>
+ (std::string unitigs_filename, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose);
+
+template void link_unitigs_pass<${KSIZE}>(const std::string unitigs_filename, bool verbose, const int pass, const int kmerSize);
+
/********************************************************************************/
} } } } /* end of namespaces. */
diff --git a/gatb-core/src/gatb/tools/.DS_Store b/gatb-core/src/gatb/tools/.DS_Store
deleted file mode 100644
index e3670ef..0000000
Binary files a/gatb-core/src/gatb/tools/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/tools/collections/impl/Hash16.hpp b/gatb-core/src/gatb/tools/collections/impl/Hash16.hpp
index 6e1ce68..e2792a7 100644
--- a/gatb-core/src/gatb/tools/collections/impl/Hash16.hpp
+++ b/gatb-core/src/gatb/tools/collections/impl/Hash16.hpp
@@ -137,6 +137,11 @@ public:
_memory.memset (datah,0, tai * sizeof(cell_ptr_t));
}
+ u_int64_t getByteSize()
+ {
+ return storage.getByteSize();
+ }
+
/** Destructor */
~Hash16()
{
diff --git a/gatb-core/src/gatb/tools/collections/impl/IteratorFile.hpp b/gatb-core/src/gatb/tools/collections/impl/IteratorFile.hpp
index 9dd4112..cab31e5 100644
--- a/gatb-core/src/gatb/tools/collections/impl/IteratorFile.hpp
+++ b/gatb-core/src/gatb/tools/collections/impl/IteratorFile.hpp
@@ -169,7 +169,7 @@ public:
*/
IterableFile (const std::string& filename, size_t cacheItemsNb=10000)
: _filename(filename), _cacheItemsNb (cacheItemsNb),
- _file(0) // hacking my own iterator, for getItems, separate from IteratorFile. dirty, but nothing used to work at all
+ _file(0) // hacking my own iterator, for getItems, separate from IteratorFile. dirty, but nothing used to work at all. _file is used in getItems() only
{
// if the file doesn't exist (meaning that BagFile hasn't created it yet), let's create it just for the sake of it. but then we'll open it just for reading
if (!system::impl::System::file().doesExist(filename))
@@ -177,8 +177,9 @@ public:
auto _file2 = system::impl::System::file().newFile (filename, "wb");
delete _file2;
}
-
- _file = system::impl::System::file().newFile (filename, "rb");
+ /* _file should be initialized here but actually, the iterator() method will also create its own file.
+ * so, instead of opening _file here, let's wait until getItems() is actually called (sometimes it won't).
+ */
}
/** Destructor. */
@@ -211,7 +212,8 @@ public:
*/
size_t getItems (Item*& buffer, size_t start, size_t nb)
{
- if (_file == 0) {std::cout << "cannot call getItems when _file is null" << std::endl; exit(1); }
+ if (_file == 0)
+ _file = system::impl::System::file().newFile (_filename, "rb");
DEBUG_ITERATORFILE(std::cout << "want to read " << nb << " elements of size " << sizeof(Item) << " at position " << _file->tell() << " file size " << _file->getSize() /*<< " then write them to buffer at position " << (sizeof(Item) * start) << std::endl*/;)
size_t n = _file->fread (buffer /*+ sizeof(Item) * start*/, sizeof(Item), nb);
DEBUG_ITERATORFILE(std::cout << "read " << n << " elements" << std::endl;)
diff --git a/gatb-core/src/gatb/tools/collections/impl/MapMPHF.hpp b/gatb-core/src/gatb/tools/collections/impl/MapMPHF.hpp
index 5aaefad..69a7a10 100644
--- a/gatb-core/src/gatb/tools/collections/impl/MapMPHF.hpp
+++ b/gatb-core/src/gatb/tools/collections/impl/MapMPHF.hpp
@@ -15,7 +15,7 @@
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
-*****************************************************************************/
+ *****************************************************************************/
/** \file MapMPHF.hpp
* \date 01/03/2013
@@ -35,113 +35,208 @@
/********************************************************************************/
namespace gatb {
-namespace core {
-namespace tools {
-namespace collections {
-namespace impl {
-/********************************************************************************/
-
-/** \brief hash table implementation
- *
- * This hash table implementation uses a minimal perfect hash function (MPHF) for
- * identifying the keys with a unique number in [0..N-1] where N is the number of items.
- *
- * Using BooPHF, the memory usage is about 3-4 bits per key.
- *
- * The values can be stored in a simple vector. The keys are not stored in memory, only
- * the mphf is needed.
- *
- * Note that such an implementation can't afford to add items into the map (it's static).
- */
-template <class Key, class Value, class Adaptator=AdaptatorDefault<Key> >
-class MapMPHF : public system::SmartPointer
-{
-public:
-
- /** Hash type. */
- typedef BooPHF<Key, Adaptator> Hash;
-
- /** Default constructor. */
- MapMPHF () : hash() {}
-
- /** Build the hash function from a set of items.
- * \param[in] keys : iterable over the keys of the hash table
- * \param[in] progress : listener called during the building of the MPHF
- */
- void build (tools::collections::Iterable<Key>& keys, int nbThreads = 1, tools::dp::IteratorListener* progress=0)
- {
- /** We build the hash function. */
- hash.build (&keys, nbThreads, progress);
-
- /** We resize the vector of Value objects. */
- data.resize (keys.getNbItems());
- clearData();
- }
-
- /* use the hash from another MapMPHF class. hmm is this smartpointer legit?
- * also allocate n/x data elements
- */
- void useHashFrom (MapMPHF *other, int x = 1)
- {
- hash = other->hash;
-
- /** We resize the vector of Value objects. */
- data.resize ((unsigned long)((hash.size()) / (unsigned long)x) + 1LL); // that +1 and not (hash.size+x-1) / x
-
- clearData();
- }
-
- /** Save the hash function into a Group object.
- * \param[out] group : group where to save the MPHF
- * \param[in] name : name of the saved MPHF
- * \return the number of bytes of the saved data.
- */
- size_t save (tools::storage::impl::Group& group, const std::string& name) { return hash.save (group, name); }
-
- /** Load hash function from a Group
- * \param[in] group : group where to load the MPHF from
- * \param[in] name : name of the MPHF
- */
- void load (tools::storage::impl::Group& group, const std::string& name)
- {
- /** We load the hash function. */
- size_t nbKeys = hash.load (group, name);
-
- /** We resize the vector of Value objects. */
- data.resize (nbKeys);
- clearData();
- }
-
- /** Get the value for a given key
- * \param[in] key : the key
- * \return the value associated to the key. */
- Value& operator[] (const Key& key) { return data[hash(key)]; }
-
- /** Get the value for a given index
- * \param[in] code : the key
- * \return the value associated to the key. */
- Value& at (typename Hash::Code code) { return data[code]; }
-
- /** Get the hash code of the given key. */
- typename Hash::Code getCode (const Key& key) { return hash(key); }
-
- /** Get the number of keys.
- * \return keys number. */
- size_t size() const { return hash.size(); }
-
- void clearData() {
- for (unsigned long i = 0; i < data.size(); i ++)
- data[i] = 0;
- }
-
-private:
-
- Hash hash;
- std::vector<Value> data;
-};
-
-/********************************************************************************/
-} } } } } /* end of namespaces. */
+ namespace core {
+ namespace tools {
+ namespace collections {
+ namespace impl {
+ /********************************************************************************/
+
+ /** \brief hash table implementation
+ *
+ * This hash table implementation uses a minimal perfect hash function (MPHF) for
+ * identifying the keys with a unique number in [0..N-1] where N is the number of items.
+ *
+ * Using BooPHF, the memory usage is about 3-4 bits per key.
+ *
+ * The values can be stored in a simple vector. The keys are not stored in memory, only
+ * the mphf is needed.
+ *
+ * Note that such an implementation can't afford to add items into the map (it's static).
+ */
+ template <class Key, class Value, class Adaptator=AdaptatorDefault<Key> >
+ class MapMPHF : public system::SmartPointer
+ {
+ public:
+
+ /** Hash type. */
+ typedef BooPHF<Key, Adaptator> Hash;
+
+ /** Default constructor. */
+ MapMPHF () : hash() {}
+
+ /** Build the hash function from a set of items.
+ * \param[in] keys : iterable over the keys of the hash table
+ * \param[in] progress : listener called during the building of the MPHF
+ */
+ void build (tools::collections::Iterable<Key>& keys, int nbThreads = 1, tools::dp::IteratorListener* progress=0)
+ {
+ /** We build the hash function. */
+ hash.build (&keys, nbThreads, progress);
+
+ /** We resize the vector of Value objects. */
+ data.resize (keys.getNbItems());
+ clearData();
+ initDiscretizationScheme();
+ }
+
+
+
+ // discretization scheme to store abundance values from 0 to 50000 on 8 bits
+ // with 5% error maximum
+ // from 0 to 70 : step = 1 (70 buckets)
+ // from 70 to 100 : step = 2 (15 buckets)
+ // from 100 to 500 : step = 10 (40 buckets)
+ // from 500 to 1000 : step = 20 (25 buckets)
+ // from 1000 to 5000 : step = 100 (40 buckets)
+ // from 5000 to 10000 : step = 200 (25 buckets)
+ // from 10000 to 50000 : step = 1000 (40 buckets)
+ //
+ //to change discretization scheme, change the values in _abundanceDiscretization below
+
+ void initDiscretizationScheme()
+ {
+ _abundanceDiscretization.resize(257);
+
+ int total =0;
+ _abundanceDiscretization[0] = 0;
+ int idx=1;
+ for(int ii=1; ii<= 70; ii++,idx++ )
+ {
+ total += 1;
+ _abundanceDiscretization[idx] = total ;
+ }
+
+ for(int ii=1; ii<= 15; ii++,idx++ )
+ {
+ total += 2;
+ _abundanceDiscretization[idx] = total ;
+ }
+
+ for(int ii=1; ii<= 40; ii++,idx++ )
+ {
+ total += 10;
+ _abundanceDiscretization[idx] = total ;
+ }
+ for(int ii=1; ii<= 25; ii++,idx++ )
+ {
+ total += 20;
+ _abundanceDiscretization[idx] = total ;
+ }
+ for(int ii=1; ii<= 40; ii++,idx++ )
+ {
+ total += 100;
+ _abundanceDiscretization[idx] = total ;
+ }
+ for(int ii=1; ii<= 25; ii++,idx++ )
+ {
+ total += 200;
+ _abundanceDiscretization[idx] = total ;
+ }
+ for(int ii=1; ii<= 40; ii++,idx++ )
+ {
+ total += 1000;
+ _abundanceDiscretization[idx] = total ;
+ }
+ //
+
+
+
+ _abundanceDiscretization[256] = UINT_MAX;
+
+
+
+
+
+ /*
+ for(int ii=0; ii<_abundanceDiscretization.size(); ii++ )
+ {
+ printf("disc[%i] = %i \n", ii,_abundanceDiscretization[ii] );
+ }
+ */
+
+ }
+
+ /* use the hash from another MapMPHF class. hmm is this smartpointer legit?
+ * also allocate n/x data elements
+ */
+ void useHashFrom (MapMPHF *other, int x = 1)
+ {
+ hash = other->hash;
+
+ /** We resize the vector of Value objects. */
+ data.resize ((unsigned long)((hash.size()) / (unsigned long)x) + 1LL); // that +1 and not (hash.size+x-1) / x
+
+ clearData();
+ }
+
+ /** Save the hash function into a Group object.
+ * \param[out] group : group where to save the MPHF
+ * \param[in] name : name of the saved MPHF
+ * \return the number of bytes of the saved data.
+ */
+ size_t save (tools::storage::impl::Group& group, const std::string& name) { return hash.save (group, name); }
+
+ /** Load hash function from a Group
+ * \param[in] group : group where to load the MPHF from
+ * \param[in] name : name of the MPHF
+ */
+ void load (tools::storage::impl::Group& group, const std::string& name)
+ {
+ /** We load the hash function. */
+ size_t nbKeys = hash.load (group, name);
+
+ /** We resize the vector of Value objects. */
+ data.resize (nbKeys);
+ clearData();
+ initDiscretizationScheme();
+ }
+
+ /** Get the value for a given key
+ * \param[in] key : the key
+ * \return the value associated to the key. */
+ int operator[] (const Key& key) {
+ return floorf((_abundanceDiscretization [data[hash(key)]] + _abundanceDiscretization [data[hash(key)]+1])/2.0);
+ //return data[hash(key)];
+ }
+
+ /** Get the value for a given index
+ * \param[in] code : the key
+ * \return the value associated to the key. */
+ Value& at (typename Hash::Code code) {
+ return data[code];
+
+ }
+
+
+ Value& at (const Key& key) {
+ return data[hash(key)];
+ }
+
+
+ /** Get the hash code of the given key. */
+ typename Hash::Code getCode (const Key& key) { return hash(key); }
+
+ /** Get the number of keys.
+ * \return keys number. */
+ size_t size() const { return hash.size(); }
+
+ void clearData() {
+ for (unsigned long i = 0; i < data.size(); i ++)
+ data[i] = 0;
+ }
+
+ std::vector<int> _abundanceDiscretization;
+
+ private:
+
+ Hash hash;
+ std::vector<Value> data;
+
+
+ };
+
+ /********************************************************************************/
+ } } } } } /* end of namespaces. */
/********************************************************************************/
#endif /* _GATB_CORE_TOOLS_COLLECTION_MAP_MPHF_HPP_ */
diff --git a/gatb-core/src/gatb/tools/compression/DnaCoder.cpp b/gatb-core/src/gatb/tools/compression/DnaCoder.cpp
new file mode 100644
index 0000000..07ee663
--- /dev/null
+++ b/gatb-core/src/gatb/tools/compression/DnaCoder.cpp
@@ -0,0 +1,1784 @@
+/*****************************************************************************
+ * Leon: reference free compression for NGS reads
+ * A tool from the GATB (Genome Assembly Tool Box)
+ * Copyright (C) 2014 INRIA
+ * Authors: G.Benoit, G.Rizk, C.Lemaitre
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *****************************************************************************/
+
+#include "DnaCoder.hpp"
+
+/*
+#define PRINT_DEBUG_EXTMUTA
+#define PRINT_DEBUG_ENCODER
+#define PRINT_DEBUG_DECODER
+*/
+
+
+char bin2NTrev[4] = {'T','G','A','C'};
+//char bin2NT[4] = {'A','C','T','G'};
+
+/*
+GACGCGCCGATATAACGCGCTTTCCCGGCTTTTACCACGTCGTTGAGGGCTTCCAGCGTCTCTTCGATCGGCGTGTTGTAATCCCAGCGATGAATTTG6:2308q
+ Anchor pos: 8
+ Anchor: GATATAACGCGCTTTCCCGGCTTTTACCACG
+ § Anchor: 8
+*/
+
+//====================================================================================
+// ** AbstractDnaCoder
+//====================================================================================
+AbstractDnaCoder::AbstractDnaCoder(Leon* leon) :
+_kmerModel(leon->_kmerSize),
+_readTypeModel(2), //only 2 value in this model: read with anchor or without anchor
+//_isPrevReadAnchorableModel(2),
+_noAnchorReadModel(5), _bifurcationModel(5), //5value: A, C, G, T, N
+_bifurcationBinaryModel(2), //0 or 1 (lowest or highest bifurcation by alphabetical order)
+_readAnchorRevcompModel(2),
+_readSizeDeltaTypeModel(3),
+_anchorPosDeltaTypeModel(3),
+_anchorAddressDeltaTypeModel(3),
+_NposDeltaTypeModel(3),
+_errorPosDeltaTypeModel(3),_seqId(0)
+{
+ _leon = leon;
+ _bloom = _leon->_bloom;
+ _kmerSize = _leon->_kmerSize;
+
+
+ for(int i=0; i<CompressionUtils::NB_MODELS_PER_NUMERIC; i++){
+ _anchorAddressModel.push_back(Order0Model(256));
+ //_isPrevReadAnchorablePosModel.push_back(Order0Model(256));
+ _anchorPosModel.push_back(Order0Model(256));
+ _noAnchorReadSizeValueModel.push_back(Order0Model(256));
+ _readSizeValueModel.push_back(Order0Model(256));
+ _NposModel.push_back(Order0Model(256));
+ _leftErrorPosModel.push_back(Order0Model(256));
+ //_rightErrorPosModel.push_back(Order0Model(256));
+ _numericModel.push_back(Order0Model(256));
+ _leftErrorModel.push_back(Order0Model(256));
+ //_rightErrorModel.push_back(Order0Model(256));
+ }
+
+
+}
+
+void AbstractDnaCoder::startBlock(){
+ //_prevSequences = NULL;
+
+ for(int i=0; i<CompressionUtils::NB_MODELS_PER_NUMERIC; i++){
+ _anchorAddressModel[i].clear();
+ //_isPrevReadAnchorablePosModel[i].clear();
+ _anchorPosModel[i].clear();
+ _noAnchorReadSizeValueModel[i].clear();
+ _readSizeValueModel[i].clear();
+ _NposModel[i].clear();
+ _leftErrorPosModel[i].clear();
+ //_rightErrorPosModel[i].clear();
+ _numericModel[i].clear();
+ _leftErrorModel[i].clear();
+ //_rightErrorModel[i].clear();
+ }
+ _readTypeModel.clear();
+ //_isPrevReadAnchorableModel.clear();
+ _noAnchorReadModel.clear();
+ _bifurcationModel.clear();
+ _bifurcationBinaryModel.clear();
+ _readAnchorRevcompModel.clear();
+ _readSizeDeltaTypeModel.clear();
+ _anchorPosDeltaTypeModel.clear();
+ _anchorAddressDeltaTypeModel.clear();
+ _NposDeltaTypeModel.clear();
+ _errorPosDeltaTypeModel.clear();
+ _prevReadSize = 0;
+ _prevAnchorPos = 0;
+ _prevAnchorAddress = 0;
+ _prevNpos = 0;
+ _prevErrorPos = 0;
+
+ _processedSequenceCount = 0;
+}
+
+void AbstractDnaCoder::endRead(){
+ _processedSequenceCount += 1;
+}
+
+void AbstractDnaCoder::codeSeedBin(KmerModel* model, kmer_type* kmer, int nt, bool right){
+
+
+ if(right)
+ {
+ /** We initialize the kmer. */
+ KmerModel::Kmer tmp; tmp.set (*kmer);
+
+ *kmer = model->codeSeedRight (tmp, nt, Data::INTEGER).value();
+ }
+ else
+ {
+ /** We initialize the canonical kmer. */
+ KmerModel::Kmer tmp; tmp.set (revcomp(*kmer, _kmerSize));
+
+ *kmer = model->codeSeedRight (tmp, binrev[nt], Data::INTEGER).value();
+ *kmer = revcomp(*kmer, _kmerSize);
+ }
+}
+
+void AbstractDnaCoder::codeSeedNT(KmerModel* model, kmer_type* kmer, char nt, bool right){
+ //if(nt == 'N') nt = 'A';
+ return codeSeedBin(model, kmer, Leon::nt2bin(nt), right);
+}
+
+
+void AbstractDnaCoder::addErrorPos(int pos, bool rightExtend){
+
+ _leftErrorPos.push_back(pos);
+}
+
+//====================================================================================
+// ** DnaEncoder
+//====================================================================================
+DnaEncoder::DnaEncoder(Leon* leon) :
+AbstractDnaCoder(leon), _itKmer(_kmerModel), _totalDnaSize(0), _readCount(0), _MCtotal(0), _readWithoutAnchorCount(0),
+_MCuniqSolid (0), _MCuniqNoSolid(0), _MCnoAternative(0), _MCmultipleSolid(0)//, _MCmultipleNoSolid(0)
+{
+ _maxSequenceSize = 0;
+ _minSequenceSize = INT_MAX;
+
+ _thread_id = __sync_fetch_and_add (&_leon->_nb_thread_living, 1);
+
+#ifdef PRINT_DISTRIB
+ _distrib.resize(maxSequences);
+ _outDistrib = 0;
+#endif
+
+ //pour quals
+ if(! leon->_isFasta)
+ {
+ _max_read_size = 10000;
+ _nb_solids = (int *) malloc(_max_read_size * sizeof(int) );
+ _qualseq = (char *) malloc(_max_read_size*sizeof(char ));
+ _bufferQuals_size = _leon->getReadPerBlock()* 200;
+ _bufferQuals = (char *) malloc(_bufferQuals_size * sizeof(char ));
+ _bufferQuals_idx=0;
+
+ _trunc_mode = true;
+ _smoothing_threshold = 2;
+
+ }
+}
+
+DnaEncoder::DnaEncoder(const DnaEncoder& copy) :
+AbstractDnaCoder(copy._leon), _itKmer(_kmerModel),
+ _totalDnaSize(0), _readCount(0), _MCtotal(0), _readWithoutAnchorCount(0),
+_MCuniqSolid (0), _MCuniqNoSolid(0), _MCnoAternative(0), _MCmultipleSolid(0)//, _MCmultipleNoSolid(0)
+{
+ _maxSequenceSize = 0;
+ _minSequenceSize = INT_MAX;
+
+#ifdef PRINT_DISTRIB
+ _distrib.resize(maxSequences);
+ _outDistrib = 0;
+#endif
+
+ _thread_id = __sync_fetch_and_add (&_leon->_nb_thread_living, 1);
+
+ startBlock();
+
+
+ //for quals
+ if(! _leon->_isFasta)
+ {
+ _max_read_size = 10000;
+ _nb_solids = (int *) malloc(_max_read_size * sizeof(int) );
+ _qualseq = (char *) malloc(_max_read_size*sizeof(char ));
+ _bufferQuals_size = _leon->getReadPerBlock()* 200;
+ _bufferQuals = (char *) malloc(_bufferQuals_size * sizeof(char ));
+ //printf("initial buffer qual size %i \n",_bufferQuals_size );
+
+ _bufferQuals_idx =0;
+
+ _trunc_mode = true;
+ _smoothing_threshold = 2;
+ }
+
+ ///
+
+ #ifdef LEON_PRINT_STAT
+ _rangeEncoder1.updateModel = false;
+ _rangeEncoder2.updateModel = false;
+ _rangeEncoder3.updateModel = false;
+ _rangeEncoder4.updateModel = false;
+ _rangeEncoder5.updateModel = false;
+ _rangeEncoder6.updateModel = false;
+ #endif
+
+}
+
+DnaEncoder::~DnaEncoder(){
+
+ if(_thread_id!=0 && (_seqId+1) % _leon->getReadPerBlock() != 0 ){
+ writeBlock();
+ }
+ //int nb_remaining =
+ __sync_fetch_and_add (&_leon->_nb_thread_living, -1);
+
+ //printf("\~ this decoder %lli seq %lli mctotal %lli mltnos %p tid %i \n",_readCount,_MCtotal,_MCmultipleNoSolid,this,_thread_id);
+ __sync_fetch_and_add(&_leon->_readCount, _readCount);
+ __sync_fetch_and_add(&_leon->_MCtotal, _MCtotal);
+ __sync_fetch_and_add(&_leon->_readWithoutAnchorCount, _readWithoutAnchorCount);
+ __sync_fetch_and_add(&_leon->_totalDnaSize, _totalDnaSize);
+ __sync_fetch_and_add(&_leon->_MCuniqSolid, _MCuniqSolid);
+ __sync_fetch_and_add(&_leon->_MCuniqNoSolid, _MCuniqNoSolid);
+ __sync_fetch_and_add(&_leon->_MCnoAternative, _MCnoAternative);
+ __sync_fetch_and_add(&_leon->_MCmultipleSolid, _MCmultipleSolid);
+
+ _leon->updateMinMaxSequenceSize(_minSequenceSize,_maxSequenceSize);
+
+
+ //__sync_fetch_and_add(&_leon->_MCmultipleNoSolid, _MCmultipleNoSolid);
+
+ #ifdef LEON_PRINT_STAT
+ __sync_fetch_and_add(&_leon->_anchorAdressSize, _rangeEncoder3.getBufferSize());
+ __sync_fetch_and_add(&_leon->_anchorPosSize, _rangeEncoder2.getBufferSize());
+ __sync_fetch_and_add(&_leon->_readSizeSize, _rangeEncoder1.getBufferSize());
+ __sync_fetch_and_add(&_leon->_bifurcationSize, _rangeEncoder4.getBufferSize());
+ __sync_fetch_and_add(&_leon->_otherSize, _rangeEncoder6.getBufferSize());
+ __sync_fetch_and_add(&_leon->_noAnchorSize, _rangeEncoder5.getBufferSize());
+
+
+ _rangeEncoder1.clear();
+ _rangeEncoder2.clear();
+ _rangeEncoder3.clear();
+ _rangeEncoder4.clear();
+ _rangeEncoder5.clear();
+ _rangeEncoder6.clear();
+ #endif
+
+
+ //pour quals
+ if(! _leon->_isFasta)
+ {
+ free(_nb_solids);
+ free(_qualseq);
+ free(_bufferQuals);
+ }
+}
+
+void DnaEncoder::operator()(Sequence& sequence){
+
+#ifdef PRINT_DISTRIB
+ if(_sequences.size() > maxSequences){
+ _sequences.pop_back();
+ }
+#endif
+
+ _sequence = &sequence;
+ //cout << _sequence->getIndex() << endl;
+ _seqId = _sequence->getIndex() ;
+ _readSize = _sequence->getDataSize();
+ _readseq = _sequence->getDataBuffer();
+
+ _totalDnaSize += _readSize ;
+
+
+ _minSequenceSize = std::min(_minSequenceSize, (int) _readSize);
+ _maxSequenceSize = std::max(_maxSequenceSize, (int)_readSize);
+
+
+ //_lastSequenceIndex = sequence->getIndex();
+
+// if(_sequence->getIndex() % Leon::READ_PER_BLOCK == 0){
+
+ execute();
+
+ //_prevSequences = _sequence;
+#ifdef PRINT_DISTRIB
+ _sequences.insert(_sequences.begin(), _sequence);
+#endif
+
+ if(_processedSequenceCount >= _leon->getReadPerBlock() ){
+
+ writeBlock();
+ startBlock();
+ }
+
+}
+
+void DnaEncoder::writeBlock(){
+ if(_processedSequenceCount == 0) return;
+
+ if(_rangeEncoder.getBufferSize() > 0){
+ _rangeEncoder.flush();
+ }
+
+ int blockId = ( _seqId / _leon->getReadPerBlock()) ;
+ //printf("\nTid %i WB : blockid %i sid %llu size: %llu _processedSequenceCount %i\n",_thread_id, blockId, _seqId, _rangeEncoder.getBufferSize(),_processedSequenceCount );
+
+ //_leon->_realDnaCompressedSize += _rangeEncoder.getBufferSize();
+ _leon->writeBlock(_rangeEncoder.getBuffer(), _rangeEncoder.getBufferSize(), _processedSequenceCount,blockId,false);
+ _rangeEncoder.clear();
+
+ if(! _leon->_isFasta)
+ {
+ _leon->writeBlockLena((u_int8_t*) _bufferQuals, _bufferQuals_idx ,_processedSequenceCount, blockId);
+ _bufferQuals_idx = 0;
+ }
+
+#ifdef PRINT_DISTRIB
+ cout << "----------------------------------------------------" << endl;
+ for(int i=0; i<_distrib.size(); i++){
+ cout << i << " " << _distrib[i] << endl;
+ }
+ cout << "Adressed: " << _outDistrib << endl;
+#endif
+
+
+}
+
+void DnaEncoder::execute(){
+
+
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << endl << "\tEncoding seq " << _sequence->getIndex() << endl;
+ cout << "\t\t" << _readseq << endl;
+ #endif
+
+ //cout << _readseq << endl;
+
+ _readCount +=1;
+ _Npos.clear();
+
+ if(_readSize < _kmerSize){
+ encodeNoAnchorRead();
+ smoothQuals();
+ endRead();
+ return;
+ }
+
+
+
+ //cout << _leon->_readCount << endl;
+ //kmer_type anchorKmer = 0;
+ u_int32_t anchorAddress;
+
+ buildKmers(); // en profiter ici pour faire la compression des qual ?
+
+ if(! _leon->_isFasta)
+ {
+ storeSolidCoverageInfo();
+ smoothQuals();
+ }
+
+
+ //_isPrevReadAnchorable = false;
+ int anchorPos = findExistingAnchor(&anchorAddress); //unsynch
+
+ if(anchorPos == -1)
+ anchorPos = _leon->findAndInsertAnchor(_kmers, &anchorAddress); //unsynch
+
+ //cout << anchorPos << endl;
+
+ if(anchorPos == -1)
+ encodeNoAnchorRead();
+ else{
+ encodeAnchorRead(anchorPos, anchorAddress);
+ }
+ //}
+
+ endRead();
+
+}
+
+
+
+
+double DnaEncoder::char2proba(char c)
+{
+ int phred = c -33;
+
+ double proba = exp(-phred* log(10)/10);
+ return proba;
+ //Q 10 : 10% err
+ //Q 20 : 1% err
+ //Q 30 0.1% err ..
+}
+
+
+char DnaEncoder::char2phred(char c)
+{
+ return c -33;
+}
+
+
+void DnaEncoder::smoothQuals()
+{
+ strcpy (_qualseq, _sequence->getQuality().c_str()); // copy the qual sequence of this read in _qualseq
+
+ if(! _leon->_lossless && _readSize >= _kmerSize)
+ {
+ for (unsigned int ii=0; ii< _readSize; ii++)
+ {
+ if ((_nb_solids[ii]>= _smoothing_threshold) || (((int) _qualseq[ii] > (int) '@') && _trunc_mode ))
+ {
+ apply_smoothing_at_pos (ii);
+ }
+ }
+ }
+
+ _qualseq[_readSize]='\n';
+ _qualseq[_readSize+1]='\0';
+
+ if( (_bufferQuals_idx+ _readSize+1 ) >= _bufferQuals_size)
+ {
+ //printf("_bufferQuals_size %i _bufferQuals_idx %i seqid %zu \n",_bufferQuals_size,_bufferQuals_idx,_sequence->getIndex() );
+ _bufferQuals_size = _bufferQuals_size * 2;
+ _bufferQuals = (char *) realloc(_bufferQuals,_bufferQuals_size * sizeof(char) );
+ }
+
+ strcpy(_bufferQuals + _bufferQuals_idx, _qualseq);
+
+ _bufferQuals_idx += _readSize+1 ; // with last '\n'
+
+ //fprintf(_leon->_testQual,"%s",_qualseq); //st_qualseq.c_str()
+
+}
+
+
+
+bool DnaEncoder::apply_smoothing_at_pos(int pos)
+{
+ if(char2phred(_qualseq[pos])==0 || char2phred(_qualseq[pos])==2 )
+ return false;
+
+ bool ok_to_smooth= true;
+
+ int diff = ('@' - _qualseq[pos]);
+ if( diff > 10 )
+ {
+ if(_nb_solids[pos]>(diff-5))
+ ok_to_smooth =true;
+ else
+ ok_to_smooth = false;
+ }
+
+ if(ok_to_smooth)
+ {
+ _qualseq[pos] = '@'; //smooth qual
+ return true;
+ }
+ else return false;
+
+}
+
+
+
+void DnaEncoder::storeSolidCoverageInfo()
+{
+ kmer_type kmer, kmerMin;
+
+ if(_readSize >= _max_read_size)
+ {
+ _max_read_size = _readSize + 1000;
+ _nb_solids = (int *) realloc(_nb_solids,_max_read_size * sizeof(int) );
+ _qualseq = (char *) realloc(_qualseq,_max_read_size*sizeof(char ));
+
+ }
+ memset(_nb_solids,0,_max_read_size * sizeof(int) );
+
+ for(unsigned int ii=0; ii<_kmers.size(); ii++){
+ kmer = _kmers[ii];
+ kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+
+ if(_bloom->contains(kmerMin))
+ {
+ //increments all pos covered by the solid kmer
+ for (unsigned int jj=0; jj< _kmerSize ; jj++)
+ {
+ _nb_solids[ii+jj] ++ ;
+ }
+ }
+ }
+}
+
+
+void DnaEncoder::buildKmers(){
+
+
+
+ for(unsigned int i=0; i<_readSize; i++){
+ if(_readseq[i] == 'N'){
+ _Npos.push_back(i);
+ _readseq[i] = 'A';
+ }
+ }
+ _itKmer.setData(_sequence->getData());
+
+ _kmers.clear();
+ for (_itKmer.first(); !_itKmer.isDone(); _itKmer.next()){
+ //cout << (*_itKmer).toString(_kmerSize) << endl;
+ _kmers.push_back(_itKmer->value());
+ }
+
+
+#ifdef PRINT_DISTRIB
+ unordered_set<u_int64_t> H;
+ for(kmer_type kmer : _kmers){
+ kmer_type kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+ H.insert(kmerMin.getVal());
+ }
+
+ for(int i=0; i<_sequences.size(); i++){
+ Sequence* sequence = _sequences[i];
+
+ _itKmer.setData(sequence->getData());
+ for (_itKmer.first(); !_itKmer.isDone(); _itKmer.next()){
+ kmer_type kmerMin2 = min(_itKmer->value(), revcomp(_itKmer->value(), _kmerSize));
+ if(H.find(kmerMin2.getVal()) != H.end()){
+ _distrib[i] += 1;
+ return;
+ }
+ }
+ }
+
+ _outDistrib += 1;
+#endif
+
+}
+
+int DnaEncoder::findExistingAnchor(u_int32_t* anchorAddress){
+
+ kmer_type kmer, kmerMin;
+
+
+ for(unsigned int i=0; i<_kmers.size(); i++){
+ kmer = _kmers[i];
+ kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+ if(_leon->anchorExist(kmerMin, anchorAddress)){
+ return i;
+ }
+ }
+ return -1;
+}
+
+bool DnaEncoder::isReadAnchorable(){
+ int nbKmerSolid = 0;
+ kmer_type kmer, kmerMin;
+
+ for(unsigned int i=0; i<_kmers.size(); i++){
+
+ kmer = _kmers[i];
+ kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+
+ if(_bloom->contains(kmerMin)){
+ nbKmerSolid += 1;
+ i += _kmerSize;
+ }
+
+ if(nbKmerSolid >= 2) return true;
+ }
+
+ return nbKmerSolid >= 2;
+
+}
+
+void DnaEncoder::encodeAnchorRead(int anchorPos, u_int32_t anchorAddress){
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\tEncode anchor read" << endl;
+ #endif
+ //printf("encode anchor read \n");
+
+ //encode read type (0: read with anchor, 1: read without anchor)
+ #ifdef LEON_PRINT_STAT
+ _rangeEncoder6.encode(_readTypeModel, 0);
+ #endif
+ _rangeEncoder.encode(_readTypeModel, 0);
+
+ //u_int64_t deltaValue;
+ //u_int8_t deltaType;
+
+ //Encode read size
+ //deltaType = CompressionUtils::getDeltaValue(_readSize, _prevReadSize, &deltaValue);
+ #ifdef LEON_PRINT_STAT
+ //_rangeEncoder1.encode(_readSizeDeltaTypeModel, deltaType);
+ CompressionUtils::encodeNumeric(_rangeEncoder1, _readSizeValueModel, _readSize);
+ #endif
+ //_rangeEncoder.encode(_readSizeDeltaTypeModel, deltaType);
+ CompressionUtils::encodeNumeric(_rangeEncoder, _readSizeValueModel, _readSize);
+ //_prevReadSize = _readSize;
+ //printf("read size %i deltaValue %i\n",_readSize,deltaValue);
+
+ //Encode anchor pos
+ //deltaType = CompressionUtils::getDeltaValue(anchorPos, _prevAnchorPos, &deltaValue);
+ #ifdef LEON_PRINT_STAT
+ //_rangeEncoder2.encode(_anchorPosDeltaTypeModel, deltaType);
+ CompressionUtils::encodeNumeric(_rangeEncoder2, _anchorPosModel, anchorPos);
+ #endif
+ //_rangeEncoder.encode(_anchorPosDeltaTypeModel, deltaType);
+ CompressionUtils::encodeNumeric(_rangeEncoder, _anchorPosModel, anchorPos);
+ //_prevAnchorPos = anchorPos;
+ //printf("anchor pos %i \n",anchorPos);
+
+ //Encode anchor address
+ //deltaType = CompressionUtils::getDeltaValue(anchorAddress, _prevAnchorAddress, &deltaValue);
+ #ifdef LEON_PRINT_STAT
+ //_rangeEncoder3.encode(_anchorAddressDeltaTypeModel, deltaType);
+ CompressionUtils::encodeNumeric(_rangeEncoder3, _anchorAddressModel, anchorAddress);
+ #endif
+ //_rangeEncoder.encode(_anchorAddressDeltaTypeModel, deltaType);
+ //if(_isPrevReadAnchorable){
+ // _rangeEncoder.encode(_isPrevReadAnchorableModel, 0);
+ // CompressionUtils::encodeNumeric(_rangeEncoder, _isPrevReadAnchorablePosModel, _isPrevReadAnchorablePos);
+ //}
+ //else{
+ //_rangeEncoder.encode(_isPrevReadAnchorableModel, 1);
+ CompressionUtils::encodeNumeric(_rangeEncoder, _anchorAddressModel, anchorAddress);
+ //CompressionUtils::encodeNumeric(_rangeEncoder, _isPrevReadAnchorablePosModel, _isPrevReadAnchorablePos);
+ //}
+ //_prevAnchorAddress = anchorAddress;
+ //printf("anchor adress %i \n",anchorAddress);
+
+
+ kmer_type anchor = _kmers[anchorPos];
+
+ //Encode a bit that says if the anchor is normal or revcomp
+ if(anchor == min(anchor, revcomp(anchor, _kmerSize))){
+ #ifdef LEON_PRINT_STAT
+ _rangeEncoder6.encode(_readAnchorRevcompModel, 0);
+ #endif
+ _rangeEncoder.encode(_readAnchorRevcompModel, 0);
+ }
+ else{
+ #ifdef LEON_PRINT_STAT
+ _rangeEncoder6.encode(_readAnchorRevcompModel, 1);
+ #endif
+ _rangeEncoder.encode(_readAnchorRevcompModel, 1);
+ }
+
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\t\tAnchor pos: " << anchorPos << endl;
+ cout << "\t\t\tAnchor: " << _kmers[anchorPos].toString(_kmerSize) << endl;
+ #endif
+
+ _bifurcations.clear();
+ _binaryBifurcations.clear();
+ _bifurcationTypes.clear();
+ _leftErrorPos.clear();
+ //_rightErrorPos.clear();
+
+
+ kmer_type kmer = anchor;
+ for(int i=anchorPos-1; i>=0; i--){
+ kmer = buildBifurcationList(i, kmer, false);
+ //i = buildBifurcationList(i, false);
+ //cout << kmer.toString(_kmerSize) << endl;
+ }
+
+ kmer = anchor;
+ for(unsigned int i=anchorPos+_kmerSize; i<_readSize; i++){
+ //cout << "Pos: " << i << endl;
+ kmer = buildBifurcationList(i, kmer, true);
+ //i = buildBifurcationList(i, true);
+ //for(int i=anchorPos; i<_kmers.size()-1; i++)
+ //cout << kmer.toString(_kmerSize) << endl;
+ }
+
+
+ //Encode N positions
+ _prevNpos = 0;
+ #ifdef LEON_PRINT_STAT
+ CompressionUtils::encodeNumeric(_rangeEncoder6, _numericModel, _Npos.size());
+ #endif
+ CompressionUtils::encodeNumeric(_rangeEncoder, _numericModel, _Npos.size());
+ for(unsigned int i=0; i<_Npos.size(); i++){
+ //deltaType = CompressionUtils::getDeltaValue(_Npos[i], _prevNpos, &deltaValue);
+ //_rangeEncoder.encode(_NposDeltaTypeModel, deltaType);
+ #ifdef LEON_PRINT_STAT
+ CompressionUtils::encodeNumeric(_rangeEncoder6, _NposModel, _Npos[i]-_prevNpos);
+ #endif
+ CompressionUtils::encodeNumeric(_rangeEncoder, _NposModel, _Npos[i]-_prevNpos);
+ _prevNpos = _Npos[i];
+ }
+
+ #ifdef LEON_PRINT_STAT
+ CompressionUtils::encodeNumeric(_rangeEncoder6, _leftErrorModel, _leftErrorPos.size());
+ #endif
+ CompressionUtils::encodeNumeric(_rangeEncoder, _leftErrorModel, _leftErrorPos.size());
+ sort(_leftErrorPos.begin(), _leftErrorPos.end());
+ _prevErrorPos = 0;
+ for(unsigned int i=0; i<_leftErrorPos.size(); i++){
+ #ifdef LEON_PRINT_STAT
+ CompressionUtils::encodeNumeric(_rangeEncoder6, _leftErrorPosModel, _leftErrorPos[i]-_prevErrorPos);
+ #endif
+ CompressionUtils::encodeNumeric(_rangeEncoder, _leftErrorPosModel, _leftErrorPos[i]-_prevErrorPos);
+ _prevErrorPos = _leftErrorPos[i];
+ }
+
+
+ u_int64_t bifType0 = 0;
+ u_int64_t bifType1 = 0;
+ //cout << _bifurcationTypes.size() << " " << _bifurcations.size() << " " << _binaryBifurcations.size() << endl;
+ for(unsigned int i=0; i<_bifurcationTypes.size(); i++){
+ u_int8_t type = _bifurcationTypes[i];
+ if(type == 0){
+ #ifdef LEON_PRINT_STAT
+ _rangeEncoder4.encode(_bifurcationModel, _bifurcations[bifType0]);
+ #endif
+ //cout << Leon::nt2bin(_bifurcations[i]) << " ";
+ _rangeEncoder.encode(_bifurcationModel, _bifurcations[bifType0]);
+ bifType0 += 1;
+ }
+ else{
+ #ifdef LEON_PRINT_STAT
+ _rangeEncoder4.encode(_bifurcationBinaryModel, _binaryBifurcations[bifType1]);
+ #endif
+ //cout << Leon::nt2bin(_bifurcations[i]) << " ";
+ _rangeEncoder.encode(_bifurcationBinaryModel, _binaryBifurcations[bifType1]);
+ bifType1 += 1;
+ }
+ }
+
+
+
+}
+
+kmer_type DnaEncoder::buildBifurcationList(int pos, kmer_type kmer, bool rightExtend){
+
+ char nextNt = _readseq[pos];
+ int nextNtBin = Leon::nt2bin(nextNt);
+
+ if(std::find(_Npos.begin(), _Npos.end(), pos) != _Npos.end()){
+ codeSeedNT(&_kmerModel, &kmer, nextNt, rightExtend);
+ return kmer;
+ //return pos;
+ }
+
+ //kmer_type kmerMin;
+ kmer_type uniqKmer;
+ bool firstSolidKmer = false;
+ //int uniqNt;
+ //u_int8_t binNt2;
+ bool isKmerSolid = false;
+
+ int indexedKmerCount = 0;
+
+
+
+
+ std::bitset<4> res4 = _bloom->contains4(kmer,rightExtend);
+ for(int nt=0; nt<4; nt++){
+
+ //mutatedKmer.printASCII(_kmerSize);
+
+ if(res4[nt]){
+
+ indexedKmerCount += 1;
+
+ if(!firstSolidKmer){
+ firstSolidKmer = true;
+ uniqKmer = kmer;
+ codeSeedBin(&_kmerModel, &uniqKmer, nt, rightExtend);
+ }
+ /*
+
+ //uniqNt = nt;
+ uniqKmer = mutatedKmer;
+ */
+
+ if(nt == nextNtBin){
+ isKmerSolid = true;
+ }
+ }
+
+ }
+
+
+ _MCtotal +=1;
+
+
+ if(isKmerSolid){
+
+ if(indexedKmerCount == 1){
+ _MCuniqSolid += 1;
+ return uniqKmer;
+ }
+ else if(indexedKmerCount == 2){
+
+ char nt1 = -1;
+ char nt2 = -1;
+
+ for(int nt=0; nt<4; nt++){
+ if(res4[nt]){
+ //cout << "\t" << nt << endl;
+ if(nt1 == -1)
+ nt1 = nt;
+ else if(nt2 == -1)
+ nt2 = nt;
+ else break;
+ }
+ }
+
+
+ if(nt1 == nextNtBin){
+ //cout << "\t0" << endl;
+ _binaryBifurcations.push_back(0);
+ _bifurcationTypes.push_back(1);
+ _MCmultipleSolid += 1;
+ }
+ else if(nt2 == nextNtBin){
+ //cout << "\t1" << endl;
+ _binaryBifurcations.push_back(1);
+ _bifurcationTypes.push_back(1);
+ _MCmultipleSolid += 1;
+ }
+ else{
+
+ //if(_sequence->getIndex() < 20)
+ // cout << "\tallo" << endl;
+ //_MCuniqNoSolid += 1;
+ //nextNt = Leon::bin2nt(nt1);
+ //_bifurcations.push_back(nextNtBin);
+ //_errorPos.push_back(pos);
+ //_bifurcationTypes.push_back(0);
+ //return uniqKmer;
+
+ /*
+ _MCuniqNoSolid += 1;
+ //nextNt = Leon::bin2nt(nt1);
+
+ _bifurcationTypes.push_back(0);
+ _bifurcations.push_back(nextNtBin);
+ _errorPos.push_back(pos);
+
+ nextNtBin = getBestPath(pos, kmer, res4, rightExtend);
+ //cout << (int)nextNtBin << endl;
+ if(nextNtBin == -1){
+ nextNtBin = nt1;
+ }
+ nextNt = Leon::bin2nt(nextNtBin);
+ _bifurcations.push_back(nextNtBin);
+ _bifurcationTypes.push_back(0);*/
+
+ }
+ //cout << "PROBLEME IN BUILD BINARY BIFURCATION (DnaEncoder - buildBifurcationList)" << endl;
+
+ //if(_sequence->getIndex() < 10)
+ // cout << (char) Leon::bin2nt(nextNtBin) << endl;;
+
+ codeSeedNT(&_kmerModel, &kmer, nextNt, rightExtend);
+ return kmer;
+
+
+
+ }
+ else{
+
+ _bifurcations.push_back(nextNtBin);
+ _bifurcationTypes.push_back(0);
+ codeSeedNT(&_kmerModel, &kmer, nextNt, rightExtend);
+ return kmer;
+ }
+
+ }
+ else{
+
+
+ if(indexedKmerCount == 0){
+ //cout << "PAF_BREAK " << pos << endl;
+ _MCnoAternative += 1;
+ }
+ else if(indexedKmerCount == 1){
+ //cout << "PAF_UNIQ " << pos << endl;
+ _MCuniqNoSolid += 1;
+
+ //_leon->_readWithAnchorMutationChoicesSize += 0.25;
+
+ _bifurcations.push_back(nextNtBin);
+ _bifurcationTypes.push_back(0);
+ addErrorPos(pos, rightExtend);
+ //_errorPos.push_back(pos);
+ return uniqKmer;
+ }
+ else if(indexedKmerCount == 2){
+ //cout << "PAF_MULTIPLE " << pos << endl;
+ _MCuniqNoSolid += 1;
+
+
+ //encode error
+ _bifurcations.push_back(nextNtBin);
+ _bifurcationTypes.push_back(0);
+ addErrorPos(pos, rightExtend);
+
+ //get the first path in bufurcation
+ for(int nt=0; nt<4; nt++){
+ if(res4[nt]){
+ nextNtBin = nt;
+ nextNt = Leon::bin2nt(nt);
+ break;
+ }
+ }
+
+ codeSeedNT(&_kmerModel, &kmer, nextNt, rightExtend);
+ return kmer;
+ //return uniqKmer;
+ //_bifurcationTypes.push_back(0);
+
+ /*_bifurcations.push_back(nextNtBin);
+ _bifurcationTypes.push_back(0);
+
+ return uniqKmer;*/
+ }
+ else{
+ _MCuniqNoSolid += 1;
+ //cout << "PAF_MULTIPLE " << pos << endl;
+ /*
+ _errorPos.push_back(pos);
+ _bifurcations.push_back(nextNtBin);
+ _bifurcationTypes.push_back(0);
+ _errorPos.push_back(pos);
+ return uniqKmer;*/
+
+ }
+
+
+
+ //_leon->_readWithAnchorMutationChoicesSize += 0.25;
+ _bifurcations.push_back(nextNtBin);
+ _bifurcationTypes.push_back(0);
+ codeSeedNT(&_kmerModel, &kmer, nextNt, rightExtend);
+ return kmer;
+
+ }
+
+
+
+}
+
+
+int DnaEncoder::getBestPath(int pos, kmer_type& kmer, bitset<4>& initRes4, bool rightExtend){
+
+
+ char ntInRead = 0;
+ if(rightExtend){
+ if((unsigned)(pos+1) < _readSize){
+ ntInRead = _readseq[pos+1];
+ }
+ }
+ else{
+ if(pos-1 >= 0){
+ ntInRead = _readseq[pos-1];
+ }
+ }
+
+ int ntInReadBin = Leon::nt2bin(ntInRead);
+
+ //int depth = 2;
+ int bestNt = -1;
+ bool isValid[4];
+ for(int i=0; i<4; i++){
+ isValid[i] = true;
+ }
+
+ //for(int j=0; j<depth; j++){
+
+ for(int nt=0; nt<4; nt++){
+
+ if(initRes4[nt]){
+
+ kmer_type mutatedKmer = kmer;
+ codeSeedBin(&_kmerModel, &mutatedKmer, nt, rightExtend);
+
+ bitset<4> res4 = _bloom->contains4(mutatedKmer, rightExtend);
+ int nbSolidKmer = 0;
+
+ for(int nt2=0; nt2<4; nt2++){
+
+ if(res4[nt2]){
+ nbSolidKmer += 1;
+
+ if(nt2 == ntInReadBin){
+ bestNt = nt;
+ }
+ }
+
+ }
+
+ if(nbSolidKmer != 1){
+ isValid[nt] = false;
+ }
+ }
+ else{
+ isValid[nt] = false;
+ }
+ }
+
+ //}
+
+ int nbAlternative = 0;
+ int uniqAlternative = -1;
+
+ for(int nt=0; nt<4; nt++){
+ if(isValid[nt]){
+ nbAlternative += 1;
+ uniqAlternative = nt;
+ }
+ }
+
+ if(nbAlternative == 1){
+ return uniqAlternative;
+ }
+
+ return bestNt;
+}
+
+int DnaEncoder::voteMutations(int pos, int depth, bool rightExtend){
+ //kmer_type kmer;
+ //int maxScore = 0;
+ //int votes[4];
+
+ int bestNt = 0;
+ //bool isValid[4];
+
+ //kmer_type mutatedKmers[4];
+ //vector<int> mutations;
+ //bool isMutateChoice[4];
+ //kmer = _kmers[pos];
+ /*
+ for(int nt=0; nt<4; nt++){
+
+ kmer_type mutatedKmer = kmer;
+ codeSeedBin(&_kmerModel, &mutatedKmer, nt, rightExtend);
+ kmer_type mutatedKmerMin = min(mutatedKmer, revcomp(mutatedKmer, _kmerSize));
+
+ //mutated_kmer.printASCII(_kmerSize);
+
+ if(_bloom->contains(mutatedKmerMin)){
+ mutations.push_back(nt);
+ mutatedKmers[nt] = mutatedKmer;
+ votes[nt] = 0;
+ //isMutateChoice[nt] = true;
+ }
+ }
+ */
+ //kmer_type currentKmer;
+ //cout << _readseq << endl;
+ //cout << pos << ": " << kmer.toString(_kmerSize) << endl;
+
+ /*
+ for(int nt=0; nt<4; nt++){
+
+ //mutatedKmer.printASCII(_kmerSize);
+
+ if(res4[nt]){
+
+ kmer_type mutatedKmer = kmer;
+ codeSeedBin(&_kmerModel, &mutatedKmer, nt, rightExtend);
+
+ indexedKmerCount += 1;
+ uniqNt = nt;
+ uniqKmer = mutatedKmer;
+
+
+ if(Leon::bin2nt(nt) == nextNt){
+ isKmerSolid = true;
+ }
+ }
+
+ }
+
+
+ //for(int nt=0; nt<4; nt++){
+
+ kmer_type mutatedKmer = kmer;
+ codeSeedBin(&_kmerModel, &mutatedKmer, nt, rightExtend);
+ kmer_type mutatedKmerMin = min(mutatedKmer, revcomp(mutatedKmer, _kmerSize));
+
+ for(int j=0; j<depth; j++){
+ //char nextNt;
+ //int kmerPos;
+ if(rightExtend){
+ if(pos+1+_kmerSize+j >= _readSize) break;
+ nextNt = _readseq[pos+1+_kmerSize+j];
+ }
+ else{
+ if(pos-2-j < 0) break;
+ nextNt = _readseq[pos-2-j];
+ }
+
+ std::bitset<4> res4 = _bloom->contains4(kmer,rightExtend);
+
+ for(int nt=0; nt<4; nt++){
+
+ if(res4[nt]){
+
+ }
+ }
+
+ //cout << j << ": " << nextNt << endl;
+ codeSeedNT(&_kmerModel, &mutatedKmer, nextNt, rightExtend);
+ kmer_type mutatedKmerMin = min(mutatedKmer, revcomp(mutatedKmer, _kmerSize));
+
+ if(_bloom->contains(mutatedKmerMin)){
+ votes[nt] += 1;
+ if(votes[nt] > maxScore){
+ maxScore = votes[nt];
+ bestNt = nt;
+ }
+ }
+ }
+ //}
+
+
+ if(maxScore == 0){
+ //cout << "No best NT" << endl;
+ bestNt = -1;
+ }
+ //else
+ //cout << "Best nt: " << bin2NT[bestNt] << endl;
+ */
+ return bestNt;
+
+
+}
+
+void DnaEncoder::encodeNoAnchorRead(){
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\tEncode no anchor read" << endl;
+ #endif
+
+ //printf("encode no anchor read \n");
+ //Reinsert N because they can be encoded by the coder
+ for(unsigned int i=0; i<_Npos.size(); i++){
+ _readseq[_Npos[i]] = 'N';
+ }
+
+ #ifdef LEON_PRINT_STAT
+ _rangeEncoder6.encode(_readTypeModel, 1);
+ #endif
+ _rangeEncoder.encode(_readTypeModel, 1);
+
+ //_leon->_readWithoutAnchorSize += _readSize*0.375;
+ _readWithoutAnchorCount +=1;
+
+ /*
+ for(int i=0; i<_readSize; i++){
+ if(_readseq[i] == 'N'){
+ _leon->_noAnchor_with_N_kmer_count += 1;
+ break;
+ }
+ }
+
+ bool full_N = true;
+ for(int i=0; i<_readSize; i++){
+ if(_readseq[i] != 'N'){
+ full_N = false;
+ break;
+ }
+ }
+ if(full_N){
+ _leon->_noAnchor_full_N_kmer_count += 1;
+ }*/
+
+ #ifdef LEON_PRINT_STAT
+ CompressionUtils::encodeNumeric(_rangeEncoder5, _noAnchorReadSizeValueModel, _readSize);
+ #endif
+ CompressionUtils::encodeNumeric(_rangeEncoder, _noAnchorReadSizeValueModel, _readSize);
+
+
+ for(unsigned int i=0; i<_readSize; i++){
+
+ #ifdef LEON_PRINT_STAT
+ _rangeEncoder5.encode(_noAnchorReadModel, Leon::nt2bin(_readseq[i]));
+ #endif
+ _rangeEncoder.encode(_noAnchorReadModel, Leon::nt2bin(_readseq[i]));
+
+ }
+
+}
+
+
+
+
+QualDecoder::QualDecoder(Leon* leon, const string& inputFilename,tools::storage::impl::Group * group)
+//QualDecoder::QualDecoder(Leon* leon, const string& inputFilename)
+{
+ _group = group;
+ _inputStream =0;
+ _finished = false;
+ _leon = leon;
+ _inbuffer = NULL;
+}
+
+
+QualDecoder::~QualDecoder(){
+
+ free(_inbuffer);
+ if(_inputStream !=0) delete _inputStream;
+
+}
+
+
+
+void QualDecoder::setup( int blockID){
+
+ _processedSequenceCount = 0;
+
+ if(_inputStream !=0) delete _inputStream;
+
+ std::string datasetname = Stringify::format ("qual_%i",blockID);
+
+ _inputStream = new tools::storage::impl::Storage::istream (*_group, datasetname);
+
+
+ auto _tempcollec = & _group->getCollection<math::NativeInt8> (datasetname);
+ std::string dsize = _tempcollec->getProperty ("size");
+
+ _blockSize = std::stoi(dsize); // blockSize;
+
+ _inbuffer = (char * ) realloc(_inbuffer, _blockSize* sizeof(char));
+
+}
+
+
+
+void QualDecoder::setup(u_int64_t blockStartPos, u_int64_t blockSize, int sequenceCount){
+
+ _processedSequenceCount = 0;
+
+
+ _inputStream->seekg(blockStartPos, _inputStream->beg);
+
+ _blockStartPos = blockStartPos;
+ _blockSize = blockSize;
+
+
+
+
+
+
+
+ _inbuffer = (char * ) realloc(_inbuffer, blockSize* sizeof(char));
+
+ _sequenceCount = sequenceCount;
+}
+
+
+
+void QualDecoder::execute(){
+
+
+// printf("execute qual decoder _blockStartPos %llu _blockSize %llu \n",_blockStartPos,_blockSize);
+
+ _inputStream->read(_inbuffer,_blockSize );
+
+ if(!_inputStream->good()) printf("inputstream E bad \n");
+
+ //_inputFile->read(_inbuffer,_blockSize );
+
+ //printf("----Begin decomp of Block ----\n");
+
+ z_stream zs;
+ memset(&zs, 0, sizeof(zs));
+
+ //deflateinit2 to be able to gunzip it fro mterminal
+
+ //if (inflateInit2(&zs, (15+32)) != Z_OK)
+ if (inflateInit (&zs) != Z_OK)
+ throw Exception ("inflate Init failed while decompressing.");
+
+ zs.next_in = (Bytef*) _inbuffer ;
+ zs.avail_in = _blockSize ; // set the z_stream's input
+
+ int ret;
+ char outbuffer[32768];
+
+ // retrieve the compressed bytes blockwise
+ do {
+ zs.next_out = reinterpret_cast<Bytef*>(outbuffer);
+ zs.avail_out = sizeof(outbuffer);
+
+ ret = inflate (&zs, Z_SYNC_FLUSH); //ou Z_FINISH ? Z_SYNC_FLUSH
+
+ if (_buffer.size() < zs.total_out) {
+ // append the block to the output string
+ _buffer.append(outbuffer,
+ zs.total_out - _buffer.size());
+ }
+ if (ret != Z_OK)
+ {
+ //printf("ret val %i _blockStartPos %llu \n",ret,_blockStartPos);
+ break;
+ }
+ else
+ {
+ //printf("-----block ret ok _blockStartPos %llu ----\n",_blockStartPos);
+ }
+ } while (ret == Z_OK);
+
+ inflateEnd(&zs);
+
+ _finished = true;
+
+ //printf("Should be done decompressing block, in size %llu out size %lu \n",_blockSize,_buffer.size() );
+
+}
+
+
+//====================================================================================
+// ** DnaDecoder
+//====================================================================================
+DnaDecoder::DnaDecoder(Leon* leon, const string& inputFilename,tools::storage::impl::Group * group) :
+AbstractDnaCoder(leon)
+{
+ _group = group;
+ _inputStream =0;
+
+ //_inputFile = new ifstream(inputFilename.c_str(), ios::in|ios::binary);
+ _finished = false;
+
+ //_anchorDictFile = new ifstream(_leon->_anchorDictFilename.c_str(), ios::in);
+
+}
+
+DnaDecoder::~DnaDecoder(){
+ //delete _rangeDecoder;
+ //delete _outputFile;
+// delete _inputFile;
+// delete _anchorDictFile;
+
+ if(_inputStream !=0) delete _inputStream;
+
+}
+
+void DnaDecoder::setup(u_int64_t blockStartPos, u_int64_t blockSize, int sequenceCount,int blockID){
+ startBlock();
+ _rangeDecoder.clear();
+
+ //_inputFile->seekg(blockStartPos, _inputFile->beg);
+ //_rangeDecoder.setInputFile(_inputFile);
+
+ if(_inputStream !=0) delete _inputStream;
+ std::string datasetname = Stringify::format ("dna_%i",blockID);
+
+ _inputStream = new tools::storage::impl::Storage::istream (*_group, datasetname);
+
+ auto _tempcollec = & _group->getCollection<math::NativeInt8> (datasetname);
+ std::string dsize = _tempcollec->getProperty ("size");
+
+ _blockSize = std::stoi(dsize); // blockSize;
+ _rangeDecoder.setInputFile(_inputStream);
+
+
+
+ _blockStartPos = blockStartPos;
+ _blockSize = blockSize;
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t-----------------------" << endl;
+ cout << "\tDecoding Dna block " << _blockStartPos << " - " << _blockStartPos+_blockSize << endl;
+ #else
+ ;//_leon->_progress_decode->inc(1);
+ #endif
+
+ _sequenceCount = sequenceCount;
+}
+
+void DnaDecoder::execute(){
+
+ //decodeFirstHeader();
+
+ while(_processedSequenceCount < _sequenceCount){
+
+ //cout << "lala" << endl;
+ //int i=0;
+ //while(i < Leon::READ_PER_BLOCK){
+ //while(_inputFile->tellg() <= _blockStartPos+_blockSize){
+ //if(_leon->_readCount > 1) return;
+
+
+ u_int8_t readType = _rangeDecoder.nextByte(_readTypeModel);
+ //cout << "Read type: " << (int)readType << endl;
+
+ if(readType == 0)
+ decodeAnchorRead(); //ici
+ else if(readType == 1)
+ decodeNoAnchorRead();
+
+ endRead();
+ //cout << _inputFile->tellg() << " " << _blockStartPos+_blockSize << endl;
+ /*
+ string trueSeq = string((*_leon->_testBankIt)->getDataBuffer());
+ trueSeq = trueSeq.substr(0, _readSize);
+ //cout << trueSeq << endl;
+ //cout << _currentSeq << endl;
+ if(trueSeq != _currentSeq){
+ cout << (*_leon->_testBankIt)->getIndex() << "\t\tseq different !!" << endl;
+ cout << "\t\t" << trueSeq << endl;
+ cout << "\t\t" << _currentSeq << endl;
+ _leon->_readCount += 1;
+ return;
+ }
+ _leon->_testBankIt->next();
+ */
+ #ifdef PRINT_DEBUG_DECODER
+ _readCount += 1;
+ cout << _leon->_readCount << ": " << _currentSeq << endl;
+ #endif
+
+ //i++;
+ //_leon->_readCount += 1;
+ //if(i == 1) return;
+ //_currentSeq.clear();
+
+ //cout << (int)(_inputFile->tellg() < _blockStartPos+_blockSize) << endl;
+
+ }
+
+ //cout << "endooo" << endl;
+ _finished = true;
+
+}
+
+void DnaDecoder::decodeAnchorRead(){
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tDecode anchor read" << endl;
+ #endif
+
+ //u_int8_t deltaType;
+ //u_int64_t deltaValue;
+
+ //printf("Decode anchor read \n");
+
+ //Decode read size
+ //deltaType = _rangeDecoder.nextByte(_readSizeDeltaTypeModel);
+ //deltaValue = CompressionUtils::decodeNumeric(_rangeDecoder, _readSizeValueModel);
+// printf("read deltaValue %llu \n",deltaValue);
+ //_readSize = CompressionUtils::getValueFromDelta(deltaType, _prevReadSize, deltaValue);
+ //_prevReadSize = _readSize;
+ _readSize = CompressionUtils::decodeNumeric(_rangeDecoder, _readSizeValueModel);
+
+// printf("read size %i \n",_readSize);
+
+ //Decode anchor pos
+ //deltaType = _rangeDecoder.nextByte(_anchorPosDeltaTypeModel);
+ //deltaValue = CompressionUtils::decodeNumeric(_rangeDecoder, _anchorPosModel);
+ //int anchorPos = CompressionUtils::getValueFromDelta(deltaType, _prevAnchorPos, deltaValue);
+ //_prevAnchorPos = anchorPos;
+// printf("anchor pos %i \n",anchorPos);
+ int anchorPos = CompressionUtils::decodeNumeric(_rangeDecoder, _anchorPosModel);
+
+
+ //Decode anchor address
+ //deltaType = _rangeDecoder.nextByte(_anchorAddressDeltaTypeModel);
+ //deltaValue = CompressionUtils::decodeNumeric(_rangeDecoder, _anchorAddressModel);
+ //u_int64_t anchorAddress = CompressionUtils::getValueFromDelta(deltaType, _prevAnchorAddress, deltaValue);
+ //_prevAnchorAddress = anchorAddress;
+ u_int64_t anchorAddress = CompressionUtils::decodeNumeric(_rangeDecoder, _anchorAddressModel);
+
+ kmer_type anchor = _leon->getAnchor(_anchorDictFile, anchorAddress); //laa
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\t\tRead size: " << _readSize << endl;
+ cout << "\t\t\tAnchor pos: " << anchorPos << endl;
+ cout << "\t\t\tAnchor adress: " << anchorAddress << endl;
+ cout << "\t\t\tAnchor: " << anchor.toString(_kmerSize) << endl;
+ #endif
+
+ //Decode the bit that says if the anchor is revcomp or not
+ if(_rangeDecoder.nextByte(_readAnchorRevcompModel) == 1)
+ anchor = revcomp(anchor, _kmerSize);
+
+ _currentSeq = anchor.toString(_kmerSize);
+ _leftErrorPos.clear();
+ //_rightErrorPos.clear();
+ _Npos.clear();
+
+ //cout << _readSize << " " << anchorPos << " " << anchorAddress << endl;
+ //Decode N pos
+ _prevNpos = 0;
+ u_int64_t NposCount = CompressionUtils::decodeNumeric(_rangeDecoder, _numericModel);
+ for(unsigned int i=0; i<NposCount; i++){
+ //deltaType = _rangeDecoder.nextByte(_NposDeltaTypeModel);
+ //deltaValue = CompressionUtils::decodeNumeric(_rangeDecoder, _NposModel);
+ //u_int64_t nPos = CompressionUtils::getValueFromDelta(deltaType, _prevNpos, deltaValue);
+ u_int64_t nPos = CompressionUtils::decodeNumeric(_rangeDecoder, _NposModel) + _prevNpos;
+ _Npos.push_back(nPos);
+ //cout << nPos << endl;
+ _prevNpos = nPos;
+ //_Npos.push_back(CompressionUtils::decodeNumeric(_rangeDecoder, _anchorPosSizeModel, _anchorPosModel));
+ }
+
+ //Decode error pos
+ u_int64_t nbLeftError = CompressionUtils::decodeNumeric(_rangeDecoder, _leftErrorModel);
+ _prevErrorPos = 0;
+ for(unsigned int i=0; i<nbLeftError; i++){
+ u_int64_t errorPos = CompressionUtils::decodeNumeric(_rangeDecoder, _leftErrorPosModel) + _prevErrorPos;
+ addErrorPos(errorPos, true);
+ _prevErrorPos = errorPos;
+ }
+
+ /*
+ u_int64_t nbRightError = CompressionUtils::decodeNumeric(_rangeDecoder, _rightErrorModel);
+ _prevErrorPos = anchorPos-1;
+ for(int i=0; i<nbLeftError; i++){
+ int deltaValue = CompressionUtils::decodeNumeric(_rangeDecoder, _leftErrorPosModel);
+ int errorPos = _prevErrorPos-deltaValue;
+ //deltaType = _rangeDecoder.nextByte(_errorPosDeltaTypeModel);
+ //deltaValue = CompressionUtils::decodeNumeric(_rangeDecoder, _errorPosModel); //reprise
+ //u_int64_t errorPos = CompressionUtils::getValueFromDelta(deltaType, _prevErrorPos, deltaValue);
+ addErrorPos(errorPos, false);
+ //_errorPos.push_back(errorPos);
+ _prevErrorPos = errorPos;
+ //_errorPos.push_back(CompressionUtils::decodeNumeric(_rangeDecoder, _anchorPosSizeModel, _anchorPosModel));
+ }
+ _prevErrorPos = anchorPos+_kmerSize;
+ for(int i=0; i<nbRightError; i++){
+ int deltaValue = CompressionUtils::decodeNumeric(_rangeDecoder, _rightErrorPosModel);
+ int errorPos = _prevErrorPos+deltaValue;
+ //deltaType = _rangeDecoder.nextByte(_errorPosDeltaTypeModel);
+ //deltaValue = CompressionUtils::decodeNumeric(_rangeDecoder, _errorPosModel); //reprise
+ //u_int64_t errorPos = CompressionUtils::getValueFromDelta(deltaType, _prevErrorPos, deltaValue);
+ addErrorPos(errorPos, true);
+ //_errorPos.push_back(errorPos);
+ _prevErrorPos = errorPos;
+ //_errorPos.push_back(CompressionUtils::decodeNumeric(_rangeDecoder, _anchorPosSizeModel, _anchorPosModel));
+ }*/
+
+
+ //Extend anchor to the left
+ kmer_type kmer = anchor;
+ for(int i=anchorPos-1; i>=0; i--){
+ kmer = extendAnchor(kmer, i, false);
+ }
+
+ //Extend anchor to the right
+ kmer = anchor;
+ for(unsigned int i=anchorPos+_kmerSize; i<_readSize; i++){
+ kmer = extendAnchor(kmer, i, true);
+ //cout << "\t" << kmer.toString(_kmerSize) << endl;
+ }
+
+ //Inject N in the decoded read sequence
+ //printf("npos s %i currseq %s \n",_Npos.size(),_currentSeq.c_str());
+ for(unsigned int i=0; i<_Npos.size(); i++){
+ _currentSeq[_Npos[i]] = 'N';
+ }
+}
+
+kmer_type DnaDecoder::extendAnchor(kmer_type kmer, int pos, bool rightExtend){
+
+ u_int8_t nextNt;
+ //int nextNtBin;
+ kmer_type resultKmer;
+
+ if(std::find(_Npos.begin(), _Npos.end(), pos) != _Npos.end()){
+ nextNt = 'A';
+ if(rightExtend){
+ _currentSeq += nextNt;
+ }
+ else{
+ _currentSeq.insert(_currentSeq.begin(), nextNt);
+ }
+ //cout << _currentSeq << endl;
+ //if(nextNt == 'N') nextN
+
+ resultKmer = kmer;
+ codeSeedNT(&_kmerModel, &resultKmer, nextNt, rightExtend);
+ //cout << kmer.toString(_kmerSize) << endl;
+ //cout << resultKmer.toString(_kmerSize) << endl;
+ return resultKmer;
+ }
+
+ /*
+ if(rightExtend){
+ if(std::find(_rightErrorPos.begin(), _rightErrorPos.end(), pos) != _rightErrorPos.end()){
+ nextNt = Leon::bin2nt(_rangeDecoder.nextByte(_bifurcationModel));
+
+ if(rightExtend)
+ _currentSeq += nextNt;
+ else
+ _currentSeq.insert(_currentSeq.begin(), nextNt);
+
+ std::bitset<4> res4 = _bloom->contains4(kmer,rightExtend);
+
+ for(int nt=0; nt<4; nt++){
+
+ if(res4[nt]){
+ kmer_type mutatedKmer = kmer;
+ codeSeedBin(&_kmerModel, &mutatedKmer, nt, rightExtend);
+ return mutatedKmer;
+ }
+ }
+ }
+ }
+ else{*/
+ if(std::find(_leftErrorPos.begin(), _leftErrorPos.end(), pos) != _leftErrorPos.end()){
+ nextNt = Leon::bin2nt(_rangeDecoder.nextByte(_bifurcationModel));
+
+ if(rightExtend)
+ _currentSeq += nextNt;
+ else
+ _currentSeq.insert(_currentSeq.begin(), nextNt);
+
+ std::bitset<4> res4 = _bloom->contains4(kmer,rightExtend);
+
+ for(int nt=0; nt<4; nt++){
+
+ if(res4[nt]){
+ kmer_type mutatedKmer = kmer;
+ codeSeedBin(&_kmerModel, &mutatedKmer, nt, rightExtend);
+ return mutatedKmer;
+ }
+ }
+ }
+ //}
+
+
+
+ //cout << kmer.toString(_kmerSize) << endl;
+ kmer_type uniqKmer;
+ //, mutatedSolidKmer;
+ int uniqNt;
+ //bool isKmerSolid = false;
+
+
+ //kmer = _kmers[pos];
+
+ int indexedKmerCount = 0;
+
+ //cout << kmer.toString(_kmerSize) << endl;
+
+
+
+ std::bitset<4> res4 = _bloom->contains4(kmer,rightExtend);
+ for(int nt=0; nt<4; nt++){
+ if(res4[nt]){
+ kmer_type mutatedKmer = kmer;
+ codeSeedBin(&_kmerModel, &mutatedKmer, nt, rightExtend);
+
+ indexedKmerCount += 1;
+ uniqNt = nt;
+ uniqKmer = mutatedKmer;
+ }
+ }
+
+
+
+ /*
+ for(int nt=0; nt<4; nt++){
+ //if(nt == original_nt){
+ // continue;
+ //}
+
+ kmer_type mutatedKmer = kmer;
+ codeSeedBin(&_kmerModel, &mutatedKmer, nt, rightExtend);
+ kmer_type mutatedKmerMin = min(mutatedKmer, revcomp(mutatedKmer, _kmerSize));
+
+ //mutatedKmer.printASCII(_kmerSize);
+
+ if(_bloom->contains(mutatedKmerMin)){
+ indexedKmerCount += 1;
+ uniqNt = nt;
+ uniqKmer = mutatedKmer;
+ }
+
+ }
+ */
+
+ if(indexedKmerCount == 1){
+ nextNt = Leon::bin2nt(uniqNt);
+ //cout << "case 1 " << nextNt << endl;
+ resultKmer = uniqKmer;
+ }
+ else if(indexedKmerCount == 2){
+
+ char nt1 = -1;
+ char nt2 = -1;
+
+ for(int nt=0; nt<4; nt++){
+ if(res4[nt]){
+ if(nt1 == -1)
+ nt1 = nt;
+ else if(nt2 == -1)
+ nt2 = nt;
+ else break;
+ }
+ }
+
+ u_int8_t nextBinaryNt = _rangeDecoder.nextByte(_bifurcationBinaryModel);
+
+ //cout << (int)nextBinaryNt << endl;
+ if(nextBinaryNt == 0)
+ nextNt = Leon::bin2nt(nt1);
+ else
+ nextNt = Leon::bin2nt(nt2);
+
+ //cout << nextNt << endl;
+ resultKmer = kmer;
+ codeSeedNT(&_kmerModel, &resultKmer, nextNt, rightExtend);
+
+
+ }
+ else{
+ nextNt = Leon::bin2nt(_rangeDecoder.nextByte(_bifurcationModel));
+ //cout << "case 2 "<< nextNt << endl;
+ resultKmer = kmer;
+ codeSeedNT(&_kmerModel, &resultKmer, nextNt, rightExtend);
+ }
+
+ //cout << nextNt << endl;
+
+ //if(nextNt == 'N') cout << "lala" << endl;
+
+ if(rightExtend){
+ _currentSeq += nextNt;
+ }
+ else{
+ _currentSeq.insert(_currentSeq.begin(), nextNt);
+ }
+
+ //cout << _currentSeq << endl;
+ //cout << resultKmer.toString(_kmerSize) << endl;
+ return resultKmer;
+
+}
+
+void DnaDecoder::decodeNoAnchorRead(){
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tDecode no anchor read" << endl;
+ #endif
+
+ _readSize = CompressionUtils::decodeNumeric(_rangeDecoder, _noAnchorReadSizeValueModel);
+ //cout << "\tRead size: " << _readSize << endl;
+ for(unsigned int i=0; i<_readSize; i++){
+ _currentSeq += Leon::bin2nt(_rangeDecoder.nextByte(_noAnchorReadModel));
+ }
+ //endSeq();
+ //cout << read << endl;
+}
+
+void DnaDecoder::endRead(){
+ AbstractDnaCoder::endRead();
+
+ _buffer += _currentSeq + '\n';
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\t\tRead: " << _currentSeq << endl;
+ #endif
+ //_outputFile->write(_currentSeq.c_str(), _currentSeq.size());
+ _currentSeq.clear();
+}
+
+
+
+
diff --git a/gatb-core/src/gatb/tools/compression/DnaCoder.hpp b/gatb-core/src/gatb/tools/compression/DnaCoder.hpp
new file mode 100644
index 0000000..6fcdc7c
--- /dev/null
+++ b/gatb-core/src/gatb/tools/compression/DnaCoder.hpp
@@ -0,0 +1,307 @@
+/*****************************************************************************
+ * Leon: reference free compression for NGS reads
+ * A tool from the GATB (Genome Assembly Tool Box)
+ * Copyright (C) 2014 INRIA
+ * Authors: G.Benoit, G.Rizk, C.Lemaitre
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *****************************************************************************/
+
+#ifndef _DNACODER_HPP_
+#define _DNACODER_HPP_
+
+
+#include <gatb/gatb_core.hpp>
+//#include "RangeCoder.hpp"
+#include "Leon.hpp"
+//#include "CompressionUtils.hpp"
+
+//#define PRINT_DISTRIB
+
+#ifdef PRINT_DISTRIB
+#include <unordered_set>
+#include <unordered_map>
+#endif
+
+
+using namespace std;
+class Leon;
+
+//====================================================================================
+// ** AbstractDnaCoder
+//====================================================================================
+class AbstractDnaCoder
+{
+ public:
+ AbstractDnaCoder(Leon* leon);
+
+ protected:
+ KmerModel _kmerModel;
+
+ Order0Model _readTypeModel; //only 2 value in this model: with anchor or without anchor
+
+ Order0Model _noAnchorReadModel;
+
+ Order0Model _bifurcationModel;
+ Order0Model _bifurcationBinaryModel;
+ Order0Model _readAnchorRevcompModel;
+
+ Leon* _leon;
+ collections::impl::IBloom<kmer_type>* _bloom; // the bloom containing the solid kmers
+
+ Order0Model _readSizeDeltaTypeModel;
+ Order0Model _anchorPosDeltaTypeModel;
+ Order0Model _anchorAddressDeltaTypeModel;
+ Order0Model _NposDeltaTypeModel;
+ Order0Model _errorPosDeltaTypeModel;
+
+
+ //Sequence* _prevSequences;
+ //Order0Model _isPrevReadAnchorableModel;
+ //vector<Order0Model> _isPrevReadAnchorablePosModel;
+
+ vector<Order0Model> _anchorAddressModel;
+
+ vector<Order0Model> _anchorPosModel;
+
+ vector<Order0Model> _numericModel;
+ vector<Order0Model> _leftErrorModel;
+ vector<Order0Model> _rightErrorModel;
+
+ vector<Order0Model> _NposModel;
+ vector<Order0Model> _leftErrorPosModel;
+ vector<Order0Model> _rightErrorPosModel;
+
+ vector<Order0Model> _readSizeValueModel;
+
+
+ vector<Order0Model> _noAnchorReadSizeValueModel;
+
+ size_t _kmerSize;
+ unsigned int _readSize;
+
+ vector<int> _leftErrorPos;
+ vector<int> _rightErrorPos;
+ vector<int> _Npos;
+
+ void startBlock();
+ void endRead();
+ void codeSeedBin(KmerModel* model, kmer_type* kmer, int nt, bool right);
+ void codeSeedNT(KmerModel* model, kmer_type* kmer, char nt, bool right);
+
+ void addErrorPos(int pos, bool rightExtend);
+
+ u_int64_t _seqId;
+
+
+
+ u_int64_t _prevReadSize;
+ u_int64_t _prevAnchorPos;
+ u_int64_t _prevAnchorAddress;
+ u_int64_t _prevNpos;
+ u_int64_t _prevErrorPos;
+ u_int64_t _prevNbLeftError;
+
+ int _processedSequenceCount;
+};
+
+//====================================================================================
+// ** DnaEncoder
+//====================================================================================
+class DnaEncoder : AbstractDnaCoder
+{
+
+ public:
+
+ DnaEncoder(Leon* leon);
+ DnaEncoder(const DnaEncoder& copy);
+ ~DnaEncoder();
+
+ void operator()(Sequence& sequence);
+
+ private:
+
+
+ //pour quals
+ char * _qualseq;
+ int * _nb_solids;
+ int _smoothing_threshold;
+ unsigned int _max_read_size;
+ bool _trunc_mode;
+
+ void storeSolidCoverageInfo();
+ void smoothQuals();
+ bool apply_smoothing_at_pos(int pos);
+
+ double char2proba(char c);
+ char char2phred(char c);
+
+ char * _bufferQuals;
+ unsigned int _bufferQuals_idx;
+ unsigned int _bufferQuals_size;
+
+#ifdef PRINT_DISTRIB
+ vector<Sequence*> _sequences;
+ const int maxSequences = 100;
+ vector<u_int32_t> _distrib;
+ u_int64_t _outDistrib;
+#endif
+
+
+
+
+ RangeEncoder _rangeEncoder;
+
+ #ifdef LEON_PRINT_STAT
+ RangeEncoder _rangeEncoder1;
+ RangeEncoder _rangeEncoder2;
+ RangeEncoder _rangeEncoder3;
+ RangeEncoder _rangeEncoder4;
+ RangeEncoder _rangeEncoder5;
+ RangeEncoder _rangeEncoder6;
+ #endif
+
+ //static void encodeFirstHeader();
+ void writeBlock();
+ void execute();
+
+ void buildKmers();
+ bool isReadAnchorable();
+ int findExistingAnchor(u_int32_t* anchorAddress);
+
+ void encodeAnchorRead(int anchorPos, u_int32_t anchorAddress);
+ kmer_type buildBifurcationList(int pos, kmer_type kmer, bool rightExtend);
+ //int buildBifurcationList(int pos, bool rightExtend);
+ int voteMutations(int pos, int depth, bool rightExtend);
+
+ void encodeNoAnchorRead();
+
+ int getBestPath(int pos, kmer_type& kmer, bitset<4>& initRes4, bool rightExtend);
+
+ Sequence* _sequence;
+ char* _readseq;
+ vector<kmer_type> _kmers;
+ KmerModel::Iterator _itKmer;
+ vector<u_int8_t> _bifurcations;
+ vector<u_int8_t> _binaryBifurcations;
+ vector<u_int8_t> _bifurcationTypes;
+
+ //bool _isPrevReadAnchorable;
+ //u_int64_t _isPrevReadAnchorablePos;
+
+ vector<int> _solidMutaChain;
+ //int _solidMutaChainPos;
+ u_int64_t _totalDnaSize;
+ u_int64_t _readCount;
+ u_int64_t _MCtotal;
+ u_int64_t _readWithoutAnchorCount;
+ u_int64_t _MCuniqSolid;
+ u_int64_t _MCuniqNoSolid;
+ u_int64_t _MCnoAternative;
+ u_int64_t _MCmultipleSolid;
+
+ int _minSequenceSize;
+ int _maxSequenceSize;
+ //u_int64_t _MCmultipleNoSolid;
+
+ int _thread_id;
+
+ // int _solidMutaChainStartPos;
+ // int _solidMutaChainSize;
+ // int _solidMutaChainLockTime;
+};
+
+//====================================================================================
+// ** DnaDecoder
+//====================================================================================
+class DnaDecoder : AbstractDnaCoder
+{
+
+ public:
+
+ DnaDecoder(Leon* leon, const string& inputFilename,tools::storage::impl::Group * group);
+ ~DnaDecoder();
+
+ void setup(u_int64_t blockStartPos, u_int64_t blockSize, int sequenceCount, int blockID);
+ void execute();
+
+ string _buffer;
+ bool _finished;
+
+ private:
+
+
+
+
+ RangeDecoder _rangeDecoder;
+ // ifstream* _inputFile;
+ //ofstream* _outputFile;
+ u_int64_t _blockStartPos;
+ u_int64_t _blockSize;
+ // int _decodedSequenceCount;
+ string _currentSeq;
+ ifstream* _anchorDictFile;
+
+ void decodeAnchorRead();
+ kmer_type extendAnchor(kmer_type kmer, int pos, bool rightExtend);
+
+ void decodeNoAnchorRead();
+ void endRead();
+
+ int _sequenceCount;
+
+ tools::storage::impl::Group * _group;
+ tools::storage::impl::Storage::istream *_inputStream;
+
+};
+
+class QualDecoder
+{
+public:
+ //QualDecoder(Leon* leon, const string& inputFilename);
+ QualDecoder(Leon* leon, const string& inputFilename,tools::storage::impl::Group * group);
+
+ ~QualDecoder();
+
+ void setup(u_int64_t blockStartPos, u_int64_t blockSize, int sequenceCount);
+ void setup( int blockID);
+
+ void execute();
+
+ string _buffer;
+ bool _finished;
+
+
+private:
+ Leon* _leon;
+
+ tools::storage::impl::Group * _group;
+
+ char * _inbuffer;
+ //ifstream* _inputFile;
+
+ tools::storage::impl::Storage::istream *_inputStream;
+
+
+ //ofstream* _outputFile;
+ u_int64_t _blockStartPos;
+ u_int64_t _blockSize;
+ //int _decodedSequenceCount;
+ string _currentSeq;
+ int _sequenceCount;
+ int _processedSequenceCount;
+
+};
+#endif /* _DNACODER_HPP_ */
+
diff --git a/gatb-core/src/gatb/tools/compression/HeaderCoder.cpp b/gatb-core/src/gatb/tools/compression/HeaderCoder.cpp
new file mode 100644
index 0000000..2bc38ad
--- /dev/null
+++ b/gatb-core/src/gatb/tools/compression/HeaderCoder.cpp
@@ -0,0 +1,789 @@
+/*****************************************************************************
+ * Leon: reference free compression for NGS reads
+ * A tool from the GATB (Genome Assembly Tool Box)
+ * Copyright (C) 2014 INRIA
+ * Authors: G.Benoit, G.Rizk, C.Lemaitre
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *****************************************************************************/
+
+#include "HeaderCoder.hpp"
+
+#include <bitset> //////////!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! delete
+/*
+#define PRINT_DEBUG_ENCODER
+#define PRINT_DEBUG_DECODER
+*/
+
+
+//====================================================================================
+// ** AbstractHeaderCoder
+//====================================================================================
+AbstractHeaderCoder::AbstractHeaderCoder(Leon* leon) :
+_headerSizeModel(256)
+{
+ _leon = leon;
+ _prevHeader = "";
+ _currentHeader = "";
+}
+
+void AbstractHeaderCoder::addFieldColumn(){
+
+ _typeModel.push_back(Order0Model(HEADER_TYPE_COUNT+1));
+ _fieldIndexModel.push_back(Order0Model(256));
+ _fieldColumnModel.push_back(Order0Model(256));
+ _misSizeModel.push_back(Order0Model(256));
+ _asciiModel.push_back(Order0Model(128));
+ _zeroModel.push_back(Order0Model(256));
+
+ _numericModels.push_back(vector<Order0Model>());
+ for(int j=0; j<CompressionUtils::NB_MODELS_PER_NUMERIC; j++)
+ _numericModels[_numericModels.size()-1].push_back( Order0Model(256) );
+
+ _prevFieldPos.push_back(0);
+ _currentFieldPos.push_back(0);
+ _prevFieldValues.push_back(0);
+ _currentFieldValues.push_back(0);
+ _prevFieldTypes.push_back(FIELD_ASCII);
+ _currentFieldTypes.push_back(FIELD_ASCII);
+ _prevFieldZeroValues.push_back(0);
+ _currentFieldZeroValues.push_back(0);
+}
+
+int AbstractHeaderCoder::typeOfChar(u_int8_t c, bool* isDigit){
+ if(isdigit(c)){
+ *isDigit = true;
+ return 1;
+ }
+ else if(isalpha(c)){
+ *isDigit = false;
+ return 1;
+ }
+ else{
+ *isDigit = false;
+ return 2;
+ }
+}
+
+void AbstractHeaderCoder::splitHeader(){
+ _fieldIndex = 0;
+ _fieldStartPos = 0;
+ _currentPos = 0;
+ _isCurrentFieldNumeric = true;
+
+ u_int8_t c;
+ int charType;
+ bool digitOnly;
+ int lastCharType = typeOfChar(_currentHeader[0], &digitOnly);
+
+ for(_currentPos=0; (unsigned)_currentPos<_currentHeader.size(); _currentPos++){
+ c = _currentHeader[_currentPos];
+
+ digitOnly = true;
+ charType = typeOfChar(c, &digitOnly);
+
+ if(charType != lastCharType){
+ lastCharType = charType;
+ makeField();
+ }
+
+ if(_isCurrentFieldNumeric){
+ _isCurrentFieldNumeric = digitOnly;
+ }
+ }
+
+ makeField();
+
+ _currentFieldCount = _fieldIndex;
+}
+
+void AbstractHeaderCoder::makeField(){
+ if(_fieldStartPos == _currentPos) return;
+
+ //Adjust the maximum number fo field column
+ int currentFieldColumn = _currentFieldPos.size();
+ while(currentFieldColumn <= _fieldIndex+1){
+ addFieldColumn();
+ currentFieldColumn = _currentFieldPos.size();
+ }
+
+ _currentFieldPos[_fieldIndex] = _fieldStartPos;
+ _currentFieldPos[_fieldIndex+1] = _currentPos;
+
+ if(_isCurrentFieldNumeric){
+ string field = _currentHeader.substr(_currentFieldPos[_fieldIndex], _currentFieldPos[_fieldIndex+1]-_currentFieldPos[_fieldIndex]);
+
+ int zeroCount = 0;
+ if(field[0] == '0'){
+ while(field[0] == '0'){
+ zeroCount += 1;
+ field.erase(field.begin());
+ }
+ }
+
+ //cout << " HAHAHAHAAHAHHAHA " << field << endl;
+ _currentFieldZeroValues[_fieldIndex] = zeroCount;
+
+ u_int64_t value = strtoul(field.c_str(), NULL, 0);
+ _currentFieldValues[_fieldIndex] = value;
+
+ if(zeroCount > 0){
+ if(value == 0){
+ _currentFieldTypes[_fieldIndex] = FIELD_ZERO_ONLY;
+ }
+ else{
+ _currentFieldTypes[_fieldIndex] = FIELD_ZERO_AND_NUMERIC;
+ }
+ }
+ else {
+ _currentFieldTypes[_fieldIndex] = FIELD_NUMERIC;
+ }
+ //cout << "OOOOOOOOOOOOoo " << strtoul("15313", NULL, 0) << endl;
+ }
+ else{
+ _currentFieldTypes[_fieldIndex] = FIELD_ASCII;
+ }
+
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\tField "<< _fieldIndex << ": " << _currentHeader.substr(_currentFieldPos[_fieldIndex], _currentFieldPos[_fieldIndex+1]-_currentFieldPos[_fieldIndex]) << " Digit only? " << _isCurrentFieldNumeric << endl;
+ #endif
+ _fieldIndex += 1;
+ _fieldStartPos = _currentPos;
+ _isCurrentFieldNumeric = true;
+}
+
+void AbstractHeaderCoder::endHeader(){
+ _prevFieldCount = _currentFieldCount;
+
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\tField count: " << _prevFieldCount << endl;
+ #endif
+
+ for(int i=0; i<_prevFieldCount+1; i++){
+ _prevFieldPos[i] = _currentFieldPos[i];
+ _prevFieldValues[i] = _currentFieldValues[i];
+ _prevFieldTypes[i] = _currentFieldTypes[i];
+ _prevFieldZeroValues[i] = _currentFieldZeroValues[i];
+
+ _currentFieldZeroValues[i] = 0;
+ }
+ _prevHeader = _currentHeader;
+ _misIndex = 0;
+ _fieldIndex = 0;
+
+ _processedSequenceCount += 1;
+}
+
+void AbstractHeaderCoder::startBlock(){
+
+ _currentHeader = _leon->_firstHeader;
+
+ for(int i=0; (unsigned)i<_typeModel.size(); i++){
+ _typeModel[i].clear();
+ _fieldIndexModel[i].clear();
+ _fieldColumnModel[i].clear();
+ _misSizeModel[i].clear();
+ _asciiModel[i].clear();
+ _zeroModel[i].clear();
+
+ for(int j=0; j<8; j++)
+ _numericModels[i][j].clear();
+
+ }
+ _headerSizeModel.clear();
+
+ splitHeader();
+ endHeader();
+
+ _processedSequenceCount = 0;
+}
+
+//====================================================================================
+// ** HeaderEncoder
+//====================================================================================
+HeaderEncoder::HeaderEncoder(Leon* leon) :
+AbstractHeaderCoder(leon) , _totalHeaderSize(0) ,_seqId(0)
+{
+ _thread_id = __sync_fetch_and_add (&_leon->_nb_thread_living, 1);
+
+
+ //_firstHeader = firstHeader;
+ //_rangeEncoder = new RangeEncoder();
+}
+
+HeaderEncoder::HeaderEncoder(const HeaderEncoder& copy) :
+AbstractHeaderCoder(NULL), _totalHeaderSize(0),_seqId(0)
+{
+
+
+ _leon = copy._leon;
+
+ _thread_id = __sync_fetch_and_add (&_leon->_nb_thread_living, 1);
+ startBlock();
+
+ //_firstHeader = copy._firstHeader;
+ //_rangeEncoder = new RangeEncoder();
+}
+
+HeaderEncoder::~HeaderEncoder(){
+
+
+ if( _thread_id!=0 && (_seqId+1) % _leon->getReadPerBlock() != 0 ){
+ writeBlock();
+ }
+// int nb_remaining =
+ __sync_fetch_and_add (&_leon->_nb_thread_living, -1);
+
+ __sync_fetch_and_add(&_leon->_totalHeaderSize, _totalHeaderSize);
+
+}
+
+
+//int HeaderEncoder::getId(){
+// return ((_lastSequenceIndex / Leon::READ_PER_BLOCK) % _leon->_nb_cores);
+//}
+
+void HeaderEncoder::operator()(Sequence& sequence){
+ _lastSequenceIndex = sequence.getIndex();
+ _seqId = sequence.getIndex() ;
+
+
+ _currentHeader = sequence.getComment();
+
+ _totalHeaderSize += _currentHeader.size();
+
+ processNextHeader();
+
+
+ if(_processedSequenceCount >= _leon->getReadPerBlock() ){
+
+ writeBlock();
+ startBlock();
+ }
+
+}
+
+void HeaderEncoder::writeBlock(){
+ if(_rangeEncoder.getBufferSize() > 0){
+ _rangeEncoder.flush();
+ }
+
+ int blockId = ( _seqId / _leon->getReadPerBlock()) ;
+
+ //printf("\nheader coder writeblock bid %i tid %i \n",blockId, _thread_id);
+
+ _leon->writeBlock(_rangeEncoder.getBuffer(), _rangeEncoder.getBufferSize(), _processedSequenceCount,blockId,true);
+ _rangeEncoder.clear();
+}
+
+void HeaderEncoder::processNextHeader(){
+
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << _prevHeader << endl;
+ cout << _currentHeader << endl;
+ #endif
+ splitHeader();
+ compareHeader();
+ endHeader();
+}
+
+void HeaderEncoder::compareHeader(){
+ _fieldPos = 0;
+ _misCurrentStartPos = -1;
+
+ for(_fieldIndex=0; _fieldIndex<_currentFieldCount; _fieldIndex++){
+
+ _currentFieldSize = _currentFieldPos[_fieldIndex+1] - _currentFieldPos[_fieldIndex];
+ _prevFieldSize = _prevFieldPos[_fieldIndex+1]-_prevFieldPos[_fieldIndex];
+ _misCurrentStartPos = -1;
+
+ HeaderType prevFieldType = _prevFieldTypes[_fieldIndex];
+ HeaderType currentFieldType = _currentFieldTypes[_fieldIndex];
+
+ //Comparing numeric field
+ if(prevFieldType == FIELD_NUMERIC && currentFieldType == FIELD_NUMERIC){
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\tComparing numeric fields: " <<_prevFieldValues[_fieldIndex] << " " << _currentFieldValues[_fieldIndex] << endl;
+ #endif
+ if(_prevFieldValues[_fieldIndex] == _currentFieldValues[_fieldIndex]){ //match
+ _lastMatchFieldIndex = _fieldIndex;
+ continue;
+ }
+ //encodeNumeric();
+ }
+ //Comparing field with zero only
+ else if(prevFieldType == FIELD_ZERO_ONLY && currentFieldType == FIELD_ZERO_ONLY){
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\tComparing fields with zero only: " << endl;
+ #endif
+ if(_prevFieldZeroValues[_fieldIndex] == _currentFieldZeroValues[_fieldIndex]){ //match
+ _lastMatchFieldIndex = _fieldIndex;
+ continue;
+ }
+ //encodeNumeric();
+ }
+ //Comparing field with zero at begining and numeric
+ else if(prevFieldType == FIELD_ZERO_AND_NUMERIC && currentFieldType == FIELD_ZERO_AND_NUMERIC){
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\tComparing fields with zero at begining and numeric: " << endl;
+ #endif
+ if(_prevFieldZeroValues[_fieldIndex] == _currentFieldZeroValues[_fieldIndex] && _prevFieldValues[_fieldIndex] == _currentFieldValues[_fieldIndex]){ //match
+ _lastMatchFieldIndex = _fieldIndex;
+ continue;
+ }
+ //encodeNumeric();
+ }
+
+
+ //Encoding numeric field
+ if(currentFieldType == FIELD_NUMERIC || currentFieldType == FIELD_ZERO_ONLY || currentFieldType == FIELD_ZERO_AND_NUMERIC){
+ encodeNumeric();
+ }
+ //Comparing ascii fields
+ else{
+ for(_fieldPos=0; _fieldPos<_currentFieldSize; _fieldPos++){
+
+ if(_fieldIndex >= _prevFieldCount){
+ _misCurrentStartPos = _fieldPos;
+ break;
+ }
+
+ if(_fieldPos >= _prevFieldSize){
+ _misCurrentStartPos = _fieldPos;
+ break;
+ }
+
+ u_int8_t c = _currentHeader[_currentFieldPos[_fieldIndex]+_fieldPos];
+
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\tComparing: " << _prevHeader[_prevFieldPos[_fieldIndex]+_fieldPos] << " " << c << " ";
+ #endif
+ if(c == _prevHeader[_prevFieldPos[_fieldIndex]+_fieldPos]){
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "match" << endl;
+ #endif
+ }
+ else{
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "mismatch" << endl;
+ #endif
+ _misCurrentStartPos = _fieldPos;
+ break;
+ }
+ }
+
+ if(_misCurrentStartPos != -1){
+ encodeAscii();
+ }
+ else if(_fieldPos != _prevFieldSize){ //All the character of the current field match but there are always character in the current field of the prev header
+ _misCurrentStartPos = _fieldPos;
+ encodeAscii();
+ }
+ else{
+ _lastMatchFieldIndex = _fieldIndex;
+ }
+
+ }
+
+ }
+
+ //if(_currentFieldPos[_fieldIndex]+_fieldPos == _currentHeader.size()){
+ // _misCurrentStartPos = _fieldPos;
+ // encodeMismatch();
+ //}
+
+ //cout << _lastMatchFieldIndex << " " << _fieldIndex << endl;
+
+ //if the last field match, we have to signal to the decoder to add the last matching field of the prev header
+
+ if(_lastMatchFieldIndex == _fieldIndex-1){
+ _rangeEncoder.encode(_typeModel[_misIndex], HEADER_END_MATCH);
+ _rangeEncoder.encode(_headerSizeModel, _currentHeader.size());
+ //_misCurrentStartPos = _currentFieldSize;
+ //encodeAscii();
+ }
+ else{
+ _rangeEncoder.encode(_typeModel[_misIndex], HEADER_END);
+ }
+ //_misIndex += 1;
+
+ //end of header
+ //_rangeEncoder.encode(_typeModel[_misIndex], HEADER_END);
+ //_rangeEncoder.encode(_headerSizeModel, _currentHeader.size());
+
+}
+
+
+void HeaderEncoder::encodeNumeric(){
+ u_int64_t zeroCount = _currentFieldZeroValues[_fieldIndex];
+ u_int64_t fieldValue = _currentFieldValues[_fieldIndex];
+
+ HeaderType currentFieldType = _currentFieldTypes[_fieldIndex];
+
+ if(currentFieldType == FIELD_ZERO_ONLY){
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\t\tField with zero only" << endl;
+ cout << "\t\t\tEnconding zero count: " << zeroCount << endl;
+ #endif
+ _rangeEncoder.encode(_typeModel[_misIndex], FIELD_ZERO_ONLY);
+ _rangeEncoder.encode(_fieldIndexModel[_misIndex], _fieldIndex);
+ _rangeEncoder.encode(_zeroModel[_misIndex], zeroCount);
+ _misIndex += 1;
+ return;
+ }
+ else if(currentFieldType == FIELD_ZERO_AND_NUMERIC){
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\t\tField with zero and numeric" << endl;
+ cout << "\t\t\tEnconding zero count: " << zeroCount << endl;
+ #endif
+ _rangeEncoder.encode(_typeModel[_misIndex], FIELD_ZERO_AND_NUMERIC);
+ _rangeEncoder.encode(_fieldIndexModel[_misIndex], _fieldIndex);
+ _rangeEncoder.encode(_zeroModel[_misIndex], zeroCount);
+ _misIndex += 1;
+ }
+
+ u_int64_t value = fieldValue;
+ u_int64_t prevValue = _prevFieldValues[_fieldIndex];
+
+
+ //int valueByteCount = CompressionUtils::getByteCount(value);
+
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\t\tPrev value: " << prevValue << endl;
+ cout << "\t\t\tField value: " << value << " Byte: " << valueByteCount << endl;
+ #endif
+
+ u_int64_t deltaValue;
+ int deltaType = CompressionUtils::getDeltaValue(value, prevValue, &deltaValue);
+
+ if(deltaType == 0){
+ _rangeEncoder.encode(_typeModel[_misIndex], FIELD_NUMERIC);
+ }
+ else if(deltaType == 1){
+ _rangeEncoder.encode(_typeModel[_misIndex], FIELD_DELTA);
+ value = deltaValue;
+ }
+ else if(deltaType == 2){
+ _rangeEncoder.encode(_typeModel[_misIndex], FIELD_DELTA_2);
+ value = deltaValue;
+ }
+
+
+ _rangeEncoder.encode(_fieldIndexModel[_misIndex], _fieldIndex);
+ CompressionUtils::encodeNumeric(_rangeEncoder, _numericModels[_misIndex], value);
+ //_rangeEncoder->encode(&_fieldColumnModel[_misIndex], 0);
+ //_prevFieldValues[_fieldIndex] = fieldValue;
+
+ _misIndex += 1;
+}
+
+void HeaderEncoder::encodeAscii(){
+ int missSize = _currentFieldSize - _misCurrentStartPos;//_currentPos - _misCurrentStartPos;
+ //cout << _currentFieldSize << " " << _fieldPos << endl;
+ _rangeEncoder.encode(_typeModel[_misIndex], FIELD_ASCII);
+ _rangeEncoder.encode(_fieldIndexModel[_misIndex], _fieldIndex);
+ _rangeEncoder.encode(_fieldColumnModel[_misIndex], _misCurrentStartPos);
+ _rangeEncoder.encode(_misSizeModel[_misIndex], missSize);
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\t\t<Mismatch> " << " Type: " << "ASCII" << " Field: " << _fieldIndex << " Column: " << _misCurrentStartPos << " Size: " << missSize << endl;
+ #endif
+ //for(int j=_misCurrentStartPos; j<_currentPos; j++){
+ for(int i=_misCurrentStartPos; i < _misCurrentStartPos+missSize; i++){
+ #ifdef PRINT_DEBUG_ENCODER
+ cout << "\t\t\tEncoding: " << _currentHeader[_currentFieldPos[_fieldIndex]+i] << endl;
+ #endif
+ //cout << _currentHeader[j] << flush;
+ _rangeEncoder.encode(_asciiModel[_misIndex], _currentHeader[_currentFieldPos[_fieldIndex]+i]);
+ }
+ //cout << endl;
+ _misIndex += 1;
+}
+
+
+
+
+
+
+
+
+
+
+
+//====================================================================================
+// ** HeaderDecoder
+//====================================================================================
+HeaderDecoder::HeaderDecoder(Leon* leon,std::string & inputFilename, tools::storage::impl::Group * group) :
+AbstractHeaderCoder(leon)
+//, _rangeDecoder(inputFile)
+{
+ _group = group;
+ _inputStream =0;
+ _finished = false;
+
+}
+
+HeaderDecoder::~HeaderDecoder(){
+
+ if(_inputStream !=0) delete _inputStream;
+
+}
+
+void HeaderDecoder::setup(u_int64_t blockStartPos, u_int64_t blockSize, int sequenceCount,int blockID){
+ startBlock();
+ _rangeDecoder.clear();
+
+
+ if(_inputStream !=0) delete _inputStream;
+ std::string datasetname = Stringify::format ("header_%i",blockID);
+
+ _inputStream = new tools::storage::impl::Storage::istream (*_group, datasetname);
+
+ auto _tempcollec = & _group->getCollection<math::NativeInt8> (datasetname);
+ std::string dsize = _tempcollec->getProperty ("size");
+
+ _blockSize = std::stoi(dsize); // blockSize;
+
+
+
+ _rangeDecoder.setInputFile(_inputStream);
+
+
+
+
+ _blockStartPos = blockStartPos;
+ //_blockSize = blockSize;
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t-----------------------" << endl;
+ cout << "\tDecoding block " << _blockStartPos << " - " << _blockStartPos+_blockSize << endl;
+ #else
+ //_leon->_progress_decode->inc(1);
+
+ #endif
+
+ _currentHeader.clear();
+ _misIndex = 0;
+
+ _sequenceCount = sequenceCount;
+}
+
+void HeaderDecoder::execute(){
+ //cout << "executing" << endl;
+ //decodeFirstHeader();
+
+ while(_processedSequenceCount < _sequenceCount){
+
+ u_int8_t type = _rangeDecoder.nextByte(_typeModel[_misIndex]);
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tNext type is: " << (int)type << endl;
+ #endif
+
+
+ if(type == HEADER_END){
+
+ endHeader();
+ //i+=1;
+ }
+ else if(type == HEADER_END_MATCH){
+ //decodeMatch();
+ u_int8_t headerSize = _rangeDecoder.nextByte(_headerSizeModel);
+
+ for(/*_fieldIndex*/; _fieldIndex < _prevFieldCount; _fieldIndex++){
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\t\tAdding from prev header: " << _prevHeader.substr(_prevFieldPos[_fieldIndex], _prevFieldPos[_fieldIndex+1]-_prevFieldPos[_fieldIndex]) << endl;
+ #endif
+ _currentHeader += _prevHeader.substr(_prevFieldPos[_fieldIndex], _prevFieldPos[_fieldIndex+1]-_prevFieldPos[_fieldIndex]);
+ if(_currentHeader.size() >= headerSize) break;
+ }
+
+ endHeader();
+ }
+ else{
+
+ decodeMatch();
+
+ if(type == FIELD_ASCII){
+ decodeAscii();
+ _fieldIndex += 1;
+ _misIndex += 1;
+ }
+ else if(type == FIELD_NUMERIC){
+ decodeNumeric();
+ _fieldIndex += 1;
+ _misIndex += 1;
+ }
+ else if(type == FIELD_DELTA){
+ decodeDelta();
+ _fieldIndex += 1;
+ _misIndex += 1;
+ }
+ else if(type == FIELD_DELTA_2){
+ decodeDelta2();
+ _fieldIndex += 1;
+ _misIndex += 1;
+ }
+ else if(type == FIELD_ZERO_ONLY){
+ decodeZero();
+ _fieldIndex += 1;
+ _misIndex += 1;
+ }
+ else if(type == FIELD_ZERO_AND_NUMERIC){
+ decodeZero();
+ _misIndex += 1;
+ //decodeNumeric();
+ //_fieldIndex += 1;
+ }
+ //_prevPos = _prevFieldPos[_fieldIndex+1];
+
+ }
+
+ //cout << "lala" << endl;
+ }
+
+ _finished = true;
+}
+
+
+void HeaderDecoder::decodeMatch(){
+ u_int8_t misFieldIndex = _rangeDecoder.nextByte(_fieldIndexModel[_misIndex]);
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tMatch to field: " << (int)misFieldIndex << endl;
+ #endif
+ for(/*_fieldIndex*/; _fieldIndex < misFieldIndex; _fieldIndex++){
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\t\tAdding from prev header: " << _prevHeader.substr(_prevFieldPos[_fieldIndex], _prevFieldPos[_fieldIndex+1]-_prevFieldPos[_fieldIndex]) << endl;
+ #endif
+ _currentHeader += _prevHeader.substr(_prevFieldPos[_fieldIndex], _prevFieldPos[_fieldIndex+1]-_prevFieldPos[_fieldIndex]);
+ }
+}
+
+void HeaderDecoder::decodeAscii(){
+ u_int8_t misColumn = _rangeDecoder.nextByte(_fieldColumnModel[_misIndex]);
+ u_int8_t misSize = _rangeDecoder.nextByte(_misSizeModel[_misIndex]);
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tDecoding Type: ASCII Column: " << (int)misColumn << " Size: " << (int)misSize << endl;
+ #endif
+
+ if(_fieldIndex < _prevFieldCount){
+ for(int fieldPos=0; fieldPos<misColumn; fieldPos++){
+ _currentHeader += _prevHeader[_prevFieldPos[_fieldIndex]+fieldPos];
+ }
+ }
+
+ for(int i=0; i<misSize; i++){
+ u_int8_t c = _rangeDecoder.nextByte(_asciiModel[_misIndex]);
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\t\tAdding: " << c << " (" << (int)c << ")"<< endl;
+ #endif
+ //_currentHeader2[_currentPos] = c;
+ _currentHeader += c;
+ //_currentPos += 1;
+ }
+
+}
+
+void HeaderDecoder::decodeNumeric(){
+ //u_int8_t misSize = _rangeDecoder.nextByte(_misSizeModel[_misIndex]);
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tDecoding Type: NUMERIC" << endl; //" Size: " << (int)misSize << endl;
+ #endif
+
+ u_int64_t value = CompressionUtils::decodeNumeric(_rangeDecoder, _numericModels[_misIndex]);
+ //_currentHeader += CompressionUtils::numberToString(value);
+
+ char temp[200];
+ snprintf(temp,200,"%llu",value);
+ _currentHeader += string(temp);
+ //_currentHeader += to_string(value); // C++11
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\t\tAdding: " << string(temp) << endl;
+ #endif
+}
+
+void HeaderDecoder::decodeDelta(){
+ //u_int8_t misSize = _rangeDecoder.nextByte(_misSizeModel[_misIndex]);
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tDecoding Type: DELTA" << endl;//" Size: " << (int)misSize << endl;
+ #endif
+
+ u_int64_t value = CompressionUtils::decodeNumeric(_rangeDecoder, _numericModels[_misIndex]);
+
+ value = CompressionUtils::getValueFromDelta(1, _prevFieldValues[_fieldIndex], value);
+
+ char temp[200];
+ snprintf(temp,200,"%llu",value);
+ _currentHeader += string(temp);
+ //_currentHeader += to_string(value);
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\t\tAdding: " << string(temp) << endl;
+ #endif
+}
+
+void HeaderDecoder::decodeDelta2(){
+ //u_int8_t misSize = _rangeDecoder.nextByte(_misSizeModel[_misIndex]);
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tDecoding Type: DELTA 2" << endl;//" Size: " << (int)misSize << endl;
+ #endif
+
+ u_int64_t value = CompressionUtils::decodeNumeric(_rangeDecoder, _numericModels[_misIndex]);
+
+ value = CompressionUtils::getValueFromDelta(2, _prevFieldValues[_fieldIndex], value);
+ char temp[200];
+ snprintf(temp,200,"%llu",value);
+ _currentHeader += string(temp);
+ //_currentHeader += to_string(value);
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\t\tAdding: " << string(temp) << endl;
+ #endif
+}
+
+void HeaderDecoder::decodeZero(){
+ u_int8_t zeroCount = _rangeDecoder.nextByte(_zeroModel[_misIndex]);
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tDecoding Type: ZERO Size: " << (int)zeroCount << endl;
+ #endif
+
+ for(int i=0; i<zeroCount; i++){
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\t\tAdding: 0"<< endl;
+ #endif
+
+ _currentHeader += '0';
+ }
+}
+
+void HeaderDecoder::endHeader(){
+ _buffer += _currentHeader + '\n';
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << _currentHeader << endl;
+ //for(int i=0; i<_currentPos; i++){
+ // cout << _currentHeader2[i];
+ //}
+ //cout << endl;
+ #endif
+
+
+ splitHeader();
+ AbstractHeaderCoder::endHeader();
+ _currentHeader.clear();
+ _misIndex = 0;
+}
+
+
diff --git a/gatb-core/src/gatb/tools/compression/HeaderCoder.hpp b/gatb-core/src/gatb/tools/compression/HeaderCoder.hpp
new file mode 100644
index 0000000..231e91f
--- /dev/null
+++ b/gatb-core/src/gatb/tools/compression/HeaderCoder.hpp
@@ -0,0 +1,186 @@
+/*****************************************************************************
+ * Leon: reference free compression for NGS reads
+ * A tool from the GATB (Genome Assembly Tool Box)
+ * Copyright (C) 2014 INRIA
+ * Authors: G.Benoit, G.Rizk, C.Lemaitre
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *****************************************************************************/
+
+#ifndef _HEADERCODER_HPP_
+#define _HEADERCODER_HPP_
+
+/*
+#include <string>
+#include <vector>
+#include <ctype.h>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <stdlib.h>
+#include <math.h>*/
+
+//#include "RangeCoder.hpp"
+#include <gatb/gatb_core.hpp>
+#include "Leon.hpp"
+//#include "CompressionUtils.hpp"
+
+using namespace std;
+class Leon;
+//====================================================================================
+// ** AbstractHeaderCoder
+//====================================================================================
+class AbstractHeaderCoder
+{
+ public:
+ AbstractHeaderCoder(Leon* leon);
+
+ protected:
+ void addFieldColumn();
+
+ enum HeaderType{HEADER_END=1, HEADER_END_MATCH, FIELD_ASCII, FIELD_NUMERIC, FIELD_DELTA, FIELD_DELTA_2, FIELD_ZERO_ONLY, FIELD_ZERO_AND_NUMERIC, HEADER_TYPE_COUNT};
+ //static const int MAX_FIELD_COUNT = 200;
+
+ vector<Order0Model> _typeModel;
+ vector<Order0Model> _fieldIndexModel;
+ vector<Order0Model> _fieldColumnModel;
+ vector<Order0Model> _misSizeModel;
+ vector<Order0Model> _asciiModel;
+ vector< vector<Order0Model> > _numericModels;
+ vector<Order0Model> _zeroModel;
+
+ Order0Model _headerSizeModel;
+
+ int typeOfChar(u_int8_t c, bool* isDigit);
+ void splitHeader();
+ void makeField();
+ void endHeader();
+
+ string _prevHeader;
+ string _currentHeader;
+ vector<unsigned int> _prevFieldPos;
+ vector<unsigned int> _currentFieldPos;
+ int _currentPos;
+ int _fieldStartPos;
+ int _prevFieldCount;
+ int _fieldIndex;
+ int _misIndex;
+
+ vector<u_int64_t> _prevFieldValues;
+ vector<u_int64_t> _currentFieldValues;
+ vector<u_int64_t> _prevFieldZeroValues;
+ vector<u_int64_t> _currentFieldZeroValues;
+ vector<HeaderType> _prevFieldTypes;
+ vector<HeaderType> _currentFieldTypes;
+
+
+ bool _isCurrentFieldNumeric;
+ int _currentFieldCount;
+
+ Leon* _leon;
+
+ void startBlock();
+
+ int _processedSequenceCount;
+};
+
+//====================================================================================
+// ** HeaderEncoder
+//====================================================================================
+class HeaderEncoder : AbstractHeaderCoder
+{
+
+ public:
+
+ HeaderEncoder(Leon* leon);
+ HeaderEncoder(const HeaderEncoder& copy);
+ ~HeaderEncoder();
+
+ void operator()(Sequence& sequence);
+ //int getId();
+ u_int64_t _lastSequenceIndex;
+
+ private:
+
+ RangeEncoder _rangeEncoder;
+
+ u_int64_t _totalHeaderSize;
+
+ int _fieldPos;
+ //int _misPrevStartPos, _misCurrentStartPos;
+ int _misCurrentStartPos;
+ //int _encoderFieldIndex;
+ int _prevFieldSize, _currentFieldSize;
+ int _lastMatchFieldIndex;
+ u_int64_t _seqId;
+ int _thread_id;
+
+ //static void encodeFirstHeader();
+ void writeBlock();
+
+ void processNextHeader();
+ void compareHeader();
+ //void encode();
+ //void encodeMismatch();
+ void encodeNumeric();
+ void encodeAscii();
+
+};
+
+//====================================================================================
+// ** HeaderDecoder
+//====================================================================================
+class HeaderDecoder : AbstractHeaderCoder
+{
+
+ public:
+
+ HeaderDecoder(Leon* leon, std::string & inputFilename, tools::storage::impl::Group * group);
+ ~HeaderDecoder();
+
+ //void processNextByte(u_int8_t byte);
+ void setup(u_int64_t blockStartPos, u_int64_t blockSize, int sequenceCount, int blockID);
+ void execute();
+
+ string _buffer;
+ bool _finished;
+
+ private:
+ tools::storage::impl::Group * _group;
+
+ RangeDecoder _rangeDecoder;
+ //ifstream* _inputFile;
+ tools::storage::impl::Storage::istream *_inputStream;
+
+ //ofstream* _outputFile;
+ u_int64_t _blockStartPos;
+ u_int64_t _blockSize;
+
+ //int _prevPos;
+ void endHeader();
+ //void decodeFirstHeader();
+ void decodeMatch();
+ void decodeAscii();
+ void decodeNumeric();
+ void decodeDelta();
+ void decodeDelta2();
+ void decodeZero();
+
+
+ int _sequenceCount;
+
+};
+
+#endif /* _HEADERCODER_HPP_ */
+
diff --git a/gatb-core/src/gatb/tools/compression/Leon.cpp b/gatb-core/src/gatb/tools/compression/Leon.cpp
new file mode 100644
index 0000000..5c5dd3c
--- /dev/null
+++ b/gatb-core/src/gatb/tools/compression/Leon.cpp
@@ -0,0 +1,2434 @@
+/*****************************************************************************
+ * Leon: reference free compression for NGS reads
+ * A tool from the GATB (Genome Assembly Tool Box)
+ * Copyright (C) 2014 INRIA
+ * Authors: G.Benoit, G.Rizk, C.Lemaitre
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *****************************************************************************/
+
+
+
+#include "Leon.hpp"
+
+
+using namespace std;
+
+//#define PRINT_DEBUG
+//#define PRINT_DEBUG_DECODER
+
+
+
+//const u_int64_t ANCHOR_KMERS_HASH_SIZE = 500000000;
+const char* Leon::STR_COMPRESS = "-c";
+const char* Leon::STR_DECOMPRESS = "-d";
+const char* Leon::STR_TEST_DECOMPRESSED_FILE = "-test-file";
+const char* Leon::STR_DNA_ONLY = "-seq-only";
+const char* Leon::STR_NOHEADER = "-noheader";
+const char* Leon::STR_NOQUAL = "-noqual";
+
+const char* Leon::STR_DATA_INFO = "Info";
+const char* Leon::STR_INIT_ITER = "-init-iterator";
+
+const int Leon::nt2binTab[128] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, //69
+ 0, 3, 0, 0, 0, 0, 0, 0, 4, 0, //79
+ 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+const int Leon::bin2ntTab[5] = {'A', 'C', 'T', 'G', 'N'};
+
+
+template<typename T>
+void createDataset(tools::storage::impl::Group * group, std::string datasetname, T data)
+{
+ tools::storage::impl::Storage::ostream os (*group, datasetname);
+ os.write (reinterpret_cast<char const*>(&data), sizeof(data));
+ os.flush();
+}
+
+
+
+template<typename T>
+void readDataset(tools::storage::impl::Group * group, std::string datasetname, T & data)
+{
+ tools::storage::impl::Storage::istream is (*group, datasetname);
+ is.read (reinterpret_cast<char*> (&data),sizeof(data));
+}
+
+
+
+
+
+//Leon::Leon ( bool compress, bool decompress) :
+Leon::Leon () :
+Tool("leon"),
+_progress_decode(0),_generalModel(256),_inputBank(0),_anchorDictModel(5)
+{
+ _MCnoAternative = _MCuniqSolid = _MCuniqNoSolid = _totalDnaSize = _compressedSize = _readCount = _MCtotal = _nb_thread_living = 0;
+ _compressed_qualSize = _anchorDictSize = _MCmultipleSolid = _anchorAdressSize = _readWithoutAnchorCount = _anchorPosSize = 0;
+ _input_qualSize = _total_nb_quals_smoothed = _otherSize = _readSizeSize = _bifurcationSize = _noAnchorSize = 0;
+ _lossless = false;
+ _storageH5file = 0;
+ _bloom = 0;
+
+ _isFasta = true;
+ _maxSequenceSize = 0;
+ _minSequenceSize = INT_MAX;
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+
+ //_kmerSize(27)
+ //_compress = compress;
+ //_decompress = decompress;
+
+ /** We don't want default options of Tool (or we want to put them at a specific location). */
+ setParser (new OptionsParser ("leon"));
+
+ getParser()->push_back (new OptionOneParam (STR_URI_FILE, "input file (e.g. FASTA/FASTQ for compress or .leon file for decompress)", true));
+ getParser()->push_back (new OptionNoParam ("-c", "compression", false));
+ getParser()->push_back (new OptionNoParam ("-d", "decompression", false));
+ getParser()->push_back (new OptionOneParam (STR_NB_CORES, "number of cores (default is the available number of cores)", false, "0"));
+ getParser()->push_back (new OptionOneParam (STR_VERBOSE, "verbosity level", false, "1", false),0,false);
+
+ getParser()->push_back (new OptionOneParam ("-reads", "number of reads per block (default is 50000)", false, "50000", false),0,false);
+
+ getParser()->push_back (new OptionNoParam ("-lossless", "switch to lossless compression for qualities (default is lossy. lossy has much higher compression rate, and the loss is in fact a gain. lossy is better!)", false));
+
+
+ IOptionsParser* compressionParser = new OptionsParser ("compression");
+
+ /** We add the sorting count options and hide all of them by default and display one some of them. */
+ compressionParser->push_back (SortingCountAlgorithm<>::getOptionsParser(), 1, false);
+
+ if (IOptionsParser* input = compressionParser->getParser (STR_URI_INPUT)) { input->setName (STR_URI_FILE); }
+ if (IOptionsParser* input = compressionParser->getParser (STR_KMER_SIZE)) { input->setVisible (true); }
+
+ compressionParser->push_back (new OptionOneParam(STR_KMER_ABUNDANCE, "abundance threshold for solid kmers (default inferred)", false));
+ compressionParser->push_back (new OptionNoParam (STR_INIT_ITER, "init iterator for ibank mode", false));
+
+ compressionParser->push_back (new OptionNoParam (Leon::STR_DNA_ONLY, "store dna seq only, header and quals are discarded, will decompress to fasta (same as -noheader -noqual)", false));
+
+ compressionParser->push_back (new OptionNoParam (Leon::STR_NOHEADER, "discard header", false));
+ compressionParser->push_back (new OptionNoParam (Leon::STR_NOQUAL, "discard quality scores", false));
+
+ IOptionsParser* decompressionParser = new OptionsParser ("decompression");
+ decompressionParser->push_back (new OptionNoParam (Leon::STR_TEST_DECOMPRESSED_FILE, "check if decompressed file is the same as original file (both files must be in the same folder)", false));
+
+
+ _subgroupInfoCollection = NULL;
+ _groupLeon = _subgroupQual = _subgroupInfo = _subgroupDict = _subgroupDNA = _subgroupHeader = NULL;
+
+
+ getParser()->push_back (compressionParser);
+ getParser()->push_back (decompressionParser, 0, false);
+
+ pthread_mutex_init(&findAndInsert_mutex, NULL);
+ pthread_mutex_init(&writeblock_mutex, NULL);
+ pthread_mutex_init(&minmax_mutex, NULL);
+
+
+}
+
+Leon::~Leon ()
+{
+ setInputBank (0);
+
+ if(_storageH5file !=0)
+ delete _storageH5file;
+
+ if (_progress_decode) { delete _progress_decode; }
+}
+
+void Leon::execute()
+{
+ _time = clock(); //Used to calculate time taken by decompression
+
+ gettimeofday(&_tim, NULL);
+ _wdebut_leon = _tim.tv_sec +(_tim.tv_usec/1000000.0);
+
+ _iterator_mode=false;
+ if(getParser()->saw (STR_INIT_ITER))
+ _iterator_mode = true;
+
+ if(getParser()->saw ("-lossless"))
+ _lossless = true;
+
+ _compress = false;
+ _decompress = false;
+ if(getParser()->saw (Leon::STR_COMPRESS)) _compress = true;
+ if(getParser()->saw (Leon::STR_DECOMPRESS)) _decompress = true;
+ if((_compress && _decompress) || (!_compress && !_decompress)){
+ cout << "Choose one option among -c (compress) or -d (decompress)" << endl << endl;
+ return;
+ }
+
+ //getParser()->displayWarnings(); //pb si ici, affiche warnings apres exec dsk ,et prob option -c -d qui sont pas dans le parser car 'globales'
+
+ // u_int64_t total_nb_solid_kmers_in_reads = 0;
+ // int nb_threads_living;
+ _nb_cores = getInput()->getInt(STR_NB_CORES);
+
+ setReadPerBlock(getInput()->getInt("-reads"));
+
+ //setup global
+ for(int i=0; i<CompressionUtils::NB_MODELS_PER_NUMERIC; i++){
+ _numericModel.push_back(Order0Model(256));
+ }
+
+ if(_compress){
+ //#define SERIAL
+ executeCompression();
+ }
+ else{
+ executeDecompression();
+ }
+
+
+
+
+ //outputFile->flush();
+
+ /*************************************************/
+ // We gather some statistics.
+ /*************************************************/
+ //getInfo()->add (1, "result");
+ //getInfo()->add (2, "nb solid kmers in reads", "%ld", total_nb_solid_kmers_in_reads);
+
+ if(_decompress){
+ // delete _inputFile;
+ if(! _iterator_mode)
+ {
+ delete _outputFile;
+ }
+ //if(! _isFasta) delete _inputFileQual;
+ delete _bloom;
+ }
+
+}
+
+void Leon::createBloom (){
+ TIME_INFO (getTimeInfo(), "fill bloom filter");
+
+ //u_int64_t solidFileSize
+
+ int _auto_cutoff = 0 ;
+ u_int64_t nbs = 0 ;
+ u_int64_t nb_kmers_infile;
+
+ //cout << _dskOutputFilename << endl;
+ Storage* storage = StorageFactory(STORAGE_HDF5).load (_dskOutputFilename);
+ LOCAL (storage);
+
+ Partition<kmer_count> & solidCollection = storage->root().getGroup("dsk").getPartition<kmer_count> ("solid");
+
+ /** We get the number of solid kmers. */
+ // u_int64_t solidFileSize = solidCollection.getNbItems();
+
+ nb_kmers_infile = solidCollection.getNbItems();
+ //(System::file().getSize(_dskOutputFilename) / sizeof (kmer_count)); //approx total number of kmer
+
+ if( ! getParser()->saw(STR_KMER_ABUNDANCE)){
+
+ //retrieve cutoff
+
+ Collection<NativeInt64>& cutoff = storage->getGroup("histogram").getCollection<NativeInt64> ("cutoff");
+ Iterator<NativeInt64>* iter = cutoff.iterator();
+ LOCAL (iter);
+ for (iter->first(); !iter->isDone(); iter->next()) {
+ _auto_cutoff = iter->item().toInt();
+ }
+ //////
+
+ //retrieve nb solids
+
+ Collection<NativeInt64>& storagesolid = storage->getGroup("histogram").getCollection<NativeInt64> ("nbsolidsforcutoff");
+ Iterator<NativeInt64>* iter2 = storagesolid.iterator();
+ LOCAL (iter2);
+ for (iter2->first(); !iter2->isDone(); iter2->next()) {
+ nbs = iter2->item().toInt();
+ }
+ //////
+
+ _nks = _auto_cutoff;
+
+ //printf("\tcutoff auto: %i \n",_nks);
+
+ }
+ else
+ {
+ _auto_cutoff =0;
+ nbs = nb_kmers_infile;
+ // printf("\tcutoff user: %i (total solids %lli) \n",_nks,nbs);
+ }
+
+
+
+
+
+
+
+ //double lg2 = log(2);
+ //float NBITS_PER_KMER = log (16*_kmerSize*(lg2*lg2))/(lg2*lg2);
+ int NBITS_PER_KMER = 12;
+
+
+ u_int64_t estimatedBloomSize = (u_int64_t) ((double)nbs * NBITS_PER_KMER);
+ if (estimatedBloomSize ==0 ) { estimatedBloomSize = 1000; }
+
+
+ //printf("raw solidFileSize %llu fsize %llu %lu %lu \n",System::file().getSize(_solidFile), solidFileSize,sizeof (kmer_type),sizeof (kmer_count));
+
+ /** We create the kmers iterator from the solid file. */
+// Iterator<kmer_count>* itKmers = createIterator<kmer_count> (
+// new IteratorFile<kmer_count>(_dskOutputFilename),
+// nb_kmers_infile,
+// "fill bloom filter"
+// );
+
+ /** We create the kmers iterator from the solid file. */
+ Iterator<kmer_count>* itKmers = createIterator<kmer_count> (
+ solidCollection.iterator(),
+ nb_kmers_infile,
+ "fill bloom filter"
+ );
+ LOCAL (itKmers);
+
+
+
+
+
+ /** We instantiate the bloom object. */
+ //BloomBuilder<> builder (estimatedBloomSize, 7,tools::collections::impl::BloomFactory::CACHE,getInput()->getInt(STR_NB_CORES));
+ //cout << "ESTIMATED:" << estimatedBloomSize << endl;
+ //_bloomSize = estimatedBloomSize;
+
+ if(_auto_cutoff){
+ getInfo()->add (0, "Abundance threshold");
+ getInfo()->add (1, "cut-off (auto)", "%d", _auto_cutoff);
+ getInfo()->add (1, "nb solid kmers", "%d", nbs);
+ }
+ else{
+ getInfo()->add (0, "Abundance threshold");
+ getInfo()->add (1, "cut-off", "%d", _nks);
+ getInfo()->add (1, "nb solid kmers", "%d", nbs);
+ }
+
+ //modif ici pour virer les kmers < auto cutoff
+ BloomBuilder<> builder (estimatedBloomSize, 7,_kmerSize,tools::misc::BLOOM_NEIGHBOR,getInput()->getInt(STR_NB_CORES),_auto_cutoff);
+ _bloom = builder.build (itKmers); // BLOOM_NEIGHBOR // BLOOM_CACHE
+
+
+}
+
+
+
+void Leon::executeCompression(){
+
+
+
+
+ #ifdef PRINT_DEBUG
+ cout << "Start compression" << endl;
+ #endif
+
+ _kmerSize = getInput()->getInt (STR_KMER_SIZE);
+ _nks = getInput()->get(STR_KMER_ABUNDANCE) ? getInput()->getInt(STR_KMER_ABUNDANCE) : 3;
+ //_nks = getInput()->getInt (STR_KMER_ABUNDANCE);
+ _inputFilename = getInput()->getStr (STR_URI_FILE);
+
+ #ifdef PRINT_DEBUG
+ cout << "\tInput filename: " << _inputFilename << endl;
+ #endif
+
+ u_int8_t infoByte = 0;
+
+
+ /** We look for the beginnin of the suffix. */
+ int lastindex = _inputFilename.find_last_of (".");
+
+ /** We build the result. */
+ string extension = _inputFilename.substr(lastindex+1);
+
+ _noHeader =false;
+
+
+ if(getParser()->saw (Leon::STR_NOHEADER))
+ {
+ _noHeader = true;
+ infoByte |= 0x02; //no header
+ }
+
+ if(getParser()->saw (Leon::STR_NOQUAL))
+ {
+ _isFasta = true;
+ infoByte |= 0x01; //fasta mode == no quals
+ }
+
+
+ if(getParser()->saw (Leon::STR_DNA_ONLY))
+ {
+ _noHeader = true;
+ _isFasta = true;
+
+ infoByte |= 0x02; //no header
+ infoByte |= 0x01; //fasta mode == no quals
+
+ }
+
+
+ //_inputBank = Bank::singleton().createBank(_inputFilename);
+ setInputBank (Bank::open(_inputFilename));
+
+ //cout << Bank::getType(_inputFilename) << endl;
+
+
+ if(_inputFilename.find(".fq") != string::npos || _inputFilename.find(".fastq") != string::npos)
+ {
+ getInfo()->add (0, "Input format: FastQ");
+
+ if(! getParser()->saw (Leon::STR_DNA_ONLY) && ! getParser()->saw (Leon::STR_NOQUAL))
+ {
+
+ if (_lossless)
+ getInfo()->add (0, "Quality compression: LOSSLESS mode");
+ else
+ getInfo()->add (0, "Quality compression: lossy mode (use '-lossless' for lossless compression)");
+
+ _isFasta = false;
+
+ }
+
+
+ }
+ //attentio a l ordre, ".fa" est aussi present dans .fastq
+ else if (_inputFilename.find(".fa") != string::npos || _inputFilename.find(".fasta") != string::npos) {
+ getInfo()->add (0, "Input format: FastA");
+ infoByte |= 0x01;
+ _isFasta = true;
+
+ }
+ else
+ {
+ getInfo()->add (0, "Input format: unknown. Input extension must be one among fasta (.fa, .fasta) or fastq (.fq, .fastq)");
+ return;
+ }
+
+
+
+ std::string leonversion = Stringify::format ("%i.%i.%i", LEON_VERSION_MAJOR, LEON_VERSION_MINOR,LEON_VERSION_PATCH);
+
+
+ //Redundant from dsk solid file !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ _dskOutputFilename = getInput()->get(STR_URI_OUTPUT) ?
+ getInput()->getStr(STR_URI_OUTPUT) + ".h5" :
+ System::file().getBaseName (_inputFilename) + ".h5"; //_inputFilename instead of prefix GR
+
+#if 1
+
+ /*************************************************/
+ // Sorting count part
+ /*************************************************/
+
+ {
+ /** We create a DSK instance and execute it. */
+ SortingCountAlgorithm<> sortingCount (_inputBank, getInput());
+
+ sortingCount.getInput()->add (0, STR_VERBOSE, getInput()->getStr(STR_VERBOSE));
+
+ // best compression ratio if min-abundance never below 4 (ie. each kmer of the graph is used by at least 4 reads)
+ getInput()->setInt (STR_KMER_ABUNDANCE_MIN_THRESHOLD, 4);
+
+ sortingCount.execute();
+ }
+
+#endif
+
+ /*************************************************/
+ // We create the modified file
+ /*************************************************/
+
+ string dir = System::file().getDirectory(_inputFilename);
+ string prefix = System::file().getBaseName(_inputFilename);
+ //_outputFilename = dir + "/" + System::file().getBaseName(prefix) + ".leon";
+ string baseOutputname;
+ if(extension.find("gz") !=string::npos)
+ {
+ baseOutputname = dir + "/" + System::file().getBaseName(_inputFilename) ;
+ }
+ else
+ {
+ baseOutputname = _inputFilename;
+ }
+ _outputFilename = baseOutputname + ".leon";
+
+// _outputFile = System::file().newFile(_outputFilename, "wb");
+
+ _storageH5file = StorageFactory(STORAGE_HDF5).create (_outputFilename, true, false,true);
+
+ _groupLeon = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon"));
+ _subgroupInfo = new tools::storage::impl::Group((*_storageH5file)().getGroup ("metadata"));
+ _subgroupDict = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon/anchors"));
+ _subgroupDNA = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon/dna"));
+
+ if(! _isFasta)
+ _subgroupQual = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon/qual"));
+
+
+ _subgroupHeader = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon/header"));
+
+
+
+
+
+ _subgroupInfoCollection = & _subgroupInfo->getCollection<math::NativeInt8> ("infobyte");
+
+
+
+ if(_isFasta)
+ _subgroupInfoCollection->addProperty ("type","fasta");
+ else
+ _subgroupInfoCollection->addProperty ("type","fastq");
+
+
+ if(_noHeader)
+ _subgroupInfoCollection->addProperty ("header","false");
+ else
+ _subgroupInfoCollection->addProperty ("header","true");
+
+
+ _subgroupInfoCollection->addProperty ("version",leonversion);
+
+
+ //making a block here so that ostream is immediately destroyed
+ //otherwise bug since the referred to _subgroupInfo is destroyed in endcommpression
+ //(I do not want to destroy _subgroupInfo in leon destructor, otherwise compression will not be flushed until leon object is destroyed, bad behavior whan compression used within code)
+ {
+ tools::storage::impl::Storage::ostream osInfo (*_subgroupInfo, "infobyte");
+ osInfo.write (reinterpret_cast<char const*>(&infoByte), sizeof(infoByte));
+ osInfo.flush();
+
+ tools::storage::impl::Storage::ostream osk (*_subgroupInfo, "kmerSize");
+ osk.write (reinterpret_cast<char const*>(&_kmerSize), sizeof(_kmerSize));
+ osk.flush();
+ }
+
+
+ if(! _isFasta)
+ {
+ //_FileQualname = baseOutputname + ".qual";
+ //_FileQual = System::file().newFile(_FileQualname, "wb");
+ //_Qual_outstream = new tools::storage::impl::Storage::ostream (*_groupLeon, "qualities");
+ }
+
+
+
+
+
+#ifdef PRINT_DEBUG
+ cout << "\tOutput filename: " << _outputFilename << endl;
+ cout << "prefix " << prefix << endl;
+ cout << "dir " << dir << endl;
+ cout << "dskout " << _dskOutputFilename << endl;
+#endif
+
+
+
+ //Compression
+ if(! _noHeader)
+ {
+ startHeaderCompression();
+ }
+
+
+
+
+
+ startDnaCompression();
+
+
+
+
+ endCompression();
+}
+
+
+void Leon::endQualCompression(){
+
+
+ _qualCompRate = ((double)_compressed_qualSize / _input_qualSize);
+
+
+}
+
+
+void Leon::writeBlockLena(u_int8_t* data, u_int64_t size, int encodedSequenceCount,u_int64_t blockID){
+
+
+ z_stream zs;
+ memset(&zs, 0, sizeof(zs));
+
+ //deflateinit2 to be able to gunzip it fro mterminal
+
+ //if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
+ // (15+16), 8, Z_DEFAULT_STRATEGY) != Z_OK)
+
+
+ if (deflateInit(&zs, Z_BEST_COMPRESSION) != Z_OK)
+ throw Exception ("deflateInit failed while compressing.");
+
+ zs.next_in = (Bytef*) data ;
+ zs.avail_in = size ; // set the z_stream's input
+
+ int ret;
+ char outbuffer[32768];
+ std::string outstring;
+
+ // retrieve the compressed bytes blockwise
+ do {
+ zs.next_out = reinterpret_cast<Bytef*>(outbuffer);
+ zs.avail_out = sizeof(outbuffer);
+
+ ret = deflate(&zs, Z_FINISH);
+
+ if (outstring.size() < zs.total_out) {
+ // append the block to the output string
+ outstring.append(outbuffer,
+ zs.total_out - outstring.size());
+ }
+ } while (ret == Z_OK);
+
+ deflateEnd(&zs);
+
+ /////////////////
+
+
+ pthread_mutex_lock(&writeblock_mutex);
+
+ std::string datasetname = Stringify::format ("qual_%i",blockID);
+
+ tools::storage::impl::Storage::ostream os (*_subgroupQual, datasetname);
+ os.write (reinterpret_cast<char const*>( outstring.data()), outstring.size());
+ os.flush();
+
+ std::string dsize = Stringify::format ("%i",outstring.size());
+ auto _tempcollec = & _subgroupQual->getCollection<math::NativeInt8> (datasetname);
+ _tempcollec->addProperty ("size",dsize);
+
+
+ _input_qualSize += size;
+ _compressed_qualSize += outstring.size();
+
+// if ((2*(blockID+1)) > _qualBlockSizes.size() )
+// {
+// _qualBlockSizes.resize(2*(blockID+1));
+// }
+//
+// _qualBlockSizes[2*blockID] = outstring.size();
+// _qualBlockSizes[2*blockID+1] = encodedSequenceCount;
+
+
+ pthread_mutex_unlock(&writeblock_mutex);
+
+
+
+}
+
+void Leon::writeBlock(u_int8_t* data, u_int64_t size, int encodedSequenceCount,u_int64_t blockID, bool Header){
+ if(size <= 0) return;
+
+
+ //cout << "\t-----------------------" << endl;
+ //printf("____ write block %i ____ \n",blockID);
+ //cout << "\tWrite block " << _blockSizes.size() << endl;
+ //cout << "\tSequence " << encoder->_lastSequenceIndex-READ_PER_BLOCK << " - " << encoder->_lastSequenceIndex << endl;
+ //cout << "Thread id: " << thread_id << endl;
+ //cout << "\tEncoded size (byte): " << size << endl;
+
+
+
+ pthread_mutex_lock(&writeblock_mutex);
+
+ if(Header)
+ {
+ std::string datasetname = Stringify::format ("header_%i",blockID);
+
+ tools::storage::impl::Storage::ostream os (*_subgroupHeader, datasetname);
+ os.write (reinterpret_cast<char const*>( data), size);
+ os.flush();
+
+ std::string dsize = Stringify::format ("%i",size);
+ auto _tempcollec = & _subgroupHeader->getCollection<math::NativeInt8> (datasetname);
+ _tempcollec->addProperty ("size",dsize);
+ }
+
+
+ if(!Header)
+ {
+ std::string datasetname = Stringify::format ("dna_%i",blockID);
+
+ tools::storage::impl::Storage::ostream os (*_subgroupDNA, datasetname);
+ os.write (reinterpret_cast<char const*>( data), size);
+ os.flush();
+
+ std::string dsize = Stringify::format ("%i",size);
+ auto _tempcollec = & _subgroupDNA->getCollection<math::NativeInt8> (datasetname);
+ _tempcollec->addProperty ("size",dsize);
+ }
+
+ _compressedSize += size;
+
+ //int thread_id = encoder->getId();
+
+ if ((2*(blockID+1)) > _blockSizes.size() )
+ {
+ _blockSizes.resize(2*(blockID+1));
+ }
+
+ _blockSizes[2*blockID] = size ;
+ _blockSizes[2*blockID+1] = encodedSequenceCount;
+
+
+ pthread_mutex_unlock(&writeblock_mutex);
+
+}
+
+void Leon::endCompression(){
+ //_rangeEncoder.flush();
+ //_outputFile->fwrite(_rangeEncoder.getBuffer(true), _rangeEncoder.getBufferSize(), 1);
+
+ //tools::storage::impl::Storage::ostream osInfo (*_groupLeon, STR_DATA_INFO);
+
+ //osInfo.write (reinterpret_cast<char const*>(_rangeEncoder.getBuffer(true)), _rangeEncoder.getBufferSize()*sizeof(char));
+ //osInfo.flush();
+
+
+
+// printf("_rangeEncoder buffer size %i B \n",_rangeEncoder.getBufferSize());
+// _outputFile->flush();
+
+ u_int64_t inputFileSize = System::file().getSize(_inputFilename.c_str());
+ getInfo()->add(0, "End compression");
+ getInfo()->add(1, "Input file");
+ getInfo()->add(2, "name", "%s", _inputFilename.c_str());
+ getInfo()->add(2, "size", "%d bytes (%ld Mb)", inputFileSize, inputFileSize/1024LL/1024LL);
+
+ u_int64_t outputFileSize = System::file().getSize(_outputFilename.c_str());
+
+ getInfo()->add(1, "Output file");
+ getInfo()->add(2, "name", "%s", _outputFilename.c_str());
+ getInfo()->add(2, "size", "%d bytes (%ld Mb)", outputFileSize, outputFileSize/1024LL/1024LL);
+
+ getInfo()->add(1, "Compression");
+ gettimeofday(&_tim, NULL);
+ _wfin_leon = _tim.tv_sec +(_tim.tv_usec/1000000.0);
+
+ getInfo()->add(2, "Time:", "%.2f seconds", ( _wfin_leon - _wdebut_leon) );
+ getInfo()->add(2, "Speed:", "%.2f Mb/seconds", (System::file().getSize(_inputFilename)/1000000.0) / ( _wfin_leon - _wdebut_leon) );
+ getInfo()->add(2, "Rates");
+ getInfo()->add(3, "overall", "%.4f (%.4f)",
+ (float)((double)outputFileSize / (double)inputFileSize),
+ (float)((double)inputFileSize / (double)outputFileSize ));
+ if(! _noHeader)
+ {
+ getInfo()->add(3, "header only", "%.4f (%.4f)",
+ (float)_headerCompRate,
+ (float) ((double)1/_headerCompRate));
+ }
+ else
+ {
+ getInfo()->add(3, "header completely discarded in '-seq-only' mode");
+ }
+ getInfo()->add(3, "Sequence only", "%.4f (%.4f)",
+ (float)_dnaCompRate,
+ (float)((double)1/_dnaCompRate));
+ if( ! _isFasta)
+ {
+ getInfo()->add(3, "Quality only", "%.4f (%.4f) [%s mode]",
+ (float)_qualCompRate,
+ (float)((double)1/_qualCompRate),
+ _lossless?"lossless":"lossy");
+ }
+
+ delete _groupLeon;
+ delete _subgroupInfo;
+ delete _subgroupDict;
+ delete _subgroupDNA;
+ if(! _isFasta)
+ delete _subgroupQual;
+ delete _subgroupHeader;
+
+ if(_storageH5file !=0)
+ {
+ delete _storageH5file;
+ _storageH5file =0;
+ }
+
+
+
+
+
+ //printf("\tTime: %.2fs\n", (double)(clock() - _time)/CLOCKS_PER_SEC);
+ //printf("\tSpeed: %.2f mo/s\n", (System::file().getSize(_inputFilename)/1000000.0) / ((double)(clock() - _time)/CLOCKS_PER_SEC));
+}
+
+
+
+
+
+
+void Leon::startHeaderCompression(){
+ Iterator<Sequence>* itSeq = createIterator<Sequence> (
+ _inputBank->iterator(),
+ _inputBank->estimateNbItems(),
+ "Compressing headers"
+ );
+ LOCAL(itSeq);
+
+
+ _totalHeaderSize = 0;
+ _compressedSize = 0;
+
+ #ifdef PRINT_DEBUG
+ cout << endl << "Start header compression" << endl;
+ #endif
+
+ //write first header to file and store it in _firstHeader variable
+ //ifstream inputFileTemp(getInput()->getStr(STR_URI_FILE).c_str(), ios::in);
+ //getline(inputFileTemp, _firstHeader); //should be get comment from itseq
+ //inputFileTemp.close();
+ itSeq->first();
+ _firstHeader = itSeq->item().getComment();
+ _firstHeader.erase(_firstHeader.begin());
+ itSeq->reset();
+
+ #ifdef PRINT_DEBUG
+ cout << "\tFirst Header: " << _firstHeader << endl;
+ cout << "\tSize: " << _firstHeader.size() << endl;
+ #endif
+
+ _totalHeaderSize += _firstHeader.size();
+
+ //encode the size of the first header on 2 byte and the header itself
+ //CompressionUtils::encodeNumeric(_rangeEncoder, _numericModel, _firstHeader.size());
+
+ //for(int i=0; i < _firstHeader.size(); i++){
+ // _rangeEncoder.encode(_generalModel, _firstHeader[i]);
+ //}
+
+
+ createDataset(_subgroupHeader,"firstheadersize",_firstHeader.size());
+
+ tools::storage::impl::Storage::ostream osH (*_subgroupHeader, "firstheader");
+ osH.write (reinterpret_cast<char const*>(_firstHeader.data()), _firstHeader.size());
+ osH.flush();
+
+
+
+ //_rangeEncoder.flush();
+ //_totalHeaderCompressedSize += _rangeEncoder.getBufferSize();
+ //_outputFile->fwrite(_rangeEncoder.getBuffer(), _rangeEncoder.getBufferSize(), 1);
+ //_rangeEncoder.clear();
+
+ //cout << "Block start pos: " << _outputFile->tell() << endl;
+
+ //iterate on read sequences and compress headers
+ TIME_INFO (getTimeInfo(), "header compression");
+
+ //int nb_threads_living = 0 ;
+
+ #ifdef SERIAL
+ setDispatcher (new SerialDispatcher());
+ #else
+ setDispatcher ( new Dispatcher (_nb_cores) );
+ #endif
+
+ //getDispatcher()->iterate (itSeq, HeaderEncoder(this, &nb_threads_living), 10000);
+ getDispatcher()->iterate (itSeq, HeaderEncoder(this), this->getReadPerBlock());
+ endHeaderCompression();
+}
+
+
+
+
+void Leon::endHeaderCompression(){
+ //u_int64_t descriptionStartPos = _outputFile->tell();
+ //cout << "Description start pos: " << descriptionStartPos << endl;
+
+ //CompressionUtils::encodeNumeric(_rangeEncoder, _numericModel, _blockSizes.size());
+ //for(int i=0; i<_blockSizes.size(); i++){
+ //cout << "block size: " << _blockSizes[i] << endl;
+ // CompressionUtils::encodeNumeric(_rangeEncoder, _numericModel, _blockSizes[i]);
+ //}
+
+
+ createDataset(_subgroupHeader,"nb_blocks",_blockSizes.size());
+
+ getInfo()->add(0, "End Header compression");
+ getInfo()->add(1, "# blocks", "%i", _blockSizes.size());
+ tools::storage::impl::Storage::ostream os (*_subgroupHeader, "blocksizes");
+ os.write (reinterpret_cast<char const*>(_blockSizes.data()), _blockSizes.size()*sizeof(u_int64_t));
+ os.flush();
+
+
+
+ _headerCompRate = ((double)_compressedSize / _totalHeaderSize);
+
+ //cout << "\t\tData blocks count: " << _blockSizes.size() << endl;
+ //cout << "\tBlock data size: " << _rangeEncoder.getBufferSize() << endl;
+ getInfo()->add(1, "headers size", "%u", _totalHeaderSize);
+ getInfo()->add(1, "headers compressed size", "%u", _compressedSize);
+ getInfo()->add(1, "compression rate", "%.4f", (float)(_headerCompRate));
+ //_rangeEncoder.clear();
+ _blockSizes.clear();
+}
+
+
+void Leon::startDnaCompression(){
+ #ifdef PRINT_DEBUG
+ cout << endl << "Start reads compression" << endl;
+ #endif
+
+
+
+ //Create and fill bloom
+ createBloom ();
+ // LOCAL (_bloom); //now we need it later
+
+ int64_t nbestimated = _inputBank->estimateNbItems();
+
+
+
+// _anchorKmers = new Hash16<kmer_type, u_int32_t > ( (nbestimated/10) * sizeof(u_int32_t) *10LL /1024LL / 1024LL ); // hmm Hash16 would need a constructor with sizeof main entry //maybe *2 for low coverage dataset
+ u_int64_t nbcreated ;
+ _anchorKmers = new Hash16<kmer_type, u_int32_t > ( nbestimated/10 , &nbcreated ); //creator with nb entries given
+// printf("asked %lli entries, got %llu \n",nbestimated/10 ,nbcreated);
+
+ Iterator<Sequence>* itSeq = createIterator<Sequence> (
+ _inputBank->iterator(),
+ nbestimated,
+ "Compressing dna"
+ );
+ LOCAL(itSeq);
+
+ //create a temporary output file to store the anchors dict
+ //_dictAnchorFile = System::file().newFile(_outputFilename + ".adtemp", "wb");
+ _dictAnchorFile = new ofstream((_outputFilename + ".adtemp").c_str(), ios::out|ios::binary);
+
+ _lastAnchorValue = 0;
+ _anchorAdress = 0;
+ _totalDnaSize = 0;
+ //_totalDnaCompressedSize = 0;
+ _compressedSize = 0;
+
+ //createKmerAbundanceHash();
+
+ //iterate on read sequences and compress headers
+ TIME_INFO (getTimeInfo(), "DNA compression");
+
+ //int nb_threads_living = 0 ;
+
+ #ifdef SERIAL
+ setDispatcher (new SerialDispatcher());
+ #else
+ setDispatcher ( new Dispatcher (_nb_cores) );
+ #endif
+
+ //getDispatcher()->iterate (itSeq, HeaderEncoder(this, &nb_threads_living), 10000);
+ getDispatcher()->iterate (itSeq, DnaEncoder(this), this->getReadPerBlock());
+
+ endDnaCompression();
+
+ if(! _isFasta)
+ {
+ endQualCompression();
+ }
+}
+
+
+void Leon::endDnaCompression(){
+
+
+ createDataset(_subgroupDNA,"nb_blocks",_blockSizes.size());
+
+ getInfo()->add(0, "End Sequence compression");
+ getInfo()->add(1, "# blocks", "%u", _blockSizes.size());
+ tools::storage::impl::Storage::ostream os (*_subgroupDNA, "blocksizes");
+ os.write (reinterpret_cast<char const*>(_blockSizes.data()), _blockSizes.size()*sizeof(u_int64_t));
+ os.flush();
+
+ _blockSizes.clear();
+
+ writeBloom();
+ writeAnchorDict();
+
+ _dnaCompRate = ((double)_compressedSize / _totalDnaSize);
+
+ getInfo()->add(1, "# sequences", "%u", _readCount);
+ getInfo()->add(1, "# nucleotides", "%u", _totalDnaSize);
+
+ createDataset(_subgroupInfo,"readcount",_readCount);
+ createDataset(_subgroupInfo,"totalDnaSize",_totalDnaSize);
+ createDataset(_subgroupInfo,"minSequenceSize",_minSequenceSize);
+ createDataset(_subgroupInfo,"maxSequenceSize",_maxSequenceSize);
+
+ getInfo()->add(1, "Compression rates");
+ getInfo()->add(2, "overall", "%.4f (%u)", (float)_dnaCompRate, _compressedSize);
+
+ getInfo()->add(2, "Bloom", "%.2f (%u)", ((_bloom->getSize()*100) / (double)_compressedSize), _bloom->getSize());
+ getInfo()->add(2, "Anchors dict", "%.2f (%u) (%u entries)", ((_anchorDictSize*100) / (double)_compressedSize), _anchorDictSize, _anchorAdress);
+
+ u_int64_t readsSize = _anchorAdressSize+_anchorPosSize+_readSizeSize+_bifurcationSize+_otherSize;
+ getInfo()->add(2, "Reads", "%.2f (%u)", ((readsSize*100) / (double)_compressedSize), readsSize);
+ getInfo()->add(3, "Anchor adress", "%.2f (%u)", ((_anchorAdressSize*100) / (double)_compressedSize), _anchorAdressSize);
+ getInfo()->add(3, "Anchor pos", "%.2f (%u)", ((_anchorPosSize*100) / (double)_compressedSize), _anchorPosSize);
+ getInfo()->add(3, "Read size", "%.2f (%u)", ((_readSizeSize*100) / (double)_compressedSize), _readSizeSize);
+ getInfo()->add(3, "Bifurcation", "%.2f (%u)", ((_bifurcationSize*100) / (double)_compressedSize), _bifurcationSize);
+ getInfo()->add(3, "Other (N, error, infoBits)", "%.2f (%u)", ((_otherSize*100) / (double)_compressedSize), _otherSize);
+ getInfo()->add(2, "Read without anchor", "%.2f (%u)", ((_noAnchorSize*100) / (double)_compressedSize), _noAnchorSize);
+
+ if(_anchorAdress!=0){
+ getInfo()->add(1, "Reads per anchor", "%u", _readCount / _anchorAdress);
+ }
+ getInfo()->add(1, "Read without anchor", "%.2f", ((double)_readWithoutAnchorCount*100) / _readCount);
+ getInfo()->add(1, "De Bruijn graph");
+
+ getInfo()->add(2, "Simple path", "%.2f", ((_MCuniqSolid*100)/ (double)_MCtotal));
+ getInfo()->add(2, "Bifurcation", "%.2f", ((_MCmultipleSolid*100)/(double)_MCtotal));
+ getInfo()->add(2, "Break", "%.2f", ((_MCnoAternative*100)/(double)_MCtotal));
+ getInfo()->add(2, "Error", "%.2f", ((_MCuniqNoSolid*100)/(double)_MCtotal));
+
+
+ delete _anchorKmers;
+ System::file().remove(_dskOutputFilename);
+
+
+}
+
+void Leon::writeBloom(){
+ _compressedSize += _bloom->getSize();
+
+
+ StorageTools::singleton().saveBloom<kmer_type> (_storageH5file->getGroup(this->getName()), "bloom", _bloom, _kmerSize);
+
+}
+
+
+
+void Leon::writeAnchorDict(){
+
+ _anchorRangeEncoder.flush();
+
+ //todo check if the tempfile _dictAnchorFile may be avoided (with the use of hdf5 ?)
+ _dictAnchorFile->write( (const char*) _anchorRangeEncoder.getBuffer(), _anchorRangeEncoder.getBufferSize());
+ _dictAnchorFile->flush();
+ _dictAnchorFile->close();
+ _anchorRangeEncoder.clear();
+
+
+ u_int64_t size = System::file().getSize(_outputFilename + ".adtemp");
+ _anchorDictSize = size;
+ //u_int64_t size = _anchorRangeEncoder.getBufferSize();
+ _compressedSize += size;
+
+
+
+ createDataset(_subgroupDict,"size",size);
+ createDataset(_subgroupDict,"anchorAdress",_anchorAdress);
+
+
+ //_dictAnchorFile->seeko(0, SEEK_SET);
+ //_outputFile->fwrite(_dictAnchorFile, size, 1);
+ ifstream tempFile((_outputFilename + ".adtemp").c_str(), ios::in|ios::binary);
+
+
+
+
+ int bufsize = 4096*8;
+ char * buffer = new char [bufsize];
+
+ tools::storage::impl::Storage::ostream osD (*_subgroupDict, "anchorsDict");
+
+ while (tempFile.good()) {
+ tempFile.read(buffer, bufsize);
+ // _outputFile->fwrite(buffer, tempFile.gcount(), 1);
+ osD.write (reinterpret_cast<char const*>(buffer), tempFile.gcount());
+
+ }
+
+ osD.flush();
+
+ tempFile.close();
+ remove((_outputFilename + ".adtemp").c_str());
+ delete [] buffer;
+
+}
+
+bool Leon::anchorExist(const kmer_type& kmer, u_int32_t* anchorAdress){
+
+ if (_anchorKmers->get(kmer,anchorAdress)) //avec Hash16
+ {
+ return true;
+ }
+
+ return false;
+
+}
+
+
+
+void Leon::updateMinMaxSequenceSize(int newMin, int newMax)
+{
+ pthread_mutex_lock(&minmax_mutex);
+ _minSequenceSize = std::min(_minSequenceSize, newMin);
+ _maxSequenceSize = std::max(_maxSequenceSize, newMax);
+ pthread_mutex_unlock(&minmax_mutex);
+
+}
+
+int Leon::findAndInsertAnchor(const vector<kmer_type>& kmers, u_int32_t* anchorAdress){
+
+ pthread_mutex_lock(&findAndInsert_mutex);
+
+
+ //cout << "\tSearching and insert anchor" << endl;
+ int maxAbundance = -1;
+ int bestPos;
+ kmer_type bestKmer;
+ //int bestPos = -1;
+
+
+ kmer_type kmer, kmerMin;
+
+ /*
+ ////////////
+ for(int i=0; i<kmers.size(); i++){
+ kmer = kmers[i];
+ kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+ if(_bloom->contains(kmerMin)){
+ encodeInsertedAnchor(kmerMin);
+ _anchorKmers.insert(kmerMin, _anchorAdress);
+ *anchorAdress = _anchorAdress;
+ _anchorKmerCount += 1;
+ _anchorAdress += 1;
+ return i;
+ }
+ }
+ return -1;
+ /////////////////////*/
+
+
+ //int iMin = 40;
+ //int iMax = 60;
+ int iMin = kmers.size()/2;
+ int iMax = kmers.size()/2 + 10;
+ //cout << iMin << " " << iMax << endl;
+ iMin = std::max(iMin, 0);
+ iMax = std::min(iMax, (int) kmers.size());
+
+ for(int i=iMin; i<iMax; i++){
+
+ kmer = kmers[i];
+ kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+
+ if(_bloom->contains(kmerMin)){
+ maxAbundance = 0;
+ bestPos = i;
+ bestKmer = kmerMin;
+ break;
+ }
+ }
+
+ if(maxAbundance == -1){
+
+ for(int i=0; i<iMin; i++){
+ kmer = kmers[i];
+ kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+
+
+ if(_bloom->contains(kmerMin)){
+ maxAbundance = 0;
+ bestPos = i;
+ bestKmer = kmerMin;
+ break;
+ }
+ }
+
+
+ if(maxAbundance == -1){
+ for(unsigned int i=iMax; i<kmers.size(); i++){
+ kmer = kmers[i];
+ kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+
+ if(_bloom->contains(kmerMin)){
+ maxAbundance = 0;
+ bestPos = i;
+ bestKmer = kmerMin;
+ break;
+ }
+ }
+ }
+ }
+
+
+ /*
+ for(int i=0; i<kmers.size(); i++){
+
+ kmer = kmers[i];
+ kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+
+ int abundance;
+ if(_kmerAbundance->get(kmerMin, &abundance)){
+ if(abundance > maxAbundance){
+ maxAbundance = abundance;// + ((kmers.size()-i)*2);
+ bestKmer = kmerMin;
+ bestPos = i;
+ //cout << maxAbundance << endl;
+ //cout << bestPos << " " << abundance << " " << kmer.toString(_kmerSize) << " " << revcomp(kmer, _kmerSize).toString(_kmerSize) << endl;
+ }
+ //cout << abundance << endl;
+ }
+ else if(maxAbundance == -1 && _bloom->contains(kmerMin)){
+ maxAbundance = _nks;
+ bestKmer = kmerMin;
+ bestPos = i;
+ //cout << maxAbundance << endl;
+ }
+ }*/
+
+ if(maxAbundance == -1)
+ {
+ pthread_mutex_unlock(&findAndInsert_mutex);
+ return -1;
+ }
+
+ encodeInsertedAnchor(bestKmer);
+
+ _anchorKmers->insert(bestKmer,_anchorAdress); //with Hash16
+ //_anchorKmers[bestKmer] = _anchorAdress;
+ //_anchorKmers.insert(bestKmer, _anchorAdress);
+
+
+ *anchorAdress = _anchorAdress;
+ //_anchorKmerCount += 1;
+ _anchorAdress += 1;
+
+ /*
+ int val;
+ for(int i=0; i<kmers.size(); i++){
+ kmer = kmers[i];
+ kmerMin = min(kmer, revcomp(kmer, _kmerSize));
+ _kmerAbundance->remove(kmerMin, &val);
+ //_kmerAbundance->insert(kmerMin, val-1);
+ }*/
+
+ pthread_mutex_unlock(&findAndInsert_mutex);
+ return bestPos;
+}
+
+void Leon::encodeInsertedAnchor(const kmer_type& kmer){
+
+ //static int i = 0;
+
+ string kmerStr = kmer.toString(_kmerSize);
+
+ for(unsigned int i=0; i<kmerStr.size(); i++){
+ _anchorRangeEncoder.encode(_anchorDictModel, Leon::nt2bin(kmerStr[i]));
+ }
+ //i+= 1;
+ //cout << i << endl;
+ if(_anchorRangeEncoder.getBufferSize() >= 4096){
+ _dictAnchorFile->write((const char*) _anchorRangeEncoder.getBuffer(), _anchorRangeEncoder.getBufferSize());
+ _anchorRangeEncoder.clearBuffer();
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+void * decoder_all_thread(void * args)
+{
+
+ thread_arg_decoder * targ = (thread_arg_decoder*) args;
+ QualDecoder * qual_decoder = targ->qual_decoder;
+ DnaDecoder * dna_decoder = targ->dna_decoder;
+ HeaderDecoder * header_decoder = targ->header_decoder;
+
+ if(qual_decoder!=NULL)
+ qual_decoder->execute();
+
+ if(header_decoder!=NULL)
+ header_decoder->execute();
+
+ dna_decoder->execute();
+
+ pthread_exit(0);
+}
+
+
+
+void * decoder_dna_thread(void * args)
+{
+ DnaDecoder * dna_decoder = (DnaDecoder*) args;
+ dna_decoder->execute();
+ pthread_exit(0);
+}
+
+void * decoder_qual_thread(void * args)
+{
+ QualDecoder * qual_decoder = (QualDecoder*) args;
+ qual_decoder->execute();
+ pthread_exit(0);
+}
+
+void Leon::executeDecompression(){
+
+
+ _filePos = 0;
+
+ #ifdef PRINT_DEBUG
+ if(!_iterator_mode){
+ cout << "Start decompression" << endl;
+ }
+ #endif
+
+ _inputFilename = getInput()->getStr(STR_URI_FILE);
+ //string inputFilename = prefix + ".txt"; //".leon"
+ //_outputFile = System::file().newFile(outputFilename, "wb");
+
+ #ifdef PRINT_DEBUG
+ if(!_iterator_mode){
+ cout << "\tInput filename: " << _inputFilename << endl;
+ }
+ #endif
+
+ if (!System::file().doesExist(_inputFilename)){
+ std::stringstream ss;
+ ss << "File not found: " << _inputFilename;
+ throw Exception (ss.str().c_str());
+ }
+ _storageH5file = StorageFactory(STORAGE_HDF5).create (_inputFilename,false,false,true); //open without adding extension h5
+ _groupLeon = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon"));
+ _subgroupInfo = new tools::storage::impl::Group((*_storageH5file)().getGroup ("metadata"));
+ _subgroupDict = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon/anchors"));
+ _subgroupDNA = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon/dna"));
+
+
+
+
+ _subgroupInfoCollection = & _subgroupInfo->getCollection<math::NativeInt8> ("infobyte");
+
+
+ tools::storage::impl::Storage::istream isInfo (*_subgroupInfo, "infobyte");
+
+
+
+
+
+ string dir = System::file().getDirectory(_inputFilename);
+
+ //_inputFile = new ifstream(_inputFilename.c_str(), ios::in|ios::binary);
+
+
+
+ //remove .leon at the end :
+ string inputFilename_leon_removed = System::file().getBaseName(_inputFilename);
+
+
+ //Go to the end of the file to decode blocks informations, data are read in reversed order (from right to left in the file)
+ //The first number is the number of data blocks
+
+
+
+ //Output file
+ string prefix = System::file().getBaseName(inputFilename_leon_removed); // for prefix need to remove two dots : .fastq.leon , but not all of them if another dot in the filename (it happened, mail from angry users)
+
+ _outputFilename = dir + "/" + prefix;
+
+ //printf("_outputFilename %s \n",_outputFilename.c_str());
+
+ //Decode the first byte of the compressed file which is an info byte
+ u_int8_t infoByte;
+ //infoByte = _rangeDecoder.nextByte(_generalModel);
+
+
+ isInfo.read (reinterpret_cast<char*> (&infoByte),sizeof(infoByte));
+
+
+
+
+
+
+ //the first bit holds the file format. 0: fastq, 1: fasta
+ //_isFasta = ((infoByte & 0x01) == 0x01);
+
+
+
+ //Second bit : option no header
+ //_noHeader = ((infoByte & 0x02) == 0x02);
+
+
+
+ std::string filetype = _subgroupInfoCollection->getProperty("type");
+ if(filetype == "fasta")
+ _isFasta = true;
+ else
+ _isFasta = false;
+
+ std::string headerinfo = _subgroupInfoCollection->getProperty("header");
+ if(headerinfo == "true")
+ _noHeader = false;
+ else
+ _noHeader = true;
+
+ //printf("%s %s \n",filetype.c_str(),headerinfo.c_str() );
+
+
+ if(! _isFasta)
+ _subgroupQual = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon/qual"));
+
+
+ _subgroupHeader = new tools::storage::impl::Group((*_storageH5file)().getGroup ("leon/header"));
+
+
+
+ //printf("info byte %i _noHeader %i _isFasta %i \n",infoByte,_noHeader,_isFasta);
+
+ if(! _isFasta)
+ {
+
+ // _FileQualname = dir + "/" + System::file().getBaseName(inputFilename_leon_removed) + ".fastq.qual";
+ // _inputFileQual = new ifstream(_FileQualname.c_str(), ios::in|ios::binary);
+ // cout << "\tQual filename: " << _FileQualname << endl;
+ }
+
+ if(!_iterator_mode){
+ getInfo()->add(0, "Decompression");
+ getInfo()->add(1, "Input filename", "%s", _inputFilename.c_str());
+ }
+
+ if(_noHeader)
+ {
+ if(!_iterator_mode)
+ getInfo()->add(1, "Headers were not stored, will number reads.");
+ }
+
+ if(_isFasta){
+ if(!_iterator_mode)
+ getInfo()->add(1, "Output format", "FastA");
+ _outputFilename += ".fasta.d";
+ }
+ else{
+ if(!_iterator_mode)
+ getInfo()->add(1, "Output format", "FastQ");
+ _outputFilename += ".fastq.d";
+ }
+
+ if(!_iterator_mode)
+ _outputFile = System::file().newFile(_outputFilename, "wb");
+
+
+ //Get kmer size
+ //_kmerSize = CompressionUtils::decodeNumeric(_rangeDecoder, _numericModel);
+
+ tools::storage::impl::Storage::istream isk (*_subgroupInfo, "kmerSize");
+ isk.read (reinterpret_cast<char*> (&_kmerSize),sizeof(_kmerSize));
+
+ if(!_iterator_mode)
+ {
+ getInfo()->add(1, "Kmer size", "%i", _kmerSize);
+ }
+
+ std::string leonversion = _subgroupInfoCollection->getProperty ("version");
+
+ if(!_iterator_mode)
+ getInfo()->add(1, "Input File compressed with Leon", "%s", leonversion.c_str());
+
+ //cout << "\tInput File was compressed with leon version " << version_major << "." << version_minor << "." << version_patch << endl;
+
+
+ // if(version_major != LEON_VERSION_MAJOR || version_minor != LEON_VERSION_MINOR || version_patch != LEON_VERSION_PATCH )
+ // {
+ // cout << "\tWarning diff version " << endl;
+ // }
+
+ if(! _iterator_mode)
+ {
+ startDecompressionAllStreams();
+
+ endDecompression();
+ }
+
+
+}
+
+void Leon::testing_iter(){
+ //printf("testing iterator \n");
+
+ tools::dp::Iterator<Sequence>* iterl = new LeonIterator(*this);
+
+ /*
+ iterl->first();
+
+ std::cout << "[" << (*iterl)->getDataSize() << "] " << (*iterl)->getComment() << std::endl;
+ std::cout << (*iterl)->toString() << std::endl;
+
+ iterl->next();
+ std::cout << "[" << (*iterl)->getDataSize() << "] " << (*iterl)->getComment() << std::endl;
+ std::cout << (*iterl)->toString() << std::endl;*/
+
+ for(iterl->first();!iterl->isDone();iterl->next())
+ {
+ //std::cout << "[" << (*iterl)->getDataSize() << "] " << (*iterl)->getComment() << std::endl;
+ std::cout << (*iterl)->getComment() << std::endl;
+ std::cout << (*iterl)->toString() << std::endl;
+ std::cout << (*iterl)->getQuality() << std::endl;
+
+ }
+
+}
+
+void Leon::startDecompression_setup(){
+
+ _filePosHeader = 0;
+ _filePosDna = 0;
+
+ if(! _noHeader)
+ {
+ ///////// header setup /////////
+
+ size_t firstHeaderSize;
+ readDataset(_subgroupHeader,"firstheadersize",firstHeaderSize);
+
+
+ char * tempS = (char * ) malloc(firstHeaderSize+1);
+ tools::storage::impl::Storage::istream isH (*_subgroupHeader, "firstheader");
+ isH.read (reinterpret_cast<char *>(tempS), firstHeaderSize);
+
+ tempS[firstHeaderSize] = '\0';
+ _firstHeader = std::string(tempS);
+ free(tempS);
+
+
+ //setup header block sizes
+ size_t nb_blocks;
+ readDataset(_subgroupHeader,"nb_blocks",nb_blocks);
+ _headerBlockSizes.resize(nb_blocks,0);
+
+ tools::storage::impl::Storage::istream is (*_subgroupHeader, "blocksizes");
+ is.read (reinterpret_cast<char *>(_headerBlockSizes.data()), _headerBlockSizes.size()*sizeof(u_int64_t));
+ ////
+
+ }
+
+
+ /////// dna setup ////////////
+
+ //need to init _filePosDna here
+ for(unsigned int ii=0; ii<_headerBlockSizes.size(); ii+=2 )
+ {
+ _filePosDna += _headerBlockSizes[ii];
+ }
+
+ //setup dna block sizes
+ size_t nb_blocks;
+ readDataset(_subgroupDNA,"nb_blocks",nb_blocks);
+ _blockCount = nb_blocks;
+ _dnaBlockSizes.resize(nb_blocks,0);
+
+ tools::storage::impl::Storage::istream is (*_subgroupDNA, "blocksizes");
+ is.read (reinterpret_cast<char *>(_dnaBlockSizes.data()), _dnaBlockSizes.size()*sizeof(u_int64_t));
+ ////
+
+ _kmerModel = new KmerModel(_kmerSize);
+
+ decodeBloom();
+ decodeAnchorDict();
+
+ /////////// qualities setup //////////
+ if(! _isFasta)
+ {
+ _filePosQual =0;
+ }
+ ///////////////
+
+}
+
+
+void Leon::decoders_setup(){
+
+
+
+ for(int i=0; i<_nb_cores; i++){
+
+ if(! _isFasta)
+ {
+ //QualDecoder* qd = new QualDecoder(this, _FileQualname,_groupLeon);
+ QualDecoder* qd = new QualDecoder(this, "qualities",_subgroupQual);
+
+ _qualdecoders.push_back(qd);
+ }
+
+ DnaDecoder* dd = new DnaDecoder(this, _inputFilename, _subgroupDNA);
+ _dnadecoders.push_back(dd);
+
+ if(! _noHeader)
+ {
+ //HeaderDecoder* hd = new HeaderDecoder(this, _inputFilename);
+ HeaderDecoder* hd = new HeaderDecoder(this, _inputFilename, _subgroupHeader);
+
+ _headerdecoders.push_back(hd);
+ }
+ }
+
+
+ _tab_threads = new pthread_t [_nb_cores];
+ _targ = new thread_arg_decoder [_nb_cores];
+
+}
+
+void Leon::decoders_cleanup(){
+
+ for(unsigned int i=0; i<_dnadecoders.size(); i++){
+ delete _dnadecoders[i];
+ }
+ _dnadecoders.clear();
+
+ for(unsigned int i=0; i<_headerdecoders.size(); i++){
+ delete _headerdecoders[i];
+ }
+ _headerdecoders.clear();
+
+ for(unsigned int i=0; i<_qualdecoders.size(); i++){
+ delete _qualdecoders[i];
+ }
+ _qualdecoders.clear();
+}
+
+void Leon::decompressionDecodeBlocks(unsigned int & idx, int & livingThreadCount){
+
+
+ for(int j=0; j<_nb_cores; j++){
+
+
+ if(idx >= _dnaBlockSizes.size()) break;
+
+ int blockId = idx/2 ;
+
+
+ u_int64_t blockSize;
+ int sequenceCount;
+
+ livingThreadCount = j+1;
+
+ QualDecoder* qdecoder;
+ HeaderDecoder* hdecoder;
+ DnaDecoder* ddecoder;
+
+ //header decoder
+ if(! _noHeader)
+ {
+ blockSize = _headerBlockSizes[idx];
+ sequenceCount = _headerBlockSizes[idx+1];
+ hdecoder = _headerdecoders[j];
+ hdecoder->setup(_filePosHeader, blockSize, sequenceCount,blockId);
+ _filePosHeader += blockSize;
+ }
+ else
+ {
+ hdecoder= NULL;
+ }
+
+ //dna decoder
+ blockSize = _dnaBlockSizes[idx];
+ sequenceCount = _dnaBlockSizes[idx+1];
+ ddecoder = _dnadecoders[j];
+ ddecoder->setup(_filePosDna, blockSize, sequenceCount,blockId);
+ _filePosDna += blockSize;
+
+ //qual decoder setup
+ //here test if in fastq mode, put null pointer otherwise
+ if(! _isFasta)
+ {
+ qdecoder = _qualdecoders[j];
+ qdecoder->setup( blockId);
+ }
+ else
+ {
+ qdecoder= NULL;
+ }
+
+
+ _targ[j].qual_decoder = qdecoder;
+ _targ[j].dna_decoder = ddecoder;
+ _targ[j].header_decoder = hdecoder;
+
+ pthread_create(&_tab_threads[j], NULL, decoder_all_thread, _targ + j);
+
+ idx += 2;
+
+ if(! _iterator_mode)
+ this->_progress_decode->inc(1);
+ }
+}
+void Leon::startDecompressionAllStreams(){
+
+
+ startDecompression_setup();
+
+ switch (getInput()->getInt(STR_VERBOSE))
+ {
+ case 0: default: _progress_decode = new IteratorListener ();break;
+ case 1: _progress_decode = new ProgressSynchro ( new ProgressTimer ( _blockCount/2, "Decompressing all streams"), System::thread().newSynchronizer() );break;
+
+ case 2: _progress_decode = new ProgressSynchro ( new Progress ( _blockCount/2, "Decompressing all streams"), System::thread().newSynchronizer() );break;
+ }
+
+
+ getInfo()->add(1, "Block count", "%u", _blockCount/2);
+
+
+// delete _progress_decode;
+// _progress_decode = new ProgressSynchro ( new ProgressTimer ( _blockCount/2, "Decompressing all streams"), System::thread().newSynchronizer() );
+// _progress_decode = new ProgressSynchro ( new Progress ( _blockCount/2, "Decompressing all streams"), System::thread().newSynchronizer() );
+
+ _progress_decode->init();
+
+
+ decoders_setup();
+
+
+ unsigned int i = 0;
+ int livingThreadCount = 0;
+
+
+
+ while(i < _dnaBlockSizes.size()){
+
+ //decode blocks
+
+ decompressionDecodeBlocks(i,livingThreadCount); //this will increment i
+
+ for(int j=0; j < livingThreadCount; j++){
+
+ pthread_join(_tab_threads[j], NULL);
+
+ HeaderDecoder* hdecoder = NULL;
+ QualDecoder* qdecoder = NULL;
+ DnaDecoder* ddecoder = _dnadecoders[j];
+
+
+ std::istringstream * stream_qual = NULL;
+ std::istringstream * stream_header = NULL;
+ std::istringstream * stream_dna = NULL;
+
+ if(! _isFasta)
+ {
+ qdecoder = _qualdecoders[j];
+ stream_qual = new std::istringstream (qdecoder->_buffer);
+ qdecoder->_buffer.clear();
+ }
+
+ if(! _noHeader)
+ {
+ hdecoder = _headerdecoders[j];
+ stream_header = new std::istringstream (hdecoder->_buffer);
+ hdecoder->_buffer.clear();
+ }
+
+ stream_dna = new std::istringstream (ddecoder->_buffer);
+
+ ddecoder->_buffer.clear();
+
+
+ std::string line;
+ std::string output_buff;
+
+ output_buff.reserve(this->getReadPerBlock() * 500);
+
+ bool reading = true;
+
+
+
+ u_int64_t readid=0;
+ while(reading){
+
+ stringstream sint;
+ sint << readid;
+
+ if( ! _noHeader)
+ {
+ if(getline(*stream_header, line)){
+ if(_isFasta)
+ output_buff += ">";
+ else
+ output_buff += "@";
+
+ output_buff += line + '\n';
+ }
+ else
+ reading = false;
+ }
+ else
+ {
+ if(_isFasta)
+ output_buff += "> " + sint.str() + '\n';
+ else
+ output_buff += "@ " + sint.str() + '\n';
+
+ readid++;
+ }
+
+
+
+ if(getline(*stream_dna, line)){
+ output_buff += line + '\n';
+ }
+ else
+ reading = false;
+
+
+ if( ! _isFasta)
+ {
+ if(getline(*stream_qual, line)){
+ output_buff += "+\n";
+ output_buff += line + '\n';
+ }
+ else
+ reading = false;
+ }
+
+ }
+
+
+ _outputFile->fwrite(output_buff.c_str(), output_buff.size(), 1);
+
+ if(stream_qual!= NULL) delete stream_qual;
+ if(stream_header!= NULL) delete stream_header;
+ if(stream_dna!= NULL) delete stream_dna;
+
+
+ }
+
+ livingThreadCount = 0;
+ }
+
+
+
+ _outputFile->flush();
+
+ for(unsigned int i=0; i<_dnadecoders.size(); i++){
+ delete _dnadecoders[i];
+ }
+ _dnadecoders.clear();
+
+ for(unsigned int i=0; i<_headerdecoders.size(); i++){
+ delete _headerdecoders[i];
+ }
+ _headerdecoders.clear();
+
+ for(unsigned int i=0; i<_qualdecoders.size(); i++){
+ delete _qualdecoders[i];
+ }
+ _qualdecoders.clear();
+
+
+
+ delete [] _tab_threads;
+ delete [] _targ;
+ cout << endl;
+
+
+ _progress_decode->finish();
+
+ delete _kmerModel;
+
+}
+
+
+
+
+void Leon::setupNextComponent( vector<u_int64_t> & blockSizes ){
+ //Go to the data block position (position 0 for headers, position |headers data| for reads)
+ _inputFile->seekg(_filePos, _inputFile->beg);
+
+ blockSizes.clear();
+ //u_int64_t size = 0;
+
+ _blockCount = CompressionUtils::decodeNumeric(_rangeDecoder, _numericModel);
+ for(unsigned int i=0; i<_blockCount; i++){
+ u_int64_t blockSize = CompressionUtils::decodeNumeric(_rangeDecoder, _numericModel);
+ blockSizes.push_back(blockSize);
+ //size += blockSize;
+ }
+
+
+
+
+ //cout << "\tBlock count: " << _blockCount/2 << endl;
+ /*
+ for(int i=0; i<_blockSizes.size(); i++){
+ cout << _blockSizes[i] << " ";
+ }
+ cout << endl;*/
+
+}
+
+
+
+void Leon::decodeBloom(){
+
+ //to be removed
+ ////////
+ //pos = tous les block sizes des header
+
+ /*
+ u_int64_t total_header_block_size = 0 ;
+
+ for(int ii=0; ii<_headerBlockSizes.size(); ii+=2 )
+ {
+ total_header_block_size += _headerBlockSizes[ii];
+ }
+
+ u_int64_t bloomPos = total_header_block_size ;
+
+ for(int i=0; i<_dnaBlockSizes.size(); i++){
+ bloomPos += _dnaBlockSizes[i];
+ i += 1;
+ }
+
+ _inputFile->seekg(bloomPos, _inputFile->beg);
+ */
+
+
+ //u_int64_t bloomBitSize = CompressionUtils::decodeNumeric(_rangeDecoder, _numericModel);
+ //u_int64_t bloomHashCount = CompressionUtils::decodeNumeric(_rangeDecoder, _numericModel);
+
+
+ //_bloom = new BloomNeighborCoherent<kmer_type> (bloomBitSize,_kmerSize,bloomHashCount);
+
+
+// _inputFile->read((char*)_bloom->getArray(), _bloom->getSize());
+//////
+
+ _bloom = StorageTools::singleton().loadBloom<kmer_type> (*_groupLeon, "bloom");
+
+
+#ifdef PRINT_DEBUG_DECODER
+// cout << "Bloom size: " << _bloom->getSize() << endl;
+// cout << "Anchor dict pos: " << _inputFile->tellg() << endl;
+ #endif
+
+
+}
+
+void Leon::decodeAnchorDict(){
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\tDecode anchor dict" << endl;
+ #endif
+
+
+ u_int64_t anchorDictSize;
+ u_int32_t anchorCount;
+ readDataset(_subgroupDict,"size",anchorDictSize);
+ readDataset(_subgroupDict,"anchorAdress",anchorCount);
+
+
+ tools::storage::impl::Storage::istream isD (*_subgroupDict, "anchorsDict");
+
+ //_anchorRangeDecoder.setInputFile(_inputFile);
+ _anchorRangeDecoder.setInputFile(&isD); //seems to be working ok
+
+ string anchorKmer = "";
+
+// u_int64_t dictPos = _inputFile->tellg();
+
+ //KmerModel model(_kmerSize, KMER_DIRECT);
+
+ u_int64_t currentAnchorCount = 0;
+
+ while(currentAnchorCount < anchorCount){
+
+
+ u_int8_t c = _anchorRangeDecoder.nextByte(_anchorDictModel);
+ anchorKmer += Leon::bin2nt(c); //convert to char
+ if(anchorKmer.size() == _kmerSize){
+
+
+ //cout << anchorKmer << endl;
+ //if(i<=10) cout << anchorKmer << endl;
+ //cout << "1: " << anchorKmer << endl;
+
+ kmer_type kmer = _kmerModel->codeSeed(anchorKmer.c_str(), Data::ASCII).value() ; //then convert to bin
+
+
+ //could be optimized if needed
+ //cout << "2: " << model.toString(kmer) << endl;
+ //lala += 1;
+ _vecAnchorKmers.push_back(kmer);
+
+ anchorKmer.clear();
+
+ currentAnchorCount += 1;
+
+ }
+ }
+
+ #ifdef PRINT_DEBUG_DECODER
+ cout << "\t\tAnchor count: " << _vecAnchorKmers.size() << endl;
+ #endif
+
+
+}
+
+
+
+kmer_type Leon::getAnchor(ifstream* anchorDictFile, u_int32_t adress){
+
+ return _vecAnchorKmers[adress]; //icii
+
+ //anchorDictFile->seekg(_kmerSize*adress);
+
+ //char buffer[_kmerSize];
+
+ //anchorDictFile->read(buffer, _kmerSize);
+ //kmer_type kmer = model.codeSeed(anchorKmer.c_str(), Data::ASCII);
+ //return _vecAnchorKmers[adress];
+ //return _kmerModel->codeSeed(buffer, Data::ASCII);
+}
+
+void Leon::endDecompression(){
+
+ getInfo()->add(1, "Output filename", "%s", _outputFile->getPath().c_str());
+
+ gettimeofday(&_tim, NULL);
+ _wfin_leon = _tim.tv_sec +(_tim.tv_usec/1000000.0);
+
+ getInfo()->add(1, "Time", "%.2f seconds", ( _wfin_leon - _wdebut_leon) );
+ getInfo()->add(1, "Speed", "%.2f Mo/seconds", (System::file().getSize(_outputFilename)/1000000.0) / ( _wfin_leon - _wdebut_leon) );
+
+
+ //Test decompressed file against original reads file (decompressed and original read file must be in the same dir)
+ if(getParser()->saw (Leon::STR_TEST_DECOMPRESSED_FILE)){
+
+ getInfo()->add(1, "Checking decompressed file");
+
+ string dir = System::file().getDirectory(_inputFilename);
+
+ string prefix = System::file().getBaseName(_inputFilename);;
+ //while(prefix.find('.') != string::npos){
+ // int lastindex = prefix.find_last_of(".");
+ // prefix = prefix.substr(0, lastindex);
+ //}
+ //string prefix = System::file().getBaseName(_inputFilename);
+
+ string originalFilename;
+ IBank* originalBank;
+ IBank* newBank;
+ Iterator<Sequence>* originalBankIt;
+ Iterator<Sequence>* newBankIt;
+
+ if(_isFasta)
+ originalFilename = dir + "/" + prefix + ".fasta";
+ else
+ originalFilename = dir + "/" + prefix + ".fastq";
+
+
+ getInfo()->add(2, "Original file", "%s", originalFilename.c_str());
+ getInfo()->add(2, "New file", "%s", _outputFile->getPath().c_str());
+
+ originalBank = Bank::open(originalFilename);
+ originalBankIt = originalBank->iterator();
+ originalBankIt->first();
+ newBank = Bank::open(_outputFile->getPath());
+ newBankIt = newBank->iterator();
+ newBankIt->first();
+
+ //int i=0;
+
+ while(true){
+ if(newBankIt->isDone()){
+ if(originalBankIt->isDone())
+ getInfo()->add(1, "OK");
+ else
+ getInfo()->add(1, "Decompressed file end but not the original file");
+ break;
+ }
+ if(originalBankIt->isDone()){
+ if(newBankIt->isDone())
+ getInfo()->add(1, "OK");
+ else
+ getInfo()->add(1, "Original file end but not the decomrpessed file");
+ break;
+ }
+
+ string originalHeader = (*originalBankIt)->getComment();
+ string originalDna = (string((*originalBankIt)->getDataBuffer())).substr(0, (*originalBankIt)->getDataSize());
+
+
+ string newHeader = (*newBankIt)->getComment();
+ string newDna = (string((*newBankIt)->getDataBuffer())).substr(0, (*newBankIt)->getDataSize());
+
+ if(originalHeader != newHeader){
+ getInfo()->add(1, "Sequence with a different header", "%i", (*newBankIt)->getIndex());
+ getInfo()->add(2, "original", "%s", originalHeader.c_str());
+ getInfo()->add(2, "new", "%s", newHeader.c_str());
+ break;
+ }
+
+ if(originalDna != newDna){
+ getInfo()->add(1, "Sequence with a different DNA", "%i", (*newBankIt)->getIndex());
+ getInfo()->add(2, "original", "%s", originalDna.c_str());
+ getInfo()->add(2, "new", "%s", newDna.c_str());
+ break;
+ }
+
+ originalBankIt->next();
+ newBankIt->next();
+
+ //i ++;
+ //cout << i << endl;
+ //if(i > 20) return;
+ }
+ }
+}
+
+
+
+Leon::LeonIterator::LeonIterator( Leon& refl)
+: _leon(refl), _isDone(true) , _isInitialized(false)
+{
+ _stream_qual = _stream_header = _stream_dna = NULL ;
+
+}
+
+void Leon::LeonIterator::first()
+{
+ //printf("iter first\n");
+ init ();
+
+ _idxB=0;
+ _livingThreadCount=0;
+ _currentTID=0;
+ _isDone = false;
+ _readingThreadBlock = false;
+ _readid=0;
+ if(_stream_qual!= NULL) delete _stream_qual;
+ if(_stream_header!= NULL) delete _stream_header;
+ if(_stream_dna!= NULL) delete _stream_dna;
+
+ next();
+
+}
+
+void Leon::LeonIterator::next()
+{
+// printf("---------- iter next ------------\n");
+
+ if(_livingThreadCount==0 ||
+ (( _currentTID>= _livingThreadCount) && !_readingThreadBlock )
+ )
+ {
+ readNextBlocks();
+ if(_isDone) return;
+ }
+
+ if(!_readingThreadBlock)
+ {
+ // assert (_currentTID < _livingThreadCount)
+ readNextThreadBock();
+ }
+
+ assert(_readingThreadBlock);
+
+ //try to get nex tseq from this thread block
+
+ std::string line;
+ stringstream sint;
+ sint << _readid;
+ std::string current_comment;
+ std::string current_dna;
+ std::string current_qual;
+
+
+ if( ! _leon._noHeader)
+ {
+ if(getline(*_stream_header, line)){
+ current_comment += line ;
+ }
+ else
+ _readingThreadBlock = false;
+ }
+ else
+ {
+ current_comment += sint.str() ;
+
+ _readid++;
+ }
+
+
+
+ if(getline(*_stream_dna, line)){
+ current_dna += line ;
+ }
+ else
+ _readingThreadBlock = false;
+
+
+ if( ! _leon._isFasta)
+ {
+ if(getline(*_stream_qual, line)){
+ current_qual += line ;
+ }
+ else
+ _readingThreadBlock = false;
+ }
+
+ if(_readingThreadBlock)
+ {
+ Sequence *currentSeq = _item;
+ currentSeq->setComment(current_comment);
+ currentSeq->setQuality(current_qual);
+
+ //currData.set ((char *)current_dna.c_str(), current_dna.size() );
+
+ // huum casting const char * to char *; not nice, could be fixed with strdup but want to avoid unnecessary copy,
+ //the set() method *should* take a const anyway
+ currentSeq->getData().set((char *)current_dna.c_str(), current_dna.size() );
+ }
+ else //reached end of current thread block, try to advance to next block
+ {
+ next();
+
+ }
+
+
+}
+
+
+void Leon::LeonIterator::readNextBlocks()
+{
+// printf("--- iter readNextBlocks %i / %i ---\n",_idxB,_leon._dnaBlockSizes.size());
+
+ if(_idxB >= _leon._dnaBlockSizes.size()){
+ _isDone= true;
+ }
+ if(!_isDone)
+ {
+ _leon.decompressionDecodeBlocks(_idxB,_livingThreadCount); //this will update _idxB and _livingThreadCount
+ _currentTID =0;
+ }
+
+// printf("___ done iter readNextBlocks %i / %i ---\n",_idxB,_leon._dnaBlockSizes.size());
+
+
+}
+
+
+//put next chunk in stream_qual,stream_header and _stream_dna
+void Leon::LeonIterator::readNextThreadBock()
+{
+// printf("--- iter readNextThreadBock %i %i ---\n",_currentTID,_livingThreadCount);
+
+ pthread_join(_leon._tab_threads[_currentTID], NULL);
+
+ _hdecoder = NULL;
+ _qdecoder = NULL;
+ _ddecoder = _leon._dnadecoders[_currentTID];
+
+
+ if(_stream_qual!= NULL) delete _stream_qual;
+ if(_stream_header!= NULL) delete _stream_header;
+ if(_stream_dna!= NULL) delete _stream_dna;
+
+ _stream_qual = NULL;
+ _stream_header = NULL;
+ _stream_dna = NULL;
+
+ if(! _leon._isFasta)
+ {
+ _qdecoder = _leon._qualdecoders[_currentTID];
+ _stream_qual = new std::istringstream (_qdecoder->_buffer);
+ _qdecoder->_buffer.clear();
+ }
+
+ if(! _leon._noHeader)
+ {
+ _hdecoder = _leon._headerdecoders[_currentTID];
+ _stream_header = new std::istringstream (_hdecoder->_buffer);
+ _hdecoder->_buffer.clear();
+ }
+
+ _stream_dna = new std::istringstream (_ddecoder->_buffer);
+ _ddecoder->_buffer.clear();
+
+
+ //std::string output_buff;
+ //output_buff.reserve(READ_PER_BLOCK * 500);
+
+ _readingThreadBlock = true;
+ _currentTID++;
+
+
+// printf("___ done iter readNextThreadBock %i %i ---\n",_currentTID,_livingThreadCount);
+
+
+/// u_int64_t readid=0;
+}
+
+Leon::LeonIterator::~LeonIterator ()
+{
+ _leon.decoders_cleanup();
+}
+
+
+void Leon::LeonIterator::estimate(u_int64_t& number, u_int64_t& totalSize, u_int64_t& maxSize)
+{
+ readDataset(_leon._subgroupInfo,"readcount",number);
+ readDataset(_leon._subgroupInfo,"totalDnaSize",totalSize);
+
+ int maxsizei ;
+
+ readDataset(_leon._subgroupInfo,"maxSequenceSize",maxsizei);
+ maxSize = maxsizei;
+
+}
+
+void Leon::LeonIterator::init()
+{
+ if (_isInitialized == true) { return ;}
+
+ ///printf("iter init\n");
+
+ _leon.startDecompression_setup();
+ _leon.decoders_setup();
+
+
+ _isInitialized = true;
+}
+
+
+void Leon::LeonIterator::finalize()
+{
+
+
+}
+
+//////////////////////////////////////////////////
+//////////////////// BankLeon ////////////////////
+//////////////////////////////////////////////////
+
+BankLeon::BankLeon (const std::string& filename)
+{
+ _fname = filename;
+ _leon = NULL;
+
+ _leon = new Leon();
+
+
+ std::vector<std::string> arguments;
+ arguments.push_back("leon");
+ arguments.push_back("-file");
+ arguments.push_back(_fname);
+ arguments.push_back("-d");
+ arguments.push_back(Leon::STR_INIT_ITER);
+
+ std::vector<char*> argv;
+ for (const auto& arg : arguments)
+ argv.push_back((char*)arg.data());
+ argv.push_back(nullptr);
+ int argc = argv.size() - 1;
+
+ _leon->run (argc, argv.data());
+
+}
+
+BankLeon::~BankLeon ()
+{
+ if(_leon!=NULL)
+ delete _leon;
+
+}
+
+u_int64_t BankLeon::getSize ()
+{
+ return System::file().getSize (_fname);
+}
+
+int64_t BankLeon::getNbItems () {
+ u_int64_t number;
+ readDataset(_leon->_subgroupInfo,"readcount",number);
+ return number;
+}
+
+
+
+void BankLeon::estimate (u_int64_t& number, u_int64_t& totalSize, u_int64_t& maxSize)
+{
+ readDataset(_leon->_subgroupInfo,"readcount",number);
+ readDataset(_leon->_subgroupInfo,"totalDnaSize",totalSize);
+
+ int maxsizei ;
+
+ readDataset(_leon->_subgroupInfo,"maxSequenceSize",maxsizei);
+ maxSize = maxsizei;
+
+}
+
+
+
+
+/// BankLeonFactory : test hdf5 storage opening, if leon version can be found, this is a leon file
+
+IBank* BankLeonFactory::createBank (const std::string& uri)
+{
+ //printf("create bank factory Leon %s \n",uri.c_str());
+ bool isLEON = false;
+
+ try {
+ isLEON = true;
+
+ auto storageH5file = StorageFactory(STORAGE_HDF5).create (uri,false,false,true); //open without adding extension h5
+
+
+ //auto groupLeon = new tools::storage::impl::Group((*storageH5file)().getGroup ("leon"));
+
+ auto _subgroupInfo = new tools::storage::impl::Group((*storageH5file)().getGroup ("metadata"));
+
+ auto _subgroupInfoCollection = & _subgroupInfo->getCollection<math::NativeInt8> ("infobyte");
+
+ std::string leonversion = _subgroupInfoCollection->getProperty ("version");
+ // std::cout << "leon file version : "<< leonversion << std::endl;
+
+ } catch (system::Exception& e) {
+// std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
+ isLEON = false;
+ }
+
+ return (isLEON ? new BankLeon (uri) : NULL);
+
+}
+
diff --git a/gatb-core/src/gatb/tools/compression/Leon.hpp b/gatb-core/src/gatb/tools/compression/Leon.hpp
new file mode 100644
index 0000000..887e8c5
--- /dev/null
+++ b/gatb-core/src/gatb/tools/compression/Leon.hpp
@@ -0,0 +1,490 @@
+/*****************************************************************************
+ * Leon: reference free compression for NGS reads
+ * A tool from the GATB (Genome Assembly Tool Box)
+ * Copyright (C) 2014 INRIA
+ * Authors: G.Benoit, G.Rizk, C.Lemaitre
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *****************************************************************************/
+
+#ifndef __leon__Leon__
+#define __leon__Leon__
+
+
+#define LEON_VERSION_MAJOR 1
+#define LEON_VERSION_MINOR 1
+#define LEON_VERSION_PATCH 0
+
+
+//#define LEON_PRINT_STAT
+
+#include <iostream>
+#include <gatb/gatb_core.hpp>
+#include <sys/time.h>
+
+/** NOTE: we should not include namespaces here => only to make user life easier... */
+using namespace gatb::core;
+using namespace gatb::core::tools;
+using namespace gatb::core::bank;
+using namespace gatb::core::kmer::impl;
+
+using namespace gatb::core::system;
+using namespace gatb::core::system::impl;
+
+
+typedef kmer::impl::Kmer<>::ModelDirect KmerModel;
+typedef kmer::impl::Kmer<>::Type kmer_type;
+typedef kmer::impl::Kmer<>::Count kmer_count;
+
+
+
+
+
+
+
+#include <string>
+#include <sstream>
+#include "HeaderCoder.hpp"
+#include "DnaCoder.hpp"
+
+//#include "RangeCoder.hpp"
+
+#include <time.h> //Used to calculate time taken by decompression
+#include <zlib.h> //Test bloom compression
+//char char2phred(char c);
+//double char2proba(char c);
+
+
+#include <pthread.h>
+
+
+class HeaderEncoder;
+class HeaderDecoder;
+class DnaEncoder;
+class QualDecoder;
+class DnaDecoder;
+
+typedef struct
+{
+ QualDecoder * qual_decoder;
+ HeaderDecoder * header_decoder;
+ DnaDecoder * dna_decoder;
+} thread_arg_decoder;
+
+
+class Leon : public misc::impl::Tool
+{
+ friend class BankLeon;
+ public:
+
+ //Leon( bool compress, bool decompress);
+ Leon();
+ ~Leon();
+
+ static const char* STR_COMPRESS;
+ static const char* STR_DECOMPRESS;
+ static const char* STR_TEST_DECOMPRESSED_FILE;
+ static const char* STR_DNA_ONLY;
+ static const char* STR_NOHEADER;
+ static const char* STR_NOQUAL;
+ static const char* STR_INIT_ITER;
+
+ static const char* STR_DATA_INFO;
+
+
+ size_t _kmerSize;
+ string _dskOutputFilename;
+ //static const int READ_PER_BLOCK = 50000;
+ int _nb_cores;
+
+ bool _compress, _decompress;
+ bool _iterator_mode;
+
+ clock_t _time; //Used to calculate time taken by decompression
+
+ //Global compression
+ void writeBlock(u_int8_t* data, u_int64_t size, int encodedSequenceCount,u_int64_t blockID, bool Header);
+
+ void writeBlockLena(u_int8_t* data, u_int64_t size, int encodedSequenceCount,u_int64_t blockID);
+
+ //Header compression
+ string _firstHeader;
+ u_int64_t _totalHeaderSize;
+ //u_int64_t _totalHeaderCompressedSize;
+
+ //Dna compression
+ u_int32_t _anchorAdress;
+
+ bool anchorExist(const kmer_type& kmer, u_int32_t* anchorAdress);
+ int findAndInsertAnchor(const vector<kmer_type>& kmers, u_int32_t* anchorAdress);
+ void updateMinMaxSequenceSize(int newMin, int newMax);
+
+ int _minSequenceSize;
+ int _maxSequenceSize;
+
+ u_int64_t _totalDnaSize;
+ u_int64_t _anchorDictSize;
+ u_int64_t _anchorAdressSize;
+ u_int64_t _anchorPosSize;
+ u_int64_t _otherSize;
+ u_int64_t _readSizeSize;
+ u_int64_t _bifurcationSize;
+ u_int64_t _noAnchorSize;
+
+ //u_int64_t _totalDnaCompressedSize;
+ //u_int64_t _realDnaCompressedSize;
+ u_int64_t _compressedSize;
+ IBloom<kmer_type>* _bloom;
+
+ bool _isFasta;
+ bool _noHeader;
+
+ bool _lossless;
+ //for qual compression
+ u_int64_t _total_nb_quals_smoothed ;
+ u_int64_t _input_qualSize;
+ u_int64_t _compressed_qualSize;
+
+
+ //test dna compression
+ u_int64_t _MCtotal;
+ u_int64_t _MCnoAternative;
+ u_int64_t _MCuniqSolid;
+ u_int64_t _MCuniqNoSolid;
+ u_int64_t _MCmultipleSolid;
+ //u_int64_t _MCmultipleNoSolid;
+
+ u_int64_t _blockCount;
+ //u_int64_t _noAnchor_full_N_kmer_count;
+ //u_int64_t _noAnchor_with_N_kmer_count;
+ //double _anchorKmerCount;
+ u_int64_t _readCount;
+ u_int64_t _readWithoutAnchorCount;
+ //double _total_kmer_indexed;
+ //double _uniq_mutated_kmer;
+ //u_int64_t _total_kmer;
+ //u_int64_t _readWithoutAnchorSize;
+ //u_int64_t _readWithAnchorSize;
+ //double _readWithAnchorMutationChoicesSize;
+ //Hash16<kmer_type>* _kmerAbundance;
+
+ int _nb_thread_living;
+
+ // ProgressSynchro *
+ dp::IteratorListener * _progress_decode;
+
+
+ //DNA decompression
+ kmer_type getAnchor(ifstream* anchorDictFile, u_int32_t adress);
+ string _anchorDictFilename;
+
+
+ static const int nt2binTab[128];
+ static const int bin2ntTab[5];
+ //static const vector<int> bin2ntTab(5;
+
+
+
+ //Utils
+ static int nt2bin(char nt){
+ return nt2binTab[(unsigned char)nt];
+
+ }
+ static int bin2nt(int nt){
+ return bin2ntTab[nt];
+ }
+
+ void setReadPerBlock(int value){
+ _read_per_block = value;
+ }
+
+ int getReadPerBlock(){
+ return _read_per_block;
+ }
+
+ private:
+
+ int _read_per_block;
+
+ //hdf5 stuff
+ Storage* _storageH5file;
+
+
+ tools::storage::impl::Group * _groupLeon;
+ tools::storage::impl::Group * _subgroupInfo;
+ tools::storage::impl::Group * _subgroupDict;
+ tools::storage::impl::Group * _subgroupDNA;
+ tools::storage::impl::Group * _subgroupQual;
+ tools::storage::impl::Group * _subgroupHeader;
+
+ collections::Collection<math::NativeInt8>* _subgroupInfoCollection;
+
+ u_int64_t _lastAnchorValue;
+
+ struct timeval _tim;
+ double _wdebut_leon, _wfin_leon;
+ //static const char* STR_GZ;
+ IFile* _outputFile;
+
+
+
+ ofstream* _dictAnchorFile;
+ int _nks;
+
+ void execute ();
+ void createBloom ();
+ //void createKmerAbundanceHash();
+
+ //Global compression
+ string _inputFilename;
+ string _outputFilename;
+
+
+ //quals
+ //string _FileQualname;
+ //IFile* _FileQual;
+ //tools::storage::impl::Storage::ostream * _Qual_outstream ;
+
+ string _qualOutputFilename; //temp file
+
+ Order0Model _generalModel;
+ vector<Order0Model> _numericModel;
+ RangeEncoder _rangeEncoder;
+ vector<u_int64_t> _blockSizes;
+
+ // vector<u_int64_t> _qualBlockSizes;
+ vector<u_int64_t> _headerBlockSizes;
+ vector<u_int64_t> _dnaBlockSizes;
+
+ IBank* _inputBank;
+ void setInputBank (IBank* inputBank) { SP_SETATTR(inputBank); }
+
+ //u_int64_t _bloomSize;
+
+ void executeCompression();
+ void executeDecompression();
+ void endCompression();
+ void endQualCompression();
+
+ //Global decompression
+ void setupNextComponent(vector<u_int64_t> & blockSizes );
+
+ RangeDecoder _rangeDecoder;
+ //ifstream* _inputFile;
+ ifstream* _inputFile;
+ //ifstream* _inputFileQual;
+
+ u_int64_t _filePos;
+
+ u_int64_t _filePosHeader;
+ u_int64_t _filePosDna;
+
+ double _headerCompRate, _dnaCompRate, _qualCompRate;
+
+ //Quals
+ u_int64_t _filePosQual;
+
+
+ void startDecompressionAllStreams();
+
+
+
+ //Header compression
+ void startHeaderCompression();
+ void endHeaderCompression();
+
+
+ //DNA Compression
+ void startDnaCompression();
+ void endDnaCompression();
+ void writeBloom();
+ void writeAnchorDict();
+ void encodeInsertedAnchor(const kmer_type& kmer);
+
+ RangeEncoder _anchorRangeEncoder;
+ Order0Model _anchorDictModel;
+
+ //map<kmer_type, u_int32_t> _anchorKmers; //uses 46 B per elem inserted
+ //OAHash<kmer_type> _anchorKmers;
+ Hash16<kmer_type, u_int32_t > * _anchorKmers ; //will use approx 20B per elem inserted
+
+ //Header decompression
+
+ string _headerOutputFilename;
+
+ // int _auto_cutoff;
+ pthread_mutex_t findAndInsert_mutex;
+ pthread_mutex_t writeblock_mutex;
+ pthread_mutex_t minmax_mutex;
+
+ //DNA Decompression
+ void decodeBloom();
+ void decodeAnchorDict();
+
+ KmerModel* _kmerModel;
+ string _dnaOutputFilename;
+ RangeDecoder _anchorRangeDecoder;
+ vector<kmer_type> _vecAnchorKmers;
+
+ //Global decompression
+ void endDecompression();
+
+ //IFile* _outputFile;
+
+ void startDecompression_setup();
+ void decoders_setup();
+ void decoders_cleanup();
+
+ vector<QualDecoder*> _qualdecoders;
+ vector<DnaDecoder*> _dnadecoders;
+ vector<HeaderDecoder*> _headerdecoders;
+
+ pthread_t * _tab_threads;
+
+ thread_arg_decoder * _targ;
+ void decompressionDecodeBlocks(unsigned int & idx, int & livingThreadCount);
+
+ void testing_iter();
+
+
+ class LeonIterator : public tools::dp::Iterator<Sequence>
+ {
+ public:
+
+
+ LeonIterator (Leon& ref);
+
+ /** Destructor */
+ ~LeonIterator ();
+
+ /** \copydoc tools::dp::Iterator::first */
+ void first();
+
+ /** \copydoc tools::dp::Iterator::next */
+ void next();
+
+ /** \copydoc tools::dp::Iterator::isDone */
+ bool isDone () { return _isDone; }
+
+ /** \copydoc tools::dp::Iterator::item */
+ Sequence& item () { return *_item; }
+
+ /** Estimation of the sequences information */
+ void estimate (u_int64_t& number, u_int64_t& totalSize, u_int64_t& maxSize);
+
+ private:
+
+ /** Reference to the underlying Leon instance. */
+ Leon& _leon;
+
+ /** Tells whether the iteration is finished or not. */
+ bool _isDone;
+
+ /** Tells whether the instance is initialized. */
+ bool _isInitialized;
+
+
+ /** Initialization method. */
+ void init ();
+
+ /** Finish method. */
+ void finalize ();
+
+ void readNextBlocks();
+
+ void readNextThreadBock();
+
+ unsigned int _idxB;
+ int _livingThreadCount;
+ int _currentTID;
+
+ HeaderDecoder* _hdecoder ;
+ QualDecoder* _qdecoder;
+ DnaDecoder* _ddecoder ;
+
+
+ std::istringstream * _stream_qual ;
+ std::istringstream * _stream_header ;
+ std::istringstream * _stream_dna;
+
+ bool _readingThreadBlock;
+ u_int64_t _readid;
+
+ };
+};
+
+
+class BankLeon : public AbstractBank
+{
+public:
+
+ /** Returns the name of the bank format. */
+ static const char* name() { return "Leon"; }
+
+ /** Constructor.
+ * \param[in] nbSequences : number of sequences of the random bank
+ * \param[in] length : length of a sequence. */
+ BankLeon (const std::string& filename);
+
+ /** Destructor. */
+ ~BankLeon ();
+
+ /** \copydoc IBank::getId. */
+ std::string getId () { return _fname; }
+
+ /** \copydoc IBank::iterator */
+ tools::dp::Iterator<Sequence>* iterator () { return new Leon::LeonIterator (*_leon); }
+
+ /** */
+ int64_t getNbItems () ;
+
+ /** \copydoc IBank::insert */
+ void insert (const Sequence& item) {}
+
+ /** \copydoc IBank::flush */
+ void flush () {}
+
+ /** \copydoc IBank::getSize */
+ u_int64_t getSize ();
+
+ /** \copydoc IBank::estimate */
+ void estimate (u_int64_t& number, u_int64_t& totalSize, u_int64_t& maxSize);
+
+ /** \return maximum number of files. */
+ static const size_t getMaxNbFiles () { return 0; }
+
+ /************************************************************/
+
+
+protected:
+
+ std::string _fname;
+ Leon *_leon;
+
+};
+
+/* \brief Factory for the BankFasta class. */
+class BankLeonFactory : public IBankFactory
+{
+public:
+
+ /** \copydoc IBankFactory::createBank */
+ IBank* createBank (const std::string& uri);
+};
+
+
+
+
+
+#endif /* defined(__leon__Leon__) */
diff --git a/gatb-core/src/gatb/tools/compression/RangeCoder.cpp b/gatb-core/src/gatb/tools/compression/RangeCoder.cpp
index 4323799..f9d08a1 100644
--- a/gatb-core/src/gatb/tools/compression/RangeCoder.cpp
+++ b/gatb-core/src/gatb/tools/compression/RangeCoder.cpp
@@ -184,7 +184,7 @@ RangeDecoder::~RangeDecoder(){
}
-void RangeDecoder::setInputFile(ifstream* inputFile, bool reversed){
+void RangeDecoder::setInputFile(istream* inputFile, bool reversed){
_reversed = reversed;
clear();
_inputFile = inputFile;
diff --git a/gatb-core/src/gatb/tools/compression/RangeCoder.hpp b/gatb-core/src/gatb/tools/compression/RangeCoder.hpp
index 405d5ad..465e30c 100644
--- a/gatb-core/src/gatb/tools/compression/RangeCoder.hpp
+++ b/gatb-core/src/gatb/tools/compression/RangeCoder.hpp
@@ -114,13 +114,13 @@ class RangeDecoder : AbstractRangeCoder
RangeDecoder();
~RangeDecoder();
- void setInputFile(ifstream* inputFile, bool reversed=false);
+ void setInputFile(istream* inputFile, bool reversed=false);
u_int8_t nextByte(Order0Model& model);
void clear();
private:
- ifstream* _inputFile;
+ istream* _inputFile;
u_int64_t _code;
bool _reversed;
diff --git a/gatb-core/src/gatb/tools/designpattern/.DS_Store b/gatb-core/src/gatb/tools/designpattern/.DS_Store
deleted file mode 100644
index 1224d38..0000000
Binary files a/gatb-core/src/gatb/tools/designpattern/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/tools/designpattern/api/.DS_Store b/gatb-core/src/gatb/tools/designpattern/api/.DS_Store
deleted file mode 100644
index 5fb483b..0000000
Binary files a/gatb-core/src/gatb/tools/designpattern/api/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/tools/designpattern/api/Iterator.hpp b/gatb-core/src/gatb/tools/designpattern/api/Iterator.hpp
index 76d2579..67ab22d 100644
--- a/gatb-core/src/gatb/tools/designpattern/api/Iterator.hpp
+++ b/gatb-core/src/gatb/tools/designpattern/api/Iterator.hpp
@@ -103,6 +103,16 @@ namespace dp {
* actual containers) and may be easier to use.
*
* Moreover, we can use our iterator as a basis for other ways for iteration.
+ *
+ *
+ * note (GR): this iterator system looks like the range of C++11, now that we use C++11, we could switch to range:
+ *
+ * for ( MyType elem : range )
+ * {
+ * // use elem
+ * }
+ *
+ * it has the same benefits as the listed benefits of the Iterator class, IMHO
*/
template <class Item> class Iterator : public system::SmartPointer
{
@@ -158,7 +168,7 @@ public:
size_t i=0;
for (i=0; i<current.size(); i++)
{
- setItem (current[i]);
+ setItem (current[i]); /* Rayan's comment: this is a weird mechanism where actually, first() and next() will be responsible for populating the vector. setItem merely points the iterator item's to a vector element and does NOT set current[i] to the current item. There must be a good reason why Erwan went for this. */
if (_isRunning == IDDLE) { first (); _isRunning=STARTED; }
else { next (); }
diff --git a/gatb-core/src/gatb/tools/designpattern/impl/.DS_Store b/gatb-core/src/gatb/tools/designpattern/impl/.DS_Store
deleted file mode 100644
index 7d05fcd..0000000
Binary files a/gatb-core/src/gatb/tools/designpattern/impl/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/tools/designpattern/impl/IteratorHelpers.hpp b/gatb-core/src/gatb/tools/designpattern/impl/IteratorHelpers.hpp
index 6083c0a..50e2a90 100644
--- a/gatb-core/src/gatb/tools/designpattern/impl/IteratorHelpers.hpp
+++ b/gatb-core/src/gatb/tools/designpattern/impl/IteratorHelpers.hpp
@@ -75,6 +75,8 @@ public:
* may still be more efficient to have two loops. The CartesianIterator is just here
* for easing the product iteration on small sets.
*
+ * NOTE: most likely it doesn't work in combination with Dispatcher, see PairedIterator for how to fix
+ *
* Example:
* \snippet iterators2.cpp snippet1
*/
@@ -102,6 +104,16 @@ public:
* This is merely an optimization in order not to call too often the "isDone" method
* on the two iterators. */
_isDone = false;
+
+ /* to make it work with dispatcher the fix should be something like:
+ if (!isDone)
+ {
+ std::pair<T1,T2> t;
+ t.first = _it1->item();
+ t.second = _it2->item();
+ *(this->_item) = t;
+ }
+ */
}
/** \copydoc Iterator::next */
@@ -127,6 +139,15 @@ public:
_isDone = true;
}
}
+ /* to make it work with dispatcher the fix should e something like:
+ if (!isDone)
+ {
+ std::pair<T1,T2> t;
+ t.first = _it1->item();
+ t.second = _it2->item();
+ *(this->_item) = t;
+ }
+ */
}
/** \copydoc Iterator::isDone */
@@ -191,12 +212,15 @@ public:
_it2->first();
_isDone = _it1->isDone() || _it2->isDone();
+
+ // due to the way Dispatcher works, we need to update _item, not just _current
+ if (_isDone == false) {
+ std::pair<T1,T2> t;
+ t.first = _it1->item();
+ t.second = _it2->item();
+ *(this->_item) = t;
+ }
- if (_isDone==false)
- {
- _current.first = _it1->item();
- _current.second = _it2->item();
- }
}
/** \copydoc Iterator::next */
@@ -205,18 +229,27 @@ public:
_it1->next (); _it2->next ();
_isDone = _it1->isDone() || _it2->isDone();
- if (_isDone==false)
- {
- _current.first = _it1->item();
- _current.second = _it2->item();
- }
+ if (_isDone == false) {
+ std::pair<T1,T2> t;
+ t.first = _it1->item();
+ t.second = _it2->item();
+ *(this->_item) = t;
+ }
+
}
/** \copydoc Iterator::isDone */
bool isDone() { return _isDone; }
/** \copydoc Iterator::item */
- std::pair<T1,T2>& item () { return _current; }
+ std::pair<T1,T2>& item () {
+ //std::pair<T1,T2> t; // can't do a temporary object because apparently it'll be destructed, eh
+ _current.first = _it1->item();
+ _current.second = _it2->item();
+ *(this->_item) = _current; // not sure if this is essential but i'm keeping it
+
+ return _current;
+ }
private:
@@ -231,8 +264,8 @@ private:
/** Finish status. */
bool _isDone;
- /** Current item in the iteration. */
std::pair<T1,T2> _current;
+
};
/********************************************************************************/
@@ -587,7 +620,7 @@ public:
* pointer (in case of usage of Dispatcher for instance where a buffer of items is kept
* and could be wrong if the referred items doesn't exist any more when accessing the
* buffer).
- * TODO doc: I get it, but why is it done only in this iterator and not other iterators like FilterIterator?*/
+ */
if (!_isDone) { *(this->_item) = _ref.item(); }
}
@@ -978,6 +1011,7 @@ private:
}
};
+
/********************************************************************************/
/** \brief Iterator adaptation from one type to another one
*/
diff --git a/gatb-core/src/gatb/tools/misc/.DS_Store b/gatb-core/src/gatb/tools/misc/.DS_Store
deleted file mode 100644
index c4e24c1..0000000
Binary files a/gatb-core/src/gatb/tools/misc/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/tools/misc/api/.DS_Store b/gatb-core/src/gatb/tools/misc/api/.DS_Store
deleted file mode 100644
index 9c612d3..0000000
Binary files a/gatb-core/src/gatb/tools/misc/api/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/tools/misc/api/IHistogram.hpp b/gatb-core/src/gatb/tools/misc/api/IHistogram.hpp
index 5cd10e9..60b4718 100644
--- a/gatb-core/src/gatb/tools/misc/api/IHistogram.hpp
+++ b/gatb-core/src/gatb/tools/misc/api/IHistogram.hpp
@@ -112,6 +112,11 @@ public:
* \return number of kmers. */
virtual u_int64_t get_nbsolids_auto () = 0;
+
+ /** Get the ratio of weak kmers in total volume
+ * \return ratio */
+ virtual float get_ratio_weak () = 0;
+
/** Get the x1 value at the first maximum after x0. */
virtual u_int16_t get_first_peak () = 0;
diff --git a/gatb-core/src/gatb/tools/misc/impl/.DS_Store b/gatb-core/src/gatb/tools/misc/impl/.DS_Store
deleted file mode 100644
index c5e46a7..0000000
Binary files a/gatb-core/src/gatb/tools/misc/impl/.DS_Store and /dev/null differ
diff --git a/gatb-core/src/gatb/tools/misc/impl/Histogram.cpp b/gatb-core/src/gatb/tools/misc/impl/Histogram.cpp
index 15729cc..6de8536 100644
--- a/gatb-core/src/gatb/tools/misc/impl/Histogram.cpp
+++ b/gatb-core/src/gatb/tools/misc/impl/Histogram.cpp
@@ -172,6 +172,21 @@ void Histogram::compute_threshold (int min_auto_threshold)
_nbsolids += _histogram[i].abundance;
}
+
+ u_int64_t vol_weak=0;
+ u_int64_t volume_total=0;
+
+ for (size_t i=0; i<_cutoff ; i++)
+ {
+ vol_weak += _histogram[i].abundance *i;
+ }
+
+ for (size_t i=0; i<_length+1 ; i++)
+ {
+ volume_total += _histogram[i].abundance *i;
+ }
+ _ratio_weak_volume = (float)vol_weak / (float)volume_total;
+
}
/********************************************************************************/
diff --git a/gatb-core/src/gatb/tools/misc/impl/Histogram.hpp b/gatb-core/src/gatb/tools/misc/impl/Histogram.hpp
index ac3a458..9f5ff17 100644
--- a/gatb-core/src/gatb/tools/misc/impl/Histogram.hpp
+++ b/gatb-core/src/gatb/tools/misc/impl/Histogram.hpp
@@ -56,7 +56,7 @@ public:
* \param[in] length : maximum value for the X axis
* \param[in] bag : bag where the values can be saved. */
Histogram (size_t length)
- : _length(length), _cutoff(0), _nbsolids(0), _firstPeak(0),
+ : _length(length), _cutoff(0), _nbsolids(0), _ratio_weak_volume(0), _firstPeak(0),
_histogram(0), _histogram_smoothed(0)
{
_histogram = (Entry*) CALLOC (_length + 1, sizeof (Entry));
@@ -99,6 +99,12 @@ public:
/** \copydoc IHistogram::get_first_peak */
u_int16_t get_first_peak () { return _firstPeak; }
+
+ /** \copydoc IHistogram::get_ratio_weak */
+ float get_ratio_weak () { return _ratio_weak_volume; }
+
+
+
/** \copydoc IHistogram::getLength */
size_t getLength() { return _length; }
@@ -110,6 +116,7 @@ private:
size_t _length;
u_int16_t _cutoff;
u_int64_t _nbsolids;
+ float _ratio_weak_volume;
u_int16_t _firstPeak;
Entry* _histogram;
@@ -136,6 +143,9 @@ public:
/** \copydoc IHistogram::get_nbsolids_auto */
u_int64_t get_nbsolids_auto () { return 0; }
+ /** \copydoc IHistogram::get_ratio_weak */
+ float get_ratio_weak () { return 0; }
+
/** \copydoc IHistogram::get_first_peak */
u_int16_t get_first_peak () { return 0; }
@@ -190,6 +200,10 @@ public:
/** \copydoc IHistogram::get_nbsolids_auto */
u_int64_t get_nbsolids_auto () {return _ref->get_nbsolids_auto();}
+ /** \copydoc IHistogram::get_ratio_weak */
+ float get_ratio_weak() { return _ref->get_ratio_weak(); }
+
+
/** \copydoc IHistogram::get_first_peak */
u_int16_t get_first_peak () { return _ref->get_first_peak(); }
diff --git a/gatb-core/src/gatb/tools/misc/impl/Pool.hpp b/gatb-core/src/gatb/tools/misc/impl/Pool.hpp
index a9a1a0e..2c671f0 100644
--- a/gatb-core/src/gatb/tools/misc/impl/Pool.hpp
+++ b/gatb-core/src/gatb/tools/misc/impl/Pool.hpp
@@ -54,10 +54,10 @@ template <typename cell> class Pool
public:
/** Default constructor.
- * \param[in] tai : 2^22 16 M cells *16 o blocs de 256 Mo
- * \param[in] N : 2^10 soit 4 G cells max
+ * \param[in] tai : 2^20 1 M cells *16 o blocs de 16 Mo
+ * \param[in] N : 2^12 soit 4 G cells max
* */
- Pool (size_t tai=4194304, size_t N=1024) : TAI_POOL(tai), N_POOL(N)
+ Pool (size_t tai=1048576, size_t N=4096) : TAI_POOL(tai), N_POOL(N)
{
n_pools = 0; n_cells=0;
//allocation table de pool :
@@ -80,6 +80,13 @@ public:
FREE (tab_pool);
}
+
+ u_int64_t getByteSize()
+ {
+ return ((n_pools-2) * TAI_POOL*sizeof(cell)); // I do not want to count the firs tdefault allocated pool
+ }
+
+
/** allocate cell, return internal pointer type ( 32bits) */
cell_ptr_t allocate_cell()
{
@@ -87,8 +94,8 @@ public:
// ncells = nb de cells deja utilisees
if (n_cells <TAI_POOL)
{
- internal_adress = n_pools -1; // low 10 bits : pool number
- internal_adress |= n_cells << 10; // 22 high bits : cell number in pool
+ internal_adress = n_pools -1; // low 12 bits : pool number
+ internal_adress |= n_cells << 12; // 20 high bits : cell number in pool
n_cells ++;
return internal_adress;
@@ -105,8 +112,8 @@ public:
n_pools++;
n_cells = 1;
- internal_adress = n_pools -1; // low 8 bits : pool number
- // 22 high bits are 0
+ internal_adress = n_pools -1;
+ // 20 high bits are 0
return internal_adress;
}
@@ -115,8 +122,8 @@ public:
/** */
cell* internal_ptr_to_cell_pointer(cell_ptr_t internal_ptr)
{
- unsigned int numpool = internal_ptr & 1023;
- unsigned int numcell = internal_ptr >> 10;
+ unsigned int numpool = internal_ptr & 4095;
+ unsigned int numcell = internal_ptr >> 12;
return (tab_pool[numpool] + numcell);
}
@@ -128,13 +135,15 @@ public:
{
FREE ( tab_pool[i] );
}
-
+ memset(tab_pool[1],0,TAI_POOL*sizeof(cell));
+
//on repasse sur premiere pool
pool_courante = tab_pool[1];
n_cells=0;
n_pools=2;
}
+
//sort the pools according to some comparator
//warning this will reorder cells and thus making existing pointers to cells irrelevant
@@ -150,6 +159,7 @@ public:
// la pool en cours de remplissage
std::sort( tab_pool[n_pools-1], tab_pool[n_pools-1] + n_cells, comparator);
+
}
@@ -172,7 +182,7 @@ public:
public:
typedef std::pair<int, cell *> cellpair_t; //id pointer of pool , cell *
- struct sortcellpair { bool operator() (cellpair_t &l,cellpair_t &r) { return ( (* l.second).val < (* r.second).val ); } } ;
+ struct sortcellpair { bool operator() (cellpair_t &l,cellpair_t &r) { return !( (* l.second).graine <= (* r.second).graine ); } } ;
IteratorSorted (Pool<cell>& aRef) : ref(aRef), done(true) {}
@@ -198,6 +208,8 @@ public:
cellpair_t current_pair = pq.top() ; pq.pop();
*this->_item = * (current_pair.second);
+
+
//push the next cell of this list if any
unsigned int cell_number = current_pair.second - ref.tab_pool[current_pair.first] ;
unsigned int current_pool = current_pair.first;
diff --git a/gatb-core/src/gatb/tools/misc/impl/Stringify.hpp b/gatb-core/src/gatb/tools/misc/impl/Stringify.hpp
index 29d1eef..9e00264 100644
--- a/gatb-core/src/gatb/tools/misc/impl/Stringify.hpp
+++ b/gatb-core/src/gatb/tools/misc/impl/Stringify.hpp
@@ -28,6 +28,10 @@
/********************************************************************************/
+#if !defined(__CYGWIN__) && !defined(_GNU_SOURCE) // following https://github.com/cliffordwolf/icestorm/issues/50
+#define _GNU_SOURCE // for vasprintf
+#endif
+
#include <string>
#include <iostream>
#include <stdarg.h>
diff --git a/gatb-core/src/gatb/tools/misc/impl/Tool.cpp b/gatb-core/src/gatb/tools/misc/impl/Tool.cpp
index bea289f..b9b8a74 100644
--- a/gatb-core/src/gatb/tools/misc/impl/Tool.cpp
+++ b/gatb-core/src/gatb/tools/misc/impl/Tool.cpp
@@ -50,7 +50,7 @@ Tool::Tool (const std::string& name) : userDisplayHelp(0), _helpTarget(0),userDi
setOutput (new Properties());
setInfo (new Properties());
- _info->add (0, _name);
+ // _info->add (0, _name);
/** We create an options parser. */
setParser (new OptionsParser(name));
diff --git a/gatb-core/src/gatb/tools/storage/impl/Storage.cpp b/gatb-core/src/gatb/tools/storage/impl/Storage.cpp
index 04342e1..3d800e9 100644
--- a/gatb-core/src/gatb/tools/storage/impl/Storage.cpp
+++ b/gatb-core/src/gatb/tools/storage/impl/Storage.cpp
@@ -301,7 +301,292 @@ Storage::istream::~istream ()
{
delete rdbuf();
}
+
+
+///////////////////////////////////////
+////////// SuperKmerBinFiles //////////
+///////////////////////////////////////
+
+SuperKmerBinFiles::SuperKmerBinFiles(const std::string& path,const std::string& name, size_t nb_files) : _basefilename(name), _path(path),_nb_files(nb_files)
+{
+ _nbKmerperFile.resize(_nb_files,0);
+ _FileSize.resize(_nb_files,0);
+
+ openFiles("wb"); //at construction will open file for writing
+ // then use close() and openFiles() to open for reading
+
+}
+
+void SuperKmerBinFiles::openFile( const char* mode, int fileId)
+{
+ std::stringstream ss;
+ ss << _basefilename << "." << fileId;
+
+ _files[fileId] = system::impl::System::file().newFile (_path, ss.str(), mode);
+ _synchros[fileId] = system::impl::System::thread().newSynchronizer();
+ _synchros[fileId]->use();
+}
+
+void SuperKmerBinFiles::openFiles( const char* mode)
+{
+ _files.resize(_nb_files,0);
+ _synchros.resize(_nb_files,0);
+
+ system::impl::System::file().mkdir(_path, 0755);
+
+ for(unsigned int ii=0;ii<_files.size();ii++)
+ {
+ std::stringstream ss;
+ ss << _basefilename << "." << ii;
+
+ _files[ii] = system::impl::System::file().newFile (_path, ss.str(), mode);
+ _synchros[ii] = system::impl::System::thread().newSynchronizer();
+ _synchros[ii]->use();
+
+ }
+}
+
+
+std::string SuperKmerBinFiles::getFileName(int fileId)
+{
+
+ std::stringstream ss;
+ ss << _path << "/" <<_basefilename << "." << fileId;
+
+ return ss.str();
+}
+
+
+int SuperKmerBinFiles::readBlock(unsigned char ** block, unsigned int* max_block_size, unsigned int* nb_bytes_read, int file_id)
+{
+ _synchros[file_id]->lock();
+
+ //block header
+ int nbr = _files[file_id]->fread(nb_bytes_read, sizeof(*max_block_size),1);
+
+ if(nbr == 0)
+ {
+ //printf("__ end of file %i __\n",file_id);
+ _synchros[file_id]->unlock();
+ return 0;
+ }
+
+ if(*nb_bytes_read > *max_block_size)
+ {
+ *block = (unsigned char *) realloc(*block, *nb_bytes_read);
+ *max_block_size = *nb_bytes_read;
+ }
+
+ //block
+ _files[file_id]->fread(*block, sizeof(unsigned char),*nb_bytes_read);
+
+ _synchros[file_id]->unlock();
+
+ return *nb_bytes_read;
+}
+
+int SuperKmerBinFiles::getNbItems(int fileId)
+{
+ return _nbKmerperFile[fileId];
+}
+
+
+u_int64_t SuperKmerBinFiles::getFileSize(int fileId)
+{
+
+ return _FileSize[fileId];
+}
+
+void SuperKmerBinFiles::getFilesStats(u_int64_t & total, u_int64_t & biggest, u_int64_t & smallest, float & mean)
+{
+ total =0;
+ smallest = ~0;
+ biggest = 0;
+ mean=0;
+ for(unsigned int ii=0;ii<_FileSize.size();ii++)
+ {
+ smallest = std::min (smallest, _FileSize[ii]);
+ biggest = std::max (biggest, _FileSize[ii]);
+ total+=_FileSize[ii];
+ }
+ if(_FileSize.size()!=0)
+ mean= total/_FileSize.size();
+
+}
+
+
+void SuperKmerBinFiles::writeBlock(unsigned char * block, unsigned int block_size, int file_id, int nbkmers)
+{
+
+ _synchros[file_id]->lock();
+
+ _nbKmerperFile[file_id]+=nbkmers;
+ _FileSize[file_id] += block_size+sizeof(block_size);
+ //block header
+ _files[file_id]->fwrite(&block_size, sizeof(block_size),1);
+
+ //block
+ _files[file_id]->fwrite(block, sizeof(unsigned char),block_size);
+
+ _synchros[file_id]->unlock();
+
+}
+
+void SuperKmerBinFiles::flushFiles()
+{
+ for(unsigned int ii=0;ii<_files.size();ii++)
+ {
+ _synchros[ii]->lock();
+
+ if(_files[ii]!=0)
+ {
+ _files[ii]->flush();
+ }
+
+ _synchros[ii]->unlock();
+
+ }
+}
+void SuperKmerBinFiles::eraseFiles()
+{
+ for(unsigned int ii=0;ii<_files.size();ii++)
+ {
+ std::stringstream ss;
+ ss << _path << "/" <<_basefilename << "." << ii;
+ system::impl::System::file().remove(ss.str());
+ }
+ system::impl::System::file().rmdir(_path);
+
+}
+
+void SuperKmerBinFiles::closeFile( int fileId)
+{
+ if(_files[fileId]!=0)
+ {
+ delete _files[fileId];
+ _files[fileId] = 0;
+ _synchros[fileId]->forget();
+ }
+}
+
+
+void SuperKmerBinFiles::closeFiles()
+{
+ for(unsigned int ii=0;ii<_files.size();ii++)
+ {
+ if(_files[ii]!=0)
+ {
+ delete _files[ii];
+ _files[ii] = 0;
+ _synchros[ii]->forget();
+ }
+ }
+}
+
+SuperKmerBinFiles::~SuperKmerBinFiles()
+{
+ this->closeFiles();
+ this->eraseFiles();
+}
+
+int SuperKmerBinFiles::nbFiles()
+{
+ return _files.size();
+}
+
+////////////////////////////////////////////
+////////// CacheSuperKmerBinFiles /////////
+////////////////////////////////////////////
+
+
+
+
+CacheSuperKmerBinFiles::CacheSuperKmerBinFiles(SuperKmerBinFiles * ref, int buffsize )
+{
+ _ref = ref;
+
+ _nb_files = _ref->nbFiles();
+ _nbKmerperFile.resize(_nb_files,0);
+
+ _buffer_max_capacity = buffsize; // this is per file, per thread
+ //printf("buffsize %i per file per thread \n",_buffer_max_capacity);
+
+ _max_superksize= 255; // this is extra size from regular kmer; ie total max superksize is kmersize + _max_superksize
+
+ _buffers.resize(_nb_files);
+ _buffers_idx.resize(_nb_files,0);
+
+ for(unsigned int ii=0; ii<_buffers.size();ii++ )
+ {
+ _buffers[ii] = (u_int8_t*) MALLOC (sizeof(u_int8_t) * _buffer_max_capacity);
+ }
+
+}
+
+//copy construc : alloc own buffer for new object
+CacheSuperKmerBinFiles::CacheSuperKmerBinFiles (const CacheSuperKmerBinFiles& p)
+{
+ _ref = p._ref;
+ _nb_files= p._nb_files;
+ _buffer_max_capacity= p._buffer_max_capacity;
+ _max_superksize= p._max_superksize;
+ _nbKmerperFile.resize(_nb_files,0);
+
+ _buffers.resize(_nb_files);
+ _buffers_idx.resize(_nb_files,0);
+
+ for(unsigned int ii=0; ii<_buffers.size();ii++ )
+ {
+ _buffers[ii] = (u_int8_t*) MALLOC (sizeof(u_int8_t) * _buffer_max_capacity);
+ }
+}
+
+void CacheSuperKmerBinFiles::flushAll()
+{
+ //printf("flush all buffers\n");
+ for(unsigned int ii=0; ii<_buffers.size();ii++ )
+ {
+ flush(ii);
+ }
+}
+
+
+void CacheSuperKmerBinFiles::flush(int file_id)
+{
+ if(_buffers_idx[file_id]!=0)
+ {
+ _ref->writeBlock(_buffers[file_id],_buffers_idx[file_id],file_id,_nbKmerperFile[file_id]);
+
+ _buffers_idx[file_id]=0;
+ _nbKmerperFile[file_id] = 0;
+ }
+}
+
+
+void CacheSuperKmerBinFiles::insertSuperkmer(u_int8_t* superk, int nb_bytes, u_int8_t nbk, int file_id)
+{
+ if( (_buffers_idx[file_id]+nb_bytes+1) > _buffer_max_capacity)
+ {
+ flush(file_id);
+ }
+
+ _buffers[file_id][_buffers_idx[file_id]++] = nbk;
+
+ memcpy(_buffers[file_id] + _buffers_idx[file_id] , superk,nb_bytes);
+ _buffers_idx[file_id] += nb_bytes;
+ _nbKmerperFile[file_id]+=nbk;
+
+}
+
+CacheSuperKmerBinFiles::~CacheSuperKmerBinFiles()
+{
+ this->flushAll();
+ for(unsigned int ii=0; ii<_buffers.size();ii++ )
+ {
+ FREE (_buffers[ii]);
+ }
+}
/********************************************************************************/
} } } } } /* end of namespaces. */
/********************************************************************************/
diff --git a/gatb-core/src/gatb/tools/storage/impl/Storage.hpp b/gatb-core/src/gatb/tools/storage/impl/Storage.hpp
index daf5081..39e5c2a 100644
--- a/gatb-core/src/gatb/tools/storage/impl/Storage.hpp
+++ b/gatb-core/src/gatb/tools/storage/impl/Storage.hpp
@@ -220,6 +220,102 @@ protected:
std::vector<Group*> _groups;
};
+
+
+ ////////////////////////////////////////////////////////////
+ ////////////////// superkmer storage ///////////////////////
+ ////////////////////////////////////////////////////////////
+
+//this class manages the set of temporary files needed to store superkmers
+//to be used in conjunction with the CacheSuperKmerBinFiles below for buffered IO
+
+
+// Note (guillaume) : not very GATB-friendly since it completely ignores GATB bag, bagcache, collection, partition, iterableFile, etc ..
+// but I do not know how to use gatb classes with a variable size type (the superkmer)
+// and anyway the gatb storage complex hierarchy is error-prone (personal opinion)
+// so, hell, just recreate an adhoc buffered storage here for superkmers
+// it does need to be templated for kmer size, superkmers are inserted as u_int8_t*
+
+
+
+//block header = 4B = block size
+//puis block = liste de couple < superk length = 1B , superkmer = nB >
+//the block structure makes it easier for buffered read,
+//otherwise we would not know how to read a big chunk without stopping in the middle of superkmer
+
+class SuperKmerBinFiles
+{
+
+public:
+
+ //construtor will open the files for writing
+ //use closeFiles to close them all then openFiles to open in different mode
+ SuperKmerBinFiles(const std::string& path,const std::string& name, size_t nb_files);
+
+ ~SuperKmerBinFiles();
+
+ void closeFiles();
+ void flushFiles();
+ void eraseFiles();
+ void openFiles(const char* mode);
+ void openFile( const char* mode, int fileId);
+ void closeFile( int fileId);
+
+ //read/write block of superkmers to filefile_id
+ //readBlock will re-allocate the block buffer if needed (current size passed by max_block_size)
+ int readBlock(unsigned char ** block, unsigned int* max_block_size, unsigned int* nb_bytes_read, int file_id);
+ void writeBlock(unsigned char * block, unsigned int block_size, int file_id, int nbkmers);
+
+ int nbFiles();
+ int getNbItems(int fileId);
+
+ void getFilesStats(u_int64_t & total, u_int64_t & biggest, u_int64_t & smallest, float & mean);
+ u_int64_t getFileSize(int fileId);
+
+
+ std::string getFileName(int fileId);
+private:
+
+ std::string _basefilename;
+ std::string _path;
+
+ std::vector<int> _nbKmerperFile;
+ std::vector<u_int64_t> _FileSize;
+
+ std::vector<system::IFile* > _files;
+ std::vector <system::ISynchronizer*> _synchros;
+ int _nb_files;
+};
+
+
+
+//encapsulate SuperKmerBinFiles I/O with a buffer
+class CacheSuperKmerBinFiles
+{
+ public:
+ CacheSuperKmerBinFiles(SuperKmerBinFiles * ref, int buffsize);
+
+ CacheSuperKmerBinFiles (const CacheSuperKmerBinFiles& p);
+
+ void insertSuperkmer(u_int8_t* superk, int nb_bytes, u_int8_t nbk, int file_id);
+ void flushAll();
+ void flush(int file_id);
+ ~CacheSuperKmerBinFiles();
+
+private:
+ SuperKmerBinFiles * _ref;
+ int _max_superksize;
+ int _buffer_max_capacity;
+ int _nb_files;
+
+ std::vector< u_int8_t* > _buffers;
+ std::vector<int> _buffers_idx;
+ std::vector<int> _nbKmerperFile;
+
+};
+
+
+
/**********************************************************************
###### # ###### ####### ### ####### ### ####### # #
# # # # # # # # # # # # ## #
@@ -517,7 +613,7 @@ public:
* \param[in] autoRemove : auto delete the storage from file system during Storage destructor.
* \return the created Storage instance
*/
- Storage* create (const std::string& name, bool deleteIfExist, bool autoRemove);
+ Storage* create (const std::string& name, bool deleteIfExist, bool autoRemove, bool dont_add_extension = false, bool append = false);
/** Tells whether or not a Storage exists in file system given a name
* \param[in] name : name of the storage to be checked
diff --git a/gatb-core/src/gatb/tools/storage/impl/Storage.tpp b/gatb-core/src/gatb/tools/storage/impl/Storage.tpp
index 2e12404..b011425 100644
--- a/gatb-core/src/gatb/tools/storage/impl/Storage.tpp
+++ b/gatb-core/src/gatb/tools/storage/impl/Storage.tpp
@@ -525,11 +525,11 @@ namespace gatb { namespace core { namespace tools { namespace storage {
/*********************************************************************
*********************************************************************/
-inline Storage* StorageFactory::create (const std::string& name, bool deleteIfExist, bool autoRemove)
+inline Storage* StorageFactory::create (const std::string& name, bool deleteIfExist, bool autoRemove, bool dont_add_extension, bool append)
{
switch (_mode)
{
- case STORAGE_HDF5: return StorageHDF5Factory::createStorage (name, deleteIfExist, autoRemove);
+ case STORAGE_HDF5: return StorageHDF5Factory::createStorage (name, deleteIfExist, autoRemove, dont_add_extension, append);
case STORAGE_FILE: return StorageFileFactory::createStorage (name, deleteIfExist, autoRemove);
case STORAGE_GZFILE: return StorageGzFileFactory::createStorage (name, deleteIfExist, autoRemove);
case STORAGE_COMPRESSED_FILE: return StorageSortedFactory::createStorage (name, deleteIfExist, autoRemove);
diff --git a/gatb-core/src/gatb/tools/storage/impl/StorageHDF5.hpp b/gatb-core/src/gatb/tools/storage/impl/StorageHDF5.hpp
index df41fa3..29e0949 100644
--- a/gatb-core/src/gatb/tools/storage/impl/StorageHDF5.hpp
+++ b/gatb-core/src/gatb/tools/storage/impl/StorageHDF5.hpp
@@ -56,9 +56,9 @@ public:
* \param[in] autoRemove : auto delete the storage from file system during Storage destructor.
* \return the created Storage instance
*/
- static Storage* createStorage (const std::string& name, bool deleteIfExist, bool autoRemove)
+ static Storage* createStorage (const std::string& name, bool deleteIfExist, bool autoRemove, bool dont_add_extension = false, bool append = false)
{
- return new StorageHDF5 (STORAGE_HDF5, name, deleteIfExist, autoRemove);
+ return new StorageHDF5 (STORAGE_HDF5, name, deleteIfExist, autoRemove, dont_add_extension, append);
}
/** Tells whether or not a Storage exists in file system given a name
@@ -193,18 +193,21 @@ private:
class StorageHDF5 : public Storage
{
public:
- StorageHDF5 (StorageMode_e mode, const std::string& name, bool deleteIfExist, bool autoRemove)
- : Storage (mode, name, autoRemove), _fileId(0), _name(name)
+ StorageHDF5 (StorageMode_e mode, const std::string& name, bool deleteIfExist, bool autoRemove, bool dont_add_extension = false, bool append = false)
+ : Storage (mode, name, autoRemove), _fileId(0), _name(name), _dont_add_extension(dont_add_extension)
{
if (deleteIfExist) { system::impl::System::file().remove (getActualName()); }
/** We test the actual name exists in filesystem. */
bool exists = system::impl::System::file().doesExist(getActualName());
-
+
if (exists==true)
{
/** We open the existing file. */
- _fileId = H5Fopen (getActualName().c_str(), H5F_ACC_RDWR, H5P_DEFAULT); /* FIXME: now all files are opened as read/write because I need that in Graph.cpp for opening exiting h5 files this needs better interface */
+ if (append)
+ _fileId = H5Fopen (getActualName().c_str(), H5F_ACC_RDWR, H5P_DEFAULT); /* opened as read/write. Used in Graph.cpp, for opening exiting h5 files (e.g. created by DSK) and adding graph stuff to them (eg bloom, mphf) */
+ else
+ _fileId = H5Fopen (getActualName().c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
}
else
{
@@ -231,7 +234,8 @@ private:
hid_t _fileId;
std::string _name;
std::string _actualName;
-
+ bool _dont_add_extension;
+
/** */
std::string getActualName ()
{
@@ -240,7 +244,7 @@ private:
{
_actualName = _name;
/** We check whether the given name has a ".h5" suffix. */
- if (_name.rfind(".h5") == std::string::npos) { _actualName += ".h5"; }
+ if ((_name.rfind(".h5") == std::string::npos) && !_dont_add_extension ) { _actualName += ".h5"; }
}
return _actualName;
diff --git a/gatb-core/test/db/NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq.gz b/gatb-core/test/db/NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq.gz
new file mode 100644
index 0000000..34dc1b7
Binary files /dev/null and b/gatb-core/test/db/NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq.gz differ
diff --git a/gatb-core/test/db/NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq.leon-ref b/gatb-core/test/db/NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq.leon-ref
new file mode 100644
index 0000000..04232d4
Binary files /dev/null and b/gatb-core/test/db/NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq.leon-ref differ
diff --git a/gatb-core/test/db/README.md b/gatb-core/test/db/README.md
new file mode 100644
index 0000000..9b380f0
--- /dev/null
+++ b/gatb-core/test/db/README.md
@@ -0,0 +1,22 @@
+This folder contains data set used by GATB-Core CPPUnit test codes.
+
+## Leon files
+
+* Leon*.fastq: DO NOT modify
+
+* leon1.fastq.leon-ref
+ and leon2.fastq.leon-ref created as follows
+
+ leon -c -file ./gatb-core/gatb-core/test/db/leon1.fastq -lossless -verbose 0 -kmer-size 31 -abundance 1
+
+* NIST7035* files from:
+
+ ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/Garvan_NA12878_HG001_HiSeq_Exome/NIST7035_TAAGGCGA_L001_R1_001.fastq.gz
+
+* giab.hg002* files from:
+
+ ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/CORNELL_Oxford_Nanopore/giab.hg002.2D.fastq
+
+* Note: loading files from ftp server can be none as follows:
+
+ curl --user anonymous:YOUR-EMAIL ftp://ftp-trace.../.../NIST7035.fastq.gz -o NIST7035.fastq.gz
diff --git a/gatb-core/test/db/giab.hg002.2D_6K.fastq.gz b/gatb-core/test/db/giab.hg002.2D_6K.fastq.gz
new file mode 100644
index 0000000..2f5f5ff
Binary files /dev/null and b/gatb-core/test/db/giab.hg002.2D_6K.fastq.gz differ
diff --git a/gatb-core/test/db/leon1.fastq b/gatb-core/test/db/leon1.fastq
new file mode 100644
index 0000000..9aaf53b
--- /dev/null
+++ b/gatb-core/test/db/leon1.fastq
@@ -0,0 +1,28 @@
+ at SRR065390.1 HWUSI-EAS687_61DAJ:8:1:1055:3384 length=100
+TGAANACCTCGAAACTTTTTCAGCGGNNTCNTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++SRR065390.1 HWUSI-EAS687_61DAJ:8:1:1055:3384 length=100
+0000!<:<;:@AAA=@:@@@A at AA@#!!##!##!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ at SRR065390.1 HWUSI-EAS687_61DAJ:8:1:1055:3384 length=100
+NNNNNNNNNNNNTGAATAAATACTTTTTGCAGATGCTAAAACAATTTCCAAGTAAAAAAATTATNNNNNNNNTNGGCNAGCAGNNGTGAANNNGGNNNAT
++SRR065390.1 HWUSI-EAS687_61DAJ:8:1:1055:3384 length=100
+!!!!!!!!!!!!####################################################!!!!!!!!#!###!#####!!#####!!!##!!!##
+ at SRR065390.2 HWUSI-EAS687_61DAJ:8:1:1055:17846 length=100
+CAGTNAATTTTCGTCGATTTTTCCAANNTTNCGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++SRR065390.2 HWUSI-EAS687_61DAJ:8:1:1055:17846 length=100
+0000!8;9;;BBBB@<95?;BABAA#!!##!##!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ at SRR065390.2 HWUSI-EAS687_61DAJ:8:1:1055:17846 length=100
+NNNNNNNNNNNNAATGAGCTGAAAAATGTCAAAATTTCGAAAAATTGGCCGGAAAATGACCGAANNNNNNNNNNNNTNGNCGANNATTGANNNNGNNNGN
++SRR065390.2 HWUSI-EAS687_61DAJ:8:1:1055:17846 length=100
+!!!!!!!!!!!!####################################################!!!!!!!!!!!!#!#!###!!#####!!!!#!!!#!
+ at SRR065390.3 HWUSI-EAS687_61DAJ:8:1:1056:16949 length=100
+ATATNATATCTATTAAACGTCTAGCTNNACNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++SRR065390.3 HWUSI-EAS687_61DAJ:8:1:1056:16949 length=100
+00//!;9<<:CCCCCCCCCCCCCBC#!!##!##!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ at SRR065390.3 HWUSI-EAS687_61DAJ:8:1:1056:16949 length=100
+NNNNNNNNNNNNTTTTATTACTGTAGAAAATTATCTCGTTTTTTCTTTAGTTCAAACAATTTATNNNNNNNNANAATNANGAGNNACATGNNNNANNNTA
++SRR065390.3 HWUSI-EAS687_61DAJ:8:1:1056:16949 length=100
+!!!!!!!!!!!!####################################################!!!!!!!!#!###!#!###!!#####!!!!#!!!##
+ at SRR065390.4 HWUSI-EAS687_61DAJ:8:1:1056:9989 length=100
+TTTGNCAATCGACTTTAGTGTTAACCGNATNTTTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++SRR065390.4 HWUSI-EAS687_61DAJ:8:1:1056:9989 length=100
+00//!00000BAB<AB>BBABAB<B##!##!####!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
diff --git a/gatb-core/test/db/leon1.fastq.leon-ref b/gatb-core/test/db/leon1.fastq.leon-ref
new file mode 100644
index 0000000..742c7b0
Binary files /dev/null and b/gatb-core/test/db/leon1.fastq.leon-ref differ
diff --git a/gatb-core/test/db/leon2.fastq b/gatb-core/test/db/leon2.fastq
new file mode 100644
index 0000000..9aaf53b
--- /dev/null
+++ b/gatb-core/test/db/leon2.fastq
@@ -0,0 +1,28 @@
+ at SRR065390.1 HWUSI-EAS687_61DAJ:8:1:1055:3384 length=100
+TGAANACCTCGAAACTTTTTCAGCGGNNTCNTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++SRR065390.1 HWUSI-EAS687_61DAJ:8:1:1055:3384 length=100
+0000!<:<;:@AAA=@:@@@A at AA@#!!##!##!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ at SRR065390.1 HWUSI-EAS687_61DAJ:8:1:1055:3384 length=100
+NNNNNNNNNNNNTGAATAAATACTTTTTGCAGATGCTAAAACAATTTCCAAGTAAAAAAATTATNNNNNNNNTNGGCNAGCAGNNGTGAANNNGGNNNAT
++SRR065390.1 HWUSI-EAS687_61DAJ:8:1:1055:3384 length=100
+!!!!!!!!!!!!####################################################!!!!!!!!#!###!#####!!#####!!!##!!!##
+ at SRR065390.2 HWUSI-EAS687_61DAJ:8:1:1055:17846 length=100
+CAGTNAATTTTCGTCGATTTTTCCAANNTTNCGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++SRR065390.2 HWUSI-EAS687_61DAJ:8:1:1055:17846 length=100
+0000!8;9;;BBBB@<95?;BABAA#!!##!##!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ at SRR065390.2 HWUSI-EAS687_61DAJ:8:1:1055:17846 length=100
+NNNNNNNNNNNNAATGAGCTGAAAAATGTCAAAATTTCGAAAAATTGGCCGGAAAATGACCGAANNNNNNNNNNNNTNGNCGANNATTGANNNNGNNNGN
++SRR065390.2 HWUSI-EAS687_61DAJ:8:1:1055:17846 length=100
+!!!!!!!!!!!!####################################################!!!!!!!!!!!!#!#!###!!#####!!!!#!!!#!
+ at SRR065390.3 HWUSI-EAS687_61DAJ:8:1:1056:16949 length=100
+ATATNATATCTATTAAACGTCTAGCTNNACNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++SRR065390.3 HWUSI-EAS687_61DAJ:8:1:1056:16949 length=100
+00//!;9<<:CCCCCCCCCCCCCBC#!!##!##!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ at SRR065390.3 HWUSI-EAS687_61DAJ:8:1:1056:16949 length=100
+NNNNNNNNNNNNTTTTATTACTGTAGAAAATTATCTCGTTTTTTCTTTAGTTCAAACAATTTATNNNNNNNNANAATNANGAGNNACATGNNNNANNNTA
++SRR065390.3 HWUSI-EAS687_61DAJ:8:1:1056:16949 length=100
+!!!!!!!!!!!!####################################################!!!!!!!!#!###!#!###!!#####!!!!#!!!##
+ at SRR065390.4 HWUSI-EAS687_61DAJ:8:1:1056:9989 length=100
+TTTGNCAATCGACTTTAGTGTTAACCGNATNTTTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++SRR065390.4 HWUSI-EAS687_61DAJ:8:1:1056:9989 length=100
+00//!00000BAB<AB>BBABAB<B##!##!####!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
diff --git a/gatb-core/test/db/leon2.fastq.leon-ref b/gatb-core/test/db/leon2.fastq.leon-ref
new file mode 100644
index 0000000..ccd3d02
Binary files /dev/null and b/gatb-core/test/db/leon2.fastq.leon-ref differ
diff --git a/gatb-core/test/jenkins/leon/README.md b/gatb-core/test/jenkins/leon/README.md
new file mode 100644
index 0000000..583a28f
--- /dev/null
+++ b/gatb-core/test/jenkins/leon/README.md
@@ -0,0 +1,62 @@
+| **Functional tests** |
+|----------------------|
+| [![Build Status](https://ci.inria.fr/gatb-core/view/Leon/job/tool-leon-functional-tests/badge/icon)](https://ci.inria.fr/gatb-core/view/Leon/job/tool-leon-functional-tests/) |
+
+# Introduction
+
+This directory contains reference material used to test Leon compressor
+by means of Jenkins tasks.
+
+It is intended to be used by Genscale team.
+
+# Scripts
+
+Tests are actually run by 'tool-leon-functional-tests' Jenkins task
+from GATB project on INRIA CI platform; please refer to:
+[https://ci.inria.fr/gatb-core/view/Leon/job/tool-leon-functional-tests/](https://ci.inria.fr/gatb-core/view/Leon/job/tool-leon-functional-tests/)
+
+this task runs in the row the following scripts:
+
+* first: ```tool-leon-functional-tests.sh``` to compile GATB-Core binaries
+* then: ```tool-leon-functional-tests-test.sh``` to run Leon on 15 SRA files
+ totalizing more than 50 Gb of gzipped reads data.
+
+# Retrieving data files
+
+NCBI SRA files have to be retrieved from Genocluster computing nodes
+using the following script: ```download.sh```. That script can be used from
+a terminal session or, even better, using GoDocker.
+
+Then, files integrity can be checked using ```test_integrity.sh``` script.
+
+# Data files
+
+Files used to test Leon are as follows:
+
+```Leon original publication, G. Benoit et al., 2015```:
+[https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0709-7](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0709-7)
+cf. Table ST1, supplementary file:
+
+* SRR065390: C. elegans WGS Illumina
+* SRR959239: E. coli WGS Illumina
+* SRR1519083: Metagenome Illumina
+* SRR1870605: E. coli WGS Illumina Miseq
+* SRR445718: Human RNA-seq Illumina
+* SRR857303: E. coli WGS Ion Torrent
+
+```Evaluation of Leon by Y. Zhang et al, 2017```:
+[http://csse.szu.edu.cn/staff/zhuzx/LWFQZip2/SupplementaryFile.pdf](http://csse.szu.edu.cn/staff/zhuzx/LWFQZip2/SupplementaryFile.pdf)
+Cf. table S10:
+
+* SRR2916693: 454 GS
+* SRR2994368: Illumina Miseq
+* SRR3211986: Pacbio RS
+* ERR739513: MiniION
+* SRR3190692: Illumina Miseq
+* ERR385912: Illumina Hiseq 2000
+* ERR386131: Ion Torrent PGM
+* SRR034509: Illumina Analyzer II
+* ERR174310: Illumina Hiseq 2000
+
+
+
diff --git a/gatb-core/test/jenkins/leon/download.sh b/gatb-core/test/jenkins/leon/download.sh
new file mode 100755
index 0000000..38d6f61
--- /dev/null
+++ b/gatb-core/test/jenkins/leon/download.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# This script relies on the use of NCBI SRA Tools to fetch FastQ files from
+# SRA entry IDs.
+#
+# To install SRA Tools on Genocluster, simply use conda as follows:
+# . /local/env/envconda.sh
+# conda config --add channels bioconda
+# conda create -p ~/sra sra-tools=2.8.1
+#
+# The above procedure has to de bone one times.
+# Then, to activate the conda 'sra' tools, simply use:
+# source activate ~/sra
+#
+# Patrick G. Durand, May 2017
+#
+
+# === CONDA environment: direct use on Genocluster ===
+#. /local/env/envconda.sh
+#source activate ~/sra
+# === CONDA environment: use from GoDocker/Genocluster ===
+. /softs/local/env/envconda.sh
+source activate $GODOCKER_HOME/sra
+
+# === DATA directory ===
+cd /omaha-beach/pdurand/sra-for-leon
+
+SRA_TOOL_PATH=`which fastq-dump`
+if [ -z "$SRA_TOOL_PATH" ] ; then
+ echo "NCBI SRA Tools not found. Please activate Conda sratools..."
+ echo " use: source activate ~/sra"
+ exit 1
+fi
+
+# This list taken from http://csse.szu.edu.cn/staff/zhuzx/LWFQZip2/SupplementaryFile.pdf
+# Table S10.
+# array_ok: SRA files on which Leon is ok
+# array_nok: SRA files on which Leon is not ok
+# (according to article authors: lose fidelity after decompression)
+array_ok=(SRR2916693 SRR2994368 SRR3190692 ERR385912 SRR034509 ERR174310)
+# ok files should include: ERR194147, but failed to download from NCBI after several trials!
+array_nok=(SRR3211986 ERR739513 ERR386131)
+
+# data set from Leon publication
+array_leon=(SRR065390 SRR959239 SRR857303 SRR1870605 SRR445718 SRR1519083)
+
+array=("${array_ok[@]}" "${array_nok[@]}" "${array_leon[@]}")
+
+echo "Nb. SRA files to download: ${#array[*]}"
+
+echo "SRA are:"
+for item in ${array[*]}
+do
+ srafile="${item}.fastq.gz"
+ if [ -e $srafile ]; then
+ echo " ${srafile}: exists"
+ else
+ echo " ${srafile}: will be downloaded"
+ fi
+done
+
+for item in ${array[*]}
+do
+ srafile="${item}.fastq.gz"
+ if [ ! -e $srafile ]; then
+ echo "> downloading ${item} to ${srafile} ..."
+ echo " time fastq-dump --gzip $item"
+ time fastq-dump --gzip $item
+ fi
+done
+
+
diff --git a/gatb-core/test/jenkins/leon/test_integrity.sh b/gatb-core/test/jenkins/leon/test_integrity.sh
new file mode 100755
index 0000000..ae9758a
--- /dev/null
+++ b/gatb-core/test/jenkins/leon/test_integrity.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# This script tests integrity of gzipped files.
+# It can be used to test SRA files retrieved from NCBI
+# usind download.sh script.
+#
+# Patrick G. Durand, May 2017
+#
+
+i=0
+while read line
+do
+ array[ $i ]="$line"
+ (( i++ ))
+done < <(ls -1 *.fastq.gz)
+
+echo "Nb. SRA files to test: ${#array[*]}"
+
+echo "SRA are:"
+for item in ${array[*]}
+do
+ echo "> Checking ${item} ..."
+ FILESIZE=$(stat -c%s "${item}")
+ FILESIZE=$(( FILESIZE/(1024*1024) ))
+ echo " size= $FILESIZE Mb."
+ gunzip -t ${item}
+ if [ $? -ne 0 ]; then
+ echo " ${item}: invalid"
+ fi
+done
+
diff --git a/gatb-core/test/jenkins/publish-doc-api.sh b/gatb-core/test/jenkins/leon/tool-leon-functional-tests-compile.sh
similarity index 66%
copy from gatb-core/test/jenkins/publish-doc-api.sh
copy to gatb-core/test/jenkins/leon/tool-leon-functional-tests-compile.sh
index a505480..64ef92d 100755
--- a/gatb-core/test/jenkins/publish-doc-api.sh
+++ b/gatb-core/test/jenkins/leon/tool-leon-functional-tests-compile.sh
@@ -3,6 +3,8 @@
# Continuous integration script for Jenkins #
#--------------------------------------------------------------#
#
+# !!! This script has only been tested using Koriscale CI slave.
+#
# Default mode :
# This script will exit with error (exit code 1) if any of its steps fails.
# To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below).
@@ -23,6 +25,7 @@ pwd : `pwd`
BRANCH_TO_BUILD : ${BRANCH_TO_BUILD}
RELEASE_TO_BUILD : ${RELEASE_TO_BUILD}
INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN}
+TEST_VARIABLE : ${TEST_VARIABLE}
DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR}
-----------------------------------------
@@ -31,45 +34,47 @@ DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR}
BUILD_NUMBER : ${BUILD_NUMBER}
JOB_NAME : ${JOB_NAME}
-
"
-error_code () { [ "$DO_NOT_STOP_AT_ERROR" = "true" ] && { return 0 ; } }
-
-
-[ "$DO_NOT_STOP_AT_ERROR" != "true" ] && { set -e ; } || { echo "(!) DEBUG mode, the script will NOT stop..." ; echo; }
-set -xv
-
-# quick look at resources
#---------------------------------------------------------------
+# quick look at resources
free -h
-#---------------------------------------------------------------
-lstopo
-#---------------------------------------------------------------
-#df -kh
-#---------------------------------------------------------------
-
################################################################
# COMPILATION #
################################################################
+if $DO_NOT_COMPILE; then
+ echo "SKIP COMPILE PHASE"
+ exit 0
+fi
+
+# Make sure, we use the appropriate cmake on Koriscale
+export PATH=/home/ci-gatb/cmake-3.7.2-Linux-x86_64/bin:$PATH
+
+# dump compiler information
+gcc --version
+g++ --version
+gcc -dumpversion
+cmake --version
+
JENKINS_TASK=${JOB_NAME}
GIT_DIR=/home/ci-gatb/workspace/$JENKINS_TASK/gatb-core
BUILD_DIR=/home/ci-gatb/scratchdir/$JENKINS_TASK/gatb-core/build
-rm -rf $BUILD_DIR
mkdir -p $BUILD_DIR
-
cd $BUILD_DIR
#---------------------------------------------------------------
-cmake -Wno-dev -DJENKINS_TAG=${BRANCH_TO_BUILD} -DJENKINS_GFORGE_USER=${INRIA_FORGE_LOGIN} $GIT_DIR
+# compile in default mode: Release (otherwise Leon compressor will
+# be very, very slow using Debug mode)
+cmake -Wno-dev \
+ -DCPPUNIT_INCLUDE_DIR=/usr/include/cppunit/ \
+ -DCPPUNIT_LIBRARY=/usr/lib64/libcppunit.so \
+ $GIT_DIR
+
+# we compile all GATB-Core: library, tools (including leon) and snippets
+# (snippets bank26 to bank28 are used to test leon)
+make -j8
+make -j8 examples
-#---------------------------------------------------------------
-make -j 2 doc || error_code
-make deploy-doc || error_code
-
-################################################################
-# END #
-################################################################
diff --git a/gatb-core/test/jenkins/leon/tool-leon-functional-tests-test.sh b/gatb-core/test/jenkins/leon/tool-leon-functional-tests-test.sh
new file mode 100755
index 0000000..e88f9d3
--- /dev/null
+++ b/gatb-core/test/jenkins/leon/tool-leon-functional-tests-test.sh
@@ -0,0 +1,266 @@
+#!/bin/bash
+
+# This script compares original SRA FastQ files with Leon compressed ones.
+#
+# Patrick G. Durand, May 2017
+#
+
+# !!! This script only works with bash --version 4+ on Linux systems.
+# (it uses hastable, stat -c, amon others)
+# !!! This script has only been tested using Koriscale CI slave.
+
+# ==================================================================
+# Where are the refaerence SRA files?
+DATA_DIR=/mnt/windows/ci-gatb/data/leon
+# Where are the binaries: leon and bank snippets
+JENKINS_TASK=${JOB_NAME}
+BIN_DIR=/home/ci-gatb/scratchdir/$JENKINS_TASK/gatb-core/build/bin
+# How many cores to use?
+CORES=8
+# Do we have to use leon in verbose mode?
+VERBOSE=0
+# default k-mer size
+KMER_SIZE=31
+# Bank snippets used to test Leon generated data files
+BANK_SNIPPET_1=bank25
+BANK_SNIPPET_2=bank26
+BANK_SNIPPET_3=bank28
+
+
+# ==================================================================
+# == Control values
+# 5 data fields: nb. letters, nb. sequences, max seq size,
+# min seq size, nb. sequences < k-mer size
+ERR385912_valids=(139175685 2728935 51 51 0)
+SRR2916693_valids=(204077299 405343 1201 67 0)
+SRR959239_valids=(526537536 5372832 98 98 0)
+ERR386131_valids=(544138147 3601856 371 8 14165)
+ERR739513_valids=(450281548 122240 246140 5 1235)
+SRR1870605_valids=(543194861 1119218 502 70 0)
+SRR857303_valids=(505750634 2581532 368 6 32095)
+SRR3211986_valids=(917247967 163477 62746 2 1762)
+SRR2994368_valids=(2258185851 5054526 502 70 0)
+SRR034509_valids=(2091430836 10353618 202 202 0)
+SRR445718_valids=(3294366500 32943665 100 100 0)
+SRR3190692_valids=(1300585440 9314994 602 70 0)
+SRR065390_valids=(2466741904 67617092 100 100 0)
+SRR1519083_valids=(1734577366 59698462 101 101 0)
+ERR174310_valids=(3276346670 207579467 202 202 0)
+
+declare -A control_map
+control_map[ERR385912]=${ERR385912_valids[*]}
+control_map[SRR2916693]=${SRR2916693_valids[*]}
+control_map[SRR959239]=${SRR959239_valids[*]}
+control_map[ERR386131]=${ERR386131_valids[*]}
+control_map[ERR739513]=${ERR739513_valids[*]}
+control_map[SRR1870605]=${SRR1870605_valids[*]}
+control_map[SRR857303]=${SRR857303_valids[*]}
+control_map[SRR3211986]=${SRR3211986_valids[*]}
+control_map[SRR2994368]=${SRR2994368_valids[*]}
+control_map[SRR034509]=${SRR034509_valids[*]}
+control_map[SRR445718]=${SRR445718_valids[*]}
+control_map[SRR3190692]=${SRR3190692_valids[*]}
+control_map[SRR065390]=${SRR065390_valids[*]}
+control_map[SRR1519083]=${SRR1519083_valids[*]}
+control_map[ERR174310]=${ERR174310_valids[*]}
+
+# ==================================================================
+# == Usefull variables
+# store size of SRA reference files (.fastq.gz)
+declare -A ref_file_size_map
+# use to check whether or not some tests failed
+# (0: ok ; !=0: not ok)
+TEST_RESULT=0
+# set a dedicated time format
+TIMEFORMAT=' Time - real:%3lR | user:%3lU | sys:%3lS'
+
+# ==================================================================
+# == Usefull methods
+checkDataFile(){
+ # takes two argument:
+ # $1 : the file base name without its extension
+ # $2 : the file name to test
+ item=$1
+ srafile=$2
+ echo " $BIN_DIR/$BANK_SNIPPET_3 -in ${srafile} -kmer-size $KMER_SIZE"
+ RESULT=`time $BIN_DIR/$BANK_SNIPPET_3 -in ${srafile} -kmer-size $KMER_SIZE`
+ echo " answer : $RESULT"
+ echo " control: ${control_map[${item}]}"
+ if [ -z "${control_map[${item}]}" ]; then
+ echo " ERROR: no control value"
+ (( TEST_RESULT++ ))
+ else
+ i=0
+ RESULT_ARRAY=($RESULT)
+ for VALUE in ${control_map[${item}]}
+ do
+ if (( VALUE != RESULT_ARRAY[i] )); then
+ echo " ERROR on value $i: $VALUE != ${RESULT_ARRAY[$i]}"
+ (( TEST_RESULT++ ))
+ fi
+ (( i++ ))
+ done
+ fi
+}
+
+# ==================================================================
+# == Start processing
+
+echo "
+-----------------------------------------
+ Data information
+-----------------------------------------
+slave : $NODE_NAME
+dir : $DATA_DIR"
+
+cd $DATA_DIR
+
+# dump the full list of '*.fastq.gz' files available
+# in working directory
+echo "> SRA FastQ files available in ${DATA_DIR}:"
+i=0
+j=0
+while read line
+do
+ FILESIZE=$(stat -c%s "${line}")
+ FILESIZE=$(( FILESIZE/(1024*1024) ))
+ echo " ${line}: $FILESIZE Mb"
+ fname=`echo $line | cut -d'.' -f 1`
+ ref_file_size_map[$fname]=$FILESIZE
+ if (( FILESIZE <= MAX_FILE_SIZE )); then
+ array[ $i ]="${fname}"
+ (( i++ ))
+ fi
+ allfiles[ $j ]="$fname"
+ (( j++ ))
+done < <(ls -Sr -1 *.fastq.gz)
+
+# Filter file by size retaining only those < MAX_FILE_SIZE
+echo "> Total SRA files: ${#allfiles[*]}"
+echo "> SRA files to handle now: ${#array[*]} (size <= $MAX_FILE_SIZE Mb)"
+for item in ${array[*]}
+do
+ srafile="${item}.fastq.gz"
+ echo " ${srafile}: ${ref_file_size_map[$item]} Mb"
+done
+
+echo "-----------------------"
+echo "Running content test..."
+echo "-----------------------"
+date
+
+# Check file content with GATB Bank API
+echo "> SRA file content (using $BANK_SNIPPET_3 snippet):"
+for item in ${array[*]}
+do
+ srafile="${item}.fastq.gz"
+ echo " ${srafile}: ${ref_file_size_map[$item]} Mb"
+ checkDataFile $item $srafile
+done
+
+if (( TEST_RESULT != 0 )); then
+ echo "FAILURE: check $TEST_RESULT error(s), above"
+ exit 1
+fi
+
+echo "----------------------"
+echo "Running compression..."
+echo "----------------------"
+date
+for item in ${array[*]}
+do
+ srafile="${item}.fastq.gz"
+ echo "> compress ${srafile}: ${ref_file_size_map[$item]} Mb"
+ echo " leon -file ${srafile} -c -lossless -nb-cores $CORES -verbose $VERBOSE -kmer-size $KMER_SIZE"
+ time $BIN_DIR/leon -file ${srafile} -c -lossless -nb-cores $CORES -verbose $VERBOSE -kmer-size $KMER_SIZE
+ leonfile="${item}.fastq.leon"
+ FILESIZE=$(stat -c%s "${leonfile}")
+ FILESIZE=$(( FILESIZE/(1024*1024) ))
+ echo " ${leonfile}: $FILESIZE Mb"
+done
+
+echo "-----------------------------------"
+echo "Running compression content test..."
+echo "-----------------------------------"
+date
+TEST_RESULT=0
+echo "> LEON file content (using $BANK_SNIPPET_3 snippet):"
+for item in ${array[*]}
+do
+ leonfile="${item}.fastq.leon"
+ echo " ${leonfile}:"
+ checkDataFile $item $leonfile
+done
+
+if (( TEST_RESULT != 0 )); then
+ echo "FAILURE: check $TEST_RESULT error(s), above"
+ exit 1
+fi
+
+echo "------------------------"
+echo "Running decompression..."
+echo "------------------------"
+date
+for item in ${array[*]}
+do
+ leonfile="${item}.fastq.leon"
+ echo "> decompress ${leonfile}"
+ echo " leon -file ${leonfile} -d -nb-cores $CORES -verbose $VERBOSE"
+ time $BIN_DIR/leon -file ${leonfile} -d -nb-cores $CORES -verbose $VERBOSE
+done
+
+echo "------------------------------------"
+echo "Running decompressed content test..."
+echo "------------------------------------"
+date
+TEST_RESULT=0
+echo "> FastQ/Leon file content (using $BANK_SNIPPET_3 snippet):"
+for item in ${array[*]}
+do
+ leonfile="${item}.fastq.d"
+ echo " ${leonfile}:"
+ checkDataFile $item $leonfile
+done
+
+if (( TEST_RESULT != 0 )); then
+ echo "FAILURE: check $TEST_RESULT error(s), above"
+ exit 1
+fi
+
+echo "----------------------------------------"
+echo "Running comparison '.gz' vs. '.leon'..."
+echo "----------------------------------------"
+date
+for item in ${array[*]}
+do
+ srafile="${item}.fastq.gz"
+ leonfile="${item}.fastq.leon"
+ echo "> compare ${srafile} vs ${leonfile} ..."
+ echo " $BANK_SNIPPET_1 ${srafile} ${leonfile}"
+ time $BIN_DIR/$BANK_SNIPPET_1 ${srafile} ${leonfile}
+done
+
+echo "------------------------------------"
+echo "Running comparison '.gz' vs. '.d'..."
+echo "------------------------------------"
+date
+for item in ${array[*]}
+do
+ srafile="${item}.fastq.gz"
+ leonfile="${item}.fastq.d"
+ echo "> compare ${srafile} vs ${leonfile} ..."
+ echo " $BANK_SNIPPET_1 ${srafile} ${leonfile}"
+ time $BIN_DIR/$BANK_SNIPPET_1 ${srafile} ${leonfile}
+done
+
+echo "-----------"
+echo "Cleaning..."
+echo "-----------"
+for item in ${array[*]}
+do
+ rm -f ${item}.fastq.d
+ rm -f ${item}.fastq.h5
+ rm -f ${item}.fastq.leon
+done
+rm -rf trashme*
+
diff --git a/gatb-core/test/jenkins/publish-doc-api.sh b/gatb-core/test/jenkins/publish-doc-api.sh
index a505480..490468a 100755
--- a/gatb-core/test/jenkins/publish-doc-api.sh
+++ b/gatb-core/test/jenkins/publish-doc-api.sh
@@ -55,8 +55,24 @@ lstopo
################################################################
JENKINS_TASK=${JOB_NAME}
-GIT_DIR=/home/ci-gatb/workspace/$JENKINS_TASK/gatb-core
-BUILD_DIR=/home/ci-gatb/scratchdir/$JENKINS_TASK/gatb-core/build
+
+MACHINE="`hostname`"
+case $MACHINE in
+koriscale*)
+ echo $MACHINE
+ GIT_DIR=/home/ci-gatb/workspace/$JENKINS_TASK/gatb-core
+ BUILD_DIR=/home/ci-gatb/scratchdir/$JENKINS_TASK/gatb-core/build
+ ;;
+gatb-core-ubuntu16-docker)
+ echo $MACHINE
+ GIT_DIR=/builds/workspace/$JENKINS_TASK/gatb-core
+ BUILD_DIR=/scratchdir/$JENKINS_TASK/gatb-core/build
+ ;;
+*)
+ echo Erreur
+ exit 1
+ ;;
+esac
rm -rf $BUILD_DIR
mkdir -p $BUILD_DIR
diff --git a/gatb-core/test/jenkins/test-bin-debian7-64bits-gcc-4.7.sh b/gatb-core/test/jenkins/test-bin-debian7-64bits-gcc-4.7.sh
index 270908b..33a6298 100755
--- a/gatb-core/test/jenkins/test-bin-debian7-64bits-gcc-4.7.sh
+++ b/gatb-core/test/jenkins/test-bin-debian7-64bits-gcc-4.7.sh
@@ -20,7 +20,7 @@ pwd : `pwd`
-----------------------------------------
Jenkins build parameters (user defined)
-----------------------------------------
-BRANCH_TO_BUILD : ${BRANCH_TO_BUILD}
+VERSION_TO_TEST : ${VERSION_TO_TEST}
INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN}
DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR}
@@ -66,7 +66,7 @@ cd $BUILD_DIR
# Upload bin bundle to the forge; source bundle is made by OSX Jenkins task
if [ $? -eq 0 ] && [ "$INRIA_FORGE_LOGIN" != none ] && [ "$DO_NOT_STOP_AT_ERROR" != true ]; then
echo "Getting a binary archive... "
- scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-core/htdocs/ci-inria/gatb-core-${BRANCH_TO_BUILD}-bin-Linux.tar.gz .
+ scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-core/htdocs/versions/bin/gatb-core-${VERSION_TO_TEST}-bin-Linux.tar.gz .
fi
################################################################
@@ -77,9 +77,9 @@ g++ --version
[ `gcc -dumpversion` = 4.7 ] && { echo "GCC 4.7"; } || { echo "GCC version is not 4.7, we exit"; exit 1; }
-gunzip gatb-core-${BRANCH_TO_BUILD}-bin-Linux.tar.gz
-tar -xf gatb-core-${BRANCH_TO_BUILD}-bin-Linux.tar
-cd gatb-core-${BRANCH_TO_BUILD}-bin-Linux
+gunzip gatb-core-${VERSION_TO_TEST}-bin-Linux.tar.gz
+tar -xf gatb-core-${VERSION_TO_TEST}-bin-Linux.tar
+cd gatb-core-${VERSION_TO_TEST}-bin-Linux
code_snippets=($(find ./examples -name "*1.cpp"))
for code_snippet in ${code_snippets[*]}
@@ -90,5 +90,5 @@ done
# do some cleanup to save disk space
cd ..
-rm -rf gatb-core-${BRANCH_TO_BUILD}-bin-Linux*
+rm -rf gatb-core-${VERSION_TO_TEST}-bin-Linux*
diff --git a/gatb-core/test/jenkins/test-bin-macos-10.9.5-gcc-4.2.1 b/gatb-core/test/jenkins/test-bin-macos-10.9.5-gcc-4.2.1.sh
similarity index 89%
rename from gatb-core/test/jenkins/test-bin-macos-10.9.5-gcc-4.2.1
rename to gatb-core/test/jenkins/test-bin-macos-10.9.5-gcc-4.2.1.sh
index d906d43..a7d2b49 100755
--- a/gatb-core/test/jenkins/test-bin-macos-10.9.5-gcc-4.2.1
+++ b/gatb-core/test/jenkins/test-bin-macos-10.9.5-gcc-4.2.1.sh
@@ -16,7 +16,7 @@ pwd : `pwd`
--------------------------
Jenkins build parameters
--------------------------
-BRANCH_TO_BUILD : ${BRANCH_TO_BUILD}
+VERSION_TO_TEST : ${VERSION_TO_TEST}
INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN}
DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR}
"
@@ -56,7 +56,7 @@ cd $BUILD_DIR
# Upload bin bundle to the forge; source bundle is made by OSX Jenkins task
if [ $? -eq 0 ] && [ "$INRIA_FORGE_LOGIN" != none ] && [ "$DO_NOT_STOP_AT_ERROR" != true ]; then
echo "Getting a binary archive... "
- scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-core/htdocs/ci-inria/gatb-core-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz .
+ scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-core/htdocs/versions/bin/gatb-core-${VERSION_TO_TEST}-bin-Darwin.tar.gz .
fi
################################################################
@@ -67,9 +67,9 @@ g++ --version
[ `gcc -dumpversion` = 4.2.1 ] && { echo "GCC 4.2.1"; } || { echo "GCC version is not 4.2.1, we exit"; exit 1; }
-gunzip gatb-core-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz
-tar -xf gatb-core-${BRANCH_TO_BUILD}-bin-Darwin.tar
-cd gatb-core-${BRANCH_TO_BUILD}-bin-Darwin
+gunzip gatb-core-${VERSION_TO_TEST}-bin-Darwin.tar.gz
+tar -xf gatb-core-${VERSION_TO_TEST}-bin-Darwin.tar
+cd gatb-core-${VERSION_TO_TEST}-bin-Darwin
code_snippets=($(find ./examples -name "*1.cpp"))
for code_snippet in ${code_snippets[*]}
@@ -80,5 +80,5 @@ done
# do some cleanup to save disk space
cd ..
-rm -rf gatb-core-${BRANCH_TO_BUILD}-bin-Darwin*
+rm -rf gatb-core-${VERSION_TO_TEST}-bin-Darwin*
diff --git a/gatb-core/test/jenkins/test-docker-gatb-core-compile-clang36.sh b/gatb-core/test/jenkins/test-docker-gatb-core-compile-clang36.sh
new file mode 100755
index 0000000..a032011
--- /dev/null
+++ b/gatb-core/test/jenkins/test-docker-gatb-core-compile-clang36.sh
@@ -0,0 +1,42 @@
+# == GATB Compiler machine
+# must exist as a docker container on the VM!
+COMP_MACHINE=gatb_compiler_clang36
+
+# == enter working directory
+DK_WORK_DIR=/builds/workspace/${JOB_NAME}
+cd ${DK_WORK_DIR}
+
+# == we have a dedicated directory per BRANCH_TO_BUILD
+[ ! -d ${BRANCH_TO_BUILD} ] && { mkdir ${BRANCH_TO_BUILD}; }
+cd ${BRANCH_TO_BUILD}
+
+# == we do the git clone (docker container cannot do that, for now)
+[ ! -d gatb-core ] && { git clone git+ssh://gatb-ci@scm.gforge.inria.fr/gitroot/gatb-core/gatb-core.git; }
+
+# == we get the appropriate branch to build
+cd gatb-core
+git checkout ${BRANCH_TO_BUILD}
+git pull
+cd ..
+
+# == important notice
+# do not delete build directory: it is done by
+# gatb-compile.sh script, below. In addition
+# user 'ci' won't have permission to do that:
+# build dir being created from the container
+# perspective, it is own by root.
+
+# == we set some variables to prepare volume mount points between container and host
+# on host, gatb is here:
+DK_MOUNT=${DK_WORK_DIR}/${BRANCH_TO_BUILD}
+# from the container, we access source code here:
+G_CODE=/tmp/gatb-core-code
+# from the container, we prepare build here:
+G_BUILD=/tmp/gatb-core-build
+
+# == we run docker to *COMPILE* GATB-Core
+docker run --rm --name ${COMP_MACHINE} -e "GIT_PROVIDER=ci" -v ${DK_MOUNT}:${G_CODE} -v ${DK_MOUNT}:${G_BUILD} ${COMP_MACHINE} gatb-compile.sh
+
+# == we run docker to *TEST* GATB-Core
+docker run --rm --name ${COMP_MACHINE} -e "GIT_PROVIDER=ci" -v ${DK_MOUNT}:${G_CODE} -v ${DK_MOUNT}:${G_BUILD} ${COMP_MACHINE} gatb-test.sh
+
diff --git a/gatb-core/test/jenkins/test-snippets-debian7-64bits-gcc-4.7.sh b/gatb-core/test/jenkins/test-snippets-debian7-64bits-gcc-4.7.sh
index 574323d..55d578e 100755
--- a/gatb-core/test/jenkins/test-snippets-debian7-64bits-gcc-4.7.sh
+++ b/gatb-core/test/jenkins/test-snippets-debian7-64bits-gcc-4.7.sh
@@ -62,10 +62,10 @@ mkdir -p $BUILD_DIR
cd $BUILD_DIR
#---------------------------------------------------------------
-cmake -Wno-dev $GIT_DIR
+cmake -DGATB_CORE_INCLUDE_EXAMPLES=True -Wno-dev $GIT_DIR
#---------------------------------------------------------------
-make -j 2 examples
+make -j2 examples
#>>>>>>>>>>>>>>>>>>>>>
#fi
diff --git a/gatb-core/test/jenkins/test-suite-fedora20-gcc-4.8.sh b/gatb-core/test/jenkins/test-suite-fedora20-gcc-4.8.sh
index 300abaa..6f6243a 100755
--- a/gatb-core/test/jenkins/test-suite-fedora20-gcc-4.8.sh
+++ b/gatb-core/test/jenkins/test-suite-fedora20-gcc-4.8.sh
@@ -35,6 +35,10 @@ JOB_NAME : ${JOB_NAME}
"
+# Make sure, we use the appropriate cmake
+
+export PATH=/home/ci-gatb/cmake-3.7.2-Linux-x86_64/bin:$PATH
+
error_code () { [ "$DO_NOT_STOP_AT_ERROR" = "true" ] && { return 0 ; } }
@@ -130,4 +134,4 @@ ls ../test/db/ # default directory for test db
################################################################
# END #
-################################################################
\ No newline at end of file
+################################################################
diff --git a/gatb-core/test/slaves/ubuntu16-shell-provisioner.sh b/gatb-core/test/slaves/ubuntu16-shell-provisioner.sh
new file mode 100755
index 0000000..37d0710
--- /dev/null
+++ b/gatb-core/test/slaves/ubuntu16-shell-provisioner.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# Shell script for provisioning an Ubuntu 16.04 LTS slave on Inria cloudstack to compile GATB-CORE
+# (for use with Jenkins, ci.inria.fr)
+
+set -xv
+set -e
+
+# Configure hostname
+# ------------------
+
+#HOST_NAME=gatb-core-ubuntu16-docker
+HOST_NAME=$1
+
+[ -z "$HOST_NAME"] && { echo "Please give a HOST_NAME argument to this script..."; exit 1; }
+
+hostnamectl set-hostname $HOST_NAME
+
+# Install necessary packages
+# --------------------------
+
+apt-get -y update
+
+apt-get install -y --no-install-recommends \
+ vim git wget make zlib1g-dev hwloc \
+ doxygen graphviz \
+ valgrind libxml2-utils cmake
+
+# Install gcc-4.7 instead of gcc-5
+# --------------------------------
+# Note : gcc 5.4.0 is already installed
+
+apt-get install -y gcc-4.7 g++-4.7 gcc-4.7-base
+
+update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.7 60 --slave /usr/bin/g++ g++ /usr/bin/g++-4.7
+update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 40 --slave /usr/bin/g++ g++ /usr/bin/g++-5
+update-alternatives --set gcc /usr/bin/gcc-4.7
+
+
+# Install cppunit-1.12
+# --------------------
+# Note: the libcppunit-dev ubuntu package corresponds to version 1.13, which is not compatible with gatb-core
+
+cd ~/ # now in /builds
+git clone git://anongit.freedesktop.org/git/libreoffice/cppunit/ cppunit_gcc47
+cd cppunit_gcc47
+git checkout cppunit-1.12.1
+
+./autogen.sh
+
+./configure LDFLAGS=-Wl,--no-as-needed
+
+make
+
+make check
+
+make install
+
+# mount point for external hard drive
+[ -d /scratchdir ] || { mkdir /scratchdir; chown -R ci:ci /scratchdir; }
+
+
+# Install Docker
+# --------------
+# Note: see https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/
+
+# ... Install Docker, part 1
+
+apt-get remove docker docker-engine docker.io
+
+apt-get install \
+ apt-transport-https \
+ ca-certificates \
+ curl \
+ software-properties-common
+
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+
+apt-key fingerprint 0EBFCD88
+
+sudo add-apt-repository \
+ "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
+ $(lsb_release -cs) \
+ stable"
+
+apt-get -y update
+apt-get -y install docker-ce
+
+# just to check
+docker run hello-world
+
+# ... Install Docker, part 2
+
+getent group docker || groupadd docker
+usermod -aG docker ci
+
+# Some cleaning
+# -------------
+
+apt-get clean
diff --git a/gatb-core/test/unit/src/bank/TestBank.cpp b/gatb-core/test/unit/src/bank/TestBank.cpp
index 13da91b..8b59c90 100644
--- a/gatb-core/test/unit/src/bank/TestBank.cpp
+++ b/gatb-core/test/unit/src/bank/TestBank.cpp
@@ -1065,7 +1065,7 @@ public:
CPPUNIT_ASSERT (Bank::getType(DBPATH("album.txt")) == "album");
CPPUNIT_ASSERT (Bank::getType(DBPATH("sample1.fa")) == "fasta");
- CPPUNIT_ASSERT (Bank::getType(DBPATH("sample.fastq")) == "fasta");
+ CPPUNIT_ASSERT (Bank::getType(DBPATH("sample.fastq")) == "fastq");
}
/********************************************************************************/
diff --git a/gatb-core/test/unit/src/bank/TestLeon.cpp b/gatb-core/test/unit/src/bank/TestLeon.cpp
new file mode 100644
index 0000000..f923851
--- /dev/null
+++ b/gatb-core/test/unit/src/bank/TestLeon.cpp
@@ -0,0 +1,435 @@
+/*****************************************************************************
+ * GATB : Genome Assembly Tool Box
+ * Copyright (C) 2014 R.Chikhi, G.Rizk, E.Drezen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+*****************************************************************************/
+
+#include <CppunitCommon.hpp>
+
+#include <gatb/system/impl/System.hpp>
+
+#include <gatb/bank/impl/Banks.hpp>
+
+#include <gatb/bank/impl/Bank.hpp>
+#include <gatb/bank/impl/BankHelpers.hpp>
+
+#include <gatb/tools/designpattern/impl/IteratorHelpers.hpp>
+
+#include <gatb/tools/misc/api/Macros.hpp>
+
+#include <gatb/tools/compression/Leon.hpp>
+
+#include <list>
+#include <stdlib.h> /* srand, rand */
+#include <time.h> /* time */
+
+using namespace std;
+
+using namespace gatb::core::system;
+using namespace gatb::core::system::impl;
+
+using namespace gatb::core::bank;
+using namespace gatb::core::bank::impl;
+
+using namespace gatb::core::tools::dp;
+using namespace gatb::core::tools::dp::impl;
+
+using namespace gatb::core::tools::misc;
+
+extern std::string DBPATH (const string& a);
+
+/********************************************************************************/
+namespace gatb { namespace tests {
+/********************************************************************************/
+
+/** \brief Test class for Leon compress/decompressor
+ */
+class TestLeon : public Test
+{
+ /********************************************************************************/
+ CPPUNIT_TEST_SUITE_GATB (TestLeon);
+
+ CPPUNIT_TEST_GATB(bank_checkLeon1);
+ /*
+ CPPUNIT_TEST_GATB(bank_checkLeon2);
+ CPPUNIT_TEST_GATB(bank_checkLeon3);
+ CPPUNIT_TEST_GATB(bank_checkLeon4);
+ CPPUNIT_TEST_GATB(bank_checkLeon5);
+ CPPUNIT_TEST_GATB(bank_checkLeon6);
+ CPPUNIT_TEST_GATB(bank_checkLeon7);
+ CPPUNIT_TEST_GATB(bank_checkLeon8);*/
+
+ CPPUNIT_TEST_SUITE_GATB_END();
+
+public:
+
+ /********************************************************************************/
+ void setUp () { srand (time(NULL)); }
+ void tearDown () {}
+
+ /********************************************************************************/
+
+ /**
+ * Compare the content of two banks for strict equality: same number
+ * of sequences, sequences in same order, etc.
+ *
+ * */
+ void bank_compare_banks_equality(IBank* bank1, IBank* bank2){
+ // We create iterators over this bank.
+ Iterator<Sequence>* itFas = bank1->iterator();
+ Iterator<Sequence>* itLeon = bank2->iterator();
+ u_int64_t nbSeqFas = 0;
+ u_int64_t nbSeqLeon = 0;
+ {
+ LOCAL(itFas);
+ LOCAL(itLeon);
+ // We do not use estimate() methods. Instead, we count
+ // exact number of sequences in both banks
+ for (itFas->first(); !itFas->isDone(); itFas->next()){nbSeqFas++;}
+ for (itLeon->first(); !itLeon->isDone(); itLeon->next()){nbSeqLeon++;}
+ CPPUNIT_ASSERT(nbSeqFas==nbSeqLeon);
+ }
+ itFas = bank1->iterator();
+ itLeon = bank2->iterator();
+ LOCAL(itFas);
+ LOCAL(itLeon);
+ // We create a PairedIterator to go through both banks simultaneously
+ PairedIterator<Sequence,Sequence> it (itFas, itLeon);
+
+ nbSeqFas =0;
+ for (it.first(); !it.isDone(); it.next())
+ {
+ nbSeqFas++;
+ // check sequence comment for equality
+ CPPUNIT_ASSERT(it->first.getComment().compare(it->second.getComment())==0);
+ // check sequence letters for equality
+ CPPUNIT_ASSERT(it->first.toString().compare(it->second.toString())==0);
+ // check sequence quality for equality
+ CPPUNIT_ASSERT(it->first.getQuality().compare(it->second.getQuality())==0);
+ }
+ CPPUNIT_ASSERT(nbSeqFas==nbSeqLeon);
+ }
+
+ // http://stackoverflow.com/a/7026414
+ /*******************************************************************************
+ * From a fastq, generate leon compressed file, then compare that file to
+ * the Fastq reference. Allow to check that Leon compression is still ok
+ * over releases of GATB-Core.
+ *
+ * LOSSLESS version
+ * */
+ void bank_leon_compress_and_compare (const std::string& fastqFile, const std::string& leonFile)
+ {
+ // STEP 1: compress the Fastq reference file
+
+ // we prepare the Leon command-line
+ std::vector<char*> leon_args;
+ std::vector<std::string> data = {
+ "-",
+ "-c",
+ "-file", fastqFile,
+ "-lossless", // <-- LOSSLESS
+ "-verbose","0",
+ "-kmer-size", "31",
+ "-abundance", "1"
+ };
+ for(std::vector<std::string>::iterator loop = data.begin(); loop != data.end(); ++loop){
+ leon_args.push_back(&(*loop)[0]);
+ }
+
+ // we start Leon compressor
+ Leon().run(leon_args.size(), &leon_args[0]);
+
+ // STEP 2: compare reference and compressed version
+
+ // we open the files in read mode
+ IBank* fasBank = Bank::open (fastqFile); //BankFasta
+ IBank* leonBank = Bank::open (leonFile); //BankLeon
+
+ bank_compare_banks_equality(fasBank, leonBank);
+ }
+
+ /**
+ * Run Leon compress on a FastQ file.
+ *
+ * Parameter 'mode' is one of: 0, 1, 2 (for -noqual, -noheader
+ * or -seq-only, respectively) or 3 (-reads 1000; used to test
+ * parallel compression).
+ */
+ void run_leon_compressor(std::string& fastqFile, int mode){
+ // we prepare the Leon command-line
+ std::vector<char*> leon_args;
+ std::vector<std::string> data = {
+ "-",
+ "-c",
+ "-file", fastqFile,
+ "-verbose","0",
+ "-kmer-size", "31",
+ "-abundance", "1",
+ }; // do NOT anymore modify this list (unless you also update
+ // reference files). If you need to add other Leon args, see
+ // switch() below.
+ switch (mode){
+ case 0:
+ data.push_back("-noqual");
+ break;
+ case 1:
+ data.push_back("-noheader");
+ break;
+ case 2:
+ data.push_back("-seq-only");
+ break;
+ case 3:
+ data.push_back("-lossless");
+ data.push_back("-reads");
+ data.push_back("1000");
+ break;
+ }
+
+ for(std::vector<std::string>::iterator loop = data.begin(); loop != data.end(); ++loop){
+ leon_args.push_back(&(*loop)[0]);
+ }
+
+ // we start Leon compressor
+ Leon().run(leon_args.size(), &leon_args[0]);
+
+ }
+
+ /**
+ * Check the content of a Leon compressed file according to the use
+ * of Leon's argument -noqual, -noheader or -seq-only (mode= 0, 1 or 2,
+ * respectively).
+ */
+ void check_leon_content(std::string& leonFile, int mode){
+ // we open the leon file in read mode
+ IBank* leonBank = Bank::open (leonFile); //BankLeon
+
+ // We create iterators over this bank.
+ Iterator<Sequence>* itLeon = leonBank->iterator();
+ itLeon = leonBank->iterator();
+ LOCAL(itLeon);
+ for (itLeon->first(); !itLeon->isDone(); itLeon->next())
+ {
+ Sequence& seq = itLeon->item();
+ switch (mode){
+ case 0://"-noqual"
+ CPPUNIT_ASSERT(seq.getComment().size()!=0);
+ CPPUNIT_ASSERT(seq.toString().size()!=0);
+ CPPUNIT_ASSERT(seq.getQuality().size()==0);
+ break;
+ case 1://"-noheader"
+ // noheader: comment contains read rank number starting from 0
+ CPPUNIT_ASSERT(std::stoi(seq.getComment())>=0);
+ CPPUNIT_ASSERT(seq.toString().size()!=0);
+ CPPUNIT_ASSERT(seq.getQuality().size()!=0);
+ break;
+ case 2://"-seq-only"
+ // seq-only: comment contains read rank number starting from 0
+ CPPUNIT_ASSERT(std::stoi(seq.getComment())>=0);
+ CPPUNIT_ASSERT(seq.toString().size()!=0);
+ CPPUNIT_ASSERT(seq.getQuality().size()==0);
+ break;
+ }
+ }
+ }
+
+ /********************************************************************************
+ * We compare a lossless-compressed Leon file against the original Fastq file.
+ * Allow to ensure that Leon decompression is still working over GATB-Core
+ * releases.
+ */
+ void bank_checkLeon1 ()
+ {
+ // We open and test the reference file
+ string fasPath=DBPATH("leon1.fastq");
+ string btype = Bank::getType(fasPath);
+ CPPUNIT_ASSERT(
+ btype.compare("fasta")==0 ||
+ btype.compare("fastq")==0
+ );
+ IBank* fasBank = Bank::open (fasPath);
+
+ // We open and test the leon file
+ // Caution: this must be a LOSSLESS-compressed file
+ // (because we compare quality, see below)
+ string leonPath=DBPATH("leon1.fastq.leon-ref");
+ btype = Bank::getType(leonPath);
+ CPPUNIT_ASSERT(btype.compare("leon")==0);
+ IBank* leonBank = Bank::open (leonPath);
+
+ bank_compare_banks_equality(fasBank, leonBank);
+ }
+
+ /*******************************************************************************
+ * From a fastq, generate leon compressed file, then compare that file to
+ * the Fastq reference. Allow to check that Leon compression is still ok
+ * over releases of GATB-Core.
+ *
+ * LOSSLESS version
+ * */
+ void bank_checkLeon2(){
+ bank_leon_compress_and_compare(DBPATH("leon2.fastq"), DBPATH("leon2.fastq.leon"));
+ }
+
+ /*******************************************************************************
+ * From a fastq, generate leon compressed file, then compare that file to
+ * the leon compressed reference. Allow to check that still format is still ok
+ * over releases of GATB-Core.
+ *
+ * LOSSLESS version
+ * */
+ void bank_checkLeon3 ()
+ {
+ // The existing reference file
+ std::string fastqFile = DBPATH("leon2.fastq");
+ // The Leon file to create
+ string leonFile=fastqFile+".leon";
+ // The Leon file to use as a reference
+ string leonFileRef=fastqFile+".leon-ref";
+
+ // STEP 1: compress the Fasta file
+
+ // we prepare the Leon command-line
+ std::vector<char*> leon_args;
+ std::vector<std::string> data = {
+ "-",
+ "-c",
+ "-file", fastqFile,
+ "-lossless", // <-- LOSSLESS
+ "-verbose","0",
+ "-kmer-size", "31",
+ "-abundance", "1"
+ };
+ for(std::vector<std::string>::iterator loop = data.begin(); loop != data.end(); ++loop){
+ leon_args.push_back(&(*loop)[0]);
+ }
+
+ // we start Leon compressor
+ Leon().run(leon_args.size(), &leon_args[0]);
+
+ // STEP 2: compare reference and compressed version
+
+ // we open the files in read mode
+ IBank* leonBank = Bank::open (leonFile); //BankLeon freshly created
+ IBank* leonBankRef = Bank::open (leonFileRef); //BankLeon reference
+
+ bank_compare_banks_equality(leonBank, leonBankRef);
+ }
+
+ /*******************************************************************************
+ * Test other args of Leon:
+ * -noqual
+ *
+ * */
+ void bank_checkLeon4 ()
+ {
+ // The existing reference file
+ std::string fastqFile = DBPATH("leon2.fastq");
+
+ // The Leon file to create
+ string leonFile=fastqFile+".leon";
+
+ // STEP 1: compress the Fasta file
+ run_leon_compressor(fastqFile, 0);
+
+ // STEP 2: check leon content
+ check_leon_content(leonFile, 0);
+
+ }
+
+ /*******************************************************************************
+ * Test other args of Leon:
+ * -noheader
+ *
+ * */
+ void bank_checkLeon5 ()
+ {
+ // The existing reference file
+ std::string fastqFile = DBPATH("leon2.fastq");
+
+ // The Leon file to create
+ string leonFile=fastqFile+".leon";
+
+ // STEP 1: compress the Fasta file
+ run_leon_compressor(fastqFile, 1);
+
+ // STEP 2: check leon content
+ check_leon_content(leonFile, 1);
+
+ }
+ /*******************************************************************************
+ * Test other args of Leon:
+ * -seq-only
+ *
+ * */
+ void bank_checkLeon6 ()
+ {
+ // The existing reference file
+ std::string fastqFile = DBPATH("leon2.fastq");
+
+ // The Leon file to create
+ string leonFile=fastqFile+".leon";
+
+ // STEP 1: compress the Fastq file
+ run_leon_compressor(fastqFile, 2);
+
+ // STEP 2: check leon content
+ check_leon_content(leonFile, 2);
+ }
+
+ /**
+ * Same as bank_checkLeon2() but with a bigger file.
+ * */
+ void bank_checkLeon7 ()
+ {
+ bank_leon_compress_and_compare(
+ DBPATH("NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq.gz"),
+ DBPATH("NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq.leon")
+ );
+ }
+
+ /*******************************************************************************
+ * Test Leon parallel compression/decompression.
+ *
+ * */
+ void bank_checkLeon8 ()
+ {
+ // The existing FastQ reference file (contains 50K reads)
+ std::string fileName = DBPATH("NIST7035_TAAGGCGA_L001_R1_001_5OK.fastq");
+ std::string fastqFile = fileName + ".gz";
+ // The Leon file to create (WILL BE compressed NOW using -reads 1000)
+ string leonFile = fileName + ".leon";
+ // The Leon reference file (WAS compressed using -reads 50000)
+ string leonRefFile = leonFile+"-ref";
+
+ // STEP 1: compress the Fastq file ('mode=3' means use '-reads 1000')
+ run_leon_compressor(fastqFile, 3);
+
+ // STEP 2: compare reference and compressed version
+ IBank* leonRefBank = Bank::open (leonRefFile);
+ IBank* leonBank = Bank::open (leonFile);
+ bank_compare_banks_equality(leonRefBank, leonBank);
+ }
+};
+
+/********************************************************************************/
+
+CPPUNIT_TEST_SUITE_REGISTRATION (TestLeon);
+CPPUNIT_TEST_SUITE_REGISTRATION_GATB (TestLeon);
+
+/********************************************************************************/
+} } /* end of namespaces. */
+/********************************************************************************/
+
diff --git a/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp b/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp
index 281517a..69f634a 100644
--- a/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp
+++ b/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp
@@ -91,7 +91,7 @@ class TestDebruijn : public Test
CPPUNIT_TEST_GATB (debruijn_deletenode);
//CPPUNIT_TEST_GATB (debruijn_checksum); // FIXME removed it because it's a damn long test
CPPUNIT_TEST_GATB (debruijn_test2);
- CPPUNIT_TEST_GATB (debruijn_test3);
+ CPPUNIT_TEST_GATB (debruijn_test3); // that one is long when compiled in debug, fast in release
CPPUNIT_TEST_GATB (debruijn_test4);
CPPUNIT_TEST_GATB (debruijn_test5);
CPPUNIT_TEST_GATB (debruijn_test6);
diff --git a/gatb-core/test/unit/src/debruijn/TestSimplificationsUnitigs.cpp b/gatb-core/test/unit/src/debruijn/TestSimplificationsUnitigs.cpp
index c748fa9..0b1f43f 100644
--- a/gatb-core/test/unit/src/debruijn/TestSimplificationsUnitigs.cpp
+++ b/gatb-core/test/unit/src/debruijn/TestSimplificationsUnitigs.cpp
@@ -77,7 +77,7 @@ using namespace gatb::core::tools::storage::impl;
extern std::string DBPATH (const string& a);
-#define DEBUGprint(a) a
+#define DEBUGprint(a) //a
/********************************************************************************/
namespace gatb { namespace tests {
@@ -95,6 +95,7 @@ class TestSimplificationsUnitigs : public Test
CPPUNIT_TEST_GATB (debruijn_simplunitigs_X);
CPPUNIT_TEST_GATB (debruijn_simplunitigs_tip);
CPPUNIT_TEST_GATB (debruijn_simplunitigs_bubble);
+ CPPUNIT_TEST_GATB (debruijn_simplunitigs_bubble_snp);
CPPUNIT_TEST_SUITE_GATB_END();
public:
@@ -168,7 +169,7 @@ public:
return rc;
}
- void debruijn_traversal (GraphUnitigs& graph, string startSeq ,const char* checkStr)
+ void debruijn_traversal (GraphUnitigs& graph, string startSeq ,const char* checkStr, const char *checkStr2 = nullptr)
{
string startKmer = startSeq.substr(0, graph._kmerSize);
@@ -179,13 +180,15 @@ public:
string sequence = graph.simplePathBothDirections(node, isolatedLeft, isolatedRight, true, coverage);
string rev_seq = revcomp(sequence);
- if (sequence.compare(checkStr) != 0 && rev_seq.compare(checkStr) != 0)
+ if ((sequence.compare(checkStr) != 0 && rev_seq.compare(checkStr) != 0 && checkStr2 == nullptr) ||
+ (checkStr2 != nullptr && (sequence.compare(checkStr2) != 0 && rev_seq.compare(checkStr2) != 0)))
{
std::cout << "anticipation of checkStr failing, sequence: " << sequence << " checkStr: " << checkStr << std::endl;
graph.debugPrintAllUnitigs();
}
- CPPUNIT_ASSERT (sequence.compare(checkStr)==0 || rev_seq.compare(checkStr) == 0 );
+ CPPUNIT_ASSERT ((sequence.compare(checkStr)==0 || rev_seq.compare(checkStr) == 0) \
+ || (checkStr2 != nullptr && (sequence.compare(checkStr2) == 0 || rev_seq.compare(checkStr2) == 0)) );
}
/********************************************************************************/
@@ -271,24 +274,14 @@ public:
/********************************************************************************/
- void debruijn_simplunitigs_bubble ()
+ void debruijn_simplunitigs_bubble_aux (const char* sequences[], int nb_seqs, const char* sol1, const char* sol2=nullptr)
{
size_t kmerSize = 21;
- const char* sequences[] =
- {
- "CATCGATGCGAGACGCCTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGCGTTGCATCGAGCGGTGTCGCTAGCGCAATGCGCAAGATTTTGACATTTACAAGGCAACATTGCAGCGTCCGATGGTCCGGTGGCCTCCAGATAGTGTCCAGTCGCTCTAACTGTATGGAGACCATAGGCATTTACCTTATTCTCATCGCCACGCCCCAAGATCTTTAGGACCCAGCATTCCTTTAACCACTAACATAACGCGTGTCATCTAGTTCAACAACC", //>works well for k=21; part of genome10K.fasta
- "TGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGT", //>that's the bubble (highly covered)
- "TGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGT", //>that's the bubble
- "TGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGT", //>that's the bubble
- "TGTCATCTAGTTCAACAACCAAAAAAACGACTCTTGCGCTCGGATGT", //>that's the bubble path 2, low covered
- "CGACTCTTGCGCTCGGATGTCCGCAATGGGTTATCCCTATGTTCCGGTAATCTCTCATCTACTAAGCGCCCTAAAGGTCGTATGGTTGGAGGGCGGTTACACACCCTTAAGTACCGAACGATAGAGCACCCGTCTAGGAGGGCGTGCAGGGTCTCCCGCTAGCTAATGGTCACGGCCTCTCTGGGAAAGCTGAACAACGGATGATACCCATACTGCCACTCCAGTACCTGGGCCGCGTGTTGTACGCTGTGTATCTTGAGAGCGTTTCCAGCAGATAGAACAGGATCACATGTACAAA" // >remaining part
- };
-
// We create the graph.
GraphUnitigs graph;
debruijn_build_entry r;
- debruijn_build(sequences, ARRAY_SIZE(sequences), kmerSize, graph, r);
+ debruijn_build(sequences, nb_seqs, kmerSize, graph, r);
DEBUGprint(std::cout << "nb nodes:" << r.nbNodes << "nb nodes non deleted " << r.nbNonDeletedNodes << std::endl;)
CPPUNIT_ASSERT (r.nbNodes == 8);
@@ -319,9 +312,47 @@ public:
CPPUNIT_ASSERT (r.nbNonDeletedNodes == 6);
}
- debruijn_traversal (graph, sequences[0], "CATCGATGCGAGACGCCTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGCGTTGCATCGAGCGGTGTCGCTAGCGCAATGCGCAAGATTTTGACATTTACAAGGCAACATTGCAGCGTCCGATGGTCCGGTGGCCTCCAGATAGTGTCCAGTCGCTCTAACTGTATGGAGACCATAGGCATTTACCTTATTCTCATCGCCACGCCCCAAGATCTTTAGGACCCAGCATTCCTTTAACCACTAACATAACGCGTGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGTCCGCAATGGGTTATCCCTATGTTCCGGTAATCTCTCATCTACTAAGCGCCCTAAAGGTCGTATGGTTGGAGGGCGGTTACACACCCT [...]
+ debruijn_traversal (graph, sequences[0], sol1, sol2);
}
+ void debruijn_simplunitigs_bubble ()
+ {
+
+ const char* sequences[] =
+ {
+ "CATCGATGCGAGACGCCTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGCGTTGCATCGAGCGGTGTCGCTAGCGCAATGCGCAAGATTTTGACATTTACAAGGCAACATTGCAGCGTCCGATGGTCCGGTGGCCTCCAGATAGTGTCCAGTCGCTCTAACTGTATGGAGACCATAGGCATTTACCTTATTCTCATCGCCACGCCCCAAGATCTTTAGGACCCAGCATTCCTTTAACCACTAACATAACGCGTGTCATCTAGTTCAACAACC", //>works well for k=21; part of genome10K.fasta
+ "TGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGT", //>that's the bubble (highly covered)
+ "TGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGT", //>that's the bubble
+ "TGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGT", //>that's the bubble
+ "TGTCATCTAGTTCAACAACCAAAAAAACGACTCTTGCGCTCGGATGT", //>that's the bubble path 2, low covered
+ "CGACTCTTGCGCTCGGATGTCCGCAATGGGTTATCCCTATGTTCCGGTAATCTCTCATCTACTAAGCGCCCTAAAGGTCGTATGGTTGGAGGGCGGTTACACACCCTTAAGTACCGAACGATAGAGCACCCGTCTAGGAGGGCGTGCAGGGTCTCCCGCTAGCTAATGGTCACGGCCTCTCTGGGAAAGCTGAACAACGGATGATACCCATACTGCCACTCCAGTACCTGGGCCGCGTGTTGTACGCTGTGTATCTTGAGAGCGTTTCCAGCAGATAGAACAGGATCACATGTACAAA" // >remaining part
+ };
+ const char* sol = "CATCGATGCGAGACGCCTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGCGTTGCATCGAGCGGTGTCGCTAGCGCAATGCGCAAGATTTTGACATTTACAAGGCAACATTGCAGCGTCCGATGGTCCGGTGGCCTCCAGATAGTGTCCAGTCGCTCTAACTGTATGGAGACCATAGGCATTTACCTTATTCTCATCGCCACGCCCCAAGATCTTTAGGACCCAGCATTCCTTTAACCACTAACATAACGCGTGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGTCCGCAATGGGTTATCCCTATGTTCCGGTAATCTCTCATCTACTAAGCGCCCTAAAGGTCGTATGGTTGGAGGGCGGTTACACACCCTTAAGTACCGAACGATAGAGCACC [...]
+
+ debruijn_simplunitigs_bubble_aux(sequences, ARRAY_SIZE(sequences),sol);
+ }
+
+ void debruijn_simplunitigs_bubble_snp()
+ {
+ const char* sequences[] =
+ {
+ "CATCGATGCGAGACGCCTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGCGTTGCATCGAGCGGTGTCGCTAGCGCAATGCGCAAGATTTTGACATTTACAAGGCAACATTGCAGCGTCCGATGGTCCGGTGGCCTCCAGATAGTGTCCAGTCGCTCTAACTGTATGGAGACCATAGGCATTTACCTTATTCTCATCGCCACGCCCCAAGATCTTTAGGACCCAGCATTCCTTTAACCACTAACATAACGCGTGTCATCTAGTTCAACAACC", //>works well for k=21; part of genome10K.fasta
+ "TGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGT", //>that's the bubble
+ "TGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGT", //>that's the bubble
+ "TGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGT", //>that's the bubble
+ "TGTCATCTAGTTCAACAACCAAAAAAACGACTCTTGCGCTCGGATGT", //>that's the bubble path 2, same coverage
+ "TGTCATCTAGTTCAACAACCAAAAAAACGACTCTTGCGCTCGGATGT", //>that's the bubble path 2,
+ "TGTCATCTAGTTCAACAACCAAAAAAACGACTCTTGCGCTCGGATGT", //>that's the bubble path 2,
+ "TGTCATCTAGTTCAACAACCAAAAAAACGACTCTTGCGCTCGGATGT", //>that's the bubble path 2,
+ "CGACTCTTGCGCTCGGATGTCCGCAATGGGTTATCCCTATGTTCCGGTAATCTCTCATCTACTAAGCGCCCTAAAGGTCGTATGGTTGGAGGGCGGTTACACACCCTTAAGTACCGAACGATAGAGCACCCGTCTAGGAGGGCGTGCAGGGTCTCCCGCTAGCTAATGGTCACGGCCTCTCTGGGAAAGCTGAACAACGGATGATACCCATACTGCCACTCCAGTACCTGGGCCGCGTGTTGTACGCTGTGTATCTTGAGAGCGTTTCCAGCAGATAGAACAGGATCACATGTACAAA" // >remaining part
+ };
+ const char* sol1 = "CATCGATGCGAGACGCCTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGCGTTGCATCGAGCGGTGTCGCTAGCGCAATGCGCAAGATTTTGACATTTACAAGGCAACATTGCAGCGTCCGATGGTCCGGTGGCCTCCAGATAGTGTCCAGTCGCTCTAACTGTATGGAGACCATAGGCATTTACCTTATTCTCATCGCCACGCCCCAAGATCTTTAGGACCCAGCATTCCTTTAACCACTAACATAACGCGTGTCATCTAGTTCAACAACCAAAATAACGACTCTTGCGCTCGGATGTCCGCAATGGGTTATCCCTATGTTCCGGTAATCTCTCATCTACTAAGCGCCCTAAAGGTCGTATGGTTGGAGGGCGGTTACACACCCTTAAGTACCGAACGATAGAGCAC [...]
+ const char* sol2 = "CATCGATGCGAGACGCCTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGCGTTGCATCGAGCGGTGTCGCTAGCGCAATGCGCAAGATTTTGACATTTACAAGGCAACATTGCAGCGTCCGATGGTCCGGTGGCCTCCAGATAGTGTCCAGTCGCTCTAACTGTATGGAGACCATAGGCATTTACCTTATTCTCATCGCCACGCCCCAAGATCTTTAGGACCCAGCATTCCTTTAACCACTAACATAACGCGTGTCATCTAGTTCAACAACCAAAAAAACGACTCTTGCGCTCGGATGTCCGCAATGGGTTATCCCTATGTTCCGGTAATCTCTCATCTACTAAGCGCCCTAAAGGTCGTATGGTTGGAGGGCGGTTACACACCCTTAAGTACCGAACGATAGAGCAC [...]
+
+ debruijn_simplunitigs_bubble_aux(sequences, ARRAY_SIZE(sequences), sol1, sol2);
+
+ }
+
void debruijn_simplunitigs_ec ()
{
diff --git a/gatb-core/test/unit/src/kmer/TestDSK.cpp b/gatb-core/test/unit/src/kmer/TestDSK.cpp
index 0afee21..cca7029 100644
--- a/gatb-core/test/unit/src/kmer/TestDSK.cpp
+++ b/gatb-core/test/unit/src/kmer/TestDSK.cpp
@@ -98,13 +98,11 @@ class TestDSK : public Test
CPPUNIT_TEST_GATB (DSK_check1);
CPPUNIT_TEST_GATB (DSK_check2);
CPPUNIT_TEST_GATB (DSK_check3);
-
CPPUNIT_TEST_GATB (DSK_perBank1);
CPPUNIT_TEST_GATB (DSK_perBank2);
-
CPPUNIT_TEST_GATB (DSK_perBankKmer);
-
CPPUNIT_TEST_GATB (DSK_multibank);
+
CPPUNIT_TEST_SUITE_GATB_END();
diff --git a/gatb-core/test/unit/src/kmer/TestMPHF.cpp b/gatb-core/test/unit/src/kmer/TestMPHF.cpp
index 50bbefb..ac9dcbb 100644
--- a/gatb-core/test/unit/src/kmer/TestMPHF.cpp
+++ b/gatb-core/test/unit/src/kmer/TestMPHF.cpp
@@ -152,11 +152,12 @@ public:
Kmer kmer = model.codeSeed ("ACCATGTATAA", Data::ASCII);
- theMap[kmer.value()] = 4;
+
+ theMap.at(kmer.value()) = 4;
- if (theMap[kmer.value()] != 4)
+ if (theMap.at(kmer.value()) != 4)
std::cout << "bad map value " << theMap[kmer.value()] << " != 4" << std::endl;
- CPPUNIT_ASSERT (theMap[kmer.value()] == 4);
+ CPPUNIT_ASSERT (theMap.at(kmer.value()) == 4);
}
/********************************************************************************/
diff --git a/gatb-core/test/unit/src/main.cpp b/gatb-core/test/unit/src/main.cpp
index 66208ba..9a2faa5 100644
--- a/gatb-core/test/unit/src/main.cpp
+++ b/gatb-core/test/unit/src/main.cpp
@@ -43,9 +43,22 @@ std::string DBPATH (const string& a)
/********************************************************************************/
int main (int argc, char **argv)
{
- /** We may launch only one test. */
- char* testname = strdup (argc >=2 ? argv[1] : "All Tests");
-
+ if (argc==2 && strcmp(argv[1], "-h")==0){
+ std::cout << "Use: gatb-core-cppunit [<test-name>] [<path-to-test/db>]\n" << std::endl;
+ std::cout << " where: <test-name>: comma separated list of unit test names." << std::endl;
+ std::cout << " e.g.: 'TestLeon,TestBank'. Default: 'all'." << std::endl;
+ std::cout << " Test names are case sensitivie." << std::endl;
+ std::cout << " <path-to-test/db>: path to directory containing GATB-Core test files." << std::endl;
+ std::cout << " Default: ../test/db \n" << std::endl;
+ std::cout << "By default, tests are executed in silent mode. Use CPPUNIT_VERBOSE=1 to switch to verbose mode."<< std::endl;
+ return 0;
+ }
+ /** We may launch only selected test(s). */
+ char* testname = strdup (argc >=2 ? argv[1] : "all");
+ if (strcmp(testname, "all")==0){//shortcut to run All tests
+ testname = strdup("All Tests");
+ }
+
/** We set the directory where the db are. */
dbprefix = (argc >=3 ? argv[2] : "../test/db");
@@ -59,10 +72,13 @@ int main (int argc, char **argv)
TestResultCollector collectedresults;
testresult.addListener (&collectedresults);
-#if 1
BriefTestProgressListener progress;
- if (getenv ("CPPUNIT_VERBOSE")) { testresult.addListener (&progress); }
-#endif
+ if (getenv ("CPPUNIT_VERBOSE")) {
+ testresult.addListener (&progress);
+ }
+ else {
+ std::cout << "Tests executed in silent mode.\n -> Use CPPUNIT_VERBOSE=1 to switch to verbose mode.\n"<< std::endl;
+ }
TextTestRunner runner;
diff --git a/gatb-core/test/unit/src/tools/collections/TestMap.cpp b/gatb-core/test/unit/src/tools/collections/TestMap.cpp
index bd3dd72..9530f42 100644
--- a/gatb-core/test/unit/src/tools/collections/TestMap.cpp
+++ b/gatb-core/test/unit/src/tools/collections/TestMap.cpp
@@ -167,7 +167,7 @@ public:
for (itKeys->first(); !itKeys->isDone(); itKeys->next(), val++)
{
/** We change the value for the current key. */
- map1[itKeys->item()] = val;
+ map1.at(itKeys->item()) = val;
}
/** We check the values. */
@@ -175,7 +175,7 @@ public:
for (itKeys->first(); !itKeys->isDone(); itKeys->next(), val++)
{
/** We check the value for the current key. */
- CPPUNIT_ASSERT (map1[itKeys->item()] == val);
+ CPPUNIT_ASSERT (map1.at(itKeys->item()) == val);
}
/** We create a storage object. */
@@ -198,7 +198,7 @@ public:
for (itKeys->first(); !itKeys->isDone(); itKeys->next(), val++)
{
/** We change the value for the current key. */
- map2[itKeys->item()] = val;
+ map2.at(itKeys->item()) = val;
}
/** We check the values. */
@@ -206,14 +206,14 @@ public:
for (itKeys->first(); !itKeys->isDone(); itKeys->next(), val++)
{
/** We check the value for the current key. */
- CPPUNIT_ASSERT (map2[itKeys->item()] == val);
+ CPPUNIT_ASSERT (map2.at(itKeys->item()) == val);
}
/** We compare the values of the two maps. */
for (itKeys->first(); !itKeys->isDone(); itKeys->next(), val++)
{
/** We check the value for the current key. */
- CPPUNIT_ASSERT (map1[itKeys->item()] == map2[itKeys->item()]);
+ CPPUNIT_ASSERT (map1.at(itKeys->item()) == map2.at(itKeys->item()));
}
/** We compare the values of the two maps (index iteration) */
diff --git a/gatb-core/thirdparty/json/json.hpp b/gatb-core/thirdparty/json/json.hpp
new file mode 100644
index 0000000..5884083
--- /dev/null
+++ b/gatb-core/thirdparty/json/json.hpp
@@ -0,0 +1,659 @@
+// https://github.com/nbsdx/SimpleJSON/blob/master/json.hpp
+// license: DWTFYWPL
+
+#pragma once
+
+#ifndef THIRDPARTY_JSON
+#define THIRDPARTY_JSON
+
+
+#include <cstdint>
+#include <cmath>
+#include <cctype>
+#include <string>
+#include <deque>
+#include <map>
+#include <type_traits>
+#include <initializer_list>
+#include <ostream>
+#include <iostream>
+
+namespace json {
+
+using std::map;
+using std::deque;
+using std::string;
+using std::enable_if;
+using std::initializer_list;
+using std::is_same;
+using std::is_convertible;
+using std::is_integral;
+using std::is_floating_point;
+
+namespace {
+ string json_escape( const string &str ) {
+ string output;
+ for( unsigned i = 0; i < str.length(); ++i )
+ switch( str[i] ) {
+ case '\"': output += "\\\""; break;
+ case '\\': output += "\\\\"; break;
+ case '\b': output += "\\b"; break;
+ case '\f': output += "\\f"; break;
+ case '\n': output += "\\n"; break;
+ case '\r': output += "\\r"; break;
+ case '\t': output += "\\t"; break;
+ default : output += str[i]; break;
+ }
+ return output;
+ }
+}
+
+class JSON
+{
+ union BackingData {
+ BackingData( double d ) : Float( d ){}
+ BackingData( long l ) : Int( l ){}
+ BackingData( bool b ) : Bool( b ){}
+ BackingData( string s ) : String( new string( s ) ){}
+ BackingData() : Int( 0 ){}
+
+ deque<JSON> *List;
+ map<string,JSON> *Map;
+ string *String;
+ double Float;
+ long Int;
+ bool Bool;
+ } Internal;
+
+ public:
+ enum class Class {
+ Null,
+ Object,
+ Array,
+ String,
+ Floating,
+ Integral,
+ Boolean
+ };
+
+ template <typename Container>
+ class JSONWrapper {
+ Container *object;
+
+ public:
+ JSONWrapper( Container *val ) : object( val ) {}
+ JSONWrapper( std::nullptr_t ) : object( nullptr ) {}
+
+ typename Container::iterator begin() { return object ? object->begin() : typename Container::iterator(); }
+ typename Container::iterator end() { return object ? object->end() : typename Container::iterator(); }
+ typename Container::const_iterator begin() const { return object ? object->begin() : typename Container::iterator(); }
+ typename Container::const_iterator end() const { return object ? object->end() : typename Container::iterator(); }
+ };
+
+ template <typename Container>
+ class JSONConstWrapper {
+ const Container *object;
+
+ public:
+ JSONConstWrapper( const Container *val ) : object( val ) {}
+ JSONConstWrapper( std::nullptr_t ) : object( nullptr ) {}
+
+ typename Container::const_iterator begin() const { return object ? object->begin() : typename Container::const_iterator(); }
+ typename Container::const_iterator end() const { return object ? object->end() : typename Container::const_iterator(); }
+ };
+
+ JSON() : Internal(), Type( Class::Null ){}
+
+ JSON( initializer_list<JSON> list )
+ : JSON()
+ {
+ SetType( Class::Object );
+ for( auto i = list.begin(), e = list.end(); i != e; ++i, ++i )
+ operator[]( i->ToString() ) = *std::next( i );
+ }
+
+ JSON( JSON&& other )
+ : Internal( other.Internal )
+ , Type( other.Type )
+ { other.Type = Class::Null; other.Internal.Map = nullptr; }
+
+ JSON& operator=( JSON&& other ) {
+ ClearInternal();
+ Internal = other.Internal;
+ Type = other.Type;
+ other.Internal.Map = nullptr;
+ other.Type = Class::Null;
+ return *this;
+ }
+
+ JSON( const JSON &other ) {
+ switch( other.Type ) {
+ case Class::Object:
+ Internal.Map =
+ new map<string,JSON>( other.Internal.Map->begin(),
+ other.Internal.Map->end() );
+ break;
+ case Class::Array:
+ Internal.List =
+ new deque<JSON>( other.Internal.List->begin(),
+ other.Internal.List->end() );
+ break;
+ case Class::String:
+ Internal.String =
+ new string( *other.Internal.String );
+ break;
+ default:
+ Internal = other.Internal;
+ }
+ Type = other.Type;
+ }
+
+ JSON& operator=( const JSON &other ) {
+ ClearInternal();
+ switch( other.Type ) {
+ case Class::Object:
+ Internal.Map =
+ new map<string,JSON>( other.Internal.Map->begin(),
+ other.Internal.Map->end() );
+ break;
+ case Class::Array:
+ Internal.List =
+ new deque<JSON>( other.Internal.List->begin(),
+ other.Internal.List->end() );
+ break;
+ case Class::String:
+ Internal.String =
+ new string( *other.Internal.String );
+ break;
+ default:
+ Internal = other.Internal;
+ }
+ Type = other.Type;
+ return *this;
+ }
+
+ ~JSON() {
+ switch( Type ) {
+ case Class::Array:
+ delete Internal.List;
+ break;
+ case Class::Object:
+ delete Internal.Map;
+ break;
+ case Class::String:
+ delete Internal.String;
+ break;
+ default:;
+ }
+ }
+
+ template <typename T>
+ JSON( T b, typename enable_if<is_same<T,bool>::value>::type* = 0 ) : Internal( b ), Type( Class::Boolean ){}
+
+ template <typename T>
+ JSON( T i, typename enable_if<is_integral<T>::value && !is_same<T,bool>::value>::type* = 0 ) : Internal( (long)i ), Type( Class::Integral ){}
+
+ template <typename T>
+ JSON( T f, typename enable_if<is_floating_point<T>::value>::type* = 0 ) : Internal( (double)f ), Type( Class::Floating ){}
+
+ template <typename T>
+ JSON( T s, typename enable_if<is_convertible<T,string>::value>::type* = 0 ) : Internal( string( s ) ), Type( Class::String ){}
+
+ JSON( std::nullptr_t ) : Internal(), Type( Class::Null ){}
+
+ static JSON Make( Class type ) {
+ JSON ret; ret.SetType( type );
+ return ret;
+ }
+
+ template <typename T>
+ void append( T arg ) {
+ SetType( Class::Array ); Internal.List->emplace_back( arg );
+ }
+
+ template <typename T, typename... U>
+ void append( T arg, U... args ) {
+ append( arg ); append( args... );
+ }
+
+ template <typename T>
+ typename enable_if<is_same<T,bool>::value, JSON&>::type operator=( T b ) {
+ SetType( Class::Boolean ); Internal.Bool = b; return *this;
+ }
+
+ template <typename T>
+ typename enable_if<is_integral<T>::value && !is_same<T,bool>::value, JSON&>::type operator=( T i ) {
+ SetType( Class::Integral ); Internal.Int = i; return *this;
+ }
+
+ template <typename T>
+ typename enable_if<is_floating_point<T>::value, JSON&>::type operator=( T f ) {
+ SetType( Class::Floating ); Internal.Float = f; return *this;
+ }
+
+ template <typename T>
+ typename enable_if<is_convertible<T,string>::value, JSON&>::type operator=( T s ) {
+ SetType( Class::String ); *Internal.String = string( s ); return *this;
+ }
+
+ JSON& operator[]( const string &key ) {
+ SetType( Class::Object ); return Internal.Map->operator[]( key );
+ }
+
+ JSON& operator[]( unsigned index ) {
+ SetType( Class::Array );
+ if( index >= Internal.List->size() ) Internal.List->resize( index + 1 );
+ return Internal.List->operator[]( index );
+ }
+
+ JSON &at( const string &key ) {
+ return operator[]( key );
+ }
+
+ const JSON &at( const string &key ) const {
+ return Internal.Map->at( key );
+ }
+
+ JSON &at( unsigned index ) {
+ return operator[]( index );
+ }
+
+ const JSON &at( unsigned index ) const {
+ return Internal.List->at( index );
+ }
+
+ int length() const {
+ if( Type == Class::Array )
+ return Internal.List->size();
+ else
+ return -1;
+ }
+
+ bool hasKey( const string &key ) const {
+ if( Type == Class::Object )
+ return Internal.Map->find( key ) != Internal.Map->end();
+ return false;
+ }
+
+ int size() const {
+ if( Type == Class::Object )
+ return Internal.Map->size();
+ else if( Type == Class::Array )
+ return Internal.List->size();
+ else
+ return -1;
+ }
+
+ Class JSONType() const { return Type; }
+
+ /// Functions for getting primitives from the JSON object.
+ bool IsNull() const { return Type == Class::Null; }
+
+ string ToString() const { bool b; return ToString( b ); }
+ string ToString( bool &ok ) const {
+ ok = (Type == Class::String);
+ return ok ? json_escape( *Internal.String ) : string("");
+ }
+
+ double ToFloat() const { bool b; return ToFloat( b ); }
+ double ToFloat( bool &ok ) const {
+ ok = (Type == Class::Floating);
+ return ok ? Internal.Float : 0.0;
+ }
+
+ long ToInt() const { bool b; return ToInt( b ); }
+ long ToInt( bool &ok ) const {
+ ok = (Type == Class::Integral);
+ return ok ? Internal.Int : 0;
+ }
+
+ bool ToBool() const { bool b; return ToBool( b ); }
+ bool ToBool( bool &ok ) const {
+ ok = (Type == Class::Boolean);
+ return ok ? Internal.Bool : false;
+ }
+
+ JSONWrapper<map<string,JSON>> ObjectRange() {
+ if( Type == Class::Object )
+ return JSONWrapper<map<string,JSON>>( Internal.Map );
+ return JSONWrapper<map<string,JSON>>( nullptr );
+ }
+
+ JSONWrapper<deque<JSON>> ArrayRange() {
+ if( Type == Class::Array )
+ return JSONWrapper<deque<JSON>>( Internal.List );
+ return JSONWrapper<deque<JSON>>( nullptr );
+ }
+
+ JSONConstWrapper<map<string,JSON>> ObjectRange() const {
+ if( Type == Class::Object )
+ return JSONConstWrapper<map<string,JSON>>( Internal.Map );
+ return JSONConstWrapper<map<string,JSON>>( nullptr );
+ }
+
+
+ JSONConstWrapper<deque<JSON>> ArrayRange() const {
+ if( Type == Class::Array )
+ return JSONConstWrapper<deque<JSON>>( Internal.List );
+ return JSONConstWrapper<deque<JSON>>( nullptr );
+ }
+
+ string dump( int depth = 1, string tab = " ") const {
+ string pad = "";
+ for( int i = 0; i < depth; ++i, pad += tab );
+
+ switch( Type ) {
+ case Class::Null:
+ return "null";
+ case Class::Object: {
+ string s = "{\n";
+ bool skip = true;
+ for( auto &p : *Internal.Map ) {
+ if( !skip ) s += ",\n";
+ s += ( pad + "\"" + p.first + "\" : " + p.second.dump( depth + 1, tab ) );
+ skip = false;
+ }
+ s += ( "\n" + pad.erase( 0, 2 ) + "}" ) ;
+ return s;
+ }
+ case Class::Array: {
+ string s = "[";
+ bool skip = true;
+ for( auto &p : *Internal.List ) {
+ if( !skip ) s += ", ";
+ s += p.dump( depth + 1, tab );
+ skip = false;
+ }
+ s += "]";
+ return s;
+ }
+ case Class::String:
+ return "\"" + json_escape( *Internal.String ) + "\"";
+ case Class::Floating:
+ return std::to_string( Internal.Float );
+ case Class::Integral:
+ return std::to_string( Internal.Int );
+ case Class::Boolean:
+ return Internal.Bool ? "true" : "false";
+ default:
+ return "";
+ }
+ return "";
+ }
+
+ friend std::ostream& operator<<( std::ostream&, const JSON & );
+
+ private:
+ void SetType( Class type ) {
+ if( type == Type )
+ return;
+
+ ClearInternal();
+
+ switch( type ) {
+ case Class::Null: Internal.Map = nullptr; break;
+ case Class::Object: Internal.Map = new map<string,JSON>(); break;
+ case Class::Array: Internal.List = new deque<JSON>(); break;
+ case Class::String: Internal.String = new string(); break;
+ case Class::Floating: Internal.Float = 0.0; break;
+ case Class::Integral: Internal.Int = 0; break;
+ case Class::Boolean: Internal.Bool = false; break;
+ }
+
+ Type = type;
+ }
+
+ private:
+ /* beware: only call if YOU know that Internal is allocated. No checks performed here.
+ This function should be called in a constructed JSON just before you are going to
+ overwrite Internal...
+ */
+ void ClearInternal() {
+ switch( Type ) {
+ case Class::Object: delete Internal.Map; break;
+ case Class::Array: delete Internal.List; break;
+ case Class::String: delete Internal.String; break;
+ default:;
+ }
+ }
+
+ private:
+
+ Class Type = Class::Null;
+};
+
+// not needed in gatb and incompatible with file being included in multiple places anyway
+/*
+JSON Array() {
+ return JSON::Make( JSON::Class::Array );
+}
+
+template <typename... T>
+JSON Array( T... args ) {
+ JSON arr = JSON::Make( JSON::Class::Array );
+ arr.append( args... );
+ return arr;
+}
+
+JSON Object() {
+ return JSON::Make( JSON::Class::Object );
+}
+
+std::ostream& operator<<( std::ostream &os, const JSON &json ) {
+ os << json.dump();
+ return os;
+}
+*/
+
+namespace {
+ JSON parse_next( const string &, size_t & );
+
+ void consume_ws( const string &str, size_t &offset ) {
+ while( isspace( str[offset] ) ) ++offset;
+ }
+
+ JSON parse_object( const string &str, size_t &offset ) {
+ JSON Object = JSON::Make( JSON::Class::Object );
+
+ ++offset;
+ consume_ws( str, offset );
+ if( str[offset] == '}' ) {
+ ++offset; return Object;
+ }
+
+ while( true ) {
+ JSON Key = parse_next( str, offset );
+ consume_ws( str, offset );
+ if( str[offset] != ':' ) {
+ std::cerr << "Error: Object: Expected colon, found '" << str[offset] << "'\n";
+ break;
+ }
+ consume_ws( str, ++offset );
+ JSON Value = parse_next( str, offset );
+ Object[Key.ToString()] = Value;
+
+ consume_ws( str, offset );
+ if( str[offset] == ',' ) {
+ ++offset; continue;
+ }
+ else if( str[offset] == '}' ) {
+ ++offset; break;
+ }
+ else {
+ std::cerr << "ERROR: Object: Expected comma, found '" << str[offset] << "'\n";
+ break;
+ }
+ }
+
+ return Object;
+ }
+
+ JSON parse_array( const string &str, size_t &offset ) {
+ JSON Array = JSON::Make( JSON::Class::Array );
+ unsigned index = 0;
+
+ ++offset;
+ consume_ws( str, offset );
+ if( str[offset] == ']' ) {
+ ++offset; return Array;
+ }
+
+ while( true ) {
+ Array[index++] = parse_next( str, offset );
+ consume_ws( str, offset );
+
+ if( str[offset] == ',' ) {
+ ++offset; continue;
+ }
+ else if( str[offset] == ']' ) {
+ ++offset; break;
+ }
+ else {
+ std::cerr << "ERROR: Array: Expected ',' or ']', found '" << str[offset] << "'\n";
+ return JSON::Make( JSON::Class::Array );
+ }
+ }
+
+ return Array;
+ }
+
+ JSON parse_string( const string &str, size_t &offset ) {
+ JSON String;
+ string val;
+ for( char c = str[++offset]; c != '\"' ; c = str[++offset] ) {
+ if( c == '\\' ) {
+ switch( str[ ++offset ] ) {
+ case '\"': val += '\"'; break;
+ case '\\': val += '\\'; break;
+ case '/' : val += '/' ; break;
+ case 'b' : val += '\b'; break;
+ case 'f' : val += '\f'; break;
+ case 'n' : val += '\n'; break;
+ case 'r' : val += '\r'; break;
+ case 't' : val += '\t'; break;
+ case 'u' : {
+ val += "\\u" ;
+ for( unsigned i = 1; i <= 4; ++i ) {
+ c = str[offset+i];
+ if( (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') )
+ val += c;
+ else {
+ std::cerr << "ERROR: String: Expected hex character in unicode escape, found '" << c << "'\n";
+ return JSON::Make( JSON::Class::String );
+ }
+ }
+ offset += 4;
+ } break;
+ default : val += '\\'; break;
+ }
+ }
+ else
+ val += c;
+ }
+ ++offset;
+ String = val;
+ return String;
+ }
+
+ JSON parse_number( const string &str, size_t &offset ) {
+ JSON Number;
+ string val, exp_str;
+ char c;
+ bool isDouble = false;
+ long exp = 0;
+ while( true ) {
+ c = str[offset++];
+ if( (c == '-') || (c >= '0' && c <= '9') )
+ val += c;
+ else if( c == '.' ) {
+ val += c;
+ isDouble = true;
+ }
+ else
+ break;
+ }
+ if( c == 'E' || c == 'e' ) {
+ c = str[ offset++ ];
+ if( c == '-' ){ ++offset; exp_str += '-';}
+ while( true ) {
+ c = str[ offset++ ];
+ if( c >= '0' && c <= '9' )
+ exp_str += c;
+ else if( !isspace( c ) && c != ',' && c != ']' && c != '}' ) {
+ std::cerr << "ERROR: Number: Expected a number for exponent, found '" << c << "'\n";
+ return JSON::Make( JSON::Class::Null );
+ }
+ else
+ break;
+ }
+ exp = std::stol( exp_str );
+ }
+ else if( !isspace( c ) && c != ',' && c != ']' && c != '}' ) {
+ std::cerr << "ERROR: Number: unexpected character '" << c << "'\n";
+ return JSON::Make( JSON::Class::Null );
+ }
+ --offset;
+
+ if( isDouble )
+ Number = std::stod( val ) * std::pow( 10, exp );
+ else {
+ if( !exp_str.empty() )
+ Number = std::stol( val ) * std::pow( 10, exp );
+ else
+ Number = std::stol( val );
+ }
+ return Number;
+ }
+
+ JSON parse_bool( const string &str, size_t &offset ) {
+ JSON Bool;
+ if( str.substr( offset, 4 ) == "true" )
+ Bool = true;
+ else if( str.substr( offset, 5 ) == "false" )
+ Bool = false;
+ else {
+ std::cerr << "ERROR: Bool: Expected 'true' or 'false', found '" << str.substr( offset, 5 ) << "'\n";
+ return JSON::Make( JSON::Class::Null );
+ }
+ offset += (Bool.ToBool() ? 4 : 5);
+ return Bool;
+ }
+
+ JSON parse_null( const string &str, size_t &offset ) {
+ JSON Null;
+ if( str.substr( offset, 4 ) != "null" ) {
+ std::cerr << "ERROR: Null: Expected 'null', found '" << str.substr( offset, 4 ) << "'\n";
+ return JSON::Make( JSON::Class::Null );
+ }
+ offset += 4;
+ return Null;
+ }
+
+ JSON parse_next( const string &str, size_t &offset ) {
+ char value;
+ consume_ws( str, offset );
+ value = str[offset];
+ switch( value ) {
+ case '[' : return parse_array( str, offset );
+ case '{' : return parse_object( str, offset );
+ case '\"': return parse_string( str, offset );
+ case 't' :
+ case 'f' : return parse_bool( str, offset );
+ case 'n' : return parse_null( str, offset );
+ default : if( ( value <= '9' && value >= '0' ) || value == '-' )
+ return parse_number( str, offset );
+ }
+ std::cerr << "ERROR: Parse: Unknown starting character '" << value << "'\n";
+ return JSON();
+ }
+
+ JSON LoadJson( const string & str){
+ size_t offset = 0;
+ return parse_next( str, offset );
+ }
+}
+
+
+} // End Namespace json
+
+#endif
diff --git a/gatb-core/tools/CMakeLists.txt b/gatb-core/tools/CMakeLists.txt
index d58de68..92bc536 100644
--- a/gatb-core/tools/CMakeLists.txt
+++ b/gatb-core/tools/CMakeLists.txt
@@ -14,7 +14,7 @@ include_directories (${gatb-core-includes})
# We add the path for extra libraries
link_directories (${gatb-core-extra-libraries-path})
-list (APPEND PROGRAMS dbgh5 dbginfo)
+list (APPEND PROGRAMS dbgh5 dbginfo leon)
FOREACH (program ${PROGRAMS})
add_executable(${program} ${program}.cpp)
@@ -44,4 +44,4 @@ ENDFOREACH (program)
################################################################################
# INSTALLATION
################################################################################
-install (TARGETS dbgh5 dbginfo DESTINATION bin)
+install (TARGETS dbgh5 dbginfo leon DESTINATION bin)
diff --git a/gatb-core/tools/dbgh5.cpp b/gatb-core/tools/dbgh5.cpp
index 7685950..21a777a 100644
--- a/gatb-core/tools/dbgh5.cpp
+++ b/gatb-core/tools/dbgh5.cpp
@@ -66,7 +66,7 @@ int main (int argc, char* argv[])
Graph graph = Graph::create (props);
/** We may have to check the result. */
- if (props->get (STR_CHECK) > 0) { nbErrors = checkResult (graph, props); }
+ if (props->get (STR_CHECK) != 0) { nbErrors = checkResult (graph, props); }
/** We dump some information about the graph. */
if (props->getInt(STR_VERBOSE) > 0) { std::cout << graph.getInfo() << std::endl; }
diff --git a/gatb-core/tools/leon.cpp b/gatb-core/tools/leon.cpp
new file mode 100644
index 0000000..47a4d88
--- /dev/null
+++ b/gatb-core/tools/leon.cpp
@@ -0,0 +1,61 @@
+/*****************************************************************************
+ * GATB : Genome Assembly Tool Box
+ * Copyright (C) 2014 INRIA
+ * Authors: R.Chikhi, G.Rizk, E.Drezen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+*****************************************************************************/
+
+#include <gatb/gatb_core.hpp>
+
+#include <gatb/tools/compression/Leon.hpp>
+
+
+using namespace std;
+
+/********************************************************************************/
+void displayVersion(std::ostream& os){
+
+ os << "* * * * * * * * * * * * * * * * * * * * * *" << endl;
+ os << "* Leon version "<< LEON_VERSION_MAJOR << "."
+ << LEON_VERSION_MINOR << "."
+ << LEON_VERSION_PATCH
+ << " *" << endl; //<< " AGPL licence" <<endl;
+ os << "* Using gatb-core version "<< System::info().getVersion() << " *" << endl;
+ os << "* * * * * * * * * * * * * * * * * * * * * *" << endl;
+}
+
+int main (int argc, char* argv[])
+{
+
+ if(argc > 1 && ( strcmp(argv[1],STR_VERSION)==0 || strcmp(argv[1],"-v")==0 ) ){
+ displayVersion(cout);
+ return EXIT_FAILURE;
+ }
+ // We define a try/catch block in case some method fails
+ try
+ {
+ Leon ().run (argc, argv);
+
+ }
+ catch (gatb::core::system::Exception& e)
+ {
+
+ cerr << "EXCEPTION: " << e.getMessage() << endl;
+ return EXIT_FAILURE;
+ }
+
+ return EXIT_SUCCESS;
+
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gatb-core.git
More information about the debian-med-commit
mailing list