[med-svn] [Git][med-team/gatb-core][master] 7 commits: New upstream version

Steffen Möller gitlab at salsa.debian.org
Wed Dec 4 23:52:04 GMT 2019



Steffen Möller pushed to branch master at Debian Med / gatb-core


Commits:
f9b878ed by Steffen Moeller at 2019-12-04T23:22:51Z
New upstream version

- - - - -
d6238780 by Steffen Moeller at 2019-12-04T23:22:52Z
New upstream version 1.4.1+git20191130.664696c+dfsg
- - - - -
79bb050d by Steffen Moeller at 2019-12-04T23:22:56Z
Update upstream source from tag 'upstream/1.4.1+git20191130.664696c+dfsg'

Update to upstream version '1.4.1+git20191130.664696c+dfsg'
with Debian dir df4adf125d7696c65abb84dcc14f05bf87112c14
- - - - -
94a8395a by Steffen Moeller at 2019-12-04T23:22:59Z
Standards-Version: 4.4.1

- - - - -
f2ed00d9 by Steffen Moeller at 2019-12-04T23:23:01Z
Set upstream metadata fields: Repository-Browse.
- - - - -
61534d82 by Steffen Moeller at 2019-12-04T23:23:01Z
Remove obsolete fields Name from debian/upstream/metadata.
- - - - -
c58b23ef by Steffen Moeller at 2019-12-04T23:51:27Z
FTBFS

- - - - -


27 changed files:

- debian/changelog
- debian/control
- debian/upstream/metadata
- gatb-core/CMakeLists.txt
- gatb-core/doc/doxygen/src/dbgh5page.hpp
- gatb-core/src/gatb/bank/impl/BankFasta.cpp
- gatb-core/src/gatb/bcalm2/bglue_algo.cpp
- gatb-core/src/gatb/bcalm2/bglue_algo.hpp
- gatb-core/src/gatb/debruijn/impl/Graph.cpp
- gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp
- gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp
- gatb-core/src/gatb/debruijn/impl/LinkTigs.hpp
- gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.cpp
- gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp
- gatb-core/src/gatb/system/impl/FileSystemCommon.hpp
- gatb-core/src/gatb/template/TemplateSpecialization10.cpp.in
- gatb-core/src/gatb/tools/collections/impl/IteratorFile.hpp
- gatb-core/src/gatb/tools/misc/api/StringsRepository.hpp
- gatb-core/src/gatb/tools/misc/impl/Tool.cpp
- gatb-core/src/gatb/tools/storage/impl/CollectionHDF5Patch.hpp
- gatb-core/src/gatb/tools/storage/impl/Storage.hpp
- gatb-core/src/gatb/tools/storage/impl/StorageFile.hpp
- gatb-core/src/gatb/tools/storage/impl/StorageHDF5.hpp
- gatb-core/test/unit/src/debruijn/TestDebruijn.cpp
- gatb-core/test/unit/src/kmer/TestDSK.cpp
- gatb-core/test/unit/src/tools/storage/TestStorage.cpp
- + gatb-core/thirdparty/update-boost.sh


Changes:

=====================================
debian/changelog
=====================================
@@ -1,3 +1,15 @@
+gatb-core (1.4.1+git20191130.664696c+dfsg-1) UNRELEASED; urgency=medium
+
+  * Team upload.
+  * New upstream version
+  * Standards-Version: 4.4.1
+  * Set upstream metadata fields: Repository-Browse.
+  * Remove obsolete fields Name from debian/upstream/metadata.
+
+  * FTBFS: Problem with symbols files, I presume
+
+ -- Steffen Moeller <moeller at debian.org>  Thu, 05 Dec 2019 00:23:01 +0100
+
 gatb-core (1.4.1+git20190813.a73b6dd+dfsg-1) unstable; urgency=medium
 
   * New upstream version


=====================================
debian/control
=====================================
@@ -13,7 +13,7 @@ Build-Depends: debhelper-compat (= 12),
                libjsoncpp-dev,
                doxygen,
                graphviz
-Standards-Version: 4.4.0
+Standards-Version: 4.4.1
 Vcs-Browser: https://salsa.debian.org/med-team/gatb-core
 Vcs-Git: https://salsa.debian.org/med-team/gatb-core.git
 Homepage: https://github.com/GATB/gatb-core


=====================================
debian/upstream/metadata
=====================================
@@ -1,4 +1,3 @@
-Name: gatb-core
 Cite-As: >
   E. Drezen, G. Rizk, R. Chikhi, C. Deltel, C. Lemaitre, P. Peterlongo,
   D. Lavenier. (2014)
@@ -6,10 +5,10 @@ Cite-As: >
   Bioinformatics, 30(20):2959-2961. / BioIT 2014 poster
 Reference:
   Author: >
-   Erwan Drezen and Guillaume Rizk and Rayan Chikhi and Charles Deltel
-   and Claire Lemaitre and Pierre Peterlongo and Dominique Lavenier
+    Erwan Drezen and Guillaume Rizk and Rayan Chikhi and Charles Deltel
+    and Claire Lemaitre and Pierre Peterlongo and Dominique Lavenier
   Title: >
-   GATB: Genome Assembly & Analysis Tool Box
+    GATB: Genome Assembly & Analysis Tool Box
   Journal: Bioinformatics
   Year: 2014
   Volume: 30
@@ -19,7 +18,8 @@ Reference:
   URL: http://dx.doi.org/10.1093/bioinformatics/btu406
 Repository: https://github.com/GATB/gatb-core
 Registry:
-  - Name: OMICtools
-    Entry: OMICS_04834
-  - Name: conda:bioconda
-    Entry: gatb
+- Name: OMICtools
+  Entry: OMICS_04834
+- Name: conda:bioconda
+  Entry: gatb
+Repository-Browse: https://github.com/GATB/gatb-core


=====================================
gatb-core/CMakeLists.txt
=====================================
@@ -110,8 +110,6 @@ if (debug)
     set (LIBRARY_COMPILE_DEFINITIONS  "${LIBRARY_COMPILE_DEFINITIONS} -g -p ${LIB_COMPILE_WARNINGS}")
     set (CMAKE_BUILD_TYPE Debug) # else CMake adds DNDEBUG
     message("-- COMPILATION IN DEBUG MODE")
-else()
-    set (LIBRARY_COMPILE_DEFINITIONS  "${LIBRARY_COMPILE_DEFINITIONS} -O3 -DNDEBUG ${LIB_COMPILE_WARNINGS}")
 endif()
 
 if (INT128_FOUND)


=====================================
gatb-core/doc/doxygen/src/dbgh5page.hpp
=====================================
@@ -91,6 +91,8 @@
           -verbose           (1 arg) :    verbosity level  [default '1']
           -email             (1 arg) :    send statistics to the given email address  [default '']
           -email-fmt         (1 arg) :    'raw' or 'xml'  [default 'raw']
+          -edge-km           (1 arg) :    Kececioglu-Myers edge representation  [default '0']
+ 
  * \endcode
  *
  *


=====================================
gatb-core/src/gatb/bank/impl/BankFasta.cpp
=====================================
@@ -651,6 +651,7 @@ void BankFasta::Iterator::init ()
         *bf = (buffered_file_t *)  CALLOC (1, sizeof(buffered_file_t));
         (*bf)->buffer = (unsigned char*)  MALLOC (BUFFER_SIZE);
         (*bf)->stream = gzopen (fname, "r");
+        gzbuffer((*bf)->stream,2*1024*1024);
 		
         /** We check that we can open the file. */
         if ((*bf)->stream == NULL)


=====================================
gatb-core/src/gatb/bcalm2/bglue_algo.cpp
=====================================
@@ -63,12 +63,9 @@ using namespace gatb::core::tools::collections::impl;
 using namespace std;
 
 // let's be clear here:
-// UF hashes will be stored in 32 bits for efficiency (as I don't want to have a 64-bits UF for memory reasons, also, would require to modify unionFind.hpp)
-typedef uint32_t uf_hashes_t;
-// but there can be more than 2^{32} sequences in the glue file
+typedef uint64_t uf_hashes_t; // UF hashes are the hash values of k-mers to be inserted into the UF data structure. Don't try setting to uint32_t, would be a disaster
 typedef uint64_t seq_idx_t;
-// so, potentially, more than 2^{32} UF hashes (but not necessarily, consider that some sequences don't need to be glued)
-// what will happen is that more one UF class won't be linked to a single unitig, but multiple unitigs
+typedef uint32_t uf_class_t; // UF class is the identifier of an element in the UF
 // let's hope that there won't be saturation (only 1 UF class with all unitigs)
 // if this happens, then "Top 10 glue partitions by size:" will show only one entry and BCALM will blow up in memory
 // a fix would be to use a 64 bits UF (to be coded later)
@@ -197,6 +194,24 @@ static string skip_first_abundance(const string& list)
     return res;
 }
 
+static string make_header(const int seq_size, const string& abundances, bool all_abundance_counts)
+{
+    string header;
+    float mean_abundance = get_mean_abundance(abundances);
+    uint64_t sum_abundances = get_sum_abundance(abundances);
+    if (all_abundance_counts)
+    {
+        // in this setting, all kmer wabundances are printed in the order of the kmers in the sequence
+        header = "LN:i:" + to_string(seq_size) + " ab:Z:" + abundances;
+    }
+    else
+    {
+        // km is not a standard GFA field so i'm putting it in lower case as per the spec
+        header = "LN:i:" + to_string(seq_size) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance);
+    }
+    return header;
+}
+
 template<int SPAN>
 struct markedSeq
 {
@@ -699,6 +714,7 @@ void bglue(Storage *storage,
         int kmerSize, 
         int nb_glue_partitions, 
         int nb_threads, 
+        bool all_abundance_counts,
         bool verbose
         )
 {
@@ -804,7 +820,7 @@ void bglue(Storage *storage,
     }
 
     // create a UF data structure
-    // this one stores nb_uf_keys * uint64_t (actually, atomic's). so it's bigger than uf_hashes
+    // this one stores nb_uf_keys * uint64_t (actually, atomic's).
     unionFind ufkmers(nb_uf_keys);
 
 #if 0
@@ -911,13 +927,13 @@ void bglue(Storage *storage,
     if (only_uf) // for debugging
         return;
 
-    /* now we're mirroring the UF to a vector of uint32_t's, it will take less space, and strictly same information
+    /* now we're mirroring the UF to a vector of uint32_t's (uf_class_t), it will take less space, and strictly same information
      * this is to get rid of the rank (one uint32) per element in the current UF implementation. 
      * To do this, we're using the disk to save space of populating one vector from the other in memory. 
      * (saves having to allocate both vectors at the same time) */
     
-    BagFile<uf_hashes_t> *ufkmers_bagf = new BagFile<uf_hashes_t>(prefix+".glue.uf");  LOCAL(ufkmers_bagf);
-	BagCache<uf_hashes_t> *ufkmers_bag = new BagCache<uf_hashes_t>(  ufkmers_bagf, 10000 );   LOCAL(ufkmers_bag);
+    BagFile<uf_class_t> *ufkmers_bagf = new BagFile<uf_class_t>(prefix+".glue.uf");  LOCAL(ufkmers_bagf);
+	BagCache<uf_class_t> *ufkmers_bag = new BagCache<uf_class_t>(  ufkmers_bagf, 10000 );   LOCAL(ufkmers_bag);
 
     for (unsigned long i = 0; i < nb_uf_keys; i++)
         //ufkmers_vector[i] = ufkmers.find(i); // just in-memory without the disk
@@ -930,15 +946,15 @@ void bglue(Storage *storage,
 
     ufkmers_bag->flush();
 
-    std::vector<uf_hashes_t> ufkmers_vector(nb_uf_keys);
-    IteratorFile<uf_hashes_t> ufkmers_file(prefix+".glue.uf");
+    std::vector<uf_class_t> ufkmers_vector(nb_uf_keys);
+    IteratorFile<uf_class_t> ufkmers_file(prefix+".glue.uf");
     unsigned long i = 0;
     for (ufkmers_file.first(); !ufkmers_file.isDone(); ufkmers_file.next())
             ufkmers_vector[i++] = ufkmers_file.item();
 
     System::file().remove (prefix+".glue.uf");
     
-    logging("loaded 32-bit UF (" + to_string(nb_uf_keys*sizeof(uf_hashes_t)/1024/1024) + " MB)");
+    logging("loaded 32-bit UF (" + to_string(nb_uf_keys*sizeof(uf_class_t)/1024/1024) + " MB)");
   
     // setup output file
     string output_prefix = prefix;
@@ -1000,7 +1016,7 @@ void bglue(Storage *storage,
 
     // partition the glue into many files, à la dsk
     auto partitionGlue = [k, &modelCanon /* crashes if copied!*/, \
-        &get_UFclass, &gluePartitions,
+        &get_UFclass, &gluePartitions, all_abundance_counts,
         &out, &outLock, &nb_seqs_in_partition, nbGluePartitions]
             (const Sequence& sequence)
     {
@@ -1024,11 +1040,8 @@ void bglue(Storage *storage,
         if (!found_class) // this one doesn't need to be glued
         {
             const string abundances = comment.substr(3);
-            float mean_abundance = get_mean_abundance(abundances);
-            uint64_t sum_abundances = get_sum_abundance(abundances);
-            
-            // km is not a standard GFA field so i'm putting it in lower case as per the spec
-            output(seq, out, "LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance)); 
+            string header = make_header(seq.size(),abundances, all_abundance_counts);
+            output(seq, out, header); 
             return;
         }
 
@@ -1082,7 +1095,7 @@ void bglue(Storage *storage,
     for (int partition = 0; partition < nbGluePartitions; partition++)
     {
         auto glue_partition = [&modelCanon, &ufkmers, partition, &gluePartition_prefix, nbGluePartitions, &copy_nb_seqs_in_partition,
-        &get_UFclass, &out, &outLock, kmerSize]( int thread_id)
+        &get_UFclass, &out, &outLock, kmerSize, all_abundance_counts]( int thread_id)
         {
             int k = kmerSize;
 
@@ -1172,10 +1185,9 @@ void bglue(Storage *storage,
                 string seq, abs;
                 glue_sequences(seqs_to_glue[i], seqs_to_glue_is_circular[i], sequences, abundances, kmerSize, seq, abs); // takes as input the indices of ordered sequences, whether that sequence is circular, and the markedSeq's themselves along with their abundances
 
-                float mean_abundance = get_mean_abundance(abs);
-                uint32_t sum_abundances = get_sum_abundance(abs);
                 {
-                    output(seq, out, "LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
+                    string header = make_header(seq.size(),abs, all_abundance_counts);
+                    output(seq, out, header);
                 }
             }
                 
@@ -1198,7 +1210,7 @@ void bglue(Storage *storage,
 
     logging("end");
 
-    bool debug_keep_glue_files = true; // for debugging // TODO enable it if -redo-bglue param was provided (need some info from UnitigsConstructionAlgorithm). 
+    bool debug_keep_glue_files = false; // for debugging // TODO warning: if debug_keep_glue_files is set to 'false,' then the debug option '-redo-bglue' cannot work because it needs those bglue files
     if (debug_keep_glue_files)
     {
         std::cout << "debug: not deleting glue files" << std::endl;


=====================================
gatb-core/src/gatb/bcalm2/bglue_algo.hpp
=====================================
@@ -150,6 +150,7 @@ void bglue(gatb::core::tools::storage::impl::Storage* storage,
         int kmerSize, 
         int nb_glue_partitions, 
         int nb_threads, 
+        bool all_abundance_counts,
         bool verbose
         );
 


=====================================
gatb-core/src/gatb/debruijn/impl/Graph.cpp
=====================================
@@ -648,6 +648,8 @@ IOptionsParser* GraphTemplate<Node, Edge, GraphDataVariant>::getOptionsParser (b
     IOptionsParser* parserGeneral  = new OptionsParser ("general");
     parserGeneral->push_front (new OptionOneParam (STR_INTEGER_PRECISION, "integers precision (0 for optimized value)", false, "0", false));
     parserGeneral->push_front (new OptionOneParam (STR_VERBOSE,           "verbosity level",      false, "1"  ));
+    parserGeneral->push_front (new OptionOneParam (STR_EDGE_KM_REPRESENTATION,           "edge km representation",      false, "0"  ));
+    parserGeneral->push_front (new OptionNoParam (STR_ALL_ABUNDANCE_COUNTS,           "output all k-mer abundance counts instead of mean" ));
     parserGeneral->push_front (new OptionOneParam (STR_NB_CORES,          "number of cores",      false, "0"  ));
     parserGeneral->push_front (new OptionNoParam  (STR_CONFIG_ONLY,       "dump config only"));
     
@@ -661,7 +663,7 @@ IOptionsParser* GraphTemplate<Node, Edge, GraphDataVariant>::getOptionsParser (b
     parserDebug->push_front (new OptionNoParam  ("-skip-links",       "same, but       skip     links"));
     parserDebug->push_front (new OptionNoParam  ("-redo-links",       "same, but       redo     links"));
     parserDebug->push_front (new OptionNoParam  ("-skip-bglue",       "same, but       skip     bglue"));
-    parserDebug->push_front (new OptionNoParam  ("-redo-bglue",       "same, but       redo     bglue"));
+    parserDebug->push_front (new OptionNoParam  ("-redo-bglue",       "same, but       redo     bglue (needs debug_keep_glue_files=true in source code)"));
     parserDebug->push_front (new OptionNoParam  ("-skip-bcalm",       "same, but       skip     bcalm"));
     parserDebug->push_front (new OptionNoParam  ("-redo-bcalm",       "debug function, redo the bcalm algo"));
 


=====================================
gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp
=====================================
@@ -259,7 +259,7 @@ void GraphUnitigsTemplate<span>::build_unitigs_postsolid(std::string unitigs_fil
     }
     
     bool redo_bcalm = props->get("-redo-bcalm");
-    bool redo_bglue = props->get("-redo-bglue");
+    bool redo_bglue = props->get("-redo-bglue"); // note: if that option is to be used, make sure to enable debug_keep_glue_files=true in bglue_algo.cpp
     bool redo_links = props->get("-redo-links");
 
     bool skip_bcalm = props->get("-skip-bcalm");


=====================================
gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp
=====================================
@@ -52,7 +52,7 @@ namespace gatb { namespace core { namespace debruijn { namespace impl  {
  *  Normally bcalm outputs consecutive unitig ID's but LinkTigs can also work with non-consecutive, non-sorted IDs
  */
 template<size_t span>
-void link_tigs(string unitigs_filename, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose, bool renumber_unitigs)
+void link_tigs(string unitigs_filename, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose,  bool edge_km_representation, bool renumber_unitigs)
 {
     bcalm_logging = verbose;
     BankFasta* out = new BankFasta(unitigs_filename+".linked");
@@ -60,7 +60,7 @@ void link_tigs(string unitigs_filename, int kmerSize, int nb_threads, uint64_t &
     logging("Finding links between unitigs");
 
     for (int pass = 0; pass < nb_passes; pass++)
-        link_unitigs_pass<span>(unitigs_filename, verbose, pass, kmerSize, renumber_unitigs);
+        link_unitigs_pass<span>(unitigs_filename, verbose, pass, kmerSize, edge_km_representation, renumber_unitigs );
 
     write_final_output(unitigs_filename, verbose, out, nb_unitigs, renumber_unitigs);
    
@@ -265,7 +265,7 @@ static void record_links(uint64_t utig_id, int pass, const string &link, std::of
 
 
 template<size_t span>
-void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pass, const int kmerSize, const bool renumber_unitigs)
+void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pass, const int kmerSize, bool edge_km_representation, const bool renumber_unitigs)
 {
     typedef typename kmer::impl::Kmer<span>::ModelCanonical Model;
     typedef typename kmer::impl::Kmer<span>::Type           Type;
@@ -376,7 +376,12 @@ void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pa
                     //bool rc = e_in.rc ^ (!beginInSameOrientation); // "rc" sets the destination strand // i don't think it's the right formula because of k-1-mers that are their self revcomp. see the mikko bug in the test folder, that provides a nice illustration of that 
                     bool rc = e_in.pos == UNITIG_END; // a better way to determine the rc flag is just looking at position of e_in k-1-mer 
 
-                    in_links += "L:-:" + to_string(e_in.unitig) + ":" + (rc?"-":"+") + " "; 
+                    
+                    if(edge_km_representation){
+                        in_links += "J:0:" + to_string(e_in.unitig) + ":" + (rc?"1":"0") + " "; 
+                    }else{
+                        in_links += "L:-:" + to_string(e_in.unitig) + ":" + (rc?"-":"+") + " "; 
+                    }
 
                     /* what to do when kmerBegin is same as forward and reverse?
                      used to have this:
@@ -432,7 +437,13 @@ void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pa
                     
                     bool rc = e_out.pos == UNITIG_END; // a better way to determine the rc flag is just looking at position of e_in k-1-mer 
 
-                    out_links += "L:+:" + to_string(e_out.unitig) + ":" + (rc?"-":"+") + " "; 
+                    if(edge_km_representation){
+                        out_links += "J:1:" + to_string(e_out.unitig) + ":" + (rc?"1":"0") + " "; 
+                    }else{
+                        out_links += "L:+:" + to_string(e_out.unitig) + ":" + (rc?"-":"+") + " "; 
+                    }
+                    
+
 
                     if (debug) std::cout << " [valid] ";
                 }


=====================================
gatb-core/src/gatb/debruijn/impl/LinkTigs.hpp
=====================================
@@ -30,10 +30,10 @@ namespace gatb { namespace core { namespace debruijn { namespace impl  {
 
 
     template<size_t SPAN>
-    void link_tigs( std::string prefix, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose, bool renumber_unitigs = false);
+    void link_tigs( std::string prefix, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose,  bool edge_km_representation, bool renumber_unitigs = false);
 
     template<size_t span>
-    void link_unitigs_pass(const std::string unitigs_filename, bool verbose, const int pass, const int kmerSize, const bool renumber_unitigs);
+    void link_unitigs_pass(const std::string unitigs_filename, bool verbose, const int pass, const int kmerSize,  bool edge_km_representation, const bool renumber_unitigs );
     
 }}}}
 


=====================================
gatb-core/src/gatb/debruijn/impl/UnitigsConstructionAlgorithm.cpp
=====================================
@@ -91,17 +91,15 @@ UnitigsConstructionAlgorithm<span>::~UnitigsConstructionAlgorithm ()
 template <size_t span>
 void UnitigsConstructionAlgorithm<span>::execute ()
 {
-    kmerSize =
-            getInput()->getInt(STR_KMER_SIZE);
-    int abundance = 
-            getInput()->getInt(STR_KMER_ABUNDANCE_MIN); // note: doesn't work when it's "auto"
-    int minimizerSize =
-        getInput()->getInt(STR_MINIMIZER_SIZE);
-    int nb_threads =
-        getInput()->getInt(STR_NB_CORES);
-    int minimizer_type =
-        getInput()->getInt(STR_MINIMIZER_TYPE);
-    bool verbose = getInput()->getInt(STR_VERBOSE);
+    kmerSize                    = getInput()->getInt(STR_KMER_SIZE);
+    int abundance               = getInput()->getInt(STR_KMER_ABUNDANCE_MIN); // note: doesn't work when it's "auto"
+    int minimizerSize           = getInput()->getInt(STR_MINIMIZER_SIZE);
+    int nb_threads              = getInput()->getInt(STR_NB_CORES);
+    int minimizer_type          = getInput()->getInt(STR_MINIMIZER_TYPE);
+    bool verbose                = getInput()->getInt(STR_VERBOSE);
+    bool edge_km_representation = getInput()->getInt(STR_EDGE_KM_REPRESENTATION);
+    bool all_abundance_counts   = getInput()->get(STR_ALL_ABUNDANCE_COUNTS);
+   
     int nb_glue_partitions = 0;
     if (getInput()->get("-nb-glue-partitions"))
         nb_glue_partitions = getInput()->getInt("-nb-glue-partitions");
@@ -110,9 +108,9 @@ void UnitigsConstructionAlgorithm<span>::execute ()
     if ((unsigned int)nb_threads > nbThreads)
         std::cout << "Uh. Unitigs graph construction called with nb_threads " << nb_threads << " but dispatcher has nbThreads " << nbThreads << std::endl;
 
-    if (do_bcalm) bcalm2<span>(&_storage, unitigs_filename, kmerSize, abundance, minimizerSize, nbThreads, minimizer_type, verbose); 
-    if (do_bglue) bglue<span> (&_storage, unitigs_filename, kmerSize, nb_glue_partitions,       nbThreads,                 verbose);
-    if (do_links) link_tigs<span>(unitigs_filename, kmerSize, nbThreads, nb_unitigs, verbose);
+    if (do_bcalm) bcalm2<span>(&_storage, unitigs_filename, kmerSize, abundance, minimizerSize, nbThreads, minimizer_type,       verbose); 
+    if (do_bglue) bglue<span> (&_storage, unitigs_filename, kmerSize, nb_glue_partitions,       nbThreads, all_abundance_counts, verbose);
+    if (do_links) link_tigs<span>(unitigs_filename, kmerSize, nbThreads, nb_unitigs, verbose, edge_km_representation);
 
     /** We gather some statistics. */
     // nb_unitigs will be used in GraphUnitigs


=====================================
gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp
=====================================
@@ -1300,6 +1300,16 @@ void SortingCountAlgorithm<span>::fillPartitions (size_t pass, Iterator<Sequence
 				itBanks[i]->finalize();
 			}
 		}
+
+        // force close partitions and re-open them for reading
+        // may prevent crash in large multi-bank counting instance on Lustre filesystems
+		if(_config._solidityKind != KMER_SOLIDITY_SUM)
+		{
+			string tmpStorageName = getInput()->getStr(STR_URI_OUTPUT_TMP) + "/" + System::file().getTemporaryFilename("dsk_partitions");
+			setPartitions        (0); // close the partitions first, otherwise new files are opened before  closing parti from previous pass
+			setPartitions        ( & (*_tmpPartitionsStorage)().getPartition<Type> ("parts"));
+			
+		}
 	}
 
 /*********************************************************************


=====================================
gatb-core/src/gatb/system/impl/FileSystemCommon.hpp
=====================================
@@ -36,6 +36,7 @@
 #include <string.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <iostream>
 
 /********************************************************************************/
 namespace gatb      {
@@ -60,6 +61,7 @@ public:
     {
         _isStdout = path && strcmp(path,"stdout")==0;
         _handle   = _isStdout ? stdout : fopen (path, mode);
+        //std::cout << "opening file " << _path << " handle " << _handle << std::endl;
 		if(_handle == 0)
 		{
 			throw Exception ("cannot open %s %s",path,strerror(errno));
@@ -67,7 +69,9 @@ public:
     }
 
     /** Destructor. */
-    virtual ~CommonFile ()  {  if (_handle && !_isStdout)  {  fclose (_handle);  }  }
+    virtual ~CommonFile ()  {  if (_handle && !_isStdout)  {  
+        //std::cout << "closing file " << _path << " handle " << _handle << std::endl; 
+        fclose (_handle);  }  }
 
     /** \copydoc IFile::isOpen */
     bool isOpen ()  { return getHandle() != 0; }


=====================================
gatb-core/src/gatb/template/TemplateSpecialization10.cpp.in
=====================================
@@ -25,15 +25,16 @@ template void bglue<${KSIZE}>(Storage* storage,
         int kmerSize, 
         int nb_glue_partitions, 
         int nb_threads, 
+        bool all_abundance_counts,
         bool verbose
         );
 
 template class graph3<${KSIZE}>; // graph3<span> switch  
 
 template void link_tigs<${KSIZE}>
-    (std::string unitigs_filename, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose, bool renumber_unitigs = false);
+    (std::string unitigs_filename, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose, bool edge_km_representation, bool renumber_unitigs = false);
 
-template void link_unitigs_pass<${KSIZE}>(const std::string unitigs_filename, bool verbose, const int pass, const int kmerSize, const bool renumber_unitigs);
+template void link_unitigs_pass<${KSIZE}>(const std::string unitigs_filename, bool verbose, const int pass, const int kmerSize, bool edge_km_representation, const bool renumber_unitigs);
 
 
 /********************************************************************************/


=====================================
gatb-core/src/gatb/tools/collections/impl/IteratorFile.hpp
=====================================
@@ -239,6 +239,7 @@ public:
     _filename(it._filename), _gzfile(0),  _buffer(0), _cpt_buffer(0), _idx(0), _cacheItemsNb(it._cacheItemsNb), _isDone(true)
     {
         _gzfile =   gzopen(_filename.c_str(),"rb");
+        gzbuffer(_gzfile,2*1024*1024);
         _buffer  = (Item*) MALLOC (sizeof(Item) * _cacheItemsNb);
     }
     
@@ -248,6 +249,7 @@ public:
     
     {
         _gzfile =   gzopen(_filename.c_str(),"rb");
+        gzbuffer(_gzfile,2*1024*1024);
         _buffer  = (Item*) MALLOC (sizeof(Item) * _cacheItemsNb);
     }
     
@@ -273,6 +275,7 @@ public:
             _isDone       = it._isDone;
             
             _gzfile =   gzopen(_filename.c_str(),"r");
+            gzbuffer(_gzfile,2*1024*1024);
             _buffer  = (Item*) MALLOC (sizeof(Item) * it._cacheItemsNb);
         }
         return *this;


=====================================
gatb-core/src/gatb/tools/misc/api/StringsRepository.hpp
=====================================
@@ -83,6 +83,8 @@ public:
     const char* graph          ()  { return "-graph";          }
     const char* kmer_size      ()  { return "-kmer-size";      }
     const char* minimizer_size ()  { return "-minimizer-size"; }
+    const char* edge_km_representation ()  { return "-edge-km"; }
+    const char* all_abundance_counts   ()  { return "-all-abundance-counts"; }
     const char* kmer_abundance ()  { return "-abundance"; }
     const char* kmer_abundance_min ()  { return "-abundance-min"; }
     const char* kmer_abundance_min_threshold ()  { return "-abundance-min-threshold"; }
@@ -138,6 +140,8 @@ public:
 #define STR_URI_GRAPH           gatb::core::tools::misc::StringRepository::singleton().graph ()
 #define STR_KMER_SIZE           gatb::core::tools::misc::StringRepository::singleton().kmer_size ()
 #define STR_MINIMIZER_SIZE      gatb::core::tools::misc::StringRepository::singleton().minimizer_size ()
+#define STR_EDGE_KM_REPRESENTATION      gatb::core::tools::misc::StringRepository::singleton().edge_km_representation ()
+#define STR_ALL_ABUNDANCE_COUNTS        gatb::core::tools::misc::StringRepository::singleton().all_abundance_counts ()
 #define STR_INTEGER_PRECISION   gatb::core::tools::misc::StringRepository::singleton().integer_precision ()
 #define STR_KMER_ABUNDANCE      gatb::core::tools::misc::StringRepository::singleton().kmer_abundance ()
 #define STR_KMER_ABUNDANCE_MIN  gatb::core::tools::misc::StringRepository::singleton().kmer_abundance_min ()


=====================================
gatb-core/src/gatb/tools/misc/impl/Tool.cpp
=====================================
@@ -57,7 +57,6 @@ Tool::Tool (const std::string& name) : userDisplayHelp(0), _helpTarget(0),userDi
 
     getParser()->push_back (new OptionOneParam (STR_NB_CORES,    "number of cores",      false, "0"  ));
     getParser()->push_back (new OptionOneParam (STR_VERBOSE,     "verbosity level",      false, "1"  ));
-	
 	getParser()->push_back (new OptionNoParam (STR_VERSION, "version", false));
 	getParser()->push_back (new OptionNoParam (STR_HELP, "help", false));
 


=====================================
gatb-core/src/gatb/tools/storage/impl/CollectionHDF5Patch.hpp
=====================================
@@ -266,6 +266,7 @@ public:
         herr_t status = 0;
 
         {
+            //std::cout << "begin insert" << std::endl;
             system::LocalSynchronizer localsynchro (_common->_synchro);
 
             /** We get the dataset id. */
@@ -300,6 +301,7 @@ public:
             status = H5Sclose (filespaceId);
             status = H5Sclose (memspaceId);
             if (status != 0)  { std::cout << "err H5Sclose" << std::endl; }
+            //std::cout << "end insert" << std::endl;
         }
 
         /** We periodically clean up some HDF5 resources. */
@@ -373,12 +375,14 @@ private:
          * NOTE !!!  the 'clean' method called after this block is also synchronized,
          * and therefore must not be in the same instruction block. */
         {
+            //std::cout << "begin retrievecache" << std::endl;
             system::LocalSynchronizer localsynchro (_common->_synchro);
 
             hid_t memspaceId = H5Screate_simple (1, &count, NULL);
 
             /** Select hyperslab on file dataset. */
             hid_t filespaceId = H5Dget_space(_common->getDatasetId());
+            //std::cout << "filespaceId "  << filespaceId << std::endl;
             status = H5Sselect_hyperslab (filespaceId, H5S_SELECT_SET, &start, NULL, &count, NULL);
             if (status < 0)  { throw gatb::core::system::Exception ("HDF5 error (H5Sselect_hyperslab), status %d", status);  }
 
@@ -390,6 +394,7 @@ private:
             status = H5Sclose (filespaceId);
             status = H5Sclose (memspaceId);
             if (status < 0)  { throw gatb::core::system::Exception ("HDF5 error (H5Sclose), status %d", status);  }
+            //std::cout << "end retrievecache" << std::endl;
         }
 
         /** We periodically clean up some HDF5 resources. */


=====================================
gatb-core/src/gatb/tools/storage/impl/Storage.hpp
=====================================
@@ -181,7 +181,7 @@ public:
 
     /** Get a child partition from its name. Created if not already exists.
      * \param[in] name : name of the child partition to be retrieved.
-     * \param[in] nb : in case of creation, tells how many collection belong to the partition.
+     * \param[in] nb : in case of creation, tells how many collection belong to the partition. IMPORTANT: if nb != 0, StorageFile will erase the partition before opening it. So if you're opening a partition, just set nb=0 and let it autodetect the size
      * \return the child partition.
      */
     template <class Type>  Partition<Type>& getPartition (const std::string& name, size_t nb=0);


=====================================
gatb-core/src/gatb/tools/storage/impl/StorageFile.hpp
=====================================
@@ -96,7 +96,8 @@ namespace impl      {
             /** */
             ~GroupFile()
             {
-				system::impl::System::file().rmdir(folder); // hack to remove the trashme folers. I'd have liked to make that call in remove() but for some reason remove() isn't called
+                //std::cout << "groupfile destructor called, removing folder " << folder << std::endl;
+                system::impl::System::file().rmdir(folder); // hack to remove the trashme folers. I'd have liked to make that call in remove() but for some reason remove() isn't called
             }
 
             /** */
@@ -219,17 +220,24 @@ public:
         if (!system::impl::System::file().isFolderEndingWith(storage_prefix,"_gatb"))
             file_folder += "_gatb/";
 
-        std::string filename = file_folder + parent->getFullId('.') + std::string(".") + name;
-        std::string folder = system::impl::System::file().getDirectory(filename);
-        std::string prefix = system::impl::System::file().getBaseName(filename) + std::string(".") + name; // because gatb's getBaseName is stupid and cuts after the last dot
+        std::string full_path = file_folder;
+        std::string parent_base = parent->getFullId('.');
+        std::string base_name = parent_base;
+        if (parent_base.size() > 0)
+            base_name += std::string(".");  // because gatb's getBaseName is stupid and cuts after the last dot
+        base_name += name; 
+
+        full_path += base_name; // but then base_name might have a suffix like ".1" for partitions
+
+        //std::cout <<"name: " << name << " filename " << full_path << " prefix " << base_name<< std::endl;
 
         if (nb == 0)
         {   // if nb is 0, it means we're opening partitions and not creating them, thus we need to get the number of partitions.
 
            int nb_partitions=0;
-           for (auto filename : system::impl::System::file().listdir(folder))
+           for (auto filename : system::impl::System::file().listdir(file_folder))
             {
-                if (!filename.compare(0, prefix.size(), prefix)) // startswith
+                if (!filename.compare(0, base_name.size(), base_name)) // startswith
                 {
                     nb_partitions++;
                 }
@@ -240,19 +248,20 @@ public:
                 std::cout << "error: could not get number of partition for " << name << " using StorageFile" << std::endl;
                 exit(1);
             }
+            //std::cout << "got " << nb << " partitions" << std::endl;
 		}
         else
         {
             // else, if nb is set, means we're creating some partitions. let's delete all the previous ones to avoid wrongly counting 
-            for (auto filename : system::impl::System::file().listdir(folder))
+            for (auto filename : system::impl::System::file().listdir(file_folder))
             {
-                //std::cout <<"name: " << name << " comparing " << filename << " with prefix " << prefix << std::endl;
-                if (!filename.compare(0, prefix.size(), prefix)) // startswith
+                //std::cout <<"name: " << name << " comparing " << filename << " with prefix " << base_name << std::endl;
+                if (!filename.compare(0, base_name.size(), base_name)) // startswith
                 {
                     // some additional guard:
                     if (filename == "." ||filename == "..") continue;
-                    system::impl::System::file().remove(folder + "/" + filename);
-                    //std::cout << "deleting" << folder << "/" << filename << std::endl;
+                    system::impl::System::file().remove(file_folder + "/" + filename);
+                    //std::cout << "deleting " << file_folder << "/" << filename << std::endl;
                 }
             }
         }


=====================================
gatb-core/src/gatb/tools/storage/impl/StorageHDF5.hpp
=====================================
@@ -268,6 +268,7 @@ private:
                 std::string actualName = this->getFullId('/');
 
                 /** We create the HDF5 group if needed. */
+//std::cout << "actualname: "<< actualName << " end"<<std::endl;
                 htri_t doesExist = H5Lexists (storage->getFileId(), actualName.c_str(), H5P_DEFAULT);
 
                 if (doesExist <= 0)


=====================================
gatb-core/test/unit/src/debruijn/TestDebruijn.cpp
=====================================
@@ -87,6 +87,7 @@ class TestDebruijn : public Test
     /********************************************************************************/
     CPPUNIT_TEST_SUITE_GATB (TestDebruijn);
 
+        CPPUNIT_TEST_GATB (debruijn_build);
         CPPUNIT_TEST_GATB (debruijn_test_small_kmers);
         CPPUNIT_TEST_GATB (debruijn_large_abundance_query);
         CPPUNIT_TEST_GATB (debruijn_test7); 
@@ -104,7 +105,6 @@ class TestDebruijn : public Test
         CPPUNIT_TEST_GATB (debruijn_test12);
         CPPUNIT_TEST_GATB (debruijn_test13);
 //        CPPUNIT_TEST_GATB (debruijn_mutation); // has been removed due to it crashing clang, and since mutate() isn't really used in apps, i didn't bother.
-        CPPUNIT_TEST_GATB (debruijn_build);
         CPPUNIT_TEST_GATB (debruijn_checkbranching);
         CPPUNIT_TEST_GATB (debruijn_mphf);
         CPPUNIT_TEST_GATB (debruijn_mphf_nodeindex);
@@ -908,13 +908,22 @@ public:
         IBank* inputBank = new BankStrings (sequences, nbSequences);
         LOCAL (inputBank);
 
+
+        //std::cout << "g1 create" << std::endl;
         Graph::create (inputBank,  "-kmer-size 31 -out %s -abundance-min 1  -verbose 0  -max-memory %d",                        "g1", MAX_MEMORY);
+
+        //std::cout << "g2 create" << std::endl;
         Graph::create (inputBank,  "-kmer-size 31 -out %s -abundance-min 1  -verbose 0 -branching-nodes none  -max-memory %d",  "g2", MAX_MEMORY);
-        Graph::create (inputBank,  "-kmer-size 31 -out %s -abundance-min 1  -verbose 0 -solid-kmers-out none  -max-memory %d",  "g3", MAX_MEMORY);
+
+        // This test doesn't work anymore.
+        // It's probably a small fix somewehre
+        // But I'd argue that the gatb feature of 'not outputting solid kmers to disk' is useless
+        // So instead of bothering, I'm just removing the present unit test.
+        //Graph::create (inputBank,  "-kmer-size 31 -out %s -abundance-min 1  -verbose 0 -solid-kmers-out none -debloom none -branching-nodes none -max-memory %d",  "g3", MAX_MEMORY);
 
         debruijn_build_entry r1 = debruijn_build_aux_aux ("g1", true,  true);
         debruijn_build_entry r2 = debruijn_build_aux_aux ("g2", true,  true);
-        debruijn_build_entry r3 = debruijn_build_aux_aux ("g3", false, true);
+        //debruijn_build_entry r3 = debruijn_build_aux_aux ("g3", false, true);
 
         CPPUNIT_ASSERT (r1.nbNodes       == r2.nbNodes);
         CPPUNIT_ASSERT (r1.checksumNodes == r2.checksumNodes);
@@ -925,8 +934,8 @@ public:
 
         CPPUNIT_ASSERT (r1.nbBranchingNodes       == r2.nbBranchingNodes);
         CPPUNIT_ASSERT (r1.checksumBranchingNodes == r2.checksumBranchingNodes);
-        CPPUNIT_ASSERT (r1.nbBranchingNodes       == r3.nbBranchingNodes);
-        CPPUNIT_ASSERT (r1.checksumBranchingNodes == r3.checksumBranchingNodes);
+        //CPPUNIT_ASSERT (r1.nbBranchingNodes       == r3.nbBranchingNodes); // uncomment if we ever fix r3 (see long comment above)
+        //CPPUNIT_ASSERT (r1.checksumBranchingNodes == r3.checksumBranchingNodes);
     }
 
     /********************************************************************************/


=====================================
gatb-core/test/unit/src/kmer/TestDSK.cpp
=====================================
@@ -471,6 +471,9 @@ public:
         // printf ("min=%ld  max=%ld  nb=%ld  check=%ld \n",
         //    nksMin, nksMax, sortingCount.getSolidCounts()->getNbItems(),checkNb
         // );
+        
+        if (sortingCount.getSolidCounts()->getNbItems() != (int)checkNb)
+            std::cout << "counted " <<sortingCount.getSolidCounts()->getNbItems()<< " kmers, expected " << (int)checkNb << std::endl;
 
         CPPUNIT_ASSERT (sortingCount.getSolidCounts()->getNbItems() == (int)checkNb);
     }


=====================================
gatb-core/test/unit/src/tools/storage/TestStorage.cpp
=====================================
@@ -132,7 +132,10 @@ public:
         {
             size_t nbIter = 0;
             Iterator<NativeInt64>* it = partition[i].iterator();    LOCAL(it);
-            for (it->first(); !it->isDone(); it->next(), nbIter++)  {  CPPUNIT_ASSERT (it->item() == 2*i);  }
+            for (it->first(); !it->isDone(); it->next(), nbIter++)  {  
+                 if (it->item() != 2*i)
+                    std::cout << std::endl << "item " << it->item() << " expected: " << 2*i << std::endl;            
+                CPPUNIT_ASSERT (it->item() == 2*i);  }
             CPPUNIT_ASSERT (nbIter == 1);
         }
 
@@ -152,7 +155,9 @@ public:
             Iterator<NativeInt64>* it = partition[i].iterator();    LOCAL(it);
             for (it->first(); !it->isDone(); it->next(), nbIter++)
             {
-                if (nbIter==0)  {  CPPUNIT_ASSERT (it->item() == 2*i  ); }
+                if (nbIter==0)  { if (it->item() != 2*i)
+                                    std::cout << "item " << it->item() << " expected: " << 2*i << std::endl;            
+                      CPPUNIT_ASSERT (it->item() == 2*i  ); }
                 if (nbIter==1)  {  CPPUNIT_ASSERT (it->item() == 2*i+1); }
             }
             CPPUNIT_ASSERT (nbIter == 2);


=====================================
gatb-core/thirdparty/update-boost.sh
=====================================
@@ -0,0 +1,13 @@
+#this is the procedure I use to update to newer versions of boost in gatb-core
+#pretty simple but gets the job done
+#to be run within thirdparty/
+#-Rayan
+
+newdir=boost_1_71_0/boost/
+olddir=boost
+
+for file in `ls $olddir`
+do
+    echo $file
+    cp -R $newdir/$file $olddir/
+done



View it on GitLab: https://salsa.debian.org/med-team/gatb-core/compare/79d0f52f9ef343e1e3980713d2fc11c1f3e51014...c58b23ef69cbe45b0c25effd4d5d8410e7bfb1ad

-- 
View it on GitLab: https://salsa.debian.org/med-team/gatb-core/compare/79d0f52f9ef343e1e3980713d2fc11c1f3e51014...c58b23ef69cbe45b0c25effd4d5d8410e7bfb1ad
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191204/601b59a1/attachment-0001.html>


More information about the debian-med-commit mailing list