[med-svn] [Git][med-team/bifrost][upstream] New upstream version 1.3.1

Andreas Tille (@tille) gitlab at salsa.debian.org
Fri Jan 5 08:21:10 GMT 2024



Andreas Tille pushed to branch upstream at Debian Med / bifrost


Commits:
225683eb by Andreas Tille at 2024-01-05T09:12:15+01:00
New upstream version 1.3.1
- - - - -


8 changed files:

- CMakeLists.txt
- Changelog.md
- src/Bifrost.cpp
- src/ColorSet.cpp
- src/ColoredCDBG.hpp
- src/ColoredCDBG.tcc
- src/CompactedDBG.hpp
- src/strict_fstream.hpp


Changes:

=====================================
CMakeLists.txt
=====================================
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 2.8.12)
 
-project(Bifrost)
+project(Bifrost C CXX)
+find_package(Threads REQUIRED)
 
 # To enable a larger default k-mer size, replace MAX_KMER_SIZE with a larger multiple of 32: actual maximum k-mer size will be MAX_KMER_SIZE-1.
 SET(MAX_KMER_SIZE "32" CACHE STRING "MAX_KMER_SIZE")
@@ -14,8 +15,17 @@ SET(ENABLE_AVX2 "ON" CACHE STRING "ENABLE_AVX2")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
+set(CMAKE_MACOSX_RPATH 1)
+
 set_property(SOURCE BlockedBloomFilter.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -funroll-loops")
 
+if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+       message("Disabling AVX2 instructions on arm64")
+       set(ENABLE_AVX2 "OFF")
+       set(COMPILATION_ARCH "OFF")
+endif(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+
+
 if(COMPILATION_ARCH MATCHES "OFF")
 	message("Disabling native architecture compilation (including AVX2)")
 else(COMPILATION_ARCH MATCHES "OFF")


=====================================
Changelog.md
=====================================
@@ -2,6 +2,8 @@
 
 API only.
 
+* **17-11-2023**
+	* Class `ColoredCDBG` exposes the function `readGraph()` which enables to load/read the graph in a colored de Bruijn graph without associating any colors to the unitigs. This enables to later color the graph with any input sequences.
 * **10-09-2023**
 	* Function `CompactedDBG()::search()` takes additional arguments:
 		* `get_nb_found_km`: boolean indicating whether to report in the output the number of found k-mers per query 


=====================================
src/Bifrost.cpp
=====================================
@@ -723,7 +723,7 @@ int main(int argc, char **argv){
                     else success = ccdbg.read(opt.filename_graph_in, opt.filename_index_in, opt.filename_colors_in, opt.nb_threads, opt.verbose);
 
                     if (success) success = ccdbg.search(opt.filename_query_in, opt.prefixFilenameOut, opt.ratio_kmers, opt.get_nb_found_km, opt.get_ratio_found_km,
-                                                        opt.inexact_search, opt.nb_threads, opt.verbose);
+                                                       opt.inexact_search, opt.nb_threads, opt.verbose);
                 }
                 else {
 


=====================================
src/ColorSet.cpp
=====================================
@@ -1,3 +1,7 @@
+#if defined(__APPLE__)
+#include <unistd.h>
+#endif
+
 #include "ColorSet.hpp"
 
 UnitigColors::UnitigColors() : setBits(localBitVector) {}


=====================================
src/ColoredCDBG.hpp
=====================================
@@ -197,13 +197,13 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
         ColoredCDBG& operator=(ColoredCDBG&& o);
 
         /** Equality operator.
-        * @return a boolean indicating if two compacted de Bruijn graphs have the same colored unitigs (does not
+        * @return a boolean indicating whether two compacted de Bruijn graphs have the same colored unitigs (does not
         * compare the data associated with the unitigs).
         */
         bool operator==(const ColoredCDBG& o) const;
 
         /** Inequality operator.
-        * @return a boolean indicating if two compacted de Bruijn graphs have different colored unitigs (does not
+        * @return a boolean indicating whether two compacted de Bruijn graphs have different colored unitigs (does not
         * compare the data associated with the unitigs).
         */
         inline bool operator!=(const ColoredCDBG& o) const;
@@ -229,14 +229,14 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
         /** Build the Colored and compacted de Bruijn graph (only the unitigs).
         * A call to ColoredCDBG::mapColors is required afterwards to map colors to unitigs.
         * @param opt is a structure from which the members are parameters of this function. See CCDBG_Build_opt.
-        * @return boolean indicating if the graph has been built successfully.
+        * @return boolean indicating whether the graph has been built successfully.
         */
         bool buildGraph(const CCDBG_Build_opt& opt);
 
         /** Map the colors to the unitigs. This is done by reading the input files and querying the graph.
         * If a color filename is provided in opt.filename_colors_in, colors are loaded from that file instead.
         * @param opt is a structure from which the members are parameters of this function. See CCDBG_Build_opt.
-        * @return boolean indicating if the colors have been mapped successfully.
+        * @return boolean indicating whether the colors have been mapped successfully.
         */
         bool buildColors(const CCDBG_Build_opt& opt);
 
@@ -248,11 +248,11 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
         * @param write_meta_file indicates if a graph meta file is written to disk. Graph meta files enable faster graph loading.
         * @param compressed_output indicates if the output file is compressed.
         * @param verbose is a boolean indicating if information message are printed during writing (true) or not (false).
-        * @return a boolean indicating if the graph was successfully written.
+        * @return a boolean indicating whether the graph was successfully written.
         */
         bool write(const string& prefix_output_fn, const size_t nb_threads = 1, const bool write_index_file = true, const bool compress_output = false, const bool verbose = false) const;
 
-        /** Read a colored and compacted de Bruijn graph from disk. The graph (in GFA, FASTA or BFG format) must 
+        /** Load a colored and compacted de Bruijn graph from disk. The graph (in GFA, FASTA or BFG format) must 
         * have been produced by Bifrost. By default, the function detects if an index file (BFI format) exists for the
         * input graph and will use it to load the graph. Otherwise, reading the graph will be much slower
         * than function read() with the index filename in input parameter.
@@ -260,21 +260,40 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
         * @param input_colors_fn is a string which is the prefix of the color filename to read
         * @param nb_threads is the number of threads that can be used to read the graph and its colors from disk.
         * @param verbose is a boolean indicating if information messages are printed during reading (true) or not (false).
-        * @return a boolean indicating if the graph was successfully read.
+        * @return a boolean indicating whether the graph was successfully read.
         */
         bool read(const string& input_graph_fn, const string& input_colors_fn, const size_t nb_threads = 1, const bool verbose = false);
 
-        /** Read a colored and compacted de Bruijn graph from disk using an index file. The graph (in GFA, FASTA or BFG format)
-        * must have been produced by Bifrost. 
+        /** Load a colored and compacted de Bruijn graph from disk using an index file.
+        * The graph (in GFA, FASTA or BFG format) must have been produced by Bifrost. 
         * @param input_graph_fn is a string which is the prefix of the graph filename to read
         * @param input_index_fn is a string which is the prefix of the index filename to read
         * @param input_colors_fn is a string which is the prefix of the color filename to read
         * @param nb_threads is the number of threads that can be used to read the graph and its colors from disk.
         * @param verbose is a boolean indicating if information messages are printed during reading (true) or not (false).
-        * @return a boolean indicating if the graph was successfully read.
+        * @return a boolean indicating whether the graph was successfully read.
         */
         bool read(const string& input_graph_fn, const string& input_index_fn, const string& input_colors_fn, const size_t nb_threads = 1, const bool verbose = false);
 
+        /** Load a colored and compacted de Bruijn graph without its colors from disk.
+        * A call to ColoredCDBG::mapColors is required afterwards to map colors to unitigs.
+        * @param input_graph_fn is a string which is the prefix of the graph filename to read
+        * @param nb_threads is the number of threads that can be used to read the graph and its colors from disk.
+        * @param verbose is a boolean indicating if information messages are printed during reading (true) or not (false).
+        * @return a boolean indicating whether the graph was successfully read.
+        */
+        bool readGraph(const string& input_graph_fn, const size_t nb_threads = 1, const bool verbose = false);
+
+        /** Load a colored and compacted de Bruijn graph without its colors from disk using an index file.
+        * A call to ColoredCDBG::mapColors is required afterwards to map colors to unitigs.
+        * @param input_graph_fn is a string which is the prefix of the graph filename to read
+        * @param input_index_fn is a string which is the prefix of the index filename to read
+        * @param nb_threads is the number of threads that can be used to read the graph and its colors from disk.
+        * @param verbose is a boolean indicating if information messages are printed during reading (true) or not (false).
+        * @return a boolean indicating whether the graph was successfully read.
+        */
+        bool readGraph(const string& input_graph_fn, const string& input_index_fn, const size_t nb_threads = 1, const bool verbose = false);
+
         /** Merge a colored and compacted de Bruijn graph.
         * After merging, all unitigs and colors of the input graph have been added to and compacted with the current
         * colored and compacted de Bruijn graph (this). If the unitigs of the input graph had data of type "MyUnitigData"
@@ -285,7 +304,7 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
         * @param o is a constant reference to the colored and compacted de Bruijn graph to merge.
         * @param nb_threads is an integer indicating how many threads can be used during the merging.
         * @param verbose is a boolean indicating if information messages must be printed during the execution of the function.
-        * @return a boolean indicating if the graph has been successfully merged.
+        * @return a boolean indicating whether the graph has been successfully merged.
         */
         bool merge(const ColoredCDBG& o, const size_t nb_threads = 1, const bool verbose = false);
 
@@ -301,7 +320,7 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
         * std::move(). After merging, the graph pointed by o is cleared.
         * @param nb_threads is an integer indicating how many threads can be used during the merging.
         * @param verbose is a boolean indicating if information messages must be printed during the execution of the function.
-        * @return a boolean indicating if the graph has been successfully merged.
+        * @return a boolean indicating whether the graph has been successfully merged.
         */
         bool merge(ColoredCDBG&& o, const size_t nb_threads = 1, const bool verbose = false);
 
@@ -313,7 +332,7 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
         * @param v is a constant reference to a vector of colored and compacted de Bruijn graphs to merge.
         * @param nb_threads is an integer indicating how many threads can be used during the merging.
         * @param verbose is a boolean indicating if information messages must be printed during the execution of the function.
-        * @return a boolean indicating if the graphs have been successfully merged.
+        * @return a boolean indicating whether the graphs have been successfully merged.
         */
         bool merge(const vector<ColoredCDBG>& v, const size_t nb_threads = 1, const bool verbose = false);
 
@@ -327,7 +346,7 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
         * obtained using std::move(). After merging, the graphs in v are cleared.
         * @param nb_threads is an integer indicating how many threads can be used during the merging.
         * @param verbose is a boolean indicating if information messages must be printed during the execution of the function.
-        * @return a boolean indicating if the graphs have been successfully merged.
+        * @return a boolean indicating whether the graphs have been successfully merged.
         */
         bool merge(vector<ColoredCDBG>&& v, const size_t nb_threads = 1, const bool verbose = false);
 


=====================================
src/ColoredCDBG.tcc
=====================================
@@ -719,6 +719,115 @@ bool ColoredCDBG<U>::read(const string& input_graph_fn, const string& input_inde
     return valid_input_files;
 }
 
+template<typename U>
+bool ColoredCDBG<U>::readGraph(const string& input_graph_fn, const size_t nb_threads, const bool verbose) {
+
+    bool valid_input_files = true;
+
+    if (input_graph_fn.length() != 0){
+
+        if (check_file_exists(input_graph_fn)){
+
+            FILE* fp = fopen(input_graph_fn.c_str(), "r");
+
+            if (fp == NULL) {
+
+                cerr << "ColoredCDBG::readGraph(): Could not open input graph file " << input_graph_fn << endl;
+                valid_input_files = false;
+            }
+            else fclose(fp);
+        }
+        else {
+
+            cerr << "ColoredCDBG::readGraph(): Input graph file " << input_graph_fn << " does not exist." << endl;
+            valid_input_files = false;
+        }
+    }
+    else {
+
+        cerr << "ColoredCDBG::readGraph(): No input graph file provided." << endl;
+        valid_input_files = false;
+    }
+
+    if (valid_input_files){
+
+        if (verbose) cout << "ColoredCDBG::readGraph(): Reading graph." << endl;
+        
+        invalid = !CompactedDBG<DataAccessor<U>, DataStorage<U>>::read(input_graph_fn, nb_threads, verbose);
+
+        if (invalid) return false; // Read graph
+    }
+
+    return valid_input_files;
+}
+
+template<typename U>
+bool ColoredCDBG<U>::readGraph(const string& input_graph_fn, const string& input_index_fn, const size_t nb_threads, const bool verbose) {
+
+    bool valid_input_files = true;
+
+    if (input_graph_fn.length() != 0){
+
+        if (check_file_exists(input_graph_fn)){
+
+            FILE* fp = fopen(input_graph_fn.c_str(), "r");
+
+            if (fp == NULL) {
+
+                cerr << "ColoredCDBG::readGraph(): Could not open input graph file " << input_graph_fn << endl;
+                valid_input_files = false;
+            }
+            else fclose(fp);
+        }
+        else {
+
+            cerr << "ColoredCDBG::readGraph(): Input graph file " << input_graph_fn << " does not exist." << endl;
+            valid_input_files = false;
+        }
+    }
+    else {
+
+        cerr << "ColoredCDBG::readGraph(): No input graph file provided." << endl;
+        valid_input_files = false;
+    }
+
+    if (input_index_fn.length() != 0){
+
+        if (check_file_exists(input_index_fn)){
+
+            FILE* fp = fopen(input_index_fn.c_str(), "rb");
+
+            if (fp == NULL) {
+
+                cerr << "ColoredCDBG::readGraph(): Could not open input index file " << input_index_fn << endl;
+                valid_input_files = false;
+            }
+            else fclose(fp);
+        }
+        else {
+
+            cerr << "ColoredCDBG::readGraph(): Input index file " << input_index_fn << " does not exist." << endl;
+            valid_input_files = false;
+        }
+    }
+    else {
+
+        cerr << "ColoredCDBG::readGraph(): No input index file provided." << endl;
+        valid_input_files = false;
+    }
+
+    if (valid_input_files){
+
+        if (verbose) cout << "ColoredCDBG::readGraph(): Reading graph." << endl;
+        
+        invalid = !CompactedDBG<DataAccessor<U>, DataStorage<U>>::read(input_graph_fn, input_index_fn, nb_threads, verbose);
+
+        if (invalid) return false; // Read graph
+    }
+
+    return valid_input_files;
+}
+
 template<typename U>
 void ColoredCDBG<U>::initUnitigColors(const CCDBG_Build_opt& opt, const size_t max_nb_hash){
 
@@ -1421,7 +1530,7 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
 
     if (get_nb_found_km && get_ratio_found_km){
 
-        cerr << "ColoredCDBG::search(): Cannot output at once the number of found k-mers and the ratio of found k-mers." << endl;
+        cerr << "ColoredCDBG::search(): Cannot output number of found k-mers and ratio of found k-mers together." << endl;
         return false;
     }
 
@@ -1538,7 +1647,6 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
             else {
 
                 const Kmer head = um.getUnitigTail().twin();
-                const size_t max_pos_um = um.dist + um.len - 1;
 
                 it = s_um.find({pos_query, {head, um.dist}});
 
@@ -1569,6 +1677,8 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
 
                     if (inexact_search){
 
+                        const size_t max_pos_um = um.dist + um.len - 1;
+
                         for (; it_uc != it_uc_end; ++it_uc) color_occ_r[it_uc.getColorID()].add(max_pos_um - it_uc.getKmerPosition() + p.first);
                     }
                     else {
@@ -1593,31 +1703,40 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
 
         if (inexact_search){
 
-            size_t nb_color_pres = 0;
+            if (!get_nb_found_km && !get_ratio_found_km) {
 
-            for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
+                size_t nb_color_pres = 0;
 
-            if (nb_color_pres == nb_colors) return;
+                for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
+
+                if (nb_color_pres == nb_colors) return;
+            }
 
             const vector<pair<size_t, const_UnitigColorMap<U>>> v_um_d = this->searchSequence(query, false, false, true, false, false);
 
             processCounts(v_um_d, color_occ_r, color_occ_u); // Extract k-mer occurrences for each color
 
-            nb_color_pres = 0;
+            if (!get_nb_found_km && !get_ratio_found_km) {
+                
+                size_t nb_color_pres = 0;
 
-            for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
+                for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
 
-            if (nb_color_pres == nb_colors) return;
+                if (nb_color_pres == nb_colors) return;
+            }
 
             const vector<pair<size_t, const_UnitigColorMap<U>>> v_um_m = this->searchSequence(query, false, false, false, true, false);
 
             processCounts(v_um_m, color_occ_r, color_occ_u); // Extract k-mer occurrences for each color
 
-            nb_color_pres = 0;
+            if (!get_nb_found_km && !get_ratio_found_km) {
+                
+                size_t nb_color_pres = 0;
 
-            for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
+                for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
 
-            if (nb_color_pres == nb_colors) return;
+                if (nb_color_pres == nb_colors) return;
+            }
 
             const vector<pair<size_t, const_UnitigColorMap<U>>> v_um_i = this->searchSequence(query, false, true, false, false, false);
 
@@ -1735,7 +1854,9 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
         for (size_t i = 0; i < nb_colors; ++i) {
 
             color_query_out += '\t';
-            color_query_out += to_string(get_ratio_found_km ? color_occ[i] : (static_cast<double>(color_occ[i]) / static_cast<double>(nb_km_query)));
+
+            if (get_nb_found_km) color_query_out += to_string(color_occ[i]);
+            else color_query_out += to_string(static_cast<double>(color_occ[i]) / static_cast<double>(nb_km_query));
         }
 
         const size_t l_color_query_out = color_query_out.length();
@@ -1772,7 +1893,9 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
         for (size_t i = 0; i < nb_colors; ++i) {
 
             color_query_out += '\t';
-            color_query_out += to_string(get_ratio_found_km ? color_occ[i] : (static_cast<double>(color_occ[i]) / static_cast<double>(nb_km_query)));
+
+            if (get_nb_found_km) color_query_out += to_string(color_occ[i]);
+            else color_query_out += to_string(static_cast<double>(color_occ[i]) / static_cast<double>(nb_km_query));
         }
 
         const size_t l_color_query_out = color_query_out.length();
@@ -1941,7 +2064,7 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
 
                             for (auto& c : buffers_seq[i]) c &= 0xDF;
 
-                            if (get_nb_found_km){
+                            if (get_nb_found_km || get_ratio_found_km){
 
                                 searchQuery(buffers_seq[i], color_occ_r, color_occ_u, nb_km_query);
                                 writeOutQuantMutex(buffers_name[i].c_str(), buffers_name[i].length(), nb_km_query, color_occ_u, buffer_res, pos_buffer_out, mutex_file_out);


=====================================
src/CompactedDBG.hpp
=====================================
@@ -164,7 +164,6 @@ struct CDBG_Build_opt {
     bool get_nb_found_km;
     bool get_ratio_found_km;
 
-
     bool writeIndexFile;
 
     double ratio_kmers;


=====================================
src/strict_fstream.hpp
=====================================
@@ -64,7 +64,7 @@ static std::string strerror()
     } else {
         return "Unknown error (" + std::to_string(err_num) + ")";
     }
-#elif ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE) || defined(__APPLE__) || defined(__MUSL__)
+#elif ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE) || defined(__APPLE__) || defined(__MUSL__) || defined(__FreeBSD__)
 // XSI-compliant strerror_r()
     const int err_num = errno; // See above
     if (strerror_r(err_num, buff.data(), buff.size()) == 0) {



View it on GitLab: https://salsa.debian.org/med-team/bifrost/-/commit/225683eb09b288158e434097233dd16d5a1f7f08

-- 
View it on GitLab: https://salsa.debian.org/med-team/bifrost/-/commit/225683eb09b288158e434097233dd16d5a1f7f08
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240105/d60ae33b/attachment-0001.htm>


More information about the debian-med-commit mailing list