[med-svn] [Git][med-team/bifrost][upstream] New upstream version 1.3.1
Andreas Tille (@tille)
gitlab at salsa.debian.org
Fri Jan 5 08:21:10 GMT 2024
Andreas Tille pushed to branch upstream at Debian Med / bifrost
Commits:
225683eb by Andreas Tille at 2024-01-05T09:12:15+01:00
New upstream version 1.3.1
- - - - -
8 changed files:
- CMakeLists.txt
- Changelog.md
- src/Bifrost.cpp
- src/ColorSet.cpp
- src/ColoredCDBG.hpp
- src/ColoredCDBG.tcc
- src/CompactedDBG.hpp
- src/strict_fstream.hpp
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 2.8.12)
-project(Bifrost)
+project(Bifrost C CXX)
+find_package(Threads REQUIRED)
# To enable a larger default k-mer size, replace MAX_KMER_SIZE with a larger multiple of 32: actual maximum k-mer size will be MAX_KMER_SIZE-1.
SET(MAX_KMER_SIZE "32" CACHE STRING "MAX_KMER_SIZE")
@@ -14,8 +15,17 @@ SET(ENABLE_AVX2 "ON" CACHE STRING "ENABLE_AVX2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(CMAKE_MACOSX_RPATH 1)
+
set_property(SOURCE BlockedBloomFilter.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -funroll-loops")
+if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+ message("Disabling AVX2 instructions on arm64")
+ set(ENABLE_AVX2 "OFF")
+ set(COMPILATION_ARCH "OFF")
+endif(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+
+
if(COMPILATION_ARCH MATCHES "OFF")
message("Disabling native architecture compilation (including AVX2)")
else(COMPILATION_ARCH MATCHES "OFF")
=====================================
Changelog.md
=====================================
@@ -2,6 +2,8 @@
API only.
+* **17-11-2023**
+ * Class `ColoredCDBG` exposes the function `readGraph()` which enables to load/read the graph in a colored de Bruijn graph without associating any colors to the unitigs. This enables to later color the graph with any input sequences.
* **10-09-2023**
* Function `CompactedDBG()::search()` takes additional arguments:
* `get_nb_found_km`: boolean indicating whether to report in the output the number of found k-mers per query
=====================================
src/Bifrost.cpp
=====================================
@@ -723,7 +723,7 @@ int main(int argc, char **argv){
else success = ccdbg.read(opt.filename_graph_in, opt.filename_index_in, opt.filename_colors_in, opt.nb_threads, opt.verbose);
if (success) success = ccdbg.search(opt.filename_query_in, opt.prefixFilenameOut, opt.ratio_kmers, opt.get_nb_found_km, opt.get_ratio_found_km,
- opt.inexact_search, opt.nb_threads, opt.verbose);
+ opt.inexact_search, opt.nb_threads, opt.verbose);
}
else {
=====================================
src/ColorSet.cpp
=====================================
@@ -1,3 +1,7 @@
+#if defined(__APPLE__)
+#include <unistd.h>
+#endif
+
#include "ColorSet.hpp"
UnitigColors::UnitigColors() : setBits(localBitVector) {}
=====================================
src/ColoredCDBG.hpp
=====================================
@@ -197,13 +197,13 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
ColoredCDBG& operator=(ColoredCDBG&& o);
/** Equality operator.
- * @return a boolean indicating if two compacted de Bruijn graphs have the same colored unitigs (does not
+ * @return a boolean indicating whether two compacted de Bruijn graphs have the same colored unitigs (does not
* compare the data associated with the unitigs).
*/
bool operator==(const ColoredCDBG& o) const;
/** Inequality operator.
- * @return a boolean indicating if two compacted de Bruijn graphs have different colored unitigs (does not
+ * @return a boolean indicating whether two compacted de Bruijn graphs have different colored unitigs (does not
* compare the data associated with the unitigs).
*/
inline bool operator!=(const ColoredCDBG& o) const;
@@ -229,14 +229,14 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
/** Build the Colored and compacted de Bruijn graph (only the unitigs).
* A call to ColoredCDBG::mapColors is required afterwards to map colors to unitigs.
* @param opt is a structure from which the members are parameters of this function. See CCDBG_Build_opt.
- * @return boolean indicating if the graph has been built successfully.
+ * @return boolean indicating whether the graph has been built successfully.
*/
bool buildGraph(const CCDBG_Build_opt& opt);
/** Map the colors to the unitigs. This is done by reading the input files and querying the graph.
* If a color filename is provided in opt.filename_colors_in, colors are loaded from that file instead.
* @param opt is a structure from which the members are parameters of this function. See CCDBG_Build_opt.
- * @return boolean indicating if the colors have been mapped successfully.
+ * @return boolean indicating whether the colors have been mapped successfully.
*/
bool buildColors(const CCDBG_Build_opt& opt);
@@ -248,11 +248,11 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
* @param write_meta_file indicates if a graph meta file is written to disk. Graph meta files enable faster graph loading.
* @param compressed_output indicates if the output file is compressed.
* @param verbose is a boolean indicating if information message are printed during writing (true) or not (false).
- * @return a boolean indicating if the graph was successfully written.
+ * @return a boolean indicating whether the graph was successfully written.
*/
bool write(const string& prefix_output_fn, const size_t nb_threads = 1, const bool write_index_file = true, const bool compress_output = false, const bool verbose = false) const;
- /** Read a colored and compacted de Bruijn graph from disk. The graph (in GFA, FASTA or BFG format) must
+ /** Load a colored and compacted de Bruijn graph from disk. The graph (in GFA, FASTA or BFG format) must
* have been produced by Bifrost. By default, the function detects if an index file (BFI format) exists for the
* input graph and will use it to load the graph. Otherwise, reading the graph will be much slower
* than function read() with the index filename in input parameter.
@@ -260,21 +260,40 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
* @param input_colors_fn is a string which is the prefix of the color filename to read
* @param nb_threads is the number of threads that can be used to read the graph and its colors from disk.
* @param verbose is a boolean indicating if information messages are printed during reading (true) or not (false).
- * @return a boolean indicating if the graph was successfully read.
+ * @return a boolean indicating whether the graph was successfully read.
*/
bool read(const string& input_graph_fn, const string& input_colors_fn, const size_t nb_threads = 1, const bool verbose = false);
- /** Read a colored and compacted de Bruijn graph from disk using an index file. The graph (in GFA, FASTA or BFG format)
- * must have been produced by Bifrost.
+ /** Load a colored and compacted de Bruijn graph from disk using an index file.
+ * The graph (in GFA, FASTA or BFG format) must have been produced by Bifrost.
* @param input_graph_fn is a string which is the prefix of the graph filename to read
* @param input_index_fn is a string which is the prefix of the index filename to read
* @param input_colors_fn is a string which is the prefix of the color filename to read
* @param nb_threads is the number of threads that can be used to read the graph and its colors from disk.
* @param verbose is a boolean indicating if information messages are printed during reading (true) or not (false).
- * @return a boolean indicating if the graph was successfully read.
+ * @return a boolean indicating whether the graph was successfully read.
*/
bool read(const string& input_graph_fn, const string& input_index_fn, const string& input_colors_fn, const size_t nb_threads = 1, const bool verbose = false);
+ /** Load a colored and compacted de Bruijn graph without its colors from disk.
+ * A call to ColoredCDBG::mapColors is required afterwards to map colors to unitigs.
+ * @param input_graph_fn is a string which is the prefix of the graph filename to read
+ * @param nb_threads is the number of threads that can be used to read the graph and its colors from disk.
+ * @param verbose is a boolean indicating if information messages are printed during reading (true) or not (false).
+ * @return a boolean indicating whether the graph was successfully read.
+ */
+ bool readGraph(const string& input_graph_fn, const size_t nb_threads = 1, const bool verbose = false);
+
+ /** Load a colored and compacted de Bruijn graph without its colors from disk using an index file.
+ * A call to ColoredCDBG::mapColors is required afterwards to map colors to unitigs.
+ * @param input_graph_fn is a string which is the prefix of the graph filename to read
+ * @param input_index_fn is a string which is the prefix of the index filename to read
+ * @param nb_threads is the number of threads that can be used to read the graph and its colors from disk.
+ * @param verbose is a boolean indicating if information messages are printed during reading (true) or not (false).
+ * @return a boolean indicating whether the graph was successfully read.
+ */
+ bool readGraph(const string& input_graph_fn, const string& input_index_fn, const size_t nb_threads = 1, const bool verbose = false);
+
/** Merge a colored and compacted de Bruijn graph.
* After merging, all unitigs and colors of the input graph have been added to and compacted with the current
* colored and compacted de Bruijn graph (this). If the unitigs of the input graph had data of type "MyUnitigData"
@@ -285,7 +304,7 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
* @param o is a constant reference to the colored and compacted de Bruijn graph to merge.
* @param nb_threads is an integer indicating how many threads can be used during the merging.
* @param verbose is a boolean indicating if information messages must be printed during the execution of the function.
- * @return a boolean indicating if the graph has been successfully merged.
+ * @return a boolean indicating whether the graph has been successfully merged.
*/
bool merge(const ColoredCDBG& o, const size_t nb_threads = 1, const bool verbose = false);
@@ -301,7 +320,7 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
* std::move(). After merging, the graph pointed by o is cleared.
* @param nb_threads is an integer indicating how many threads can be used during the merging.
* @param verbose is a boolean indicating if information messages must be printed during the execution of the function.
- * @return a boolean indicating if the graph has been successfully merged.
+ * @return a boolean indicating whether the graph has been successfully merged.
*/
bool merge(ColoredCDBG&& o, const size_t nb_threads = 1, const bool verbose = false);
@@ -313,7 +332,7 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
* @param v is a constant reference to a vector of colored and compacted de Bruijn graphs to merge.
* @param nb_threads is an integer indicating how many threads can be used during the merging.
* @param verbose is a boolean indicating if information messages must be printed during the execution of the function.
- * @return a boolean indicating if the graphs have been successfully merged.
+ * @return a boolean indicating whether the graphs have been successfully merged.
*/
bool merge(const vector<ColoredCDBG>& v, const size_t nb_threads = 1, const bool verbose = false);
@@ -327,7 +346,7 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage
* obtained using std::move(). After merging, the graphs in v are cleared.
* @param nb_threads is an integer indicating how many threads can be used during the merging.
* @param verbose is a boolean indicating if information messages must be printed during the execution of the function.
- * @return a boolean indicating if the graphs have been successfully merged.
+ * @return a boolean indicating whether the graphs have been successfully merged.
*/
bool merge(vector<ColoredCDBG>&& v, const size_t nb_threads = 1, const bool verbose = false);
=====================================
src/ColoredCDBG.tcc
=====================================
@@ -719,6 +719,115 @@ bool ColoredCDBG<U>::read(const string& input_graph_fn, const string& input_inde
return valid_input_files;
}
+template<typename U>
+bool ColoredCDBG<U>::readGraph(const string& input_graph_fn, const size_t nb_threads, const bool verbose) {
+
+ bool valid_input_files = true;
+
+ if (input_graph_fn.length() != 0){
+
+ if (check_file_exists(input_graph_fn)){
+
+ FILE* fp = fopen(input_graph_fn.c_str(), "r");
+
+ if (fp == NULL) {
+
+ cerr << "ColoredCDBG::readGraph(): Could not open input graph file " << input_graph_fn << endl;
+ valid_input_files = false;
+ }
+ else fclose(fp);
+ }
+ else {
+
+ cerr << "ColoredCDBG::readGraph(): Input graph file " << input_graph_fn << " does not exist." << endl;
+ valid_input_files = false;
+ }
+ }
+ else {
+
+ cerr << "ColoredCDBG::readGraph(): No input graph file provided." << endl;
+ valid_input_files = false;
+ }
+
+ if (valid_input_files){
+
+ if (verbose) cout << "ColoredCDBG::readGraph(): Reading graph." << endl;
+
+ invalid = !CompactedDBG<DataAccessor<U>, DataStorage<U>>::read(input_graph_fn, nb_threads, verbose);
+
+ if (invalid) return false; // Read graph
+ }
+
+ return valid_input_files;
+}
+
+template<typename U>
+bool ColoredCDBG<U>::readGraph(const string& input_graph_fn, const string& input_index_fn, const size_t nb_threads, const bool verbose) {
+
+ bool valid_input_files = true;
+
+ if (input_graph_fn.length() != 0){
+
+ if (check_file_exists(input_graph_fn)){
+
+ FILE* fp = fopen(input_graph_fn.c_str(), "r");
+
+ if (fp == NULL) {
+
+ cerr << "ColoredCDBG::readGraph(): Could not open input graph file " << input_graph_fn << endl;
+ valid_input_files = false;
+ }
+ else fclose(fp);
+ }
+ else {
+
+ cerr << "ColoredCDBG::readGraph(): Input graph file " << input_graph_fn << " does not exist." << endl;
+ valid_input_files = false;
+ }
+ }
+ else {
+
+ cerr << "ColoredCDBG::readGraph(): No input graph file provided." << endl;
+ valid_input_files = false;
+ }
+
+ if (input_index_fn.length() != 0){
+
+ if (check_file_exists(input_index_fn)){
+
+ FILE* fp = fopen(input_index_fn.c_str(), "rb");
+
+ if (fp == NULL) {
+
+ cerr << "ColoredCDBG::readGraph(): Could not open input index file " << input_index_fn << endl;
+ valid_input_files = false;
+ }
+ else fclose(fp);
+ }
+ else {
+
+ cerr << "ColoredCDBG::readGraph(): Input index file " << input_index_fn << " does not exist." << endl;
+ valid_input_files = false;
+ }
+ }
+ else {
+
+ cerr << "ColoredCDBG::readGraph(): No input index file provided." << endl;
+ valid_input_files = false;
+ }
+
+ if (valid_input_files){
+
+ if (verbose) cout << "ColoredCDBG::readGraph(): Reading graph." << endl;
+
+ invalid = !CompactedDBG<DataAccessor<U>, DataStorage<U>>::read(input_graph_fn, input_index_fn, nb_threads, verbose);
+
+ if (invalid) return false; // Read graph
+ }
+
+ return valid_input_files;
+}
+
template<typename U>
void ColoredCDBG<U>::initUnitigColors(const CCDBG_Build_opt& opt, const size_t max_nb_hash){
@@ -1421,7 +1530,7 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
if (get_nb_found_km && get_ratio_found_km){
- cerr << "ColoredCDBG::search(): Cannot output at once the number of found k-mers and the ratio of found k-mers." << endl;
+ cerr << "ColoredCDBG::search(): Cannot output number of found k-mers and ratio of found k-mers together." << endl;
return false;
}
@@ -1538,7 +1647,6 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
else {
const Kmer head = um.getUnitigTail().twin();
- const size_t max_pos_um = um.dist + um.len - 1;
it = s_um.find({pos_query, {head, um.dist}});
@@ -1569,6 +1677,8 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
if (inexact_search){
+ const size_t max_pos_um = um.dist + um.len - 1;
+
for (; it_uc != it_uc_end; ++it_uc) color_occ_r[it_uc.getColorID()].add(max_pos_um - it_uc.getKmerPosition() + p.first);
}
else {
@@ -1593,31 +1703,40 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
if (inexact_search){
- size_t nb_color_pres = 0;
+ if (!get_nb_found_km && !get_ratio_found_km) {
- for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
+ size_t nb_color_pres = 0;
- if (nb_color_pres == nb_colors) return;
+ for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
+
+ if (nb_color_pres == nb_colors) return;
+ }
const vector<pair<size_t, const_UnitigColorMap<U>>> v_um_d = this->searchSequence(query, false, false, true, false, false);
processCounts(v_um_d, color_occ_r, color_occ_u); // Extract k-mer occurrences for each color
- nb_color_pres = 0;
+ if (!get_nb_found_km && !get_ratio_found_km) {
+
+ size_t nb_color_pres = 0;
- for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
+ for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
- if (nb_color_pres == nb_colors) return;
+ if (nb_color_pres == nb_colors) return;
+ }
const vector<pair<size_t, const_UnitigColorMap<U>>> v_um_m = this->searchSequence(query, false, false, false, true, false);
processCounts(v_um_m, color_occ_r, color_occ_u); // Extract k-mer occurrences for each color
- nb_color_pres = 0;
+ if (!get_nb_found_km && !get_ratio_found_km) {
+
+ size_t nb_color_pres = 0;
- for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
+ for (size_t j = 0; j < nb_colors; ++j) nb_color_pres += (color_occ_u[j] >= nb_km_min);
- if (nb_color_pres == nb_colors) return;
+ if (nb_color_pres == nb_colors) return;
+ }
const vector<pair<size_t, const_UnitigColorMap<U>>> v_um_i = this->searchSequence(query, false, true, false, false, false);
@@ -1735,7 +1854,9 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
for (size_t i = 0; i < nb_colors; ++i) {
color_query_out += '\t';
- color_query_out += to_string(get_ratio_found_km ? color_occ[i] : (static_cast<double>(color_occ[i]) / static_cast<double>(nb_km_query)));
+
+ if (get_nb_found_km) color_query_out += to_string(color_occ[i]);
+ else color_query_out += to_string(static_cast<double>(color_occ[i]) / static_cast<double>(nb_km_query));
}
const size_t l_color_query_out = color_query_out.length();
@@ -1772,7 +1893,9 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
for (size_t i = 0; i < nb_colors; ++i) {
color_query_out += '\t';
- color_query_out += to_string(get_ratio_found_km ? color_occ[i] : (static_cast<double>(color_occ[i]) / static_cast<double>(nb_km_query)));
+
+ if (get_nb_found_km) color_query_out += to_string(color_occ[i]);
+ else color_query_out += to_string(static_cast<double>(color_occ[i]) / static_cast<double>(nb_km_query));
}
const size_t l_color_query_out = color_query_out.length();
@@ -1941,7 +2064,7 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&
for (auto& c : buffers_seq[i]) c &= 0xDF;
- if (get_nb_found_km){
+ if (get_nb_found_km || get_ratio_found_km){
searchQuery(buffers_seq[i], color_occ_r, color_occ_u, nb_km_query);
writeOutQuantMutex(buffers_name[i].c_str(), buffers_name[i].length(), nb_km_query, color_occ_u, buffer_res, pos_buffer_out, mutex_file_out);
=====================================
src/CompactedDBG.hpp
=====================================
@@ -164,7 +164,6 @@ struct CDBG_Build_opt {
bool get_nb_found_km;
bool get_ratio_found_km;
-
bool writeIndexFile;
double ratio_kmers;
=====================================
src/strict_fstream.hpp
=====================================
@@ -64,7 +64,7 @@ static std::string strerror()
} else {
return "Unknown error (" + std::to_string(err_num) + ")";
}
-#elif ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE) || defined(__APPLE__) || defined(__MUSL__)
+#elif ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE) || defined(__APPLE__) || defined(__MUSL__) || defined(__FreeBSD__)
// XSI-compliant strerror_r()
const int err_num = errno; // See above
if (strerror_r(err_num, buff.data(), buff.size()) == 0) {
View it on GitLab: https://salsa.debian.org/med-team/bifrost/-/commit/225683eb09b288158e434097233dd16d5a1f7f08
--
View it on GitLab: https://salsa.debian.org/med-team/bifrost/-/commit/225683eb09b288158e434097233dd16d5a1f7f08
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240105/d60ae33b/attachment-0001.htm>
More information about the debian-med-commit
mailing list