[med-svn] [Git][med-team/seqwish][upstream] New upstream version 0.7.1

Steffen Möller (@moeller) gitlab at salsa.debian.org
Mon Aug 30 20:08:51 BST 2021



Steffen Möller pushed to branch upstream at Debian Med / seqwish


Commits:
d133ab50 by Steffen Moeller at 2021-08-30T18:26:03+02:00
New upstream version 0.7.1
- - - - -


6 changed files:

- .gitmodules
- .travis.yml
- CMakeLists.txt
- src/alignments.cpp
- src/main.cpp
- src/seqindex.cpp


Changes:

=====================================
.gitmodules
=====================================
@@ -7,9 +7,6 @@
 [submodule "deps/iitii"]
 	path = deps/iitii
 	url = https://github.com/ekg/iitii.git
-[submodule "deps/mmap_allocator"]
-	path = deps/mmap_allocator
-	url = https://github.com/ekg/mmap_allocator.git
 [submodule "deps/sdsl-lite"]
 	path = deps/sdsl-lite
 	url = https://github.com/simongog/sdsl-lite.git


=====================================
.travis.yml
=====================================
@@ -8,7 +8,7 @@ before_install:
   - git submodule update --init --recursive
   - sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
   - sudo apt-get update -qq
-  - sudo apt-get install -qq gcc-9 g++-9
+  - sudo apt-get install -qq gcc-9 g++-9 libjemalloc-dev
   - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 60 --slave /usr/bin/g++ g++ /usr/bin/g++-9
 script:
   - sed -i 's/CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -mcx16 -g/CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O -mcx16 -g -fsanitize=address/g' CMakeLists.txt


=====================================
CMakeLists.txt
=====================================
@@ -77,14 +77,6 @@ ExternalProject_Add(iitii
 ExternalProject_Get_property(iitii SOURCE_DIR)
 set(iitii_INCLUDE "${SOURCE_DIR}/src")
 
-ExternalProject_Add(mmap_allocator
-  SOURCE_DIR "${CMAKE_SOURCE_DIR}/deps/mmap_allocator"
-  CMAKE_ARGS "${CMAKE_ARGS};-DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>"
-  UPDATE_COMMAND "")
-ExternalProject_Get_property(mmap_allocator INSTALL_DIR)
-set(mmap_allocator_INCLUDE "${INSTALL_DIR}/include/mmap_allocator")
-set(mmap_allocator_LIB "${INSTALL_DIR}/lib")
-
 # In-place Parallel Super Scalar Samplesort (IPS⁴o), header only
 ExternalProject_Add(ips4o
   SOURCE_DIR "${CMAKE_SOURCE_DIR}/deps/ips4o"
@@ -184,7 +176,6 @@ add_dependencies(seqwish sdsl-lite)
 add_dependencies(seqwish gzipreader)
 add_dependencies(seqwish mmmulti)
 add_dependencies(seqwish iitii)
-add_dependencies(seqwish mmap_allocator)
 add_dependencies(seqwish ips4o)
 add_dependencies(seqwish bbhash)
 add_dependencies(seqwish atomicbitvector)
@@ -200,7 +191,6 @@ target_include_directories(seqwish PUBLIC
   "${ips4o_INCLUDE}"
   "${mmmulti_INCLUDE}"
   "${iitii_INCLUDE}"
-  "${mmap_allocator_INCLUDE}"
   "${bbhash_INCLUDE}"
   "${atomicbitvector_INCLUDE}"
   "${atomicqueue_INCLUDE}"
@@ -211,9 +201,9 @@ target_link_libraries(seqwish
   "${sdsl-lite_LIB}/libsdsl.a"
   "${sdsl-lite-divsufsort_LIB}/libdivsufsort.a"
   "${sdsl-lite-divsufsort_LIB}/libdivsufsort64.a"
-  "${mmap_allocator_LIB}/libmmap_allocator.a"
   "-latomic"
   Threads::Threads
+  jemalloc
   z)
 if (BUILD_STATIC)
   #set(CMAKE_EXE_LINKER_FLAGS "-static")


=====================================
src/alignments.cpp
=====================================
@@ -31,6 +31,7 @@ void paf_worker(
             switch (c.op) {
             case 'M':
             case '=':
+            case 'X':
             {
                 pos_t q_pos_match_start = q_pos;
                 pos_t t_pos_match_start = t_pos;
@@ -50,7 +51,10 @@ void paf_worker(
                         }
                     };
                 for (size_t i = 0; i < c.len; ++i) {
-                    if (seqidx.at_pos(q_pos) == seqidx.at_pos(t_pos)
+                    char query_base = seqidx.at_pos(q_pos);
+                    char target_base = seqidx.at_pos(t_pos);
+                    if (query_base == target_base
+                        && query_base != 'N'
                         && offset(q_pos) != offset(t_pos)) { // guard against self mappings
                         if (match_len == 0) {
                             q_pos_match_start = q_pos;
@@ -71,10 +75,6 @@ void paf_worker(
                 add_match();
             }
                 break;
-            case 'X':
-                incr_pos(q_pos, c.len);
-                incr_pos(t_pos, c.len);
-                break;
             case 'I':
                 //std::cerr << "ins " << c.len << std::endl;
                 incr_pos(q_pos, c.len);


=====================================
src/main.cpp
=====================================
@@ -80,15 +80,23 @@ int main(int argc, char** argv) {
                 std::cerr << "[seqwish] ERROR: input alignment file " << args::get(paf_alns) << " does not exist" << std::endl;
                 return 4;
             }else {
+                 // Check if the first non-empty line has the CIGAR
+
                 igzstream paf_in(p.first.c_str());
 
-                std::string line;
-                std::getline(paf_in, line);
+                while (!paf_in.eof()) {
+                    std::string line;
+                    std::getline(paf_in, line);
+
+                    if (!line.empty()) {
+                        paf_row_t paf(line);
 
-                paf_row_t paf(line);
-                if (paf.cigar.empty()){
-                    std::cerr << "[seqwish] WARNING: input alignment file " << p.first << " does not have CIGAR strings. "
-                    << "The resulting graph will only represent the input sequences." << std::endl;
+                        if (paf.cigar.empty()){
+                            std::cerr << "[seqwish] WARNING: input alignment file " << p.first << " does not have CIGAR strings. "
+                                      << "The resulting graph will only represent the input sequences." << std::endl;
+                        }
+                        break;
+                    }
                 }
             }
         }


=====================================
src/seqindex.cpp
=====================================
@@ -33,13 +33,11 @@ void seqindex_t::build_index(const std::string& filename, const std::string& idx
     }
     size_t seq_bytes_written = 0;
     size_t seq_names_bytes_written = 0;
+    bool notified_empty_seqs = false;
     while (in.good()) {
-        seqname_offset.push_back(seq_names_bytes_written);
-        seq_offset.push_back(seq_bytes_written);
         line[0] = '>';
-        line = line.substr(0, line.find(" "));
-        seqnames << line << " ";
-        seq_names_bytes_written += line.size() + 1;
+        std::string seq_name = line.substr(0, line.find(" "));
+
         std::string seq;
         // get the sequence
         if (input_is_fasta) {
@@ -52,16 +50,29 @@ void seqindex_t::build_index(const std::string& filename, const std::string& idx
                 }
             }
         } else if (input_is_fastq) {
-            std::getline(in, seq); // sequence
+            std::getline(in, seq);  // sequence
             std::getline(in, line); // delimiter
             std::getline(in, line); // quality
             std::getline(in, line);
         }
-        // force the sequence to be upper-case
-        std::transform(seq.begin(), seq.end(), seq.begin(), [](char c) { return std::toupper(c); });
-        seqout << seq;
-        // record where the sequence starts
-        seq_bytes_written += seq.size();
+        if (seq.empty()){
+            if (!notified_empty_seqs){
+                notified_empty_seqs = true;
+                std::cerr << "[seqwish] WARNING: input FASTA file contains empty sequences, which will be ignored." << std::endl;
+            }
+        } else {
+            seqname_offset.push_back(seq_names_bytes_written);
+            seq_offset.push_back(seq_bytes_written);
+
+            seqnames << seq_name << " ";
+            seq_names_bytes_written += seq_name.size() + 1;
+
+            // force the sequence to be upper-case
+            std::transform(seq.begin(), seq.end(), seq.begin(), [](char c) { return std::toupper(c); });
+            seqout << seq;
+            // record where the sequence starts
+            seq_bytes_written += seq.size();
+        }
     }
     in.close();
     // add the last value so we can get sequence length for the last sequence and name
@@ -96,7 +107,7 @@ void seqindex_t::build_index(const std::string& filename, const std::string& idx
     std::remove(seqnamefile.c_str());
 
     if (duplicated_ids){
-        std::cerr << "[seqwish] ERROR: the input sequences have duplicated IDs." << std::endl;
+        std::cerr << "[seqwish] ERROR: input sequences have duplicated IDs." << std::endl;
         exit(1);
     }
 



View it on GitLab: https://salsa.debian.org/med-team/seqwish/-/commit/d133ab50e5795f04776d0da701b05c11bba69938

-- 
View it on GitLab: https://salsa.debian.org/med-team/seqwish/-/commit/d133ab50e5795f04776d0da701b05c11bba69938
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210830/4a77d6ca/attachment-0001.htm>


More information about the debian-med-commit mailing list