[med-svn] [Git][med-team/bcalm][master] 5 commits: New upstream version 2.2.2
Steffen Möller
gitlab at salsa.debian.org
Tue May 5 23:13:33 BST 2020
Steffen Möller pushed to branch master at Debian Med / bcalm
Commits:
f2de5dc7 by Steffen Moeller at 2020-05-05T21:21:17+02:00
New upstream version 2.2.2
- - - - -
ef0bb801 by Steffen Moeller at 2020-05-05T21:21:17+02:00
routine-update: New upstream version
- - - - -
090cbad9 by Steffen Moeller at 2020-05-05T21:21:18+02:00
Update upstream source from tag 'upstream/2.2.2'
Update to upstream version '2.2.2'
with Debian dir f0a0c31fa96367262765400f2e81b1d2706397c8
- - - - -
b66cc11f by Steffen Moeller at 2020-05-05T21:21:18+02:00
routine-update: Standards-Version: 4.5.0
- - - - -
cc66c614 by Steffen Moeller at 2020-05-05T21:33:31+02:00
Adjusting patches - small quirky bits
- - - - -
17 changed files:
- + .circleci/config.yml
- .travis.yml
- CMakeLists.txt
- README.md
- VERSION
- bidirected-graphs-in-bcalm2/bidirected-graphs-in-bcalm2.md
- + debian/bcalm.docs
- debian/bcalm.install
- debian/changelog
- debian/control
- debian/patches/modify_cmake.patch
- debian/rules
- + scripts/split_unitigs.py
- scripts/unitigEvaluator.cpp
- src/bcalm_1.cpp
- src/main.cpp
- + test/simple_test.sh
Changes:
=====================================
.circleci/config.yml
=====================================
@@ -0,0 +1,75 @@
+version: 1
+jobs:
+ build:
+ docker:
+ - image: "debian:stretch"
+ steps:
+ - checkout
+
+ - run:
+ name: Installing SUDO
+ command: 'apt-get update && apt-get install -y sudo && rm -rf /var/lib/apt/lists/*'
+ - run:
+ name: Installing GCC
+ command: 'apt-get update && apt-get install -y build-essential wget zlib1g zlib1g-dev'
+ - run:
+ name: Installing Git
+ command: 'apt-get update && apt-get install -y git'
+ - run:
+ name: Getting submodules
+ command: |
+ git submodule init
+ git submodule update --remote
+ - run:
+ name: Install CMAKE
+ command: |
+ wget https://cmake.org/files/v3.10/cmake-3.10.2-Linux-x86_64.tar.gz
+ tar -xvzf cmake-3.10.2-Linux-x86_64.tar.gz
+ chmod +x ./cmake-3.10.2-Linux-x86_64/bin/ccmake
+ chmod +x ./cmake-3.10.2-Linux-x86_64/bin/cmake
+ chmod +x ./cmake-3.10.2-Linux-x86_64/bin/cpack
+ chmod +x ./cmake-3.10.2-Linux-x86_64/bin/ctest
+ DIR=$(pwd)
+ ln -s $DIR/cmake-3.10.2-Linux-x86_64/bin/ccmake /usr/bin/ccmake
+ ln -s $DIR/cmake-3.10.2-Linux-x86_64/bin/cmake /usr/bin/cmake
+ ln -s $DIR/cmake-3.10.2-Linux-x86_64/bin/cpack /usr/bin/cpack
+ ln -s $DIR/cmake-3.10.2-Linux-x86_64/bin/ctest /usr/bin/ctest
+ - run:
+ name: Install CPPUnit
+ command: 'apt-get update && sudo apt-get install -y libcppunit-dev'
+ - run:
+ name: Compiling
+ command: 'mkdir build && cd build && cmake .. && make -j 4'
+ - run:
+ name: Making package in case we'll release
+ command: 'cd build && make package && mkdir ../artifacts && mv bcalm-binaries-*.tar.gz ../artifacts'
+ - store_artifacts:
+ path: ./artifacts
+
+ publish-github-release:
+ docker:
+ - image: cibuilds/github:0.10
+ steps:
+ - attach_workspace:
+ at: ./artifacts
+ - run:
+ name: "Publish Release on GitHub"
+ command: |
+ VERSION=$(cat VERSION)
+ ghr -t ${GITHUB_TOKEN} -u ${CIRCLE_PROJECT_USERNAME} -r ${CIRCLE_PROJECT_REPONAME} -c ${CIRCLE_SHA1} -delete ${VERSION} ./artifacts/
+workflows:
+ version: 2
+ main:
+ jobs:
+ - build:
+ filters:
+ tags:
+ only: /^v?\d+\.\d+\.\d+$/
+ - publish-github-release:
+ requires:
+ - build
+ filters:
+ branches:
+ ignore: /.*/
+ tags:
+ only: /^v?\d+\.\d+\.\d+$/
=====================================
.travis.yml
=====================================
@@ -36,7 +36,7 @@ before_deploy:
deploy:
provider: releases
api_key:
- secure: hQsQR/iRTVbDDJlyJpBln98CT+Mm9GHy/H/gYvLwhoh73f+OmUEreB2HS/KsWp3TTgZeI/ANRBpY11I4Xrai1sntqjXO2LG79pEIoL62AcCBEQJ1RsnkGg9s10qFKzQqImuPDzincnVIrSr92L8kcXBuErU2YK7/Ss5ABzxRN+E=
+ secure: KW6qAyrtX1fyCpZEjttW4YaKRGFvioqYBoN3mvlGJeYJqTFvuPbQ4PMA+ueQWvQFImhF74C/WQjnw0uUgRpBUV5mQZOFqA0QIGJNaIkL/dSqs7N9YI97VX3TH/GxPsLpW1KfQPF0DkCJ8pobWEz5k3Itl8woSXcX2HskxtQNKA0=
file: "${RELEASE_FILE}"
skip_cleanup: true
on:
=====================================
CMakeLists.txt
=====================================
@@ -17,16 +17,26 @@ FOREACH (path "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${GATB_CORE_HOME}/gatb-core/c
ENDFOREACH(path)
#############################
-# getting git version
+#getting git version
#from http://stackoverflow.com/questions/1435953/how-can-i-pass-git-sha1-to-compiler-as-definition-using-cmake
exec_program(
"git"
${CMAKE_CURRENT_SOURCE_DIR}
ARGS "rev-parse --short HEAD"
- OUTPUT_VARIABLE VERSION_SHA1 )
+ OUTPUT_VARIABLE VERSION_SHA1
+ RETURN_VALUE ERROR_GIT)
-add_definitions( -DGIT_SHA1="${VERSION_SHA1}" )
+if(NOT ${ERROR_GIT})
+ add_definitions( -DGIT_SHA1="${VERSION_SHA1}" )
+else()
+ message("Warning: cannot retrieve git version. Bcalm won't display its version. Error value: ${ERROR_GIT})")
+endif(NOT ${ERROR_GIT})
+################################
+#add version nifo
+
+file (STRINGS "VERSION" VERSION)
+add_definitions( -DVERSION="${VERSION}" )
################################################################################
# THIRD PARTIES
@@ -105,7 +115,7 @@ SET (CPACK_SOURCE_IGNORE_FILES
# We copy the project binary to the 'bin' directory
INSTALL (TARGETS ${PROJECT_NAME} DESTINATION bin)
INSTALL (DIRECTORY "${PROJECT_SOURCE_DIR}/example/" DESTINATION example)
-INSTALL (FILES LICENSE README.md DESTINATION bin/..)
+INSTALL (FILES VERSION LICENSE README.md DESTINATION bin/..)
# cmake_system_name for mac is Darwin, i want to change that
if (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
@@ -114,9 +124,6 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(PRETTY_SYSTEM_NAME "Mac")
endif()
-
-
-file (STRINGS "VERSION" VERSION)
set (CPACK_PACKAGE_FILE_NAME ${PROJECT_NAME}-binaries-${VERSION}-${PRETTY_SYSTEM_NAME})
include (CPack)
=====================================
README.md
=====================================
@@ -39,6 +39,10 @@ Download the latest [Linux/MacOS binaries](https://github.com/GATB/bcalm/release
git clone --recursive https://github.com/GATB/bcalm
cd bcalm
mkdir build; cd build; cmake ..; make -j 8
+
+You can also install bcalm from [bioconda](https://bioconda.github.io/) with [conda](https://docs.conda.io/en/latest/):
+
+ conda install -c conda-forge -c bioconda bcalm
# Input formats
@@ -97,6 +101,6 @@ BCALM 2 produces some intermediate files: a .h5 file (or a _gatb/ folder), which
Acknowledgements
========
If using BCALM 2, please cite:
-Rayan Chikhi, Antoine Limasset and Paul Medvedev, Compacting de Bruijn graphs from sequencing data quickly and in low memory, Proceedings of ISMB 2016, Bioinformatics, 32 (12): i201-i208. ([Bibtex](http://bioinformatics.oxfordjournals.org/citmgr?type=bibtex&gca=bioinfo%3B32%2F12%2Fi201))
+Rayan Chikhi, Antoine Limasset and Paul Medvedev, Compacting de Bruijn graphs from sequencing data quickly and in low memory, Proceedings of ISMB 2016, Bioinformatics, 32 (12): i201-i208. ([Bibtex](https://academic.oup.com/Citation/Download?resourceId=2289008&resourceType=3&citationFormat=2))
This project has been supported in part by NSF awards DBI-1356529, CCF-1439057, IIS-1453527, and IIS-1421908.
=====================================
VERSION
=====================================
@@ -1 +1 @@
-v2.2.1
+v2.2.2
=====================================
bidirected-graphs-in-bcalm2/bidirected-graphs-in-bcalm2.md
=====================================
@@ -25,7 +25,7 @@ The table contains the 8 edge types, organized such that each row lists a pair o
![Fig3](fig33.png)
-Bi-directed graphs have the constraint that, given two nodes x and y, a label l, and a mirror type, either both or none of the mirror edges with label l exist between x and y. Informally, edges always come in identically-labeled pairs (with one exception listed below). Thus, even though there are 8 type of edges that can be between x and y, there are really only 4 types of *connections*. There are alternate representations of bi-directed graphs that remove this redundancy, but we do not describe them here.
+Bi-directed graphs have the *mirror constraint* that, given two nodes x and y, a label l, and a mirror type, either both or none of the mirror edges with label l exist between x and y. Informally, edges always come in identically-labeled pairs (with one exception listed below). Thus, even though there are 8 type of edges that can be between x and y, there are really only 4 types of *connections*. There are alternate representations of bi-directed graphs that remove this redundancy, but we do not describe them here.
The above rule has an interesting effect on self-loops (i.e. when e.from = e.to). In such cases, observe that the two mirror edges of Type 3 are identical. The same holds for Type 4. We refer to such edges as *self-mirrors*. A bi-directed graph does not allow to represent duplicate edges, and so this connection is represented by only one edge. This case forms an exception to the rule that all edges come in pairs with their mirror. I suspect that this special case has cost humanity thousand of hours in debugging time.
@@ -50,16 +50,16 @@ We label each edge with the length of the equal suffixes/prefixes. For example,
![Fig5](fig5.png)
-Intuitively, an edge implies that the strings of the two nodes can be combined, using an overlap, into a bigger string. The sign at a node indicates whether the label or the label's reverse-complement is used. A '+' indicates that the label is used, while a '-' indicates that the label's reverse-complement is used.
+Intuitively, an edge implies that the strings of the two nodes can be combined, using an overlap, into a bigger string. The sign at a node indicates whether the label or the label's reverse-complement is used. A '+' indicates that the label is used, while a '-' indicates that the label's reverse-complement is used. This is called the *spelling rule*. In this case, for example, the string spelled by the top edge is AAGTCTAC.
-Notice that the two overlaps between the nodes in the previous example are mirror edges of type 4. This is not a coincidence. One can show from the properties of reverse-complements that if there is an overlap implied by an edge, then there must also be an overlap implied by the mirror of that edge.
+Notice that the two overlaps between the nodes in the previous example are mirror edges of type 4. This is not a coincidence. One can show from the properties of reverse-complements that if there is an overlap implied by an edge, then there must also be an overlap implied by the mirror of that edge. Thus, our definition of edges in a bi-directed overlap graph satisfies the mirror constraint of bi-directed graphs.
In this context, a self-mirror edge corresponds to a situation where either a suffix or a prefix of a node is equal to its own reverse-complement. Here is an example of a Type 3 self-mirror. Note that a self-mirror cannot have an overlap with an odd length, because an odd-length string can never be its own reverse-complement.
![Fig6](fig66.png)
-## Bidirected de-Bruijn graph
+## Bi-directed de-Bruijn graph
Given a set of strings S, a *[node-centric](https://www.biostars.org/p/175058/#256741) bi-directed de-Bruijn graph of order k* is a type of bi-directed overlap graph. The nodes correspond to all the distinct k-mer substrings of S, with the caveat that two k-mers that are reverse-complements of each other are represented by a single node. The label of this node is chosen arbitrarily as either the k-mer or its reverse complement; however there is often a convention that the label is chosen as the lexicographically smallest option. The edges correspond to all the overlaps of length k-1. For example, with k = 3 and S={GTATAC}, the graph is:
@@ -68,25 +68,28 @@ Given a set of strings S, a *[node-centric](https://www.biostars.org/p/175058/#2
Here, we gave each edge a name (e.g. e1) so that we can refer to the edges below. Observe that e1 and e2 are mirror edges, while e3 is a self-mirror and e4 is a self-mirror.
-## Walks and unitigs and compaction
+## Walks
+
A sequence of edges e<sub>1</sub>, ... , e<sub>n</sub> in a bi-directed graph is *walk* if
* It is a walk in the directed sense, i.e. e<sub>i</sub>.to = e<sub>i+1</sub>.from, for all 0 < i < n.
* The signs at an internal vertices are equal, i.e. e<sub>i</sub>.toSign = e<sub>i+1</sub>.fromSign, for all 0 < i < n.
A single vertex is also considered a walk. In the previous graph example, the sequence (e2, e3, e4, e3, e1) is a walk, while (e2, e3, e4, e1) is not.
-In a bi-directed overlap graph, a walk spells a corresponding string. For example, (e2, e3, e1) spells the original string GTATAC, while (e3, e1) spells TATAC. Observe that each walk has a corresponding *mirror walk* which spells the reverse-complement. For example, the mirror walk of (e3, e1) is (e2, e3) and spells GTATA.
+In a bi-directed overlap graph, a walk spells a corresponding string using the spelling rule previously described. For example, (e2, e3, e1) spells the original string GTATAC, while (e3, e1) spells TATAC. Observe that each walk has a corresponding *mirror walk* which spells the reverse-complement. For example, the mirror walk of (e3, e1) is (e2, e3) and spells GTATA.
+
+## Unitigs and compaction
Given a walk (e<sub>1</sub>, ..., e<sub>n</sub>), let v<sub>0</sub> = e<sub>1</sub>.from and v<sub>i</sub> = e<sub>i</sub>.to, for all 1 <= i <= n. A walk is a *unitig* if it is either a single vertex or a path (i.e. does not repeat vertices) such that
* for every 0 < i < n, the only edges incident on v<sub>i</sub> are e<sub>i-1</sub>, e<sub>i</sub>, and their mirrors.
* e<sub>1</sub> is the only outgoing edge from v<sub>0</sub> that has the the sign e<sub>1</sub>.fromSign at v<sub>0</sub>,
* e<sub>n</sub> is the only incoming edge to v<sub>n</sub> that has the sign e<sub>n</sub>.toSign at v<sub>n</sub>.
-A unitig is *maximal* if it cannot be extended in either direction. Consider the graph defined by the rectangular nodes and solid edges below (i.e. discarding the rhombus-shaped nodes and dashed edges for now), In this graph, there is a maximal unitig traversing (B, C, D, E). There is also a mirror maximal unitig traversing (E, D, C, B). There are also 4 other maximal unitigs: (A), (H), (K), and (I). The rhombus-shaped nodes and dashed edges represent nodes and edges that, if added to the graph, would destroy the unitig (B, C, D, E) and its mirror.
+A unitig is *maximal* if it cannot be extended in either direction. Consider the graph defined by the rectangular nodes and solid edges below (i.e. discarding the rhombus-shaped nodes and dashed edges for now). In this graph, there is a maximal unitig traversing (B, C, D, E). There is also a mirror maximal unitig traversing (E, D, C, B). There are also 4 other maximal unitigs: (A), (H), (K), and (I). The rhombus-shaped nodes and dashed edges represent nodes and edges that, if added to the graph, would destroy the unitig (B, C, D, E) and its mirror. The maximal unitigs of the graph would then be (K), (A), (B), (C), (D), (E), (I), and (H).
![Fig8](fig8.png)
-In the *compacted* bi-directed graph, every maximal unitig and its mirror is replaced by a single vertex. Formally, the nodes of the compacted graph are the maximal unitigs, and the edges represent all overlaps of length k-1.
+In the *compacted* bi-directed graph, every maximal unitig and its mirror is replaced by a single vertex. Formally, the nodes of the compacted graph are the maximal unitigs (grouped in mirror pairs), and the edges represent all overlaps of length k-1.
The unitig definition is motivated by the desire for it to have the following properties (though I have not seen a formal proof of these in the bi-directed setting):
* Maximal unitigs should be a vertex decomposition of the graph; in particular, two maximal unitigs should not share a vertex
=====================================
debian/bcalm.docs
=====================================
@@ -0,0 +1 @@
+README.md
=====================================
debian/bcalm.install
=====================================
@@ -4,4 +4,3 @@ debian/missing-sources/simple_test.sh usr/share/doc/bcalm/test
example/* usr/share/doc/bcalm/test
test/* usr/share/doc/bcalm/test
debian/missing-sources/compare_fasta.py usr/share/doc/bcalm/test
-README.md usr/share/doc/bcalm
=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+bcalm (2.2.2-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version
+ * Standards-Version: 4.5.0 (routine-update)
+ * Don't install VERSION file to /usr
+
+ -- Steffen Moeller <moeller at debian.org> Tue, 05 May 2020 21:21:17 +0200
+
bcalm (2.2.1-2) unstable; urgency=medium
* Team upload.
=====================================
debian/control
=====================================
@@ -9,7 +9,7 @@ Build-Depends: debhelper-compat (= 12),
libhdf5-dev,
zlib1g-dev,
cmake
-Standards-Version: 4.4.1
+Standards-Version: 4.5.0
Vcs-Browser: https://salsa.debian.org/med-team/bcalm
Vcs-Git: https://salsa.debian.org/med-team/bcalm.git
Homepage: https://github.com/GATB/bcalm
=====================================
debian/patches/modify_cmake.patch
=====================================
@@ -6,9 +6,9 @@ Last-Update: 2019-09-23
Index: bcalm/CMakeLists.txt
===================================================================
---- bcalm.orig/CMakeLists.txt 2019-09-23 06:21:21.709546638 +0100
-+++ bcalm/CMakeLists.txt 2019-09-23 06:21:21.705546591 +0100
-@@ -5,12 +5,12 @@
+--- bcalm.orig/CMakeLists.txt
++++ bcalm/CMakeLists.txt
+@@ -5,12 +5,12 @@ cmake_minimum_required(VERSION 2.6)
################################################################################
# Shortcuts
################################################################################
@@ -23,29 +23,38 @@ Index: bcalm/CMakeLists.txt
IF (EXISTS "${path}")
SET (CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${path}")
ENDIF()
-@@ -19,13 +19,15 @@
+@@ -19,18 +19,18 @@ ENDFOREACH(path)
#############################
- # getting git version
+ #getting git version
#from http://stackoverflow.com/questions/1435953/how-can-i-pass-git-sha1-to-compiler-as-definition-using-cmake
-exec_program(
- "git"
- ${CMAKE_CURRENT_SOURCE_DIR}
- ARGS "rev-parse --short HEAD"
-- OUTPUT_VARIABLE VERSION_SHA1 )
+- OUTPUT_VARIABLE VERSION_SHA1
+- RETURN_VALUE ERROR_GIT)
+-
+-if(NOT ${ERROR_GIT})
+- add_definitions( -DGIT_SHA1="${VERSION_SHA1}" )
+-else()
+- message("Warning: cannot retrieve git version. Bcalm won't display its version. Error value: ${ERROR_GIT})")
+-endif(NOT ${ERROR_GIT})
+#exec_program(
+# "git"
+# ${CMAKE_CURRENT_SOURCE_DIR}
+# ARGS "rev-parse --short HEAD"
-+# OUTPUT_VARIABLE VERSION_SHA1 )
++# OUTPUT_VARIABLE VERSION_SHA1
++# RETURN_VALUE ERROR_GIT)
+#
++#if(NOT ${ERROR_GIT})
++# add_definitions( -DGIT_SHA1="${VERSION_SHA1}" )
++#else()
++# message("Warning: cannot retrieve git version. Bcalm won't display its version. Error value: ${ERROR_GIT})")
++#endif(NOT ${ERROR_GIT})
--add_definitions( -DGIT_SHA1="${VERSION_SHA1}" )
-+# NEEDS CHANGING PER UPSTREAM CHANGE
-+add_definitions( -DGIT_SHA1="ed86d2c")
-
-
- ################################################################################
-@@ -39,7 +41,10 @@
+ ################################
+ #add version nifo
+@@ -49,7 +49,10 @@ SET (GATB_CORE_EXCLUDE_EXAMPLES 1)
# GATB CORE
@@ -57,7 +66,7 @@ Index: bcalm/CMakeLists.txt
################################################################################
# TOOLS
-@@ -70,7 +75,7 @@
+@@ -80,7 +83,7 @@ set (PROGRAM_SOURCE_DIR ${PROJECT_SOURCE
include_directories (${PROGRAM_SOURCE_DIR})
file (GLOB_RECURSE ProjectFiles ${PROGRAM_SOURCE_DIR}/*.cpp)
add_executable(${program} ${ProjectFiles})
=====================================
debian/rules
=====================================
@@ -14,4 +14,5 @@ override_dh_install:
dh_install
rm -r debian/bcalm/usr/example \
debian/bcalm/usr/README.md \
- debian/bcalm/usr/LICENSE
+ debian/bcalm/usr/LICENSE \
+ debian/bcalm/usr/VERSION
=====================================
scripts/split_unitigs.py
=====================================
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+import sys, os
+if len(sys.argv) < 4:
+ print("split BCALM unitigs at reference extremities. More specifically:")
+ print(" the script considers the sets B and E of all k-mers that are extremities of the reference genomes/contigs, respectively B for beginnings of contigs and E for ends.")
+ print(" output is: modified unitigs such that each k-mer in B should be the beginning of an unitig, and each kmer in E should be end of an unitig.")
+ print(" in order words, unitigs are split at kmers that are extremities of the reference sequences")
+ print("This script is a small modification of pufferize.py")
+ exit("arguments: references.fa unitigs.fa k")
+
+references=sys.argv[1]
+unitigs=sys.argv[2]
+k=int(sys.argv[3])
+
+# https://www.biostars.org/p/710/#1412
+from itertools import groupby
+def fasta_iter(fasta_name):
+ """
+ given a fasta file. yield tuples of header, sequence
+ """
+ fh = open(fasta_name)
+ # ditch the boolean (x[0]) and just keep the header or sequence since
+ # we know they alternate.
+ faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
+ for header in faiter:
+ # drop the ">"
+ header = next(header)[1:].strip()
+ # join all sequence lines to one.
+ seq = "".join(s.strip() for s in next(faiter))
+ yield header, seq
+
+complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
+def revcomp(seq):
+ reverse_complement = "".join(complement.get(base, base) for base in reversed(seq))
+ return reverse_complement
+
+
+def normalize(kmer):
+ return kmer if kmer < revcomp(kmer) else revcomp(kmer)
+
+
+#ref_kmers=set()
+# parse references
+#for header, ref in fasta_iter(references):
+# for kmer in [ref[:k], ref[-k:]]:
+# ref_kmers.add(normalize(kmer))
+# print(kmer)
+
+# go through all reference strings and keep the starting and end kmers for each of them in the ref_skmers and ref_ekmers respectively.
+ref_skmers=set()
+ref_ekmers=set()
+# parse references
+for header, ref in fasta_iter(references):
+ ref_skmers.add(ref[:k])
+ ref_ekmers.add(ref[-k:])
+
+
+# parse unitigs and split if necessary
+# ASSUMPTION: we might need to split a unitig multiple times
+# NOTE: the exact position of the split in unitig depends on whether the seen kmer is the first or last kmer in a reference string.
+# NOTE: unitigs are renumbered consecutively
+# NOTE: unitigs links are discarded
+output = open(unitigs+".split.fa","w")
+nb_unitigs=-1
+def save_unitig(header,seq):
+ global output, p, nb_unitigs
+ nb_unitigs += 1
+ output.write(">unitig%s\n%s\n" % (nb_unitigs, seq))
+ return(nb_unitigs)
+
+
+unitig_skmer = {}
+unitig_ekmer = {}
+
+def create_unitig(header, unitig):
+ global unitig_skmer, unitig_ekmer
+ if len(unitig) == k:
+ unitig = normalize(unitig)
+ unitig_id=save_unitig(header, unitig)
+ if normalize(unitig[:k]) in unitig_skmer:
+ print("Warning, start kmer (%s) was also seen at start of unitig %s." % (unitig[:k],str(unitig_skmer[normalize(unitig[:k])])))
+ if normalize(unitig[:k]) in unitig_ekmer:
+ print("Warning, start kmer (%s) was also seen at end of unitig %s." % (unitig[:k],str(unitig_ekmer[normalize(unitig[:k])])))
+ if normalize(unitig[-k:]) in unitig_skmer:
+ print("Warning, last kmer (%s) was also seen at start of unitig %s." % (unitig[-k:],str(unitig_skmer[normalize(unitig[-k:])])))
+ if normalize(unitig[-k:]) in unitig_ekmer:
+ print("Warning, last kmer (%s) was also seen at end of unitig %s." % (unitig[-k:],str(unitig_ekmer[normalize(unitig[-k:])])))
+ unitig_ekmer[normalize(unitig[-k:])] = [unitig_id, len(unitig)]
+ unitig_skmer[normalize(unitig[:k])] = [unitig_id, len(unitig)]
+
+print("Start parsing and spliting unitigs .. ")
+for header, unitig in fasta_iter(unitigs):
+ prev = 0
+ for i in range(0,len(unitig)-k+1):
+ kmer = unitig[i:i+k]
+ # cut up until first kmer but not the kmer itself
+ if kmer in ref_skmers or revcomp(kmer) in ref_ekmers:
+ if i+k-1-prev >= k:
+ create_unitig(header, unitig[prev:i+k-1])
+ prev = i
+ # cut the unitig until the kmer, including it
+ if kmer in ref_ekmers or revcomp(kmer) in ref_skmers:
+ create_unitig(header, unitig[prev:i+k])
+ prev = i+1
+ #add the last and right most unitig:
+ if len(unitig)-prev >= k:
+ create_unitig(header, unitig[prev:])
+
+
+output.close()
+print("done. result is in: %s.split.fa" % unitigs)
=====================================
scripts/unitigEvaluator.cpp
=====================================
@@ -4,6 +4,8 @@
// it can be compiled with just:
// g++ -lomp -o unitigEvaluator unitigEvaluator.cpp
// updated: 27th November 2018
+// 6th june 2019
+// WARNING: reference needs to be one line per sequence
#include <fstream>
#include <cstring>
#include <string>
@@ -11,15 +13,10 @@
#include <iostream>
#include <algorithm>
#include <chrono>
-#include <unordered_set>
-#include "sparsepp/spp.h"
+#include <unordered_map>
#include <omp.h>
-using spp::sparse_hash_map;
-
-
-
uint64_t xs(uint64_t y){
y^=(y<<13); y^=(y>>17);y=(y^=(y<<15)); return y;
}
@@ -109,13 +106,13 @@ int main(int argc, char ** argv){
cout<<"Problem with files opening"<<endl;
exit(1);
}
- uint64_t FP(0),TP(0),FN(0),size(0),number(0),genomicKmersNum(0);
+ uint64_t FP(0),TP(0),FN(0),size(0),number(0),genomicKmersNum(0),genomicDuplicated(0);
omp_lock_t lock[1024];
for (int i=0; i<1024; i++)
omp_init_lock(&(lock[i]));
for(uint HASH(0);HASH<nbHash;++HASH){
- vector<sparse_hash_map<string, bool>> genomicKmers;
+ vector<std::unordered_map<string, bool>> genomicKmers;
genomicKmers.resize(1024);
#pragma omp parallel num_threads(nb_cores)
@@ -179,6 +176,8 @@ int main(int argc, char ** argv){
#pragma omp atomic
TP++;
}
+ else
+ genomicDuplicated++;
}
}
}
@@ -212,6 +211,8 @@ int main(int argc, char ** argv){
cout<<"True positive (kmers in the unitig and the references) GOOD kmers: "<<intToString(TP)<<endl;
cout<<"False positive (kmers in the unitig and NOT in the references) ERRONEOUS kmers: "<<intToString(FP)<<endl;
cout<<"False Negative (kmers NOT in the unitig but in the references) MISSING kmers: "<<intToString(FN)<<endl;
+ if (genomicDuplicated > 0)
+ cout<<"REPEATED kmers in the unitigs (should not happen): "<<intToString(genomicDuplicated)<<endl;
cout<<"Erroneous kmer rate (*10,000): "<<(double)10000*FP/(FP+TP)<<endl;
cout<<"Missing kmer rate (*10,000): "<<(double)10000*FN/genomicKmersNum<<endl;
=====================================
src/bcalm_1.cpp
=====================================
@@ -61,21 +61,34 @@ struct Functor { void operator () (bcalm_1 *bcalm)
throw OptionFailure (bcalm->getParser(), "Specifiy -in");
}
+ // delete the .h5 file
+ bool delete_h5_file = true;
+ if (delete_h5_file)
+ {
+ // copies the h5 naming mechanism in GraphUnitigs.cpp
+ string input = bcalm->getInput()->getStr(STR_URI_INPUT);
+ string prefix;
+ if (bcalm->getInput()->get(STR_URI_OUTPUT))
+ prefix = bcalm->getInput()->getStr(STR_URI_OUTPUT);
+ else
+ prefix = System::file().getBaseName (input) + prefix;
+
+ System::file().remove (prefix + ".h5");
+ }
+
}
};
void bcalm_1::execute (){
#ifdef GIT_SHA1
- std::cout << "BCALM 2, git commit " << GIT_SHA1 << std::endl;
+ std::cout << "BCALM 2, version " << VERSION << ", git commit " << GIT_SHA1 << std::endl;
#endif
/** we get the kmer size chosen by the end user. */
size_t kmerSize = getInput()->getInt (STR_KMER_SIZE);
- if (kmerSize % 4 == 0) {std::cout << "due to a currently known bug, bcalm with a kmer multiple of 4 is temporarily unavailable. please retry with another k-mer size" << std::endl; exit(1);}
-
/** We launch Minia with the correct Integer implementation according to the choosen kmer size. */
Integer::apply<Functor,bcalm_1*> (kmerSize, this);
=====================================
src/main.cpp
=====================================
@@ -28,9 +28,9 @@ int main (int argc, char* argv[])
#ifdef GIT_SHA1
if(argc > 1 && ( strcmp(argv[1],STR_VERSION)==0 || strcmp(argv[1],"-v")==0 ) ){
- std::cout << "BCALM 2, git commit " << GIT_SHA1 << std::endl;
+ std::cout << "BCALM 2, version " << VERSION << ", git commit " << GIT_SHA1 << std::endl;
std::cout << "Using gatb-core version "<< System::info().getVersion() << std::endl;
- return EXIT_FAILURE;
+ return EXIT_SUCCESS;
}
#endif
=====================================
test/simple_test.sh
=====================================
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+rm -f reference.fasta
+wget https://raw.githubusercontent.com/GATB/MindTheGap/f5cb0fec816686c7393772787d736565c4f056a4/test/full_test/reference.fasta >& /dev/null
+../build/bcalm -in reference.fasta -abundance-min 1 > /dev/null >& /dev/null
+rm -rf reference.unitigs.fa.glue*
+rm -f compare_fasta.py
+wget https://raw.githubusercontent.com/GATB/minia/master/test/compare_fasta.py >& /dev/null
+python compare_fasta.py reference.fasta reference.unitigs.fa
+res=$?
+rm -f reference.fasta reference.h5 reference.unitigs.fa compare_fasta.py
+if [ "$res" = "0" ]
+then
+ echo "test OK"
+ exit 0
+else
+ echo "test KO"
+ exit 1
+fi
View it on GitLab: https://salsa.debian.org/med-team/bcalm/-/compare/63dd846f06a6d99653cc7ccab1b981acd3165d6f...cc66c61487335b1a2214bb5c556c25da9989aaff
--
View it on GitLab: https://salsa.debian.org/med-team/bcalm/-/compare/63dd846f06a6d99653cc7ccab1b981acd3165d6f...cc66c61487335b1a2214bb5c556c25da9989aaff
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200505/7fb38cc8/attachment-0001.html>
More information about the debian-med-commit
mailing list