[med-svn] [Git][med-team/racon][master] 11 commits: New upstream version 1.4.0
Andreas Tille
gitlab at salsa.debian.org
Fri Aug 2 21:48:57 BST 2019
Andreas Tille pushed to branch master at Debian Med / racon
Commits:
434b362c by Andreas Tille at 2019-07-15T14:33:40Z
New upstream version 1.4.0
- - - - -
2a657f05 by Andreas Tille at 2019-07-15T14:33:43Z
Update upstream source from tag 'upstream/1.4.0'
Update to upstream version '1.4.0'
with Debian dir e0bc0676adb3066a8cdc641912d03c0d7d71fa21
- - - - -
59a85060 by Andreas Tille at 2019-07-15T14:33:43Z
New upstream version
- - - - -
8ae7f5fb by Andreas Tille at 2019-07-15T14:33:43Z
debhelper 12
- - - - -
e6d34ac9 by Andreas Tille at 2019-07-15T14:34:05Z
Standards-Version: 4.4.0
- - - - -
ca47945c by Andreas Tille at 2019-07-15T14:40:04Z
Refresh patches
- - - - -
30dc4ab6 by Andreas Tille at 2019-08-02T20:44:02Z
New upstream version 1.4.3
- - - - -
fe9f40e3 by Andreas Tille at 2019-08-02T20:44:04Z
Update upstream source from tag 'upstream/1.4.3'
Update to upstream version '1.4.3'
with Debian dir a79445d56d01d687d134b4d322df967e2156d2e8
- - - - -
1d2b1a88 by Andreas Tille at 2019-08-02T20:44:04Z
New upstream version
- - - - -
12474747 by Andreas Tille at 2019-08-02T20:44:04Z
debhelper-compat 12
- - - - -
07bf9560 by Andreas Tille at 2019-08-02T20:47:53Z
Add TODO
- - - - -
23 changed files:
- .gitmodules
- .travis.yml
- CMakeLists.txt
- README.md
- debian/changelog
- − debian/compat
- debian/control
- debian/patches/use_debian_packaged_libs.patch
- + src/cuda/cudaaligner.cpp
- + src/cuda/cudaaligner.hpp
- + src/cuda/cudabatch.cpp
- + src/cuda/cudabatch.hpp
- + src/cuda/cudapolisher.cpp
- + src/cuda/cudapolisher.hpp
- + src/cuda/cudautils.hpp
- src/main.cpp
- src/overlap.cpp
- src/overlap.hpp
- src/polisher.cpp
- src/polisher.hpp
- src/window.cpp
- src/window.hpp
- test/racon_test.cpp
Changes:
=====================================
.gitmodules
=====================================
@@ -16,3 +16,9 @@
[submodule "vendor/rampler"]
path = vendor/rampler
url = https://github.com/rvaser/rampler
+[submodule "vendor/logger"]
+ path = vendor/logger
+ url = https://github.com/rvaser/logger
+[submodule "vendor/ClaraGenomicsAnalysis"]
+ path = vendor/ClaraGenomicsAnalysis
+ url = https://github.com/clara-genomics/ClaraGenomicsAnalysis
=====================================
.travis.yml
=====================================
@@ -1,3 +1,5 @@
+dist: trusty
+
language: cpp
compiler:
=====================================
CMakeLists.txt
=====================================
@@ -8,17 +8,40 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic")
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
option(racon_build_tests "Build racon unit tests" OFF)
option(racon_build_wrapper "Build racon wrapper" OFF)
+option(racon_enable_cuda "Build racon with NVIDIA CUDA support" OFF)
-add_executable(racon
+# Check CUDA compatibility.
+if(racon_enable_cuda)
+ find_package(CUDA 9.0 QUIET REQUIRED)
+ if(NOT ${CUDA_FOUND})
+ message(FATAL_ERROR "CUDA not detected on system. Please install")
+ else()
+ message(STATUS "Using CUDA ${CUDA_VERSION} from ${CUDA_TOOLKIT_ROOT_DIR}")
+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo")
+ endif()
+endif()
+
+include_directories(${PROJECT_SOURCE_DIR}/src)
+
+set(racon_sources
src/main.cpp
src/polisher.cpp
src/overlap.cpp
src/sequence.cpp
src/window.cpp)
+if(racon_enable_cuda)
+ list(APPEND racon_sources src/cuda/cudapolisher.cpp src/cuda/cudabatch.cpp src/cuda/cudaaligner.cpp)
+ cuda_add_executable(racon ${racon_sources})
+ target_compile_definitions(racon PRIVATE CUDA_ENABLED)
+else()
+ add_executable(racon ${racon_sources})
+endif()
+
if (NOT TARGET bioparser)
add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
endif()
@@ -31,8 +54,42 @@ endif()
if (NOT TARGET edlib)
add_subdirectory(vendor/edlib EXCLUDE_FROM_ALL)
endif()
+if (NOT TARGET logger)
+ add_subdirectory(vendor/logger EXCLUDE_FROM_ALL)
+endif()
+if (racon_enable_cuda)
+ if (DEFINED CLARAGENOMICSANALYSIS_SDK_PATH)
+ list(APPEND CMAKE_PREFIX_PATH "${CLARAGENOMICSANALYSIS_SDK_PATH}/cmake")
+ find_package(cudapoa REQUIRED)
+ find_package(cudaaligner REQUIRED)
+ elseif (DEFINED CLARAGENOMICSANALYSIS_SRC_PATH)
+ if (NOT TARGET cudapoa)
+ add_subdirectory(${CLARAGENOMICSANALYSIS_SRC_PATH} ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+ endif()
+ if (NOT TARGET cudaaligner)
+ add_subdirectory(${CLARAGENOMICSANALYSIS_SRC_PATH} ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+ endif()
+ elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ClaraGenomicsAnalysis)
+ if (NOT TARGET cudapoa)
+ add_subdirectory(vendor/ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+ endif()
+ if (NOT TARGET cudaaligner)
+ add_subdirectory(vendor/ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+ endif()
+ else()
+ if (NOT TARGET cudapoa)
+ add_subdirectory(../ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+ endif()
+ if (NOT TARGET cudaaligner)
+ add_subdirectory(../ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+ endif()
+ endif()
+endif()
-target_link_libraries(racon bioparser spoa thread_pool pthread edlib_static)
+target_link_libraries(racon bioparser spoa thread_pool edlib_static logger)
+if (racon_enable_cuda)
+ target_link_libraries(racon cudapoa cudaaligner)
+endif()
install(TARGETS racon DESTINATION bin)
@@ -43,18 +100,30 @@ if (racon_build_tests)
include_directories(${PROJECT_BINARY_DIR}/config)
include_directories(${PROJECT_SOURCE_DIR}/src)
- add_executable(racon_test
+ set(racon_test_sources
test/racon_test.cpp
src/polisher.cpp
src/overlap.cpp
src/sequence.cpp
src/window.cpp)
- add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
+ if (racon_enable_cuda)
+ list(APPEND racon_test_sources src/cuda/cudapolisher.cpp src/cuda/cudabatch.cpp src/cuda/cudaaligner.cpp)
+ cuda_add_executable(racon_test ${racon_test_sources})
+ target_compile_definitions(racon_test PRIVATE CUDA_ENABLED)
+ else()
+ add_executable(racon_test ${racon_test_sources})
+ endif()
- target_link_libraries(racon_test bioparser spoa thread_pool pthread
- edlib_static gtest_main)
-endif(racon_build_tests)
+ if (NOT TARGET gtest_main)
+ add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
+ endif()
+
+ target_link_libraries(racon_test bioparser spoa thread_pool edlib_static logger gtest_main)
+ if (racon_enable_cuda)
+ target_link_libraries(racon_test cudapoa cudaaligner)
+ endif()
+endif()
if (racon_build_wrapper)
set(racon_path ${PROJECT_BINARY_DIR}/bin/racon)
@@ -66,5 +135,7 @@ if (racon_build_wrapper)
FILE_PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE
WORLD_READ WORLD_EXECUTE)
- add_subdirectory(vendor/rampler)
-endif(racon_build_wrapper)
+ if (NOT TARGET rampler)
+ add_subdirectory(vendor/rampler)
+ endif()
+endif()
=====================================
README.md
=====================================
@@ -11,7 +11,7 @@ Racon is intended as a standalone consensus module to correct raw contigs genera
Racon can be used as a polishing tool after the assembly with **either Illumina data or data produced by third generation of sequencing**. The type of data inputed is automatically detected.
-Racon takes as input only three files: contigs in FASTA/FASTQ format, reads in FASTA/FASTQ format and overlaps/alignments between the reads and the contigs in MHAP/PAF/SAM format. Output is a set of polished contigs in FASTA format printed to stdout. All input files **can be compressed with gzip**.
+Racon takes as input only three files: contigs in FASTA/FASTQ format, reads in FASTA/FASTQ format and overlaps/alignments between the reads and the contigs in MHAP/PAF/SAM format. Output is a set of polished contigs in FASTA format printed to stdout. All input files **can be compressed with gzip** (which will have impact on parsing time).
Racon can also be used as a read error-correction tool. In this scenario, the MHAP/PAF/SAM file needs to contain pairwise overlaps between reads **including dual overlaps**.
@@ -21,6 +21,11 @@ A **wrapper script** is also available to enable easier usage to the end-user fo
1. gcc 4.8+ or clang 3.4+
2. cmake 3.2+
+### CUDA Support
+1. gcc 5.0+
+2. cmake 3.10+
+4. CUDA 10.0+
+
## Installation
To install Racon run the following commands:
@@ -43,6 +48,20 @@ To build unit tests add `-Dracon_build_tests=ON` while running `cmake`. After in
To build the wrapper script add `-Dracon_build_wrapper=ON` while running `cmake`. After installation, an executable named `racon_wrapper` (python script) will be created in `build/bin`.
+### CUDA Support
+Racon makes use of [NVIDIA's ClaraGenomicsAnalysis SDK](https://github.com/clara-genomics/ClaraGenomicsAnalysis) for CUDA accelerated polishing and alignment.
+
+To build `racon` with CUDA support, add `-Dracon_enable_cuda=ON` while running `cmake`. If CUDA support is unavailable, the `cmake` step will error out.
+Note that the CUDA support flag does not produce a new binary target. Instead it augments the existing `racon` binary itself.
+
+```bash
+cd build
+cmake -DCMAKE_BUILD_TYPE=Release -Dracon_enable_cuda=ON ..
+make
+```
+
+***Note***: Short read polishing with CUDA is still in development!
+
## Usage
Usage of `racon` is as following:
@@ -90,6 +109,15 @@ Usage of `racon` is as following:
-h, --help
prints the usage
+ only available when built with CUDA:
+ -c, --cudapoa-batches
+ default: 1
+ number of batches for CUDA accelerated polishing
+ -b, --cuda-banded-alignment
+ use banding approximation for polishing on GPU. Only applicable when -c is used.
+ --cudaaligner-batches (experimental)
+ Number of batches for CUDA accelerated alignment
+
`racon_test` is run without any parameters.
Usage of `racon_wrapper` equals the one of `racon` with two additional parameters:
=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+racon (1.4.3-1) UNRELEASED; urgency=medium
+
+ * New upstream version
+ * debhelper-compat 12
+ * Standards-Version: 4.4.0
+ TODO: https://salsa.debian.org/med-team/liblogger
+
+ -- Andreas Tille <tille at debian.org> Fri, 02 Aug 2019 22:44:04 +0200
+
racon (1.3.2-1) unstable; urgency=medium
* Team upload.
=====================================
debian/compat deleted
=====================================
@@ -1 +0,0 @@
-11
=====================================
debian/control
=====================================
@@ -4,7 +4,7 @@ Uploaders: Cédric Lood <cedric.lood at kuleuven.be>,
Andreas Tille <tille at debian.org>
Section: science
Priority: optional
-Build-Depends: debhelper (>= 11~),
+Build-Depends: debhelper-compat (= 12),
cmake,
libgtest-dev,
libbioparser-dev,
@@ -12,7 +12,7 @@ Build-Depends: debhelper (>= 11~),
libspoa-dev,
libthread-pool-dev,
rampler
-Standards-Version: 4.3.0
+Standards-Version: 4.4.0
Vcs-Browser: https://salsa.debian.org/med-team/racon
Vcs-Git: https://salsa.debian.org/med-team/racon.git
Homepage: https://github.com/isovic/racon
=====================================
debian/patches/use_debian_packaged_libs.patch
=====================================
@@ -4,9 +4,9 @@ Description: Use Debian packaged libraries
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-@@ -19,20 +19,7 @@ add_executable(racon
- src/sequence.cpp
- src/window.cpp)
+@@ -42,18 +42,6 @@ else()
+ add_executable(racon ${racon_sources})
+ endif()
-if (NOT TARGET bioparser)
- add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
@@ -20,22 +20,28 @@ Description: Use Debian packaged libraries
-if (NOT TARGET edlib)
- add_subdirectory(vendor/edlib EXCLUDE_FROM_ALL)
-endif()
--
--target_link_libraries(racon bioparser spoa thread_pool pthread edlib_static)
-+target_link_libraries(racon spoa thread_pool pthread edlib z)
-
- install(TARGETS racon DESTINATION bin)
+ if (NOT TARGET logger)
+ add_subdirectory(vendor/logger EXCLUDE_FROM_ALL)
+ endif()
+@@ -86,7 +74,7 @@ if (racon_enable_cuda)
+ endif()
+ endif()
-@@ -50,10 +37,8 @@ if (racon_build_tests)
- src/sequence.cpp
- src/window.cpp)
+-target_link_libraries(racon bioparser spoa thread_pool edlib_static logger)
++target_link_libraries(racon spoa thread_pool pthread edlib logger z)
+ if (racon_enable_cuda)
+ target_link_libraries(racon cudapoa cudaaligner)
+ endif()
+@@ -115,11 +103,7 @@ if (racon_build_tests)
+ add_executable(racon_test ${racon_test_sources})
+ endif()
-- add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
+- if (NOT TARGET gtest_main)
+- add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
+- endif()
-
-- target_link_libraries(racon_test bioparser spoa thread_pool pthread
-- edlib_static gtest_main)
-+ target_link_libraries(racon_test spoa thread_pool pthread
-+ edlib gtest_main gtest z)
- endif(racon_build_tests)
-
- if (racon_build_wrapper)
+- target_link_libraries(racon_test bioparser spoa thread_pool edlib_static logger gtest_main)
++ target_link_libraries(racon_test spoa thread_pool pthread edlib logger gtest_main gtest z)
+ if (racon_enable_cuda)
+ target_link_libraries(racon_test cudapoa cudaaligner)
+ endif()
=====================================
src/cuda/cudaaligner.cpp
=====================================
@@ -0,0 +1,128 @@
+/*!
+ * @file cudaaligner.cpp
+ *
+ * @brief CUDABatchAligner class source file
+ */
+
+#include <cudautils/cudautils.hpp>
+
+#include "cudaaligner.hpp"
+
+namespace racon {
+
+std::atomic<uint32_t> CUDABatchAligner::batches;
+
+std::unique_ptr<CUDABatchAligner> createCUDABatchAligner(uint32_t max_query_size,
+ uint32_t max_target_size,
+ uint32_t max_alignments,
+ uint32_t device_id)
+{
+ return std::unique_ptr<CUDABatchAligner>(new CUDABatchAligner(max_query_size,
+ max_target_size,
+ max_alignments,
+ device_id));
+}
+
+CUDABatchAligner::CUDABatchAligner(uint32_t max_query_size,
+ uint32_t max_target_size,
+ uint32_t max_alignments,
+ uint32_t device_id)
+ : aligner_(claragenomics::cudaaligner::create_aligner(max_query_size,
+ max_target_size,
+ max_alignments,
+ claragenomics::cudaaligner::AlignmentType::global,
+ device_id))
+ , overlaps_()
+ , stream_(0)
+{
+ bid_ = CUDABatchAligner::batches++;
+
+ CGA_CU_CHECK_ERR(cudaStreamCreate(&stream_));
+
+ aligner_->set_cuda_stream(stream_);
+}
+
+CUDABatchAligner::~CUDABatchAligner()
+{
+ CGA_CU_CHECK_ERR(cudaStreamDestroy(stream_));
+}
+
+bool CUDABatchAligner::addOverlap(Overlap* overlap, std::vector<std::unique_ptr<Sequence>>& sequences)
+{
+ const char* q = !overlap->strand_ ? &(sequences[overlap->q_id_]->data()[overlap->q_begin_]) :
+ &(sequences[overlap->q_id_]->reverse_complement()[overlap->q_length_ - overlap->q_end_]);
+ const char* t = &(sequences[overlap->t_id_]->data()[overlap->t_begin_]);
+
+ claragenomics::cudaaligner::StatusType s =
+ aligner_->add_alignment(q, overlap->q_end_ - overlap->q_begin_,
+ t, overlap->t_end_ - overlap->t_begin_);
+ if (s == claragenomics::cudaaligner::StatusType::exceeded_max_alignments)
+ {
+ return false;
+ }
+ else if (s == claragenomics::cudaaligner::StatusType::exceeded_max_alignment_difference
+ || s == claragenomics::cudaaligner::StatusType::exceeded_max_length)
+ {
+ cpu_overlap_data_.emplace_back(std::make_pair<std::string, std::string>(std::string(q, q + overlap->q_end_ - overlap->q_begin_),
+ std::string(t, t + overlap->t_end_ - overlap->t_begin_)));
+ cpu_overlaps_.push_back(overlap);
+ }
+ else if (s != claragenomics::cudaaligner::StatusType::success)
+ {
+ fprintf(stderr, "Unknown error in cuda aligner!\n");
+ }
+ else
+ {
+ overlaps_.push_back(overlap);
+ }
+ return true;
+}
+
+void CUDABatchAligner::alignAll()
+{
+ aligner_->align_all();
+ compute_cpu_overlaps();
+}
+
+void CUDABatchAligner::compute_cpu_overlaps()
+{
+ for(std::size_t a = 0; a < cpu_overlaps_.size(); a++)
+ {
+ // Run CPU version of overlap.
+ Overlap* overlap = cpu_overlaps_[a];
+ overlap->align_overlaps(cpu_overlap_data_[a].first.c_str(), cpu_overlap_data_[a].first.length(),
+ cpu_overlap_data_[a].second.c_str(), cpu_overlap_data_[a].second.length());
+ }
+}
+
+void CUDABatchAligner::find_breaking_points(uint32_t window_length)
+{
+ aligner_->sync_alignments();
+
+ const std::vector<std::shared_ptr<claragenomics::cudaaligner::Alignment>>& alignments = aligner_->get_alignments();
+ // Number of alignments should be the same as number of overlaps.
+ if (overlaps_.size() != alignments.size())
+ {
+ throw std::runtime_error("Number of alignments doesn't match number of overlaps in cudaaligner.");
+ }
+ for(std::size_t a = 0; a < alignments.size(); a++)
+ {
+ overlaps_[a]->cigar_ = alignments[a]->convert_to_cigar();
+ overlaps_[a]->find_breaking_points_from_cigar(window_length);
+ }
+ for(Overlap* overlap : cpu_overlaps_)
+ {
+ // Run CPU version of breaking points.
+ overlap->find_breaking_points_from_cigar(window_length);
+ }
+}
+
+void CUDABatchAligner::reset()
+{
+ overlaps_.clear();
+ cpu_overlaps_.clear();
+ cpu_overlap_data_.clear();
+ aligner_->reset();
+}
+
+}
=====================================
src/cuda/cudaaligner.hpp
=====================================
@@ -0,0 +1,96 @@
+/*!
+* @file cudaaligner.hpp
+ *
+ * @brief CUDA aligner class header file
+ */
+#include "cudaaligner/cudaaligner.hpp"
+#include "cudaaligner/aligner.hpp"
+#include "cudaaligner/alignment.hpp"
+
+#include "overlap.hpp"
+#include "sequence.hpp"
+
+#include <vector>
+#include <atomic>
+
+namespace racon {
+
+class CUDABatchAligner;
+std::unique_ptr<CUDABatchAligner> createCUDABatchAligner(uint32_t max_query_size, uint32_t max_target_size, uint32_t max_alignments, uint32_t device_id);
+
+class CUDABatchAligner
+{
+ public:
+ virtual ~CUDABatchAligner();
+
+ /**
+ * @brief Add a new overlap to the batch.
+ *
+ * @param[in] window : The overlap to add to the batch.
+ * @param[in] sequences: Reference to a database of sequences.
+ *
+ * @return True if overlap could be added to the batch.
+ */
+ virtual bool addOverlap(Overlap* overlap, std::vector<std::unique_ptr<Sequence>>& sequences);
+
+ /**
+ * @brief Checks if batch has any overlaps to process.
+ *
+ * @return Trie if there are overlaps in the batch.
+ */
+ virtual bool hasOverlaps() const {
+ return overlaps_.size() > 0;
+ };
+
+ /**
+ * @brief Runs batched alignment of overlaps on GPU.
+ *
+ */
+ virtual void alignAll();
+
+ /**
+ * @brief Find breaking points in alignments.
+ *
+ */
+ virtual void find_breaking_points(uint32_t window_length);
+
+ /**
+ * @brief Resets the state of the object, which includes
+ * resetting buffer states and counters.
+ */
+ virtual void reset();
+
+ /**
+ * @brief Get batch ID.
+ */
+ uint32_t getBatchID() const { return bid_; }
+
+ // Builder function to create a new CUDABatchAligner object.
+ friend std::unique_ptr<CUDABatchAligner>
+ createCUDABatchAligner(uint32_t max_query_size, uint32_t max_target_size, uint32_t max_alignments, uint32_t device_id);
+
+ protected:
+ CUDABatchAligner(uint32_t max_query_size, uint32_t max_target_size, uint32_t max_alignments, uint32_t device_id);
+ CUDABatchAligner(const CUDABatchAligner&) = delete;
+ const CUDABatchAligner& operator=(const CUDABatchAligner&) = delete;
+
+ void compute_cpu_overlaps();
+
+ std::unique_ptr<claragenomics::cudaaligner::Aligner> aligner_;
+
+ std::vector<Overlap*> overlaps_;
+
+ std::vector<Overlap*> cpu_overlaps_;
+ std::vector<std::pair<std::string, std::string>> cpu_overlap_data_;
+
+ // Static batch count used to generate batch IDs.
+ static std::atomic<uint32_t> batches;
+
+ // Batch ID.
+ uint32_t bid_ = 0;
+
+ // CUDA stream for batch.
+ cudaStream_t stream_;
+};
+
+}
=====================================
src/cuda/cudabatch.cpp
=====================================
@@ -0,0 +1,260 @@
+/*!
+ * @file cudabatch.cpp
+ *
+ * @brief CUDABatch class source file
+ */
+
+#include <string>
+#include <iostream>
+#include <cstring>
+#include <algorithm>
+
+#include "cudabatch.hpp"
+#include "cudautils.hpp"
+
+#include "spoa/spoa.hpp"
+#include <cudautils/cudautils.hpp>
+
+namespace racon {
+
+std::atomic<uint32_t> CUDABatchProcessor::batches;
+
+std::unique_ptr<CUDABatchProcessor> createCUDABatch(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment)
+{
+ return std::unique_ptr<CUDABatchProcessor>(new CUDABatchProcessor(max_windows, max_window_depth, device, gap, mismatch, match, cuda_banded_alignment));
+}
+
+CUDABatchProcessor::CUDABatchProcessor(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment)
+ : max_windows_(max_windows)
+ , cudapoa_batch_(claragenomics::cudapoa::create_batch(max_windows, max_window_depth, device, claragenomics::cudapoa::OutputType::consensus, gap, mismatch, match, cuda_banded_alignment))
+ , windows_()
+ , seqs_added_per_window_()
+{
+ bid_ = CUDABatchProcessor::batches++;
+
+ // Create new CUDA stream.
+ CGA_CU_CHECK_ERR(cudaStreamCreate(&stream_));
+ cudapoa_batch_->set_cuda_stream(stream_);
+}
+
+CUDABatchProcessor::~CUDABatchProcessor()
+{
+ // Destroy CUDA stream.
+ CGA_CU_CHECK_ERR(cudaStreamDestroy(stream_));
+}
+
+bool CUDABatchProcessor::addWindow(std::shared_ptr<Window> window)
+{
+ if (windows_.size() < max_windows_)
+ {
+ windows_.push_back(window);
+ seqs_added_per_window_.push_back(0);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+}
+
+bool CUDABatchProcessor::hasWindows() const
+{
+ return (windows_.size() != 0);
+}
+
+void CUDABatchProcessor::convertPhredQualityToWeights(const char* qual,
+ uint32_t qual_length,
+ std::vector<int8_t>& weights)
+{
+ weights.clear();
+ for(uint32_t i = 0; i < qual_length; i++)
+ {
+ weights.push_back(static_cast<uint8_t>(qual[i]) - 33); // PHRED quality
+ }
+}
+
+claragenomics::cudapoa::StatusType CUDABatchProcessor::addSequenceToPoa(std::pair<const char*, uint32_t>& seq,
+ std::pair<const char*, uint32_t>& qualities)
+{
+ // Add sequences to latest poa in batch.
+ std::vector<int8_t> weights;
+ claragenomics::cudapoa::StatusType status = claragenomics::cudapoa::StatusType::success;
+ if (qualities.first == nullptr)
+ {
+ status = cudapoa_batch_->add_seq_to_poa(seq.first, nullptr, seq.second);
+ }
+ else
+ {
+ convertPhredQualityToWeights(qualities.first, qualities.second, weights);
+ status = cudapoa_batch_->add_seq_to_poa(seq.first, weights.data(), seq.second);
+ }
+ return status;
+}
+
+void CUDABatchProcessor::generateMemoryMap()
+{
+ auto num_windows = windows_.size();
+ for(uint32_t w = 0; w < num_windows; w++)
+ {
+ // Add new poa
+ claragenomics::cudapoa::StatusType status = cudapoa_batch_->add_poa();
+ if (status != claragenomics::cudapoa::StatusType::success)
+ {
+ fprintf(stderr, "Failed to add new POA to batch %d.\n",
+ cudapoa_batch_->batch_id());
+ exit(1);
+ }
+
+ std::shared_ptr<Window> window = windows_.at(w);
+ uint32_t num_seqs = window->sequences_.size();
+ std::vector<uint8_t> weights;
+
+ // Add first sequence as backbone to graph.
+ std::pair<const char*, uint32_t> seq = window->sequences_.front();
+ std::pair<const char*, uint32_t> qualities = window->qualities_.front();
+ status = addSequenceToPoa(seq, qualities);
+ if (status != claragenomics::cudapoa::StatusType::success)
+ {
+ fprintf(stderr, "Could not add backbone to window. Fatal error.\n");
+ exit(1);
+ }
+
+ // Add the rest of the sequences in sorted order of starting positions.
+ std::vector<uint32_t> rank;
+ rank.reserve(window->sequences_.size());
+
+ for (uint32_t i = 0; i < num_seqs; ++i) {
+ rank.emplace_back(i);
+ }
+
+ std::sort(rank.begin() + 1, rank.end(), [&](uint32_t lhs, uint32_t rhs) {
+ return window->positions_[lhs].first < window->positions_[rhs].first; });
+
+ // Start from index 1 since first sequence has already been added as backbone.
+ uint32_t long_seq = 0;
+ uint32_t skipped_seq = 0;
+ for(uint32_t j = 1; j < num_seqs; j++)
+ {
+ uint32_t i = rank.at(j);
+ seq = window->sequences_.at(i);
+ qualities = window->qualities_.at(i);
+ // Add sequences to latest poa in batch.
+ status = addSequenceToPoa(seq, qualities);
+ if (status == claragenomics::cudapoa::StatusType::exceeded_maximum_sequence_size)
+ {
+ long_seq++;
+ continue;
+ }
+ else if (status == claragenomics::cudapoa::StatusType::exceeded_maximum_sequences_per_poa)
+ {
+ skipped_seq++;
+ continue;
+ }
+ else if (status != claragenomics::cudapoa::StatusType::success)
+ {
+ fprintf(stderr, "Could not add sequence to POA in batch %d.\n",
+ cudapoa_batch_->batch_id());
+ exit(1);
+ }
+
+ seqs_added_per_window_[w] = seqs_added_per_window_[w] + 1;
+ }
+#ifndef NDEBUG
+ if (long_seq > 0)
+ {
+ fprintf(stderr, "Too long (%d / %d)\n", long_seq, num_seqs);
+ }
+ if (skipped_seq > 0)
+ {
+ fprintf(stderr, "Skipped (%d / %d)\n", skipped_seq, num_seqs);
+ }
+#endif
+ }
+}
+
+void CUDABatchProcessor::generatePOA()
+{
+ // call generate poa function
+ cudapoa_batch_->generate_poa();
+}
+
+void CUDABatchProcessor::getConsensus()
+{
+ std::vector<std::string> consensuses;
+ std::vector<std::vector<uint16_t>> coverages;
+ std::vector<claragenomics::cudapoa::StatusType> output_status;
+ cudapoa_batch_->get_consensus(consensuses, coverages, output_status);
+
+ for(uint32_t i = 0; i < windows_.size(); i++)
+ {
+ auto window = windows_.at(i);
+ if (output_status.at(i) != claragenomics::cudapoa::StatusType::success)
+ {
+ // leave the failure cases to CPU polisher
+ window_consensus_status_.emplace_back(false);
+ }
+ else
+ {
+ // This is a special case borrowed from the CPU version.
+ // TODO: We still run this case through the GPU, but could take it out.
+ if (window->sequences_.size() < 3)
+ {
+ window->consensus_ = std::string(window->sequences_.front().first,
+ window->sequences_.front().second);
+
+ // This status is borrowed from the CPU version which considers this
+ // a failed consensus. All other cases are true.
+ window_consensus_status_.emplace_back(false);
+ }
+ else
+ {
+ window->consensus_ = consensuses.at(i);
+ if (window->type_ == WindowType::kTGS)
+ {
+ uint32_t num_seqs_in_window = seqs_added_per_window_[i];
+ uint32_t average_coverage = num_seqs_in_window / 2;
+
+ int32_t begin = 0, end = window->consensus_.size() - 1;
+ for (; begin < static_cast<int32_t>( window->consensus_.size()); ++begin) {
+ if (coverages.at(i).at(begin) >= average_coverage) {
+ break;
+ }
+ }
+ for (; end >= 0; --end) {
+ if (coverages.at(i).at(end) >= average_coverage) {
+ break;
+ }
+ }
+
+ if (begin >= end) {
+ fprintf(stderr, "[CUDABatchProcessor] warning: "
+ "contig might be chimeric in window %lu!\n", window->id_);
+ } else {
+ window->consensus_ = window->consensus_.substr(begin, end - begin + 1);
+ }
+ }
+ window_consensus_status_.emplace_back(true);
+ }
+ }
+ }
+}
+
+const std::vector<bool>& CUDABatchProcessor::generateConsensus()
+{
+ // Generate consensus for all windows in the batch
+ generateMemoryMap();
+ generatePOA();
+ getConsensus();
+
+ return window_consensus_status_;
+}
+
+void CUDABatchProcessor::reset()
+{
+ windows_.clear();
+ window_consensus_status_.clear();
+ seqs_added_per_window_.clear();
+ cudapoa_batch_->reset();
+}
+
+} // namespace racon
=====================================
src/cuda/cudabatch.hpp
=====================================
@@ -0,0 +1,142 @@
+/*!
+* @file cudabatch.hpp
+ *
+ * @brief CUDA batch class header file
+ */
+
+#pragma once
+
+#include <memory>
+#include <cuda_runtime_api.h>
+#include <atomic>
+
+#include "window.hpp"
+#include "cudapoa/batch.hpp"
+
+namespace spoa {
+ class AlignmentEngine;
+}
+
+namespace racon {
+
+class Window;
+
+class CUDABatchProcessor;
+std::unique_ptr<CUDABatchProcessor> createCUDABatch(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment);
+
+class CUDABatchProcessor
+{
+public:
+ ~CUDABatchProcessor();
+
+ /**
+ * @brief Add a new window to the batch.
+ *
+ * @param[in] window : The window to add to the batch.
+ *
+ * @return True of window could be added to the batch.
+ */
+ bool addWindow(std::shared_ptr<Window> window);
+
+ /**
+ * @brief Checks if batch has any windows to process.
+ */
+ bool hasWindows() const;
+
+ /**
+ * @brief Runs the core computation to generate consensus for
+ * all windows in the batch.
+ *
+ * @return Vector of bool indicating succesful generation of consensus
+ * for each window in the batch.
+ */
+ const std::vector<bool>& generateConsensus();
+
+ /**
+ * @brief Resets the state of the object, which includes
+ * resetting buffer states and counters.
+ */
+ void reset();
+
+ /**
+ * @brief Get batch ID.
+ */
+ uint32_t getBatchID() const { return bid_; }
+
+ // Builder function to create a new CUDABatchProcessor object.
+ friend std::unique_ptr<CUDABatchProcessor>
+ createCUDABatch(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment);
+
+protected:
+ /**
+ * @brief Constructor for CUDABatch class.
+ *
+ * @param[in] max_windows : Maximum number windows in batch
+ * @param[in] max_window_depth : Maximum number of sequences per window
+ * @param[in] cuda_banded_alignment : Use banded POA alignment
+ */
+ CUDABatchProcessor(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment);
+ CUDABatchProcessor(const CUDABatchProcessor&) = delete;
+ const CUDABatchProcessor& operator=(const CUDABatchProcessor&) = delete;
+
+ /*
+ * @brief Process all the windows and re-map them into
+ * memory for more efficient processing in the CUDA
+ * kernels.
+ */
+ void generateMemoryMap();
+
+ /*
+ * @brief Run the CUDA kernel for generating POA on the batch.
+ * This call is asynchronous.
+ */
+ void generatePOA();
+
+ /*
+ * @brief Wait for execution to complete and grab the output
+ * consensus from the device.
+ */
+ void getConsensus();
+
+ /*
+ * @brief Convert PHRED quality scores to weights.
+ *
+ */
+ void convertPhredQualityToWeights(const char* qual,
+ uint32_t qual_length,
+ std::vector<int8_t>& weights);
+
+ /*
+ * @brief Add sequence and qualities to cudapoa.
+ *
+ */
+ claragenomics::cudapoa::StatusType addSequenceToPoa(std::pair<const char*, uint32_t>& seq,
+ std::pair<const char*, uint32_t>& quality);
+
+protected:
+ // Static batch count used to generate batch IDs.
+ static std::atomic<uint32_t> batches;
+
+ // Batch ID.
+ uint32_t bid_ = 0;
+
+ // Maximum windows allowed in batch.
+ uint32_t max_windows_;
+
+ // CUDA-POA library object that manages POA batch.
+ std::unique_ptr<claragenomics::cudapoa::Batch> cudapoa_batch_;
+
+ // Stream for running POA batch.
+ cudaStream_t stream_;
+ // Windows belonging to the batch.
+ std::vector<std::shared_ptr<Window>> windows_;
+
+ // Consensus generation status for each window.
+ std::vector<bool> window_consensus_status_;
+
+ // Number of sequences actually added per window.
+ std::vector<uint32_t> seqs_added_per_window_;
+
+};
+
+} // namespace racon
=====================================
src/cuda/cudapolisher.cpp
=====================================
@@ -0,0 +1,396 @@
+/*!
+ * @file cudapolisher.cpp
+ *
+ * @brief CUDA Polisher class source file
+ */
+
+#include <future>
+#include <iostream>
+#include <chrono>
+#include <cuda_profiler_api.h>
+
+#include "sequence.hpp"
+#include "cudapolisher.hpp"
+#include <cudautils/cudautils.hpp>
+
+#include "bioparser/bioparser.hpp"
+#include "logger/logger.hpp"
+
+namespace racon {
+
+// The logger used by racon has a fixed size of 20 bins
+// which is used for the progress bar updates. Hence all
+// updates need to be broken into 20 bins.
+const uint32_t RACON_LOGGER_BIN_SIZE = 20;
+
+CUDAPolisher::CUDAPolisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
+ std::unique_ptr<bioparser::Parser<Overlap>> oparser,
+ std::unique_ptr<bioparser::Parser<Sequence>> tparser,
+ PolisherType type, uint32_t window_length, double quality_threshold,
+ double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
+ uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
+ uint32_t cudaaligner_batches)
+ : Polisher(std::move(sparser), std::move(oparser), std::move(tparser),
+ type, window_length, quality_threshold,
+ error_threshold, match, mismatch, gap, num_threads)
+ , cudapoa_batches_(cudapoa_batches)
+ , cudaaligner_batches_(cudaaligner_batches)
+ , gap_(gap)
+ , mismatch_(mismatch)
+ , match_(match)
+ , cuda_banded_alignment_(cuda_banded_alignment)
+{
+ claragenomics::cudapoa::Init();
+ claragenomics::cudaaligner::Init();
+
+ CGA_CU_CHECK_ERR(cudaGetDeviceCount(&num_devices_));
+
+ if (num_devices_ < 1)
+ {
+ throw std::runtime_error("No GPU devices found.");
+ }
+
+ std::cerr << "Using " << num_devices_ << " GPU(s) to perform polishing" << std::endl;
+
+ // Run dummy call on each device to initialize CUDA context.
+ for(int32_t dev_id = 0; dev_id < num_devices_; dev_id++)
+ {
+ std::cerr << "Initialize device " << dev_id << std::endl;
+ CGA_CU_CHECK_ERR(cudaSetDevice(dev_id));
+ CGA_CU_CHECK_ERR(cudaFree(0));
+ }
+
+ std::cerr << "[CUDAPolisher] Constructed." << std::endl;
+}
+
+CUDAPolisher::~CUDAPolisher()
+{
+ cudaDeviceSynchronize();
+ cudaProfilerStop();
+}
+
+std::vector<uint32_t> CUDAPolisher::calculate_batches_per_gpu(uint32_t batches, uint32_t gpus)
+{
+ // Bin batches into each GPU.
+ std::vector<uint32_t> batches_per_gpu(gpus, batches / gpus);
+
+ for(uint32_t i = 0; i < batches % gpus; ++i)
+ {
+ ++batches_per_gpu[i];
+ }
+
+ return batches_per_gpu;
+}
+
+void CUDAPolisher::find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps)
+{
+ if (cudaaligner_batches_ < 1)
+ {
+ // TODO: Kept CPU overlap alignment right now while GPU is a dummy implmentation.
+ Polisher::find_overlap_breaking_points(overlaps);
+ }
+ else
+ {
+ // TODO: Experimentally this is giving decent perf
+ const uint32_t MAX_ALIGNMENTS = 50;
+
+ logger_->log();
+ std::mutex mutex_overlaps;
+ uint32_t next_overlap_index = 0;
+
+ // Lambda expression for filling up next batch of alignments.
+ auto fill_next_batch = [&mutex_overlaps, &next_overlap_index, &overlaps, this](CUDABatchAligner* batch) -> std::pair<uint32_t, uint32_t> {
+ batch->reset();
+
+ // Use mutex to read the vector containing windows in a threadsafe manner.
+ std::lock_guard<std::mutex> guard(mutex_overlaps);
+
+ uint32_t initial_count = next_overlap_index;
+ uint32_t count = overlaps.size();
+ while(next_overlap_index < count)
+ {
+ if (batch->addOverlap(overlaps.at(next_overlap_index).get(), sequences_))
+ {
+ next_overlap_index++;
+ }
+ else
+ {
+ break;
+ }
+ }
+ return {initial_count, next_overlap_index};
+ };
+
+ // Variables for keeping track of logger progress bar.
+ uint32_t logger_step = overlaps.size() / RACON_LOGGER_BIN_SIZE;
+ int32_t log_bar_idx = 0, log_bar_idx_prev = -1;
+ uint32_t window_idx = 0;
+ std::mutex mutex_log_bar_idx;
+
+ // Lambda expression for processing a batch of alignments.
+ auto process_batch = [&fill_next_batch, &logger_step, &log_bar_idx, &log_bar_idx_prev, &window_idx, &mutex_log_bar_idx, this](CUDABatchAligner* batch) -> void {
+ while(true)
+ {
+ auto range = fill_next_batch(batch);
+ if (batch->hasOverlaps())
+ {
+ // Launch workload.
+ batch->alignAll();
+ batch->find_breaking_points(window_length_);
+
+ // logging bar
+ {
+ std::lock_guard<std::mutex> guard(mutex_log_bar_idx);
+ window_idx += range.second - range.first;
+ log_bar_idx = window_idx / logger_step;
+ if (log_bar_idx == log_bar_idx_prev) {
+ continue;
+ }
+ else if (logger_step != 0 && log_bar_idx < static_cast<int32_t>(RACON_LOGGER_BIN_SIZE))
+ {
+ logger_->bar("[racon::CUDAPolisher::initialize] aligning overlaps");
+ std::cerr<<std::endl;
+ log_bar_idx_prev = log_bar_idx;
+ }
+ }
+ }
+ else
+ {
+ break;
+ }
+ }
+ };
+
+ // Bin batches into each GPU.
+ std::vector<uint32_t> batches_per_gpu = calculate_batches_per_gpu(cudaaligner_batches_, num_devices_);
+
+ for(int32_t device = 0; device < num_devices_; device++)
+ {
+ for(uint32_t batch = 0; batch < batches_per_gpu.at(device); batch++)
+ {
+ batch_aligners_.emplace_back(createCUDABatchAligner(10000, 10000, MAX_ALIGNMENTS, device));
+ }
+ }
+
+ // Run batched alignment.
+ std::vector<std::future<void>> thread_futures;
+ for(auto& aligner : batch_aligners_)
+ {
+ thread_futures.emplace_back(
+ thread_pool_->submit(
+ process_batch,
+ aligner.get()
+ )
+ );
+ }
+
+ // Wait for threads to finish, and collect their results.
+ for (const auto& future : thread_futures) {
+ future.wait();
+ }
+
+ batch_aligners_.clear();
+ }
+}
+
+void CUDAPolisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
+ bool drop_unpolished_sequences)
+{
+ if (cudapoa_batches_ < 1)
+ {
+ Polisher::polish(dst, drop_unpolished_sequences);
+ }
+ else
+ {
+ // Creation and use of batches.
+ const uint32_t MAX_WINDOWS = 256;
+ const uint32_t MAX_DEPTH_PER_WINDOW = 200;
+
+ // Bin batches into each GPU.
+ std::vector<uint32_t> batches_per_gpu = calculate_batches_per_gpu(cudapoa_batches_, num_devices_);
+
+ for(int32_t device = 0; device < num_devices_; device++)
+ {
+ for(uint32_t batch = 0; batch < batches_per_gpu.at(device); batch++)
+ {
+ batch_processors_.emplace_back(createCUDABatch(MAX_WINDOWS, MAX_DEPTH_PER_WINDOW, device, gap_, mismatch_, match_, cuda_banded_alignment_));
+ }
+ }
+
+ logger_->log("[racon::CUDAPolisher::polish] allocated memory on GPUs");
+
+ // Mutex for accessing the vector of windows.
+ std::mutex mutex_windows;
+
+ // Initialize window consensus statuses.
+ window_consensus_status_.resize(windows_.size(), false);
+
+ // Index of next window to be added to a batch.
+ uint32_t next_window_index = 0;
+
+ // Lambda function for adding windows to batches.
+ auto fill_next_batch = [&mutex_windows, &next_window_index, this](CUDABatchProcessor* batch) -> std::pair<uint32_t, uint32_t> {
+ batch->reset();
+
+ // Use mutex to read the vector containing windows in a threadsafe manner.
+ std::lock_guard<std::mutex> guard(mutex_windows);
+
+ // TODO: Reducing window wize by 10 for debugging.
+ uint32_t initial_count = next_window_index;
+ uint32_t count = windows_.size();
+ while(next_window_index < count)
+ {
+ if (batch->addWindow(windows_.at(next_window_index)))
+ {
+ next_window_index++;
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ return {initial_count, next_window_index};
+ };
+
+ // Variables for keeping track of logger progress bar.
+ uint32_t logger_step = windows_.size() / RACON_LOGGER_BIN_SIZE;
+ int32_t log_bar_idx = 0, log_bar_idx_prev = -1;
+ uint32_t window_idx = 0;
+ std::mutex mutex_log_bar_idx;
+ logger_->log();
+
+ // Lambda function for processing each batch.
+ auto process_batch = [&fill_next_batch, &logger_step, &log_bar_idx, &mutex_log_bar_idx, &window_idx, &log_bar_idx_prev, this](CUDABatchProcessor* batch) -> void {
+ while(true)
+ {
+ std::pair<uint32_t, uint32_t> range = fill_next_batch(batch);
+ if (batch->hasWindows())
+ {
+ // Launch workload.
+ const std::vector<bool>& results = batch->generateConsensus();
+
+ // Check if the number of batches processed is same as the range of
+ // of windows that were added.
+ if (results.size() != (range.second - range.first))
+ {
+ throw std::runtime_error("Windows processed doesn't match \
+ range of windows passed to batch\n");
+ }
+
+ // Copy over the results from the batch into the per window
+ // result vector of the CUDAPolisher.
+ for(uint32_t i = 0; i < results.size(); i++)
+ {
+ window_consensus_status_.at(range.first + i) = results.at(i);
+ }
+
+ // logging bar
+ {
+ std::lock_guard<std::mutex> guard(mutex_log_bar_idx);
+ window_idx += results.size();
+ log_bar_idx = window_idx / logger_step;
+ if (log_bar_idx == log_bar_idx_prev) {
+ continue;
+ }
+ else if (logger_step != 0 && log_bar_idx < static_cast<int32_t>(RACON_LOGGER_BIN_SIZE))
+ {
+ logger_->bar("[racon::CUDAPolisher::polish] generating consensus");
+ std::cerr<<std::endl;
+ log_bar_idx_prev = log_bar_idx;
+ }
+ }
+ }
+ else
+ {
+ break;
+ }
+ }
+ };
+
+ // Process each of the batches in a separate thread.
+ std::vector<std::future<void>> thread_futures;
+ for(auto& batch_processor : batch_processors_)
+ {
+ thread_futures.emplace_back(
+ thread_pool_->submit(
+ process_batch,
+ batch_processor.get()
+ )
+ );
+ }
+
+ // Wait for threads to finish, and collect their results.
+ for (const auto& future : thread_futures) {
+ future.wait();
+ }
+
+ logger_->log("[racon::CUDAPolisher::polish] polished windows on GPU");
+
+ // Start timing CPU time for failed windows on GPU
+ logger_->log();
+ // Process each failed windows in parallel on CPU
+ std::vector<std::future<bool>> thread_failed_windows;
+ for (uint64_t i = 0; i < windows_.size(); ++i) {
+ if (window_consensus_status_.at(i) == false)
+ {
+ thread_failed_windows.emplace_back(thread_pool_->submit(
+ [&](uint64_t j) -> bool {
+ auto it = thread_to_id_.find(std::this_thread::get_id());
+ if (it == thread_to_id_.end()) {
+ fprintf(stderr, "[racon::CUDAPolisher::polish] error: "
+ "thread identifier not present!\n");
+ exit(1);
+ }
+ return window_consensus_status_.at(j) = windows_[j]->generate_consensus(
+ alignment_engines_[it->second]);
+ }, i));
+ }
+ }
+
+ // Wait for threads to finish, and collect their results.
+ for (const auto& t : thread_failed_windows) {
+ t.wait();
+ }
+ if (thread_failed_windows.size() > 0)
+ {
+ logger_->log("[racon::CUDAPolisher::polish] polished remaining windows on CPU");
+ logger_->log();
+ }
+
+ // Collect results from all windows into final output.
+ std::string polished_data = "";
+ uint32_t num_polished_windows = 0;
+
+ for (uint64_t i = 0; i < windows_.size(); ++i) {
+
+ num_polished_windows += window_consensus_status_.at(i) == true ? 1 : 0;
+ polished_data += windows_[i]->consensus();
+
+ if (i == windows_.size() - 1 || windows_[i + 1]->rank() == 0) {
+ double polished_ratio = num_polished_windows /
+ static_cast<double>(windows_[i]->rank() + 1);
+
+ if (!drop_unpolished_sequences || polished_ratio > 0) {
+ std::string tags = type_ == PolisherType::kF ? "r" : "";
+ tags += " LN:i:" + std::to_string(polished_data.size());
+ tags += " RC:i:" + std::to_string(targets_coverages_[windows_[i]->id()]);
+ tags += " XC:f:" + std::to_string(polished_ratio);
+ dst.emplace_back(createSequence(sequences_[windows_[i]->id()]->name() +
+ tags, polished_data));
+ }
+
+ num_polished_windows = 0;
+ polished_data.clear();
+ }
+ windows_[i].reset();
+ }
+
+ logger_->log("[racon::CUDAPolisher::polish] generated consensus");
+
+ // Clear POA processors.
+ batch_processors_.clear();
+ }
+}
+
+}
=====================================
src/cuda/cudapolisher.hpp
=====================================
@@ -0,0 +1,74 @@
+/*!
+ * @file cudapolisher.hpp
+ *
+ * @brief CUDA Polisher class header file
+ */
+
+#pragma once
+
+#include <mutex>
+
+#include "polisher.hpp"
+#include "cudabatch.hpp"
+#include "cudaaligner.hpp"
+#include "thread_pool/thread_pool.hpp"
+
+
+namespace racon {
+
+class CUDAPolisher : public Polisher {
+public:
+ ~CUDAPolisher();
+
+ virtual void polish(std::vector<std::unique_ptr<Sequence>>& dst,
+ bool drop_unpolished_sequences) override;
+
+ friend std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
+ const std::string& overlaps_path, const std::string& target_path,
+ PolisherType type, uint32_t window_length, double quality_threshold,
+ double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
+ uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
+ uint32_t cudaaligner_batches);
+
+protected:
+ CUDAPolisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
+ std::unique_ptr<bioparser::Parser<Overlap>> oparser,
+ std::unique_ptr<bioparser::Parser<Sequence>> tparser,
+ PolisherType type, uint32_t window_length, double quality_threshold,
+ double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
+ uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
+ uint32_t cudaaligner_batches);
+ CUDAPolisher(const CUDAPolisher&) = delete;
+ const CUDAPolisher& operator=(const CUDAPolisher&) = delete;
+ virtual void find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps) override;
+
+ static std::vector<uint32_t> calculate_batches_per_gpu(uint32_t cudapoa_batches, uint32_t gpus);
+
+ // Vector of POA batches.
+ std::vector<std::unique_ptr<CUDABatchProcessor>> batch_processors_;
+
+ // Vector of aligner batches.
+ std::vector<std::unique_ptr<CUDABatchAligner>> batch_aligners_;
+
+ // Vector of bool indicating consensus generation status for each window.
+ std::vector<bool> window_consensus_status_;
+
+ // Number of batches for POA processing.
+ uint32_t cudapoa_batches_;
+
+ // Numbver of batches for Alignment processing.
+ uint32_t cudaaligner_batches_;
+
+ // Number of GPU devices to run with.
+ int32_t num_devices_;
+
+ // State transition scores.
+ int8_t gap_;
+ int8_t mismatch_;
+ int8_t match_;
+
+ // Use banded POA alignment
+ bool cuda_banded_alignment_;
+};
+
+}
=====================================
src/cuda/cudautils.hpp
=====================================
@@ -0,0 +1,20 @@
+// Implementation file for CUDA POA utilities.
+
+#pragma once
+
+#include <stdlib.h>
+#include <cuda_runtime_api.h>
+
+namespace racon {
+
+void cudaCheckError(std::string &msg)
+{
+ cudaError_t error = cudaGetLastError();
+ if (error != cudaSuccess)
+ {
+ fprintf(stderr, "%s (CUDA error %s)\n", msg.c_str(), cudaGetErrorString(error));
+ exit(-1);
+ }
+}
+
+} // namespace racon
=====================================
src/main.cpp
=====================================
@@ -8,8 +8,12 @@
#include "sequence.hpp"
#include "polisher.hpp"
+#ifdef CUDA_ENABLED
+#include "cuda/cudapolisher.hpp"
+#endif
-static const char* version = "v1.3.2";
+static const char* version = "v1.4.3";
+static const int32_t CUDAALIGNER_INPUT_CODE = 10000;
static struct option options[] = {
{"include-unpolished", no_argument, 0, 'u'},
@@ -23,6 +27,11 @@ static struct option options[] = {
{"threads", required_argument, 0, 't'},
{"version", no_argument, 0, 'v'},
{"help", no_argument, 0, 'h'},
+#ifdef CUDA_ENABLED
+ {"cudapoa-batches", optional_argument, 0, 'c'},
+ {"cuda-banded-alignment", no_argument, 0, 'b'},
+ {"cudaaligner-batches", required_argument, 0, CUDAALIGNER_INPUT_CODE},
+#endif
{0, 0, 0, 0}
};
@@ -44,8 +53,17 @@ int main(int argc, char** argv) {
bool drop_unpolished_sequences = true;
uint32_t num_threads = 1;
- char argument;
- while ((argument = getopt_long(argc, argv, "ufw:q:e:m:x:g:t:h", options, nullptr)) != -1) {
+ uint32_t cudapoa_batches = 0;
+ uint32_t cudaaligner_batches = 0;
+ bool cuda_banded_alignment = false;
+
+ std::string optstring = "ufw:q:e:m:x:g:t:h";
+#ifdef CUDA_ENABLED
+ optstring += "bc::";
+#endif
+
+ int32_t argument;
+ while ((argument = getopt_long(argc, argv, optstring.c_str(), options, nullptr)) != -1) {
switch (argument) {
case 'u':
drop_unpolished_sequences = false;
@@ -80,6 +98,27 @@ int main(int argc, char** argv) {
case 'h':
help();
exit(0);
+#ifdef CUDA_ENABLED
+ case 'c':
+ //if option c encountered, cudapoa_batches initialized with a default value of 1.
+ cudapoa_batches = 1;
+ // next text entry is not an option, assuming it's the arg for option 'c'
+ if (optarg == NULL && argv[optind] != NULL
+ && argv[optind][0] != '-') {
+ cudapoa_batches = atoi(argv[optind++]);
+ }
+ // optional argument provided in the ususal way
+ if (optarg != NULL) {
+ cudapoa_batches = atoi(optarg);
+ }
+ break;
+ case 'b':
+ cuda_banded_alignment = true;
+ break;
+ case CUDAALIGNER_INPUT_CODE: // cudaaligner-batches
+ cudaaligner_batches = atoi(optarg);
+ break;
+#endif
default:
exit(1);
}
@@ -98,7 +137,8 @@ int main(int argc, char** argv) {
auto polisher = racon::createPolisher(input_paths[0], input_paths[1],
input_paths[2], type == 0 ? racon::PolisherType::kC :
racon::PolisherType::kF, window_length, quality_threshold,
- error_threshold, match, mismatch, gap, num_threads);
+ error_threshold, match, mismatch, gap, num_threads,
+ cudapoa_batches, cuda_banded_alignment, cudaaligner_batches);
polisher->initialize();
@@ -156,5 +196,16 @@ void help() {
" --version\n"
" prints the version number\n"
" -h, --help\n"
- " prints the usage\n");
+ " prints the usage\n"
+#ifdef CUDA_ENABLED
+ " -c, --cudapoa-batches\n"
+ " default: 1\n"
+ " number of batches for CUDA accelerated polishing\n"
+ " -b, --cuda-banded-alignment\n"
+ " use banding approximation for alignment on GPU\n"
+ " --cudaaligner-batches (experimental)\n"
+ " Number of batches for CUDA accelerated alignment\n"
+
+#endif
+ );
}
=====================================
src/overlap.cpp
=====================================
@@ -190,29 +190,41 @@ void Overlap::find_breaking_points(const std::vector<std::unique_ptr<Sequence>>&
}
if (cigar_.empty()) {
- // align overlaps with edlib
const char* q = !strand_ ? &(sequences[q_id_]->data()[q_begin_]) :
&(sequences[q_id_]->reverse_complement()[q_length_ - q_end_]);
const char* t = &(sequences[t_id_]->data()[t_begin_]);
- EdlibAlignResult result = edlibAlign(q, q_end_ - q_begin_, t, t_end_ -
- t_begin_, edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_PATH,
- nullptr, 0));
+ align_overlaps(q, q_end_ - q_begin_, t, t_end_ - t_begin_);
+ }
+
+ find_breaking_points_from_cigar(window_length);
+
+ std::string().swap(cigar_);
+}
+
+void Overlap::align_overlaps(const char* q, uint32_t q_length, const char* t, uint32_t t_length)
+{
+ // align overlaps with edlib
+ EdlibAlignResult result = edlibAlign(q, q_length, t, t_length,
+ edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_PATH,
+ nullptr, 0));
- if (result.status == EDLIB_STATUS_OK) {
- char* cigar = edlibAlignmentToCigar(result.alignment,
+ if (result.status == EDLIB_STATUS_OK) {
+ char* cigar = edlibAlignmentToCigar(result.alignment,
result.alignmentLength, EDLIB_CIGAR_STANDARD);
- cigar_ = cigar;
- free(cigar);
- } else {
- fprintf(stderr, "[racon::Overlap::find_breaking_points] error: "
+ cigar_ = cigar;
+ free(cigar);
+ } else {
+ fprintf(stderr, "[racon::Overlap::find_breaking_points] error: "
"edlib unable to align pair (%zu x %zu)!\n", q_id_, t_id_);
- exit(1);
- }
-
- edlibFreeAlignResult(result);
+ exit(1);
}
+ edlibFreeAlignResult(result);
+}
+
+void Overlap::find_breaking_points_from_cigar(uint32_t window_length)
+{
// find breaking points from cigar
std::vector<int32_t> window_ends;
for (uint32_t i = 0; i < t_end_; i += window_length) {
@@ -277,8 +289,6 @@ void Overlap::find_breaking_points(const std::vector<std::unique_ptr<Sequence>>&
j = i + 1;
}
}
-
- std::string().swap(cigar_);
}
}
=====================================
src/overlap.hpp
=====================================
@@ -71,6 +71,10 @@ public:
friend bioparser::MhapParser<Overlap>;
friend bioparser::PafParser<Overlap>;
friend bioparser::SamParser<Overlap>;
+
+#ifdef CUDA_ENABLED
+ friend class CUDABatchAligner;
+#endif
private:
Overlap(uint64_t a_id, uint64_t b_id, double accuracy, uint32_t minmers,
uint32_t a_rc, uint32_t a_begin, uint32_t a_end, uint32_t a_length,
@@ -89,6 +93,8 @@ private:
Overlap();
Overlap(const Overlap&) = delete;
const Overlap& operator=(const Overlap&) = delete;
+ virtual void find_breaking_points_from_cigar(uint32_t window_length);
+ virtual void align_overlaps(const char* q, uint32_t q_len, const char* t, uint32_t t_len);
std::string q_name_;
uint64_t q_id_;
=====================================
src/polisher.cpp
=====================================
@@ -6,15 +6,20 @@
#include <algorithm>
#include <unordered_set>
+#include <iostream>
#include "overlap.hpp"
#include "sequence.hpp"
#include "window.hpp"
#include "polisher.hpp"
+#ifdef CUDA_ENABLED
+#include "cuda/cudapolisher.hpp"
+#endif
#include "bioparser/bioparser.hpp"
#include "thread_pool/thread_pool.hpp"
#include "spoa/spoa.hpp"
+#include "logger/logger.hpp"
namespace racon {
@@ -51,7 +56,8 @@ std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
const std::string& overlaps_path, const std::string& target_path,
PolisherType type, uint32_t window_length, double quality_threshold,
double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
- uint32_t num_threads) {
+ uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
+ uint32_t cudaaligner_batches) {
if (type != PolisherType::kC && type != PolisherType::kF) {
fprintf(stderr, "[racon::createPolisher] error: invalid polisher type!\n");
@@ -122,10 +128,30 @@ std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
exit(1);
}
- return std::unique_ptr<Polisher>(new Polisher(std::move(sparser),
- std::move(oparser), std::move(tparser), type, window_length,
- quality_threshold, error_threshold, match, mismatch, gap,
- num_threads));
+ if (cudapoa_batches > 0 || cudaaligner_batches > 0)
+ {
+#ifdef CUDA_ENABLED
+ // If CUDA is enabled, return an instance of the CUDAPolisher object.
+ return std::unique_ptr<Polisher>(new CUDAPolisher(std::move(sparser),
+ std::move(oparser), std::move(tparser), type, window_length,
+ quality_threshold, error_threshold, match, mismatch, gap,
+ num_threads, cudapoa_batches, cuda_banded_alignment, cudaaligner_batches));
+#else
+ fprintf(stderr, "[racon::createPolisher] error: "
+ "Attemping to use CUDA when CUDA support is not available.\n"
+ "Please check logic in %s:%s\n",
+ __FILE__, __func__);
+ exit(1);
+#endif
+ }
+ else
+ {
+ (void) cuda_banded_alignment;
+ return std::unique_ptr<Polisher>(new Polisher(std::move(sparser),
+ std::move(oparser), std::move(tparser), type, window_length,
+ quality_threshold, error_threshold, match, mismatch, gap,
+ num_threads));
+ }
}
Polisher::Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
@@ -140,7 +166,7 @@ Polisher::Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
alignment_engines_(), sequences_(), dummy_quality_(window_length, '!'),
window_length_(window_length), windows_(),
thread_pool_(thread_pool::createThreadPool(num_threads)),
- thread_to_id_() {
+ thread_to_id_(), logger_(new logger::Logger()) {
uint32_t id = 0;
for (const auto& it: thread_pool_->thread_identifiers()) {
@@ -155,6 +181,7 @@ Polisher::Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
}
Polisher::~Polisher() {
+ logger_->total("[racon::Polisher::] total =");
}
void Polisher::initialize() {
@@ -165,8 +192,10 @@ void Polisher::initialize() {
return;
}
+ logger_->log();
+
tparser_->reset();
- tparser_->parse_objects(sequences_, -1);
+ tparser_->parse(sequences_, -1);
uint64_t targets_size = sequences_.size();
if (targets_size == 0) {
@@ -186,14 +215,15 @@ void Polisher::initialize() {
std::vector<bool> has_data(targets_size, true);
std::vector<bool> has_reverse_data(targets_size, false);
- fprintf(stderr, "[racon::Polisher::initialize] loaded target sequences\n");
+ logger_->log("[racon::Polisher::initialize] loaded target sequences");
+ logger_->log();
uint64_t sequences_size = 0, total_sequences_length = 0;
sparser_->reset();
while (true) {
uint64_t l = sequences_.size();
- auto status = sparser_->parse_objects(sequences_, kChunkSize);
+ auto status = sparser_->parse(sequences_, kChunkSize);
uint64_t n = 0;
for (uint64_t i = l; i < sequences_.size(); ++i, ++sequences_size) {
@@ -241,7 +271,8 @@ void Polisher::initialize() {
WindowType window_type = static_cast<double>(total_sequences_length) /
sequences_size <= 1000 ? WindowType::kNGS : WindowType::kTGS;
- fprintf(stderr, "[racon::Polisher::initialize] loaded sequences\n");
+ logger_->log("[racon::Polisher::initialize] loaded sequences");
+ logger_->log();
std::vector<std::unique_ptr<Overlap>> overlaps;
@@ -274,7 +305,7 @@ void Polisher::initialize() {
oparser_->reset();
uint64_t l = 0;
while (true) {
- auto status = oparser_->parse_objects(overlaps, kChunkSize);
+ auto status = oparser_->parse(overlaps, kChunkSize);
uint64_t c = l;
for (uint64_t i = l; i < overlaps.size(); ++i) {
@@ -326,11 +357,13 @@ void Polisher::initialize() {
"empty overlap set!\n");
exit(1);
}
- fprintf(stderr, "[racon::Polisher::initialize] loaded overlaps\n");
+
+ logger_->log("[racon::Polisher::initialize] loaded overlaps");
+ logger_->log();
std::vector<std::future<void>> thread_futures;
for (uint64_t i = 0; i < sequences_.size(); ++i) {
- thread_futures.emplace_back(thread_pool_->submit_task(
+ thread_futures.emplace_back(thread_pool_->submit(
[&](uint64_t j) -> void {
sequences_[j]->transmute(has_name[j], has_data[j], has_reverse_data[j]);
}, i));
@@ -339,19 +372,9 @@ void Polisher::initialize() {
it.wait();
}
- thread_futures.clear();
- for (uint64_t i = 0; i < overlaps.size(); ++i) {
- thread_futures.emplace_back(thread_pool_->submit_task(
- [&](uint64_t j) -> void {
- overlaps[j]->find_breaking_points(sequences_, window_length_);
- }, i));
- }
- for (uint64_t i = 0; i < thread_futures.size(); ++i) {
- thread_futures[i].wait();
- fprintf(stderr, "[racon::Polisher::initialize] aligned overlap %zu/%zu\r",
- i + 1, overlaps.size());
- }
- fprintf(stderr, "\n");
+ find_overlap_breaking_points(overlaps);
+
+ logger_->log();
std::vector<uint64_t> id_to_first_window_id(targets_size + 1, 0);
for (uint64_t i = 0; i < targets_size; ++i) {
@@ -428,15 +451,41 @@ void Polisher::initialize() {
overlaps[i].reset();
}
- fprintf(stderr, "[racon::Polisher::initialize] transformed data into windows\n");
+ logger_->log("[racon::Polisher::initialize] transformed data into windows");
+}
+
+void Polisher::find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps)
+{
+ std::vector<std::future<void>> thread_futures;
+ for (uint64_t i = 0; i < overlaps.size(); ++i) {
+ thread_futures.emplace_back(thread_pool_->submit(
+ [&](uint64_t j) -> void {
+ overlaps[j]->find_breaking_points(sequences_, window_length_);
+ }, i));
+ }
+
+ uint32_t logger_step = thread_futures.size() / 20;
+ for (uint64_t i = 0; i < thread_futures.size(); ++i) {
+ thread_futures[i].wait();
+ if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) {
+ logger_->bar("[racon::Polisher::initialize] aligning overlaps");
+ }
+ }
+ if (logger_step != 0) {
+ logger_->bar("[racon::Polisher::initialize] aligning overlaps");
+ } else {
+ logger_->log("[racon::Polisher::initialize] aligned overlaps");
+ }
}
void Polisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
bool drop_unpolished_sequences) {
+ logger_->log();
+
std::vector<std::future<bool>> thread_futures;
for (uint64_t i = 0; i < windows_.size(); ++i) {
- thread_futures.emplace_back(thread_pool_->submit_task(
+ thread_futures.emplace_back(thread_pool_->submit(
[&](uint64_t j) -> bool {
auto it = thread_to_id_.find(std::this_thread::get_id());
if (it == thread_to_id_.end()) {
@@ -452,6 +501,8 @@ void Polisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
std::string polished_data = "";
uint32_t num_polished_windows = 0;
+ uint64_t logger_step = thread_futures.size() / 20;
+
for (uint64_t i = 0; i < thread_futures.size(); ++i) {
thread_futures[i].wait();
@@ -476,12 +527,18 @@ void Polisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
}
windows_[i].reset();
- fprintf(stderr, "[racon::Polisher::polish] generated consensus for window %zu/%zu\r",
- i + 1, thread_futures.size());
+ if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) {
+ logger_->bar("[racon::Polisher::polish] generating consensus");
+ }
+ }
+
+ if (logger_step != 0) {
+ logger_->bar("[racon::Polisher::polish] generating consensus");
+ } else {
+ logger_->log("[racon::Polisher::polish] generated consensus");
}
- fprintf(stderr, "\n");
- std::vector<std::unique_ptr<Window>>().swap(windows_);
+ std::vector<std::shared_ptr<Window>>().swap(windows_);
std::vector<std::unique_ptr<Sequence>>().swap(sequences_);
}
=====================================
src/polisher.hpp
=====================================
@@ -25,6 +25,10 @@ namespace spoa {
class AlignmentEngine;
}
+namespace logger {
+ class Logger;
+}
+
namespace racon {
class Sequence;
@@ -41,23 +45,26 @@ std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
const std::string& overlaps_path, const std::string& target_path,
PolisherType type, uint32_t window_length, double quality_threshold,
double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
- uint32_t num_threads);
+ uint32_t num_threads, uint32_t cuda_batches = 0,
+ bool cuda_banded_alignment = false, uint32_t cudaaligner_batches = 0);
class Polisher {
public:
- ~Polisher();
+ virtual ~Polisher();
- void initialize();
+ virtual void initialize();
- void polish(std::vector<std::unique_ptr<Sequence>>& dst,
+ virtual void polish(std::vector<std::unique_ptr<Sequence>>& dst,
bool drop_unpolished_sequences);
friend std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
const std::string& overlaps_path, const std::string& target_path,
PolisherType type, uint32_t window_length, double quality_threshold,
double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
- uint32_t num_threads);
-private:
+ uint32_t num_threads, uint32_t cuda_batches, bool cuda_banded_alignment,
+ uint32_t cudaaligner_batches);
+
+protected:
Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
std::unique_ptr<bioparser::Parser<Overlap>> oparser,
std::unique_ptr<bioparser::Parser<Sequence>> tparser,
@@ -66,6 +73,7 @@ private:
uint32_t num_threads);
Polisher(const Polisher&) = delete;
const Polisher& operator=(const Polisher&) = delete;
+ virtual void find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps);
std::unique_ptr<bioparser::Parser<Sequence>> sparser_;
std::unique_ptr<bioparser::Parser<Overlap>> oparser_;
@@ -81,10 +89,12 @@ private:
std::string dummy_quality_;
uint32_t window_length_;
- std::vector<std::unique_ptr<Window>> windows_;
+ std::vector<std::shared_ptr<Window>> windows_;
std::unique_ptr<thread_pool::ThreadPool> thread_pool_;
std::unordered_map<std::thread::id, uint32_t> thread_to_id_;
+
+ std::unique_ptr<logger::Logger> logger_;
};
}
=====================================
src/window.cpp
=====================================
@@ -12,7 +12,7 @@
namespace racon {
-std::unique_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type,
+std::shared_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type,
const char* backbone, uint32_t backbone_length, const char* quality,
uint32_t quality_length) {
@@ -22,7 +22,7 @@ std::unique_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type
exit(1);
}
- return std::unique_ptr<Window>(new Window(id, rank, type, backbone,
+ return std::shared_ptr<Window>(new Window(id, rank, type, backbone,
backbone_length, quality, quality_length));
}
=====================================
src/window.hpp
=====================================
@@ -24,11 +24,12 @@ enum class WindowType {
};
class Window;
-std::unique_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type,
+std::shared_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type,
const char* backbone, uint32_t backbone_length, const char* quality,
uint32_t quality_length);
class Window {
+
public:
~Window();
@@ -49,9 +50,13 @@ public:
const char* quality, uint32_t quality_length, uint32_t begin,
uint32_t end);
- friend std::unique_ptr<Window> createWindow(uint64_t id, uint32_t rank,
+ friend std::shared_ptr<Window> createWindow(uint64_t id, uint32_t rank,
WindowType type, const char* backbone, uint32_t backbone_length,
const char* quality, uint32_t quality_length);
+
+#ifdef CUDA_ENABLED
+ friend class CUDABatchProcessor;
+#endif
private:
Window(uint64_t id, uint32_t rank, WindowType type, const char* backbone,
uint32_t backbone_length, const char* quality, uint32_t quality_length);
=====================================
test/racon_test.cpp
=====================================
@@ -29,11 +29,12 @@ public:
void SetUp(const std::string& sequences_path, const std::string& overlaps_path,
const std::string& target_path, racon::PolisherType type,
uint32_t window_length, double quality_threshold, double error_threshold,
- int8_t match,int8_t mismatch, int8_t gap) {
+ int8_t match,int8_t mismatch, int8_t gap, uint32_t cuda_batches = 0,
+ bool cuda_banded_alignment = false, uint32_t cudaaligner_batches = 0) {
polisher = racon::createPolisher(sequences_path, overlaps_path, target_path,
type, window_length, quality_threshold, error_threshold, match,
- mismatch, gap, 4);
+ mismatch, gap, 4, cuda_batches, cuda_banded_alignment, cudaaligner_batches);
}
void TearDown() {}
@@ -99,7 +100,7 @@ TEST_F(RaconPolishingTest, ConsensusWithQualities) {
auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
racon_test_data_path + "sample_reference.fasta.gz");
- parser->parse_objects(polished_sequences, -1);
+ parser->parse(polished_sequences, -1);
EXPECT_EQ(polished_sequences.size(), 2);
EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -121,7 +122,7 @@ TEST_F(RaconPolishingTest, ConsensusWithoutQualities) {
auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
racon_test_data_path + "sample_reference.fasta.gz");
- parser->parse_objects(polished_sequences, -1);
+ parser->parse(polished_sequences, -1);
EXPECT_EQ(polished_sequences.size(), 2);
EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -143,7 +144,7 @@ TEST_F(RaconPolishingTest, ConsensusWithQualitiesAndAlignments) {
auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
racon_test_data_path + "sample_reference.fasta.gz");
- parser->parse_objects(polished_sequences, -1);
+ parser->parse(polished_sequences, -1);
EXPECT_EQ(polished_sequences.size(), 2);
EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -165,7 +166,7 @@ TEST_F(RaconPolishingTest, ConsensusWithoutQualitiesAndWithAlignments) {
auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
racon_test_data_path + "sample_reference.fasta.gz");
- parser->parse_objects(polished_sequences, -1);
+ parser->parse(polished_sequences, -1);
EXPECT_EQ(polished_sequences.size(), 2);
EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -187,7 +188,7 @@ TEST_F(RaconPolishingTest, ConsensusWithQualitiesLargerWindow) {
auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
racon_test_data_path + "sample_reference.fasta.gz");
- parser->parse_objects(polished_sequences, -1);
+ parser->parse(polished_sequences, -1);
EXPECT_EQ(polished_sequences.size(), 2);
EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -209,7 +210,7 @@ TEST_F(RaconPolishingTest, ConsensusWithQualitiesEditDistance) {
auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
racon_test_data_path + "sample_reference.fasta.gz");
- parser->parse_objects(polished_sequences, -1);
+ parser->parse(polished_sequences, -1);
EXPECT_EQ(polished_sequences.size(), 2);
EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -287,3 +288,209 @@ TEST_F(RaconPolishingTest, FragmentCorrectionWithQualitiesFullMhap) {
}
EXPECT_EQ(total_length, 1658216);
}
+
+#ifdef CUDA_ENABLED
+TEST_F(RaconPolishingTest, ConsensusWithQualitiesCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+ "sample_overlaps.paf.gz", racon_test_data_path + "sample_layout.fasta.gz",
+ racon::PolisherType::kC, 500, 10, 0.3, 5, -4, -8, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, true);
+ EXPECT_EQ(polished_sequences.size(), 1);
+
+ polished_sequences[0]->create_reverse_complement();
+
+ auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+ racon_test_data_path + "sample_reference.fasta.gz");
+ parser->parse(polished_sequences, -1);
+ EXPECT_EQ(polished_sequences.size(), 2);
+
+ EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+ polished_sequences[1]->data()), 1385); // CPU 1312
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithoutQualitiesCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fasta.gz", racon_test_data_path +
+ "sample_overlaps.paf.gz", racon_test_data_path + "sample_layout.fasta.gz",
+ racon::PolisherType::kC, 500, 10, 0.3, 5, -4, -8, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, true);
+ EXPECT_EQ(polished_sequences.size(), 1);
+
+ polished_sequences[0]->create_reverse_complement();
+
+ auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+ racon_test_data_path + "sample_reference.fasta.gz");
+ parser->parse(polished_sequences, -1);
+ EXPECT_EQ(polished_sequences.size(), 2);
+
+ EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+ polished_sequences[1]->data()), 1607); // CPU 1566
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithQualitiesAndAlignmentsCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+ "sample_overlaps.sam.gz", racon_test_data_path + "sample_layout.fasta.gz",
+ racon::PolisherType::kC, 500, 10, 0.3, 5, -4, -8, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, true);
+ EXPECT_EQ(polished_sequences.size(), 1);
+
+ polished_sequences[0]->create_reverse_complement();
+
+ auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+ racon_test_data_path + "sample_reference.fasta.gz");
+ parser->parse(polished_sequences, -1);
+ EXPECT_EQ(polished_sequences.size(), 2);
+
+ EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+ polished_sequences[1]->data()), 1541); // CPU 1317
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithoutQualitiesAndWithAlignmentsCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fasta.gz", racon_test_data_path +
+ "sample_overlaps.sam.gz", racon_test_data_path + "sample_layout.fasta.gz",
+ racon::PolisherType::kC, 500, 10, 0.3, 5, -4, -8, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, true);
+ EXPECT_EQ(polished_sequences.size(), 1);
+
+ polished_sequences[0]->create_reverse_complement();
+
+ auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+ racon_test_data_path + "sample_reference.fasta.gz");
+ parser->parse(polished_sequences, -1);
+ EXPECT_EQ(polished_sequences.size(), 2);
+
+ EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+ polished_sequences[1]->data()), 1661); // CPU 1770
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithQualitiesLargerWindowCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+ "sample_overlaps.paf.gz", racon_test_data_path + "sample_layout.fasta.gz",
+ racon::PolisherType::kC, 1000, 10, 0.3, 5, -4, -8, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, true);
+ EXPECT_EQ(polished_sequences.size(), 1);
+
+ polished_sequences[0]->create_reverse_complement();
+
+ auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+ racon_test_data_path + "sample_reference.fasta.gz");
+ parser->parse(polished_sequences, -1);
+ EXPECT_EQ(polished_sequences.size(), 2);
+
+ EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+ polished_sequences[1]->data()), 4168); // CPU 1289
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithQualitiesEditDistanceCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+ "sample_overlaps.paf.gz", racon_test_data_path + "sample_layout.fasta.gz",
+ racon::PolisherType::kC, 500, 10, 0.3, 1, -1, -1, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, true);
+ EXPECT_EQ(polished_sequences.size(), 1);
+
+ polished_sequences[0]->create_reverse_complement();
+
+ auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+ racon_test_data_path + "sample_reference.fasta.gz");
+ parser->parse(polished_sequences, -1);
+ EXPECT_EQ(polished_sequences.size(), 2);
+
+ EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+ polished_sequences[1]->data()), 1361); // CPU 1321
+}
+
+TEST_F(RaconPolishingTest, FragmentCorrectionWithQualitiesCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+ "sample_ava_overlaps.paf.gz", racon_test_data_path + "sample_reads.fastq.gz",
+ racon::PolisherType::kC, 500, 10, 0.3, 1, -1, -1, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, true);
+ EXPECT_EQ(polished_sequences.size(), 39);
+
+ uint32_t total_length = 0;
+ for (const auto& it: polished_sequences) {
+ total_length += it->data().size();
+ }
+ EXPECT_EQ(total_length, 385543); // CPU 389394
+}
+
+TEST_F(RaconPolishingTest, FragmentCorrectionWithQualitiesFullCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+ "sample_ava_overlaps.paf.gz", racon_test_data_path + "sample_reads.fastq.gz",
+ racon::PolisherType::kF, 500, 10, 0.3, 1, -1, -1, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, false);
+ EXPECT_EQ(polished_sequences.size(), 236);
+
+ uint32_t total_length = 0;
+ for (const auto& it: polished_sequences) {
+ total_length += it->data().size();
+ }
+ EXPECT_EQ(total_length, 1655505); // CPU 1658216
+}
+
+TEST_F(RaconPolishingTest, FragmentCorrectionWithoutQualitiesFullCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fasta.gz", racon_test_data_path +
+ "sample_ava_overlaps.paf.gz", racon_test_data_path + "sample_reads.fasta.gz",
+ racon::PolisherType::kF, 500, 10, 0.3, 1, -1, -1, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, false);
+ EXPECT_EQ(polished_sequences.size(), 236);
+
+ uint32_t total_length = 0;
+ for (const auto& it: polished_sequences) {
+ total_length += it->data().size();
+ }
+ EXPECT_EQ(total_length, 1663732); // CPU 1663982
+}
+
+TEST_F(RaconPolishingTest, FragmentCorrectionWithQualitiesFullMhapCUDA) {
+ SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+ "sample_ava_overlaps.mhap.gz", racon_test_data_path + "sample_reads.fastq.gz",
+ racon::PolisherType::kF, 500, 10, 0.3, 1, -1, -1, 1);
+
+ initialize();
+
+ std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+ polish(polished_sequences, false);
+ EXPECT_EQ(polished_sequences.size(), 236);
+
+ uint32_t total_length = 0;
+ for (const auto& it: polished_sequences) {
+ total_length += it->data().size();
+ }
+ EXPECT_EQ(total_length, 1655505); // CPU 1658216
+}
+#endif
View it on GitLab: https://salsa.debian.org/med-team/racon/compare/bc5f1f7386b29a55ba2bfd0af0c8d193e7931db0...07bf95601d09fa77eb6329165e5b121a4e9aa011
--
View it on GitLab: https://salsa.debian.org/med-team/racon/compare/bc5f1f7386b29a55ba2bfd0af0c8d193e7931db0...07bf95601d09fa77eb6329165e5b121a4e9aa011
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190802/1b5416e8/attachment-0001.html>
More information about the debian-med-commit
mailing list