[med-svn] [Git][med-team/racon][master] 11 commits: New upstream version 1.4.0

Andreas Tille gitlab at salsa.debian.org
Fri Aug 2 21:48:57 BST 2019


Andreas Tille pushed to branch master at Debian Med / racon


Commits:
434b362c by Andreas Tille at 2019-07-15T14:33:40Z
New upstream version 1.4.0
- - - - -
2a657f05 by Andreas Tille at 2019-07-15T14:33:43Z
Update upstream source from tag 'upstream/1.4.0'

Update to upstream version '1.4.0'
with Debian dir e0bc0676adb3066a8cdc641912d03c0d7d71fa21
- - - - -
59a85060 by Andreas Tille at 2019-07-15T14:33:43Z
New upstream version

- - - - -
8ae7f5fb by Andreas Tille at 2019-07-15T14:33:43Z
debhelper 12

- - - - -
e6d34ac9 by Andreas Tille at 2019-07-15T14:34:05Z
Standards-Version: 4.4.0

- - - - -
ca47945c by Andreas Tille at 2019-07-15T14:40:04Z
Refresh patches

- - - - -
30dc4ab6 by Andreas Tille at 2019-08-02T20:44:02Z
New upstream version 1.4.3
- - - - -
fe9f40e3 by Andreas Tille at 2019-08-02T20:44:04Z
Update upstream source from tag 'upstream/1.4.3'

Update to upstream version '1.4.3'
with Debian dir a79445d56d01d687d134b4d322df967e2156d2e8
- - - - -
1d2b1a88 by Andreas Tille at 2019-08-02T20:44:04Z
New upstream version

- - - - -
12474747 by Andreas Tille at 2019-08-02T20:44:04Z
debhelper-compat 12

- - - - -
07bf9560 by Andreas Tille at 2019-08-02T20:47:53Z
Add TODO

- - - - -


23 changed files:

- .gitmodules
- .travis.yml
- CMakeLists.txt
- README.md
- debian/changelog
- − debian/compat
- debian/control
- debian/patches/use_debian_packaged_libs.patch
- + src/cuda/cudaaligner.cpp
- + src/cuda/cudaaligner.hpp
- + src/cuda/cudabatch.cpp
- + src/cuda/cudabatch.hpp
- + src/cuda/cudapolisher.cpp
- + src/cuda/cudapolisher.hpp
- + src/cuda/cudautils.hpp
- src/main.cpp
- src/overlap.cpp
- src/overlap.hpp
- src/polisher.cpp
- src/polisher.hpp
- src/window.cpp
- src/window.hpp
- test/racon_test.cpp


Changes:

=====================================
.gitmodules
=====================================
@@ -16,3 +16,9 @@
 [submodule "vendor/rampler"]
 	path = vendor/rampler
 	url = https://github.com/rvaser/rampler
+[submodule "vendor/logger"]
+	path = vendor/logger
+	url = https://github.com/rvaser/logger
+[submodule "vendor/ClaraGenomicsAnalysis"]
+	path = vendor/ClaraGenomicsAnalysis
+	url = https://github.com/clara-genomics/ClaraGenomicsAnalysis


=====================================
.travis.yml
=====================================
@@ -1,3 +1,5 @@
+dist: trusty
+
 language: cpp
 
 compiler:


=====================================
CMakeLists.txt
=====================================
@@ -8,17 +8,40 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic")
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
 option(racon_build_tests "Build racon unit tests" OFF)
 option(racon_build_wrapper "Build racon wrapper" OFF)
+option(racon_enable_cuda "Build racon with NVIDIA CUDA support" OFF)
 
-add_executable(racon
+# Check CUDA compatibility.
+if(racon_enable_cuda)
+    find_package(CUDA 9.0 QUIET REQUIRED)
+    if(NOT ${CUDA_FOUND})
+        message(FATAL_ERROR "CUDA not detected on system. Please install")
+    else()
+        message(STATUS "Using CUDA ${CUDA_VERSION} from ${CUDA_TOOLKIT_ROOT_DIR}")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo")
+    endif()
+endif()
+
+include_directories(${PROJECT_SOURCE_DIR}/src)
+
+set(racon_sources
     src/main.cpp
     src/polisher.cpp
     src/overlap.cpp
     src/sequence.cpp
     src/window.cpp)
 
+if(racon_enable_cuda)
+    list(APPEND racon_sources src/cuda/cudapolisher.cpp src/cuda/cudabatch.cpp src/cuda/cudaaligner.cpp)
+    cuda_add_executable(racon ${racon_sources})
+    target_compile_definitions(racon PRIVATE CUDA_ENABLED)
+else()
+    add_executable(racon ${racon_sources})
+endif()
+
 if (NOT TARGET bioparser)
     add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
 endif()
@@ -31,8 +54,42 @@ endif()
 if (NOT TARGET edlib)
     add_subdirectory(vendor/edlib EXCLUDE_FROM_ALL)
 endif()
+if (NOT TARGET logger)
+    add_subdirectory(vendor/logger EXCLUDE_FROM_ALL)
+endif()
+if (racon_enable_cuda)
+    if (DEFINED CLARAGENOMICSANALYSIS_SDK_PATH)
+        list(APPEND CMAKE_PREFIX_PATH "${CLARAGENOMICSANALYSIS_SDK_PATH}/cmake")
+        find_package(cudapoa REQUIRED)
+        find_package(cudaaligner REQUIRED)
+    elseif (DEFINED CLARAGENOMICSANALYSIS_SRC_PATH)
+        if (NOT TARGET cudapoa)
+            add_subdirectory(${CLARAGENOMICSANALYSIS_SRC_PATH} ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+        endif()
+        if (NOT TARGET cudaaligner)
+            add_subdirectory(${CLARAGENOMICSANALYSIS_SRC_PATH} ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+        endif()
+    elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ClaraGenomicsAnalysis)
+        if (NOT TARGET cudapoa)
+            add_subdirectory(vendor/ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+        endif()
+        if (NOT TARGET cudaaligner)
+            add_subdirectory(vendor/ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+        endif()
+    else()
+        if (NOT TARGET cudapoa)
+            add_subdirectory(../ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+        endif()
+        if (NOT TARGET cudaaligner)
+            add_subdirectory(../ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
+        endif()
+    endif()
+endif()
 
-target_link_libraries(racon bioparser spoa thread_pool pthread edlib_static)
+target_link_libraries(racon bioparser spoa thread_pool edlib_static logger)
+if (racon_enable_cuda)
+    target_link_libraries(racon cudapoa cudaaligner)
+endif()
 
 install(TARGETS racon DESTINATION bin)
 
@@ -43,18 +100,30 @@ if (racon_build_tests)
     include_directories(${PROJECT_BINARY_DIR}/config)
     include_directories(${PROJECT_SOURCE_DIR}/src)
 
-    add_executable(racon_test
+    set(racon_test_sources
         test/racon_test.cpp
         src/polisher.cpp
         src/overlap.cpp
         src/sequence.cpp
         src/window.cpp)
 
-    add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
+    if (racon_enable_cuda)
+        list(APPEND racon_test_sources src/cuda/cudapolisher.cpp src/cuda/cudabatch.cpp src/cuda/cudaaligner.cpp)
+        cuda_add_executable(racon_test ${racon_test_sources})
+        target_compile_definitions(racon_test PRIVATE CUDA_ENABLED)
+    else()
+        add_executable(racon_test ${racon_test_sources})
+    endif()
 
-    target_link_libraries(racon_test bioparser spoa thread_pool pthread
-        edlib_static gtest_main)
-endif(racon_build_tests)
+    if (NOT TARGET gtest_main)
+        add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
+    endif()
+
+    target_link_libraries(racon_test bioparser spoa thread_pool edlib_static logger gtest_main)
+    if (racon_enable_cuda)
+        target_link_libraries(racon_test cudapoa cudaaligner)
+    endif()
+endif()
 
 if (racon_build_wrapper)
     set(racon_path ${PROJECT_BINARY_DIR}/bin/racon)
@@ -66,5 +135,7 @@ if (racon_build_wrapper)
         FILE_PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE
         WORLD_READ WORLD_EXECUTE)
 
-    add_subdirectory(vendor/rampler)
-endif(racon_build_wrapper)
+    if (NOT TARGET rampler)
+        add_subdirectory(vendor/rampler)
+    endif()
+endif()


=====================================
README.md
=====================================
@@ -11,7 +11,7 @@ Racon is intended as a standalone consensus module to correct raw contigs genera
 
 Racon can be used as a polishing tool after the assembly with **either Illumina data or data produced by third generation of sequencing**. The type of data inputed is automatically detected.
 
-Racon takes as input only three files: contigs in FASTA/FASTQ format, reads in FASTA/FASTQ format and overlaps/alignments between the reads and the contigs in MHAP/PAF/SAM format. Output is a set of polished contigs in FASTA format printed to stdout. All input files **can be compressed with gzip**.
+Racon takes as input only three files: contigs in FASTA/FASTQ format, reads in FASTA/FASTQ format and overlaps/alignments between the reads and the contigs in MHAP/PAF/SAM format. Output is a set of polished contigs in FASTA format printed to stdout. All input files **can be compressed with gzip** (which will have impact on parsing time).
 
 Racon can also be used as a read error-correction tool. In this scenario, the MHAP/PAF/SAM file needs to contain pairwise overlaps between reads **including dual overlaps**.
 
@@ -21,6 +21,11 @@ A **wrapper script** is also available to enable easier usage to the end-user fo
 1. gcc 4.8+ or clang 3.4+
 2. cmake 3.2+
 
+### CUDA Support
+1. gcc 5.0+
+2. cmake 3.10+
+4. CUDA 10.0+
+
 ## Installation
 To install Racon run the following commands:
 
@@ -43,6 +48,20 @@ To build unit tests add `-Dracon_build_tests=ON` while running `cmake`. After in
 
 To build the wrapper script add `-Dracon_build_wrapper=ON` while running `cmake`. After installation, an executable named `racon_wrapper` (python script) will be created in `build/bin`.
 
+### CUDA Support
+Racon makes use of [NVIDIA's ClaraGenomicsAnalysis SDK](https://github.com/clara-genomics/ClaraGenomicsAnalysis) for CUDA accelerated polishing and alignment.
+
+To build `racon` with CUDA support, add `-Dracon_enable_cuda=ON` while running `cmake`. If CUDA support is unavailable, the `cmake` step will error out. 
+Note that the CUDA support flag does not produce a new binary target. Instead it augments the existing `racon` binary itself.
+
+```bash
+cd build
+cmake -DCMAKE_BUILD_TYPE=Release -Dracon_enable_cuda=ON ..
+make
+```
+
+***Note***: Short read polishing with CUDA is still in development!
+
 ## Usage
 Usage of `racon` is as following:
 
@@ -90,6 +109,15 @@ Usage of `racon` is as following:
         -h, --help
             prints the usage
 
+    only available when built with CUDA:
+        -c, --cudapoa-batches
+            default: 1
+            number of batches for CUDA accelerated polishing
+        -b, --cuda-banded-alignment
+            use banding approximation for polishing on GPU. Only applicable when -c is used.
+        --cudaaligner-batches (experimental)
+            Number of batches for CUDA accelerated alignment
+
 `racon_test` is run without any parameters.
 
 Usage of `racon_wrapper` equals the one of `racon` with two additional parameters:


=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+racon (1.4.3-1) UNRELEASED; urgency=medium
+
+  * New upstream version
+  * debhelper-compat 12
+  * Standards-Version: 4.4.0
+  TODO: https://salsa.debian.org/med-team/liblogger
+
+ -- Andreas Tille <tille at debian.org>  Fri, 02 Aug 2019 22:44:04 +0200
+
 racon (1.3.2-1) unstable; urgency=medium
 
   * Team upload.


=====================================
debian/compat deleted
=====================================
@@ -1 +0,0 @@
-11


=====================================
debian/control
=====================================
@@ -4,7 +4,7 @@ Uploaders: Cédric Lood <cedric.lood at kuleuven.be>,
            Andreas Tille <tille at debian.org>
 Section: science
 Priority: optional
-Build-Depends: debhelper (>= 11~),
+Build-Depends: debhelper-compat (= 12),
                cmake,
                libgtest-dev,
                libbioparser-dev,
@@ -12,7 +12,7 @@ Build-Depends: debhelper (>= 11~),
                libspoa-dev,
                libthread-pool-dev,
                rampler
-Standards-Version: 4.3.0
+Standards-Version: 4.4.0
 Vcs-Browser: https://salsa.debian.org/med-team/racon
 Vcs-Git: https://salsa.debian.org/med-team/racon.git
 Homepage: https://github.com/isovic/racon


=====================================
debian/patches/use_debian_packaged_libs.patch
=====================================
@@ -4,9 +4,9 @@ Description: Use Debian packaged libraries
 
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -19,20 +19,7 @@ add_executable(racon
-     src/sequence.cpp
-     src/window.cpp)
+@@ -42,18 +42,6 @@ else()
+     add_executable(racon ${racon_sources})
+ endif()
  
 -if (NOT TARGET bioparser)
 -    add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
@@ -20,22 +20,28 @@ Description: Use Debian packaged libraries
 -if (NOT TARGET edlib)
 -    add_subdirectory(vendor/edlib EXCLUDE_FROM_ALL)
 -endif()
--
--target_link_libraries(racon bioparser spoa thread_pool pthread edlib_static)
-+target_link_libraries(racon spoa thread_pool pthread edlib z)
- 
- install(TARGETS racon DESTINATION bin)
+ if (NOT TARGET logger)
+     add_subdirectory(vendor/logger EXCLUDE_FROM_ALL)
+ endif()
+@@ -86,7 +74,7 @@ if (racon_enable_cuda)
+     endif()
+ endif()
  
-@@ -50,10 +37,8 @@ if (racon_build_tests)
-         src/sequence.cpp
-         src/window.cpp)
+-target_link_libraries(racon bioparser spoa thread_pool edlib_static logger)
++target_link_libraries(racon spoa thread_pool pthread edlib logger z)
+ if (racon_enable_cuda)
+     target_link_libraries(racon cudapoa cudaaligner)
+ endif()
+@@ -115,11 +103,7 @@ if (racon_build_tests)
+         add_executable(racon_test ${racon_test_sources})
+     endif()
  
--    add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
+-    if (NOT TARGET gtest_main)
+-        add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
+-    endif()
 -
--    target_link_libraries(racon_test bioparser spoa thread_pool pthread
--        edlib_static gtest_main)
-+    target_link_libraries(racon_test spoa thread_pool pthread
-+        edlib gtest_main gtest z)
- endif(racon_build_tests)
- 
- if (racon_build_wrapper)
+-    target_link_libraries(racon_test bioparser spoa thread_pool edlib_static logger gtest_main)
++    target_link_libraries(racon_test spoa thread_pool pthread edlib logger gtest_main gtest z)
+     if (racon_enable_cuda)
+         target_link_libraries(racon_test cudapoa cudaaligner)
+     endif()


=====================================
src/cuda/cudaaligner.cpp
=====================================
@@ -0,0 +1,128 @@
+/*!
+ * @file cudaaligner.cpp
+ *
+ * @brief CUDABatchAligner class source file
+ */
+
+#include <cudautils/cudautils.hpp>
+
+#include "cudaaligner.hpp"
+
+namespace racon {
+
+std::atomic<uint32_t> CUDABatchAligner::batches;
+
+std::unique_ptr<CUDABatchAligner> createCUDABatchAligner(uint32_t max_query_size,
+                                                         uint32_t max_target_size,
+                                                         uint32_t max_alignments,
+                                                         uint32_t device_id)
+{
+    return std::unique_ptr<CUDABatchAligner>(new CUDABatchAligner(max_query_size,
+                                                                  max_target_size,
+                                                                  max_alignments,
+                                                                  device_id));
+}
+
+CUDABatchAligner::CUDABatchAligner(uint32_t max_query_size,
+                                   uint32_t max_target_size,
+                                   uint32_t max_alignments,
+                                   uint32_t device_id)
+    : aligner_(claragenomics::cudaaligner::create_aligner(max_query_size,
+                                                        max_target_size,
+                                                        max_alignments,
+                                                        claragenomics::cudaaligner::AlignmentType::global,
+                                                        device_id))
+    , overlaps_()
+    , stream_(0)
+{
+    bid_ = CUDABatchAligner::batches++;
+
+    CGA_CU_CHECK_ERR(cudaStreamCreate(&stream_));
+
+    aligner_->set_cuda_stream(stream_);
+}
+
+CUDABatchAligner::~CUDABatchAligner()
+{
+    CGA_CU_CHECK_ERR(cudaStreamDestroy(stream_));
+}
+
+bool CUDABatchAligner::addOverlap(Overlap* overlap, std::vector<std::unique_ptr<Sequence>>& sequences)
+{
+    const char* q = !overlap->strand_ ? &(sequences[overlap->q_id_]->data()[overlap->q_begin_]) :
+        &(sequences[overlap->q_id_]->reverse_complement()[overlap->q_length_ - overlap->q_end_]);
+    const char* t = &(sequences[overlap->t_id_]->data()[overlap->t_begin_]);
+
+    claragenomics::cudaaligner::StatusType s =
+        aligner_->add_alignment(q, overlap->q_end_ - overlap->q_begin_,
+                                t, overlap->t_end_ - overlap->t_begin_);
+    if (s == claragenomics::cudaaligner::StatusType::exceeded_max_alignments)
+    {
+        return false;
+    }
+    else if (s == claragenomics::cudaaligner::StatusType::exceeded_max_alignment_difference
+             || s == claragenomics::cudaaligner::StatusType::exceeded_max_length)
+    {
+        cpu_overlap_data_.emplace_back(std::make_pair<std::string, std::string>(std::string(q, q + overlap->q_end_ - overlap->q_begin_),
+                                                                                std::string(t, t + overlap->t_end_ - overlap->t_begin_)));
+        cpu_overlaps_.push_back(overlap);
+    }
+    else if (s != claragenomics::cudaaligner::StatusType::success)
+    {
+        fprintf(stderr, "Unknown error in cuda aligner!\n");
+    }
+    else
+    {
+        overlaps_.push_back(overlap);
+    }
+    return true;
+}
+
+void CUDABatchAligner::alignAll()
+{
+    aligner_->align_all();
+    compute_cpu_overlaps();
+}
+
+void CUDABatchAligner::compute_cpu_overlaps()
+{
+    for(std::size_t a = 0; a < cpu_overlaps_.size(); a++)
+    {
+        // Run CPU version of overlap.
+        Overlap* overlap = cpu_overlaps_[a];
+        overlap->align_overlaps(cpu_overlap_data_[a].first.c_str(), cpu_overlap_data_[a].first.length(),
+                                cpu_overlap_data_[a].second.c_str(), cpu_overlap_data_[a].second.length());
+    }
+}
+
+void CUDABatchAligner::find_breaking_points(uint32_t window_length)
+{
+    aligner_->sync_alignments();
+
+    const std::vector<std::shared_ptr<claragenomics::cudaaligner::Alignment>>& alignments = aligner_->get_alignments();
+    // Number of alignments should be the same as number of overlaps.
+    if (overlaps_.size() != alignments.size())
+    {
+        throw std::runtime_error("Number of alignments doesn't match number of overlaps in cudaaligner.");
+    }
+    for(std::size_t a = 0; a < alignments.size(); a++)
+    {
+        overlaps_[a]->cigar_ = alignments[a]->convert_to_cigar();
+        overlaps_[a]->find_breaking_points_from_cigar(window_length);
+    }
+    for(Overlap* overlap : cpu_overlaps_)
+    {
+        // Run CPU version of breaking points.
+        overlap->find_breaking_points_from_cigar(window_length);
+    }
+}
+
+void CUDABatchAligner::reset()
+{
+    overlaps_.clear();
+    cpu_overlaps_.clear();
+    cpu_overlap_data_.clear();
+    aligner_->reset();
+}
+
+}


=====================================
src/cuda/cudaaligner.hpp
=====================================
@@ -0,0 +1,96 @@
+/*!
+* @file cudaaligner.hpp
+ *
+ * @brief CUDA aligner class header file
+ */
+#include "cudaaligner/cudaaligner.hpp"
+#include "cudaaligner/aligner.hpp"
+#include "cudaaligner/alignment.hpp"
+
+#include "overlap.hpp"
+#include "sequence.hpp"
+
+#include <vector>
+#include <atomic>
+
+namespace racon {
+
+class CUDABatchAligner;
+std::unique_ptr<CUDABatchAligner> createCUDABatchAligner(uint32_t max_query_size, uint32_t max_target_size, uint32_t max_alignments, uint32_t device_id);
+
+class CUDABatchAligner
+{
+    public:
+        virtual ~CUDABatchAligner();
+
+        /**
+         * @brief Add a new overlap to the batch.
+         *
+         * @param[in] window   : The overlap to add to the batch.
+         * @param[in] sequences: Reference to a database of sequences.
+         *
+         * @return True if overlap could be added to the batch.
+         */
+        virtual bool addOverlap(Overlap* overlap, std::vector<std::unique_ptr<Sequence>>& sequences);
+
+        /**
+         * @brief Checks if batch has any overlaps to process.
+         *
+         * @return Trie if there are overlaps in the batch.
+         */
+        virtual bool hasOverlaps() const {
+            return overlaps_.size() > 0;
+        };
+
+        /**
+         * @brief Runs batched alignment of overlaps on GPU.
+         *
+         */
+        virtual void alignAll();
+
+        /**
+         * @brief Find breaking points in alignments.
+         *
+         */
+        virtual void find_breaking_points(uint32_t window_length);
+
+        /**
+         * @brief Resets the state of the object, which includes
+         *        resetting buffer states and counters.
+         */
+        virtual void reset();
+
+        /**
+         * @brief Get batch ID.
+         */
+        uint32_t getBatchID() const { return bid_; }
+
+        // Builder function to create a new CUDABatchAligner object.
+        friend std::unique_ptr<CUDABatchAligner>
+        createCUDABatchAligner(uint32_t max_query_size, uint32_t max_target_size, uint32_t max_alignments, uint32_t device_id);
+
+    protected:
+        CUDABatchAligner(uint32_t max_query_size, uint32_t max_target_size, uint32_t max_alignments, uint32_t device_id);
+        CUDABatchAligner(const CUDABatchAligner&) = delete;
+        const CUDABatchAligner& operator=(const CUDABatchAligner&) = delete;
+
+        void compute_cpu_overlaps();
+
+        std::unique_ptr<claragenomics::cudaaligner::Aligner> aligner_;
+
+        std::vector<Overlap*> overlaps_;
+
+        std::vector<Overlap*> cpu_overlaps_;
+        std::vector<std::pair<std::string, std::string>> cpu_overlap_data_;
+
+        // Static batch count used to generate batch IDs.
+        static std::atomic<uint32_t> batches;
+
+        // Batch ID.
+        uint32_t bid_ = 0;
+
+        // CUDA stream for batch.
+        cudaStream_t stream_;
+};
+
+}


=====================================
src/cuda/cudabatch.cpp
=====================================
@@ -0,0 +1,260 @@
+/*!
+ * @file cudabatch.cpp
+ *
+ * @brief CUDABatch class source file
+ */
+
+#include <string>
+#include <iostream>
+#include <cstring>
+#include <algorithm>
+
+#include "cudabatch.hpp"
+#include "cudautils.hpp"
+
+#include "spoa/spoa.hpp"
+#include <cudautils/cudautils.hpp>
+
+namespace racon {
+
+std::atomic<uint32_t> CUDABatchProcessor::batches;
+
+std::unique_ptr<CUDABatchProcessor> createCUDABatch(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment)
+{
+    return std::unique_ptr<CUDABatchProcessor>(new CUDABatchProcessor(max_windows, max_window_depth, device, gap, mismatch, match, cuda_banded_alignment));
+}
+
+CUDABatchProcessor::CUDABatchProcessor(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment)
+    : max_windows_(max_windows)
+    , cudapoa_batch_(claragenomics::cudapoa::create_batch(max_windows, max_window_depth, device, claragenomics::cudapoa::OutputType::consensus, gap, mismatch, match, cuda_banded_alignment))
+    , windows_()
+    , seqs_added_per_window_()
+{
+    bid_ = CUDABatchProcessor::batches++;
+    
+    // Create new CUDA stream.
+    CGA_CU_CHECK_ERR(cudaStreamCreate(&stream_));
+    cudapoa_batch_->set_cuda_stream(stream_);
+}
+
+CUDABatchProcessor::~CUDABatchProcessor()
+{
+    // Destroy CUDA stream.
+    CGA_CU_CHECK_ERR(cudaStreamDestroy(stream_));
+}
+
+bool CUDABatchProcessor::addWindow(std::shared_ptr<Window> window)
+{
+    if (windows_.size() < max_windows_)
+    {
+        windows_.push_back(window);
+        seqs_added_per_window_.push_back(0);
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+bool CUDABatchProcessor::hasWindows() const
+{
+    return (windows_.size() != 0);
+}
+
+void CUDABatchProcessor::convertPhredQualityToWeights(const char* qual,
+                                                      uint32_t qual_length,
+                                                      std::vector<int8_t>& weights)
+{
+    weights.clear();
+    for(uint32_t i = 0; i < qual_length; i++)
+    {
+        weights.push_back(static_cast<uint8_t>(qual[i]) - 33); // PHRED quality
+    }
+}
+
+claragenomics::cudapoa::StatusType CUDABatchProcessor::addSequenceToPoa(std::pair<const char*, uint32_t>& seq,
+                                                                      std::pair<const char*, uint32_t>& qualities)
+{
+    // Add sequences to latest poa in batch.
+    std::vector<int8_t> weights;
+    claragenomics::cudapoa::StatusType status = claragenomics::cudapoa::StatusType::success;
+    if (qualities.first == nullptr)
+    {
+        status = cudapoa_batch_->add_seq_to_poa(seq.first, nullptr, seq.second);
+    }
+    else
+    {
+        convertPhredQualityToWeights(qualities.first, qualities.second, weights);
+        status = cudapoa_batch_->add_seq_to_poa(seq.first, weights.data(), seq.second);
+    }
+    return status;
+}
+
+void CUDABatchProcessor::generateMemoryMap()
+{
+    auto num_windows = windows_.size();
+    for(uint32_t w = 0; w < num_windows; w++)
+    {
+        // Add new poa
+        claragenomics::cudapoa::StatusType status = cudapoa_batch_->add_poa();
+        if (status != claragenomics::cudapoa::StatusType::success)
+        {
+            fprintf(stderr, "Failed to add new POA to batch %d.\n",
+                    cudapoa_batch_->batch_id());
+            exit(1);
+        }
+
+        std::shared_ptr<Window> window = windows_.at(w);
+        uint32_t num_seqs = window->sequences_.size();
+        std::vector<uint8_t> weights;
+
+        // Add first sequence as backbone to graph.
+        std::pair<const char*, uint32_t> seq = window->sequences_.front();
+        std::pair<const char*, uint32_t> qualities = window->qualities_.front();
+        status = addSequenceToPoa(seq, qualities);
+        if (status != claragenomics::cudapoa::StatusType::success)
+        {
+            fprintf(stderr, "Could not add backbone to window. Fatal error.\n");
+            exit(1);
+        }
+
+        // Add the rest of the sequences in sorted order of starting positions.
+        std::vector<uint32_t> rank;
+        rank.reserve(window->sequences_.size());
+
+        for (uint32_t i = 0; i < num_seqs; ++i) {
+            rank.emplace_back(i);
+        }
+
+        std::sort(rank.begin() + 1, rank.end(), [&](uint32_t lhs, uint32_t rhs) {
+                return window->positions_[lhs].first < window->positions_[rhs].first; });
+
+        // Start from index 1 since first sequence has already been added as backbone.
+        uint32_t long_seq = 0;
+        uint32_t skipped_seq = 0;
+        for(uint32_t j = 1; j < num_seqs; j++)
+        {
+            uint32_t i = rank.at(j);
+            seq = window->sequences_.at(i);
+            qualities = window->qualities_.at(i);
+            // Add sequences to latest poa in batch.
+            status = addSequenceToPoa(seq, qualities);
+            if (status == claragenomics::cudapoa::StatusType::exceeded_maximum_sequence_size)
+            {
+                long_seq++;
+                continue;
+            } 
+            else if (status == claragenomics::cudapoa::StatusType::exceeded_maximum_sequences_per_poa)
+            {
+                skipped_seq++;
+                continue;
+            } 
+            else if (status != claragenomics::cudapoa::StatusType::success)
+            {
+                fprintf(stderr, "Could not add sequence to POA in batch %d.\n",
+                        cudapoa_batch_->batch_id());
+                exit(1);
+            }
+
+            seqs_added_per_window_[w] = seqs_added_per_window_[w] + 1;
+        }
+#ifndef NDEBUG
+        if (long_seq > 0)
+        {
+            fprintf(stderr, "Too long (%d / %d)\n", long_seq, num_seqs);
+        }
+        if (skipped_seq > 0)
+        {
+            fprintf(stderr, "Skipped (%d / %d)\n", skipped_seq, num_seqs);
+        }
+#endif
+    }
+}
+
+void CUDABatchProcessor::generatePOA()
+{
+    // call generate poa function
+    cudapoa_batch_->generate_poa();
+}
+
+void CUDABatchProcessor::getConsensus()
+{
+    std::vector<std::string> consensuses;
+    std::vector<std::vector<uint16_t>> coverages;
+    std::vector<claragenomics::cudapoa::StatusType> output_status;
+    cudapoa_batch_->get_consensus(consensuses, coverages, output_status);
+
+    for(uint32_t i = 0; i < windows_.size(); i++)
+    {
+        auto window = windows_.at(i);
+        if (output_status.at(i) != claragenomics::cudapoa::StatusType::success)
+        {
+            // leave the failure cases to CPU polisher
+            window_consensus_status_.emplace_back(false);
+        }
+        else
+        {
+            // This is a special case borrowed from the CPU version.
+            // TODO: We still run this case through the GPU, but could take it out.
+            if (window->sequences_.size() < 3)
+            {
+                window->consensus_ = std::string(window->sequences_.front().first,
+                        window->sequences_.front().second);
+
+                // This status is borrowed from the CPU version which considers this
+                // a failed consensus. All other cases are true.
+                window_consensus_status_.emplace_back(false);
+            }
+            else
+            {
+                window->consensus_ = consensuses.at(i);
+                if (window->type_ ==  WindowType::kTGS)
+                {
+                    uint32_t num_seqs_in_window = seqs_added_per_window_[i];
+                    uint32_t average_coverage = num_seqs_in_window / 2;
+
+                    int32_t begin = 0, end =  window->consensus_.size() - 1;
+                    for (; begin < static_cast<int32_t>( window->consensus_.size()); ++begin) {
+                        if (coverages.at(i).at(begin) >= average_coverage) {
+                            break;
+                        }
+                    }
+                    for (; end >= 0; --end) {
+                        if (coverages.at(i).at(end) >= average_coverage) {
+                            break;
+                        }
+                    }
+
+                    if (begin >= end) {
+                        fprintf(stderr, "[CUDABatchProcessor] warning: "
+                                "contig might be chimeric in window %lu!\n", window->id_);
+                    } else {
+                        window->consensus_ =  window->consensus_.substr(begin, end - begin + 1);
+                    }
+                }
+                window_consensus_status_.emplace_back(true);
+            }
+        }
+    }
+}
+
+const std::vector<bool>& CUDABatchProcessor::generateConsensus()
+{
+    // Generate consensus for all windows in the batch
+    generateMemoryMap();
+    generatePOA();
+    getConsensus();
+
+    return window_consensus_status_;
+}
+
+void CUDABatchProcessor::reset()
+{
+    windows_.clear();
+    window_consensus_status_.clear();
+    seqs_added_per_window_.clear();
+    cudapoa_batch_->reset();
+}
+
+} // namespace racon


=====================================
src/cuda/cudabatch.hpp
=====================================
@@ -0,0 +1,142 @@
+/*!
+* @file cudabatch.hpp
+ *
+ * @brief CUDA batch class header file
+ */
+
+#pragma once
+
+#include <memory>
+#include <cuda_runtime_api.h>
+#include <atomic>
+
+#include "window.hpp"
+#include "cudapoa/batch.hpp"
+
+namespace spoa {
+    class AlignmentEngine;
+}
+
+namespace racon {
+
+class Window;
+
+class CUDABatchProcessor;
+std::unique_ptr<CUDABatchProcessor> createCUDABatch(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment);
+
+class CUDABatchProcessor
+{
+public:
+    ~CUDABatchProcessor();
+
+    /**
+     * @brief Add a new window to the batch.
+     *
+     * @param[in] window : The window to add to the batch.
+     *
+     * @return True of window could be added to the batch.
+     */
+    bool addWindow(std::shared_ptr<Window> window);
+
+    /**
+     * @brief Checks if batch has any windows to process.
+     */
+    bool hasWindows() const;
+
+    /**
+     * @brief Runs the core computation to generate consensus for
+     *        all windows in the batch.
+     *
+     * @return Vector of bool indicating succesful generation of consensus
+     *         for each window in the batch.
+     */
+    const std::vector<bool>& generateConsensus();
+
+    /**
+     * @brief Resets the state of the object, which includes
+     *        resetting buffer states and counters.
+     */
+    void reset();
+
+    /**
+     * @brief Get batch ID.
+     */
+    uint32_t getBatchID() const { return bid_; }
+
+    // Builder function to create a new CUDABatchProcessor object.
+    friend std::unique_ptr<CUDABatchProcessor>
+    createCUDABatch(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment);
+
+protected:
+    /**
+     * @brief Constructor for CUDABatch class.
+     *
+     * @param[in] max_windows      : Maximum number windows in batch
+     * @param[in] max_window_depth : Maximum number of sequences per window
+     * @param[in] cuda_banded_alignment : Use banded POA alignment
+     */
+    CUDABatchProcessor(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment);
+    CUDABatchProcessor(const CUDABatchProcessor&) = delete;
+    const CUDABatchProcessor& operator=(const CUDABatchProcessor&) = delete;
+
+    /*
+     * @brief Process all the windows and re-map them into
+     *        memory for more efficient processing in the CUDA
+     *        kernels.
+     */
+    void generateMemoryMap();
+
+    /*
+     * @brief Run the CUDA kernel for generating POA on the batch.
+     *        This call is asynchronous.
+     */
+    void generatePOA();
+
+    /*
+     * @brief Wait for execution to complete and grab the output
+     *        consensus from the device.
+     */
+    void getConsensus();
+
+    /*
+     * @brief Convert PHRED quality scores to weights.
+     *
+     */
+    void convertPhredQualityToWeights(const char* qual,
+                                      uint32_t qual_length,
+                                      std::vector<int8_t>& weights);
+
+    /*
+     * @brief Add sequence and qualities to cudapoa.
+     *
+     */
+    claragenomics::cudapoa::StatusType addSequenceToPoa(std::pair<const char*, uint32_t>& seq,
+                                                      std::pair<const char*, uint32_t>& quality);
+
+protected:
+    // Static batch count used to generate batch IDs.
+    static std::atomic<uint32_t> batches;
+
+    // Batch ID.
+    uint32_t bid_ = 0;
+
+    // Maximum windows allowed in batch.
+    uint32_t max_windows_;
+
+    // CUDA-POA library object that manages POA batch.
+    std::unique_ptr<claragenomics::cudapoa::Batch> cudapoa_batch_;
+
+    // Stream for running POA batch.
+    cudaStream_t stream_;
+    // Windows belonging to the batch.
+    std::vector<std::shared_ptr<Window>> windows_;
+
+    // Consensus generation status for each window.
+    std::vector<bool> window_consensus_status_;
+
+    // Number of sequences actually added per window.
+    std::vector<uint32_t> seqs_added_per_window_;
+
+};
+
+} // namespace racon


=====================================
src/cuda/cudapolisher.cpp
=====================================
@@ -0,0 +1,396 @@
+/*!
+ * @file cudapolisher.cpp
+ *
+ * @brief CUDA Polisher class source file
+ */
+
+#include <future>
+#include <iostream>
+#include <chrono>
+#include <cuda_profiler_api.h>
+
+#include "sequence.hpp"
+#include "cudapolisher.hpp"
+#include <cudautils/cudautils.hpp>
+
+#include "bioparser/bioparser.hpp"
+#include "logger/logger.hpp"
+
+namespace racon {
+
+// The logger used by racon has a fixed size of 20 bins
+// which is used for the progress bar updates. Hence all
+// updates need to be broken into 20 bins.
+const uint32_t RACON_LOGGER_BIN_SIZE = 20;
+
+CUDAPolisher::CUDAPolisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
+    std::unique_ptr<bioparser::Parser<Overlap>> oparser,
+    std::unique_ptr<bioparser::Parser<Sequence>> tparser,
+    PolisherType type, uint32_t window_length, double quality_threshold,
+    double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
+    uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
+    uint32_t cudaaligner_batches)
+        : Polisher(std::move(sparser), std::move(oparser), std::move(tparser),
+                type, window_length, quality_threshold,
+                error_threshold, match, mismatch, gap, num_threads)
+        , cudapoa_batches_(cudapoa_batches)
+        , cudaaligner_batches_(cudaaligner_batches)
+        , gap_(gap)
+        , mismatch_(mismatch)
+        , match_(match)
+        , cuda_banded_alignment_(cuda_banded_alignment)
+{
+    claragenomics::cudapoa::Init();
+    claragenomics::cudaaligner::Init();
+
+    CGA_CU_CHECK_ERR(cudaGetDeviceCount(&num_devices_));
+
+    if (num_devices_ < 1)
+    {
+        throw std::runtime_error("No GPU devices found.");
+    }
+
+    std::cerr << "Using " << num_devices_ << " GPU(s) to perform polishing" << std::endl;
+
+    // Run dummy call on each device to initialize CUDA context.
+    for(int32_t dev_id = 0; dev_id < num_devices_; dev_id++)
+    {
+        std::cerr << "Initialize device " << dev_id << std::endl;
+        CGA_CU_CHECK_ERR(cudaSetDevice(dev_id));
+        CGA_CU_CHECK_ERR(cudaFree(0));
+    }
+
+    std::cerr << "[CUDAPolisher] Constructed." << std::endl;
+}
+
+CUDAPolisher::~CUDAPolisher()
+{
+    cudaDeviceSynchronize();
+    cudaProfilerStop();
+}
+
+std::vector<uint32_t> CUDAPolisher::calculate_batches_per_gpu(uint32_t batches, uint32_t gpus)
+{
+    // Bin batches into each GPU.
+    std::vector<uint32_t> batches_per_gpu(gpus, batches / gpus);
+
+    for(uint32_t i = 0; i < batches % gpus; ++i)
+    {
+        ++batches_per_gpu[i];
+    }
+
+    return batches_per_gpu;
+}
+
+void CUDAPolisher::find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps)
+{
+    if (cudaaligner_batches_ < 1)
+    {
+        // TODO: Kept CPU overlap alignment right now while GPU is a dummy implmentation.
+        Polisher::find_overlap_breaking_points(overlaps);
+    }
+    else
+    {
+        // TODO: Experimentally this is giving decent perf
+        const uint32_t MAX_ALIGNMENTS = 50;
+
+        logger_->log();
+        std::mutex mutex_overlaps;
+        uint32_t next_overlap_index = 0;
+
+        // Lambda expression for filling up next batch of alignments.
+        auto fill_next_batch = [&mutex_overlaps, &next_overlap_index, &overlaps, this](CUDABatchAligner* batch) -> std::pair<uint32_t, uint32_t> {
+            batch->reset();
+
+            // Use mutex to read the vector containing windows in a threadsafe manner.
+            std::lock_guard<std::mutex> guard(mutex_overlaps);
+
+            uint32_t initial_count = next_overlap_index;
+            uint32_t count = overlaps.size();
+            while(next_overlap_index < count)
+            {
+                if (batch->addOverlap(overlaps.at(next_overlap_index).get(), sequences_))
+                {
+                    next_overlap_index++;
+                }
+                else
+                {
+                    break;
+                }
+            }
+            return {initial_count, next_overlap_index};
+        };
+
+        // Variables for keeping track of logger progress bar.
+        uint32_t logger_step = overlaps.size() / RACON_LOGGER_BIN_SIZE;
+        int32_t log_bar_idx = 0, log_bar_idx_prev = -1;
+        uint32_t window_idx = 0;
+        std::mutex mutex_log_bar_idx;
+
+        // Lambda expression for processing a batch of alignments.
+        auto process_batch = [&fill_next_batch, &logger_step, &log_bar_idx, &log_bar_idx_prev, &window_idx, &mutex_log_bar_idx, this](CUDABatchAligner* batch) -> void {
+            while(true)
+            {
+                auto range = fill_next_batch(batch);
+                if (batch->hasOverlaps())
+                {
+                    // Launch workload.
+                    batch->alignAll();
+                    batch->find_breaking_points(window_length_);
+
+                    // logging bar
+                    {
+                        std::lock_guard<std::mutex> guard(mutex_log_bar_idx);
+                        window_idx += range.second - range.first;
+                        log_bar_idx = window_idx / logger_step;
+                        if (log_bar_idx == log_bar_idx_prev) {
+                            continue;
+                        }
+                        else if (logger_step != 0 && log_bar_idx < static_cast<int32_t>(RACON_LOGGER_BIN_SIZE))
+                        {
+                            logger_->bar("[racon::CUDAPolisher::initialize] aligning overlaps");
+                            std::cerr<<std::endl;
+                            log_bar_idx_prev = log_bar_idx;
+                        }
+                    }
+                }
+                else
+                {
+                    break;
+                }
+            }
+        };
+
+        // Bin batches into each GPU.
+        std::vector<uint32_t> batches_per_gpu = calculate_batches_per_gpu(cudaaligner_batches_, num_devices_);
+
+        for(int32_t device = 0; device < num_devices_; device++)
+        {
+            for(uint32_t batch = 0; batch < batches_per_gpu.at(device); batch++)
+            {
+                batch_aligners_.emplace_back(createCUDABatchAligner(10000, 10000, MAX_ALIGNMENTS, device));
+            }
+        }
+
+        // Run batched alignment.
+        std::vector<std::future<void>> thread_futures;
+        for(auto& aligner : batch_aligners_)
+        {
+            thread_futures.emplace_back(
+                    thread_pool_->submit(
+                        process_batch,
+                        aligner.get()
+                        )
+                    );
+        }
+
+        // Wait for threads to finish, and collect their results.
+        for (const auto& future : thread_futures) {
+            future.wait();
+        }
+
+        batch_aligners_.clear();
+    }
+}
+
+void CUDAPolisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
+    bool drop_unpolished_sequences)
+{
+    if (cudapoa_batches_ < 1)
+    {
+        Polisher::polish(dst, drop_unpolished_sequences);
+    }
+    else
+    {
+        // Creation and use of batches.
+        const uint32_t MAX_WINDOWS = 256;
+        const uint32_t MAX_DEPTH_PER_WINDOW = 200;
+
+        // Bin batches into each GPU.
+        std::vector<uint32_t> batches_per_gpu = calculate_batches_per_gpu(cudapoa_batches_, num_devices_);
+
+        for(int32_t device = 0; device < num_devices_; device++)
+        {
+            for(uint32_t batch = 0; batch < batches_per_gpu.at(device); batch++)
+            {
+                batch_processors_.emplace_back(createCUDABatch(MAX_WINDOWS, MAX_DEPTH_PER_WINDOW, device, gap_, mismatch_, match_, cuda_banded_alignment_));
+            }
+        }
+
+        logger_->log("[racon::CUDAPolisher::polish] allocated memory on GPUs");
+
+        // Mutex for accessing the vector of windows.
+        std::mutex mutex_windows;
+
+        // Initialize window consensus statuses.
+        window_consensus_status_.resize(windows_.size(), false);
+
+        // Index of next window to be added to a batch.
+        uint32_t next_window_index = 0;
+
+        // Lambda function for adding windows to batches.
+        auto fill_next_batch = [&mutex_windows, &next_window_index, this](CUDABatchProcessor* batch) -> std::pair<uint32_t, uint32_t> {
+            batch->reset();
+
+            // Use mutex to read the vector containing windows in a threadsafe manner.
+            std::lock_guard<std::mutex> guard(mutex_windows);
+
+            // TODO: Reducing window wize by 10 for debugging.
+            uint32_t initial_count = next_window_index;
+            uint32_t count = windows_.size();
+            while(next_window_index < count)
+            {
+                if (batch->addWindow(windows_.at(next_window_index)))
+                {
+                    next_window_index++;
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return {initial_count, next_window_index};
+        };
+
+        // Variables for keeping track of logger progress bar.
+        uint32_t logger_step = windows_.size() / RACON_LOGGER_BIN_SIZE;
+        int32_t log_bar_idx = 0, log_bar_idx_prev = -1;
+        uint32_t window_idx = 0;
+        std::mutex mutex_log_bar_idx;
+        logger_->log();
+
+        // Lambda function for processing each batch.
+        auto process_batch = [&fill_next_batch, &logger_step, &log_bar_idx, &mutex_log_bar_idx, &window_idx, &log_bar_idx_prev, this](CUDABatchProcessor* batch) -> void {
+            while(true)
+            {
+                std::pair<uint32_t, uint32_t> range = fill_next_batch(batch);
+                if (batch->hasWindows())
+                {
+                    // Launch workload.
+                    const std::vector<bool>& results = batch->generateConsensus();
+
+                    // Check if the number of batches processed is same as the range of
+                    // of windows that were added.
+                    if (results.size() != (range.second - range.first))
+                    {
+                        throw std::runtime_error("Windows processed doesn't match \
+                                range of windows passed to batch\n");
+                    }
+
+                    // Copy over the results from the batch into the per window
+                    // result vector of the CUDAPolisher.
+                    for(uint32_t i = 0; i < results.size(); i++)
+                    {
+                        window_consensus_status_.at(range.first + i) = results.at(i);
+                    }
+
+                    // logging bar
+                    {
+                        std::lock_guard<std::mutex> guard(mutex_log_bar_idx);
+                        window_idx += results.size();
+                        log_bar_idx = window_idx / logger_step;
+                        if (log_bar_idx == log_bar_idx_prev) {
+                            continue;
+                        }
+                        else if (logger_step != 0 && log_bar_idx < static_cast<int32_t>(RACON_LOGGER_BIN_SIZE))
+                        {
+                            logger_->bar("[racon::CUDAPolisher::polish] generating consensus");
+                            std::cerr<<std::endl;
+                            log_bar_idx_prev = log_bar_idx;
+                        }
+                    }
+                }
+                else
+                {
+                    break;
+                }
+            }
+        };
+
+        // Process each of the batches in a separate thread.
+        std::vector<std::future<void>> thread_futures;
+        for(auto& batch_processor : batch_processors_)
+        {
+            thread_futures.emplace_back(
+                    thread_pool_->submit(
+                        process_batch,
+                        batch_processor.get()
+                        )
+                    );
+        }
+
+        // Wait for threads to finish, and collect their results.
+        for (const auto& future : thread_futures) {
+            future.wait();
+        }
+
+        logger_->log("[racon::CUDAPolisher::polish] polished windows on GPU");
+
+        // Start timing CPU time for failed windows on GPU
+        logger_->log();
+        // Process each failed windows in parallel on CPU
+        std::vector<std::future<bool>> thread_failed_windows;
+        for (uint64_t i = 0; i < windows_.size(); ++i) {
+            if (window_consensus_status_.at(i) == false)
+            {
+                thread_failed_windows.emplace_back(thread_pool_->submit(
+                            [&](uint64_t j) -> bool {
+                            auto it = thread_to_id_.find(std::this_thread::get_id());
+                            if (it == thread_to_id_.end()) {
+                            fprintf(stderr, "[racon::CUDAPolisher::polish] error: "
+                                    "thread identifier not present!\n");
+                            exit(1);
+                            }
+                            return window_consensus_status_.at(j) = windows_[j]->generate_consensus(
+                                    alignment_engines_[it->second]);
+                            }, i));
+            }
+        }
+
+        // Wait for threads to finish, and collect their results.
+        for (const auto& t : thread_failed_windows) {
+            t.wait();
+        }
+        if (thread_failed_windows.size() > 0)
+        {
+            logger_->log("[racon::CUDAPolisher::polish] polished remaining windows on CPU");
+            logger_->log();
+        }
+
+        // Collect results from all windows into final output.
+        std::string polished_data = "";
+        uint32_t num_polished_windows = 0;
+
+        for (uint64_t i = 0; i < windows_.size(); ++i) {
+
+            num_polished_windows += window_consensus_status_.at(i) == true ? 1 : 0;
+            polished_data += windows_[i]->consensus();
+
+            if (i == windows_.size() - 1 || windows_[i + 1]->rank() == 0) {
+                double polished_ratio = num_polished_windows /
+                    static_cast<double>(windows_[i]->rank() + 1);
+
+                if (!drop_unpolished_sequences || polished_ratio > 0) {
+                    std::string tags = type_ == PolisherType::kF ? "r" : "";
+                    tags += " LN:i:" + std::to_string(polished_data.size());
+                    tags += " RC:i:" + std::to_string(targets_coverages_[windows_[i]->id()]);
+                    tags += " XC:f:" + std::to_string(polished_ratio);
+                    dst.emplace_back(createSequence(sequences_[windows_[i]->id()]->name() +
+                                tags, polished_data));
+                }
+
+                num_polished_windows = 0;
+                polished_data.clear();
+            }
+            windows_[i].reset();
+        }
+
+        logger_->log("[racon::CUDAPolisher::polish] generated consensus");
+
+        // Clear POA processors.
+        batch_processors_.clear();
+    }
+}
+
+}


=====================================
src/cuda/cudapolisher.hpp
=====================================
@@ -0,0 +1,74 @@
+/*!
+ * @file cudapolisher.hpp
+ *
+ * @brief CUDA Polisher class header file
+ */
+
+#pragma once
+
+#include <mutex>
+
+#include "polisher.hpp"
+#include "cudabatch.hpp"
+#include "cudaaligner.hpp"
+#include "thread_pool/thread_pool.hpp"
+
+
+namespace racon {
+
+class CUDAPolisher : public Polisher {
+public:
+    ~CUDAPolisher();
+
+    virtual void polish(std::vector<std::unique_ptr<Sequence>>& dst,
+        bool drop_unpolished_sequences) override;
+
+    friend std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
+        const std::string& overlaps_path, const std::string& target_path,
+        PolisherType type, uint32_t window_length, double quality_threshold,
+        double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
+        uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
+        uint32_t cudaaligner_batches);
+
+protected:
+    CUDAPolisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
+        std::unique_ptr<bioparser::Parser<Overlap>> oparser,
+        std::unique_ptr<bioparser::Parser<Sequence>> tparser,
+        PolisherType type, uint32_t window_length, double quality_threshold,
+        double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
+        uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
+        uint32_t cudaaligner_batches);
+    CUDAPolisher(const CUDAPolisher&) = delete;
+    const CUDAPolisher& operator=(const CUDAPolisher&) = delete;
+    virtual void find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps) override;
+
+    static std::vector<uint32_t> calculate_batches_per_gpu(uint32_t cudapoa_batches, uint32_t gpus);
+
+    // Vector of POA batches.
+    std::vector<std::unique_ptr<CUDABatchProcessor>> batch_processors_;
+
+    // Vector of aligner batches.
+    std::vector<std::unique_ptr<CUDABatchAligner>> batch_aligners_;
+
+    // Vector of bool indicating consensus generation status for each window.
+    std::vector<bool> window_consensus_status_;
+
+    // Number of batches for POA processing.
+    uint32_t cudapoa_batches_;
+
+    // Numbver of batches for Alignment processing.
+    uint32_t cudaaligner_batches_;
+
+    // Number of GPU devices to run with.
+    int32_t num_devices_;
+
+    // State transition scores.
+    int8_t gap_;
+    int8_t mismatch_;
+    int8_t match_;
+
+    // Use banded POA alignment
+    bool cuda_banded_alignment_;
+};
+
+}


=====================================
src/cuda/cudautils.hpp
=====================================
@@ -0,0 +1,20 @@
+// Implementation file for CUDA POA utilities.
+
+#pragma once
+
+#include <stdlib.h>
+#include <cuda_runtime_api.h>
+
+namespace racon {
+
+void cudaCheckError(std::string &msg)
+{
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess)
+    {
+        fprintf(stderr, "%s (CUDA error %s)\n", msg.c_str(), cudaGetErrorString(error));
+        exit(-1);
+    }
+}
+
+} // namespace racon


=====================================
src/main.cpp
=====================================
@@ -8,8 +8,12 @@
 
 #include "sequence.hpp"
 #include "polisher.hpp"
+#ifdef CUDA_ENABLED
+#include "cuda/cudapolisher.hpp"
+#endif
 
-static const char* version = "v1.3.2";
+static const char* version = "v1.4.3";
+static const int32_t CUDAALIGNER_INPUT_CODE = 10000;
 
 static struct option options[] = {
     {"include-unpolished", no_argument, 0, 'u'},
@@ -23,6 +27,11 @@ static struct option options[] = {
     {"threads", required_argument, 0, 't'},
     {"version", no_argument, 0, 'v'},
     {"help", no_argument, 0, 'h'},
+#ifdef CUDA_ENABLED
+    {"cudapoa-batches", optional_argument, 0, 'c'},
+    {"cuda-banded-alignment", no_argument, 0, 'b'},
+    {"cudaaligner-batches", required_argument, 0, CUDAALIGNER_INPUT_CODE},
+#endif
     {0, 0, 0, 0}
 };
 
@@ -44,8 +53,17 @@ int main(int argc, char** argv) {
     bool drop_unpolished_sequences = true;
     uint32_t num_threads = 1;
 
-    char argument;
-    while ((argument = getopt_long(argc, argv, "ufw:q:e:m:x:g:t:h", options, nullptr)) != -1) {
+    uint32_t cudapoa_batches = 0;
+    uint32_t cudaaligner_batches = 0;
+    bool cuda_banded_alignment = false;
+
+    std::string optstring = "ufw:q:e:m:x:g:t:h";
+#ifdef CUDA_ENABLED
+    optstring += "bc::";
+#endif
+
+    int32_t argument;
+    while ((argument = getopt_long(argc, argv, optstring.c_str(), options, nullptr)) != -1) {
         switch (argument) {
             case 'u':
                 drop_unpolished_sequences = false;
@@ -80,6 +98,27 @@ int main(int argc, char** argv) {
             case 'h':
                 help();
                 exit(0);
+#ifdef CUDA_ENABLED
+            case 'c':
+                //if option c encountered, cudapoa_batches initialized with a default value of 1.
+                cudapoa_batches = 1;
+                // next text entry is not an option, assuming it's the arg for option 'c'
+                if (optarg == NULL && argv[optind] != NULL
+                    && argv[optind][0] != '-') {
+                    cudapoa_batches = atoi(argv[optind++]);
+                }
+                // optional argument provided in the ususal way
+                if (optarg != NULL) {
+                    cudapoa_batches = atoi(optarg);
+                }
+                break;
+            case 'b':
+                cuda_banded_alignment = true;
+                break;
+            case CUDAALIGNER_INPUT_CODE: // cudaaligner-batches
+                cudaaligner_batches = atoi(optarg);
+                break;
+#endif
             default:
                 exit(1);
         }
@@ -98,7 +137,8 @@ int main(int argc, char** argv) {
     auto polisher = racon::createPolisher(input_paths[0], input_paths[1],
         input_paths[2], type == 0 ? racon::PolisherType::kC :
         racon::PolisherType::kF, window_length, quality_threshold,
-        error_threshold, match, mismatch, gap, num_threads);
+        error_threshold, match, mismatch, gap, num_threads,
+        cudapoa_batches, cuda_banded_alignment, cudaaligner_batches);
 
     polisher->initialize();
 
@@ -156,5 +196,16 @@ void help() {
         "        --version\n"
         "            prints the version number\n"
         "        -h, --help\n"
-        "            prints the usage\n");
+        "            prints the usage\n"
+#ifdef CUDA_ENABLED
+        "        -c, --cudapoa-batches\n"
+        "            default: 1\n"
+        "            number of batches for CUDA accelerated polishing\n"
+        "        -b, --cuda-banded-alignment\n"
+        "            use banding approximation for alignment on GPU\n"
+        "        --cudaaligner-batches (experimental)\n"
+        "            Number of batches for CUDA accelerated alignment\n"
+
+#endif
+    );
 }


=====================================
src/overlap.cpp
=====================================
@@ -190,29 +190,41 @@ void Overlap::find_breaking_points(const std::vector<std::unique_ptr<Sequence>>&
     }
 
     if (cigar_.empty()) {
-        // align overlaps with edlib
         const char* q = !strand_ ? &(sequences[q_id_]->data()[q_begin_]) :
             &(sequences[q_id_]->reverse_complement()[q_length_ - q_end_]);
         const char* t = &(sequences[t_id_]->data()[t_begin_]);
 
-        EdlibAlignResult result = edlibAlign(q, q_end_ - q_begin_, t, t_end_ -
-            t_begin_, edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_PATH,
-            nullptr, 0));
+        align_overlaps(q, q_end_ - q_begin_, t, t_end_ - t_begin_);
+    }
+
+    find_breaking_points_from_cigar(window_length);
+
+    std::string().swap(cigar_);
+}
+
+void Overlap::align_overlaps(const char* q, uint32_t q_length, const char* t, uint32_t t_length)
+{
+    // align overlaps with edlib
+    EdlibAlignResult result = edlibAlign(q, q_length, t, t_length,
+            edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_PATH,
+                nullptr, 0));
 
-        if (result.status == EDLIB_STATUS_OK) {
-            char* cigar = edlibAlignmentToCigar(result.alignment,
+    if (result.status == EDLIB_STATUS_OK) {
+        char* cigar = edlibAlignmentToCigar(result.alignment,
                 result.alignmentLength, EDLIB_CIGAR_STANDARD);
-            cigar_ = cigar;
-            free(cigar);
-        } else {
-            fprintf(stderr, "[racon::Overlap::find_breaking_points] error: "
+        cigar_ = cigar;
+        free(cigar);
+    } else {
+        fprintf(stderr, "[racon::Overlap::find_breaking_points] error: "
                 "edlib unable to align pair (%zu x %zu)!\n", q_id_, t_id_);
-            exit(1);
-        }
-
-        edlibFreeAlignResult(result);
+        exit(1);
     }
 
+    edlibFreeAlignResult(result);
+}
+
+void Overlap::find_breaking_points_from_cigar(uint32_t window_length)
+{
     // find breaking points from cigar
     std::vector<int32_t> window_ends;
     for (uint32_t i = 0; i < t_end_; i += window_length) {
@@ -277,8 +289,6 @@ void Overlap::find_breaking_points(const std::vector<std::unique_ptr<Sequence>>&
             j = i + 1;
         }
     }
-
-    std::string().swap(cigar_);
 }
 
 }


=====================================
src/overlap.hpp
=====================================
@@ -71,6 +71,10 @@ public:
     friend bioparser::MhapParser<Overlap>;
     friend bioparser::PafParser<Overlap>;
     friend bioparser::SamParser<Overlap>;
+
+#ifdef CUDA_ENABLED
+    friend class CUDABatchAligner;
+#endif
 private:
     Overlap(uint64_t a_id, uint64_t b_id, double accuracy, uint32_t minmers,
         uint32_t a_rc, uint32_t a_begin, uint32_t a_end, uint32_t a_length,
@@ -89,6 +93,8 @@ private:
     Overlap();
     Overlap(const Overlap&) = delete;
     const Overlap& operator=(const Overlap&) = delete;
+    virtual void find_breaking_points_from_cigar(uint32_t window_length);
+    virtual void align_overlaps(const char* q, uint32_t q_len, const char* t, uint32_t t_len);
 
     std::string q_name_;
     uint64_t q_id_;


=====================================
src/polisher.cpp
=====================================
@@ -6,15 +6,20 @@
 
 #include <algorithm>
 #include <unordered_set>
+#include <iostream>
 
 #include "overlap.hpp"
 #include "sequence.hpp"
 #include "window.hpp"
 #include "polisher.hpp"
+#ifdef CUDA_ENABLED
+#include "cuda/cudapolisher.hpp"
+#endif
 
 #include "bioparser/bioparser.hpp"
 #include "thread_pool/thread_pool.hpp"
 #include "spoa/spoa.hpp"
+#include "logger/logger.hpp"
 
 namespace racon {
 
@@ -51,7 +56,8 @@ std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
     const std::string& overlaps_path, const std::string& target_path,
     PolisherType type, uint32_t window_length, double quality_threshold,
     double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
-    uint32_t num_threads) {
+    uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
+    uint32_t cudaaligner_batches) {
 
     if (type != PolisherType::kC && type != PolisherType::kF) {
         fprintf(stderr, "[racon::createPolisher] error: invalid polisher type!\n");
@@ -122,10 +128,30 @@ std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
         exit(1);
     }
 
-    return std::unique_ptr<Polisher>(new Polisher(std::move(sparser),
-        std::move(oparser), std::move(tparser), type, window_length,
-        quality_threshold, error_threshold, match, mismatch, gap,
-        num_threads));
+    if (cudapoa_batches > 0 || cudaaligner_batches > 0)
+    {
+#ifdef CUDA_ENABLED
+        // If CUDA is enabled, return an instance of the CUDAPolisher object.
+        return std::unique_ptr<Polisher>(new CUDAPolisher(std::move(sparser),
+                    std::move(oparser), std::move(tparser), type, window_length,
+                    quality_threshold, error_threshold, match, mismatch, gap,
+                    num_threads, cudapoa_batches, cuda_banded_alignment, cudaaligner_batches));
+#else
+        fprintf(stderr, "[racon::createPolisher] error: "
+                "Attemping to use CUDA when CUDA support is not available.\n"
+                "Please check logic in %s:%s\n",
+                __FILE__, __func__);
+        exit(1);
+#endif
+    }
+    else
+    {
+        (void) cuda_banded_alignment;
+        return std::unique_ptr<Polisher>(new Polisher(std::move(sparser),
+                    std::move(oparser), std::move(tparser), type, window_length,
+                    quality_threshold, error_threshold, match, mismatch, gap,
+                    num_threads));
+    }
 }
 
 Polisher::Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
@@ -140,7 +166,7 @@ Polisher::Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
         alignment_engines_(), sequences_(), dummy_quality_(window_length, '!'),
         window_length_(window_length), windows_(),
         thread_pool_(thread_pool::createThreadPool(num_threads)),
-        thread_to_id_() {
+        thread_to_id_(), logger_(new logger::Logger()) {
 
     uint32_t id = 0;
     for (const auto& it: thread_pool_->thread_identifiers()) {
@@ -155,6 +181,7 @@ Polisher::Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
 }
 
 Polisher::~Polisher() {
+    logger_->total("[racon::Polisher::] total =");
 }
 
 void Polisher::initialize() {
@@ -165,8 +192,10 @@ void Polisher::initialize() {
         return;
     }
 
+    logger_->log();
+
     tparser_->reset();
-    tparser_->parse_objects(sequences_, -1);
+    tparser_->parse(sequences_, -1);
 
     uint64_t targets_size = sequences_.size();
     if (targets_size == 0) {
@@ -186,14 +215,15 @@ void Polisher::initialize() {
     std::vector<bool> has_data(targets_size, true);
     std::vector<bool> has_reverse_data(targets_size, false);
 
-    fprintf(stderr, "[racon::Polisher::initialize] loaded target sequences\n");
+    logger_->log("[racon::Polisher::initialize] loaded target sequences");
+    logger_->log();
 
     uint64_t sequences_size = 0, total_sequences_length = 0;
 
     sparser_->reset();
     while (true) {
         uint64_t l = sequences_.size();
-        auto status = sparser_->parse_objects(sequences_, kChunkSize);
+        auto status = sparser_->parse(sequences_, kChunkSize);
 
         uint64_t n = 0;
         for (uint64_t i = l; i < sequences_.size(); ++i, ++sequences_size) {
@@ -241,7 +271,8 @@ void Polisher::initialize() {
     WindowType window_type = static_cast<double>(total_sequences_length) /
         sequences_size <= 1000 ? WindowType::kNGS : WindowType::kTGS;
 
-    fprintf(stderr, "[racon::Polisher::initialize] loaded sequences\n");
+    logger_->log("[racon::Polisher::initialize] loaded sequences");
+    logger_->log();
 
     std::vector<std::unique_ptr<Overlap>> overlaps;
 
@@ -274,7 +305,7 @@ void Polisher::initialize() {
     oparser_->reset();
     uint64_t l = 0;
     while (true) {
-        auto status = oparser_->parse_objects(overlaps, kChunkSize);
+        auto status = oparser_->parse(overlaps, kChunkSize);
 
         uint64_t c = l;
         for (uint64_t i = l; i < overlaps.size(); ++i) {
@@ -326,11 +357,13 @@ void Polisher::initialize() {
             "empty overlap set!\n");
         exit(1);
     }
-    fprintf(stderr, "[racon::Polisher::initialize] loaded overlaps\n");
+
+    logger_->log("[racon::Polisher::initialize] loaded overlaps");
+    logger_->log();
 
     std::vector<std::future<void>> thread_futures;
     for (uint64_t i = 0; i < sequences_.size(); ++i) {
-        thread_futures.emplace_back(thread_pool_->submit_task(
+        thread_futures.emplace_back(thread_pool_->submit(
             [&](uint64_t j) -> void {
                 sequences_[j]->transmute(has_name[j], has_data[j], has_reverse_data[j]);
             }, i));
@@ -339,19 +372,9 @@ void Polisher::initialize() {
         it.wait();
     }
 
-    thread_futures.clear();
-    for (uint64_t i = 0; i < overlaps.size(); ++i) {
-        thread_futures.emplace_back(thread_pool_->submit_task(
-            [&](uint64_t j) -> void {
-                overlaps[j]->find_breaking_points(sequences_, window_length_);
-            }, i));
-    }
-    for (uint64_t i = 0; i < thread_futures.size(); ++i) {
-        thread_futures[i].wait();
-        fprintf(stderr, "[racon::Polisher::initialize] aligned overlap %zu/%zu\r",
-            i + 1, overlaps.size());
-    }
-    fprintf(stderr, "\n");
+    find_overlap_breaking_points(overlaps);
+
+    logger_->log();
 
     std::vector<uint64_t> id_to_first_window_id(targets_size + 1, 0);
     for (uint64_t i = 0; i < targets_size; ++i) {
@@ -428,15 +451,41 @@ void Polisher::initialize() {
         overlaps[i].reset();
     }
 
-    fprintf(stderr, "[racon::Polisher::initialize] transformed data into windows\n");
+    logger_->log("[racon::Polisher::initialize] transformed data into windows");
+}
+
+void Polisher::find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps)
+{
+    std::vector<std::future<void>> thread_futures;
+    for (uint64_t i = 0; i < overlaps.size(); ++i) {
+        thread_futures.emplace_back(thread_pool_->submit(
+            [&](uint64_t j) -> void {
+                overlaps[j]->find_breaking_points(sequences_, window_length_);
+            }, i));
+    }
+
+    uint32_t logger_step = thread_futures.size() / 20;
+    for (uint64_t i = 0; i < thread_futures.size(); ++i) {
+        thread_futures[i].wait();
+        if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) {
+            logger_->bar("[racon::Polisher::initialize] aligning overlaps");
+        }
+    }
+    if (logger_step != 0) {
+        logger_->bar("[racon::Polisher::initialize] aligning overlaps");
+    } else {
+        logger_->log("[racon::Polisher::initialize] aligned overlaps");
+    }
 }
 
 void Polisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
     bool drop_unpolished_sequences) {
 
+    logger_->log();
+
     std::vector<std::future<bool>> thread_futures;
     for (uint64_t i = 0; i < windows_.size(); ++i) {
-        thread_futures.emplace_back(thread_pool_->submit_task(
+        thread_futures.emplace_back(thread_pool_->submit(
             [&](uint64_t j) -> bool {
                 auto it = thread_to_id_.find(std::this_thread::get_id());
                 if (it == thread_to_id_.end()) {
@@ -452,6 +501,8 @@ void Polisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
     std::string polished_data = "";
     uint32_t num_polished_windows = 0;
 
+    uint64_t logger_step = thread_futures.size() / 20;
+
     for (uint64_t i = 0; i < thread_futures.size(); ++i) {
         thread_futures[i].wait();
 
@@ -476,12 +527,18 @@ void Polisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
         }
         windows_[i].reset();
 
-        fprintf(stderr, "[racon::Polisher::polish] generated consensus for window %zu/%zu\r",
-            i + 1, thread_futures.size());
+        if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) {
+            logger_->bar("[racon::Polisher::polish] generating consensus");
+        }
+    }
+
+    if (logger_step != 0) {
+        logger_->bar("[racon::Polisher::polish] generating consensus");
+    } else {
+        logger_->log("[racon::Polisher::polish] generated consensus");
     }
-    fprintf(stderr, "\n");
 
-    std::vector<std::unique_ptr<Window>>().swap(windows_);
+    std::vector<std::shared_ptr<Window>>().swap(windows_);
     std::vector<std::unique_ptr<Sequence>>().swap(sequences_);
 }
 


=====================================
src/polisher.hpp
=====================================
@@ -25,6 +25,10 @@ namespace spoa {
     class AlignmentEngine;
 }
 
+namespace logger {
+    class Logger;
+}
+
 namespace racon {
 
 class Sequence;
@@ -41,23 +45,26 @@ std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
     const std::string& overlaps_path, const std::string& target_path,
     PolisherType type, uint32_t window_length, double quality_threshold,
     double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
-    uint32_t num_threads);
+    uint32_t num_threads, uint32_t cuda_batches = 0,
+    bool cuda_banded_alignment = false, uint32_t cudaaligner_batches = 0);
 
 class Polisher {
 public:
-    ~Polisher();
+    virtual ~Polisher();
 
-    void initialize();
+    virtual void initialize();
 
-    void polish(std::vector<std::unique_ptr<Sequence>>& dst,
+    virtual void polish(std::vector<std::unique_ptr<Sequence>>& dst,
         bool drop_unpolished_sequences);
 
     friend std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
         const std::string& overlaps_path, const std::string& target_path,
         PolisherType type, uint32_t window_length, double quality_threshold,
         double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
-        uint32_t num_threads);
-private:
+        uint32_t num_threads, uint32_t cuda_batches, bool cuda_banded_alignment,
+        uint32_t cudaaligner_batches);
+
+protected:
     Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
         std::unique_ptr<bioparser::Parser<Overlap>> oparser,
         std::unique_ptr<bioparser::Parser<Sequence>> tparser,
@@ -66,6 +73,7 @@ private:
         uint32_t num_threads);
     Polisher(const Polisher&) = delete;
     const Polisher& operator=(const Polisher&) = delete;
+    virtual void find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps);
 
     std::unique_ptr<bioparser::Parser<Sequence>> sparser_;
     std::unique_ptr<bioparser::Parser<Overlap>> oparser_;
@@ -81,10 +89,12 @@ private:
     std::string dummy_quality_;
 
     uint32_t window_length_;
-    std::vector<std::unique_ptr<Window>> windows_;
+    std::vector<std::shared_ptr<Window>> windows_;
 
     std::unique_ptr<thread_pool::ThreadPool> thread_pool_;
     std::unordered_map<std::thread::id, uint32_t> thread_to_id_;
+
+    std::unique_ptr<logger::Logger> logger_;
 };
 
 }


=====================================
src/window.cpp
=====================================
@@ -12,7 +12,7 @@
 
 namespace racon {
 
-std::unique_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type,
+std::shared_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type,
     const char* backbone, uint32_t backbone_length, const char* quality,
     uint32_t quality_length) {
 
@@ -22,7 +22,7 @@ std::unique_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type
         exit(1);
     }
 
-    return std::unique_ptr<Window>(new Window(id, rank, type, backbone,
+    return std::shared_ptr<Window>(new Window(id, rank, type, backbone,
         backbone_length, quality, quality_length));
 }
 


=====================================
src/window.hpp
=====================================
@@ -24,11 +24,12 @@ enum class WindowType {
 };
 
 class Window;
-std::unique_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type,
+std::shared_ptr<Window> createWindow(uint64_t id, uint32_t rank, WindowType type,
     const char* backbone, uint32_t backbone_length, const char* quality,
     uint32_t quality_length);
 
 class Window {
+
 public:
     ~Window();
 
@@ -49,9 +50,13 @@ public:
         const char* quality, uint32_t quality_length, uint32_t begin,
         uint32_t end);
 
-    friend std::unique_ptr<Window> createWindow(uint64_t id, uint32_t rank,
+    friend std::shared_ptr<Window> createWindow(uint64_t id, uint32_t rank,
         WindowType type, const char* backbone, uint32_t backbone_length,
         const char* quality, uint32_t quality_length);
+
+#ifdef CUDA_ENABLED
+    friend class CUDABatchProcessor;
+#endif
 private:
     Window(uint64_t id, uint32_t rank, WindowType type, const char* backbone,
         uint32_t backbone_length, const char* quality, uint32_t quality_length);


=====================================
test/racon_test.cpp
=====================================
@@ -29,11 +29,12 @@ public:
     void SetUp(const std::string& sequences_path, const std::string& overlaps_path,
         const std::string& target_path, racon::PolisherType type,
         uint32_t window_length, double quality_threshold, double error_threshold,
-        int8_t match,int8_t mismatch, int8_t gap) {
+        int8_t match,int8_t mismatch, int8_t gap, uint32_t cuda_batches = 0,
+        bool cuda_banded_alignment = false, uint32_t cudaaligner_batches = 0) {
 
         polisher = racon::createPolisher(sequences_path, overlaps_path, target_path,
             type, window_length, quality_threshold, error_threshold, match,
-            mismatch, gap, 4);
+            mismatch, gap, 4, cuda_batches, cuda_banded_alignment, cudaaligner_batches);
     }
 
     void TearDown() {}
@@ -99,7 +100,7 @@ TEST_F(RaconPolishingTest, ConsensusWithQualities) {
 
     auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
         racon_test_data_path + "sample_reference.fasta.gz");
-    parser->parse_objects(polished_sequences, -1);
+    parser->parse(polished_sequences, -1);
     EXPECT_EQ(polished_sequences.size(), 2);
 
     EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -121,7 +122,7 @@ TEST_F(RaconPolishingTest, ConsensusWithoutQualities) {
 
     auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
         racon_test_data_path + "sample_reference.fasta.gz");
-    parser->parse_objects(polished_sequences, -1);
+    parser->parse(polished_sequences, -1);
     EXPECT_EQ(polished_sequences.size(), 2);
 
     EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -143,7 +144,7 @@ TEST_F(RaconPolishingTest, ConsensusWithQualitiesAndAlignments) {
 
     auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
         racon_test_data_path + "sample_reference.fasta.gz");
-    parser->parse_objects(polished_sequences, -1);
+    parser->parse(polished_sequences, -1);
     EXPECT_EQ(polished_sequences.size(), 2);
 
     EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -165,7 +166,7 @@ TEST_F(RaconPolishingTest, ConsensusWithoutQualitiesAndWithAlignments) {
 
     auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
         racon_test_data_path + "sample_reference.fasta.gz");
-    parser->parse_objects(polished_sequences, -1);
+    parser->parse(polished_sequences, -1);
     EXPECT_EQ(polished_sequences.size(), 2);
 
     EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -187,7 +188,7 @@ TEST_F(RaconPolishingTest, ConsensusWithQualitiesLargerWindow) {
 
     auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
         racon_test_data_path + "sample_reference.fasta.gz");
-    parser->parse_objects(polished_sequences, -1);
+    parser->parse(polished_sequences, -1);
     EXPECT_EQ(polished_sequences.size(), 2);
 
     EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -209,7 +210,7 @@ TEST_F(RaconPolishingTest, ConsensusWithQualitiesEditDistance) {
 
     auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
         racon_test_data_path + "sample_reference.fasta.gz");
-    parser->parse_objects(polished_sequences, -1);
+    parser->parse(polished_sequences, -1);
     EXPECT_EQ(polished_sequences.size(), 2);
 
     EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
@@ -287,3 +288,209 @@ TEST_F(RaconPolishingTest, FragmentCorrectionWithQualitiesFullMhap) {
     }
     EXPECT_EQ(total_length, 1658216);
 }
+
+#ifdef CUDA_ENABLED
+TEST_F(RaconPolishingTest, ConsensusWithQualitiesCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+        "sample_overlaps.paf.gz", racon_test_data_path + "sample_layout.fasta.gz",
+        racon::PolisherType::kC, 500, 10, 0.3, 5, -4, -8, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, true);
+    EXPECT_EQ(polished_sequences.size(), 1);
+
+    polished_sequences[0]->create_reverse_complement();
+
+    auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+        racon_test_data_path + "sample_reference.fasta.gz");
+    parser->parse(polished_sequences, -1);
+    EXPECT_EQ(polished_sequences.size(), 2);
+
+    EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+        polished_sequences[1]->data()), 1385); // CPU 1312
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithoutQualitiesCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fasta.gz", racon_test_data_path +
+        "sample_overlaps.paf.gz", racon_test_data_path + "sample_layout.fasta.gz",
+        racon::PolisherType::kC, 500, 10, 0.3, 5, -4, -8, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, true);
+    EXPECT_EQ(polished_sequences.size(), 1);
+
+    polished_sequences[0]->create_reverse_complement();
+
+    auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+        racon_test_data_path + "sample_reference.fasta.gz");
+    parser->parse(polished_sequences, -1);
+    EXPECT_EQ(polished_sequences.size(), 2);
+
+    EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+        polished_sequences[1]->data()), 1607); // CPU 1566
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithQualitiesAndAlignmentsCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+        "sample_overlaps.sam.gz", racon_test_data_path + "sample_layout.fasta.gz",
+        racon::PolisherType::kC, 500, 10, 0.3, 5, -4, -8, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, true);
+    EXPECT_EQ(polished_sequences.size(), 1);
+
+    polished_sequences[0]->create_reverse_complement();
+
+    auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+        racon_test_data_path + "sample_reference.fasta.gz");
+    parser->parse(polished_sequences, -1);
+    EXPECT_EQ(polished_sequences.size(), 2);
+
+    EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+        polished_sequences[1]->data()), 1541); // CPU 1317
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithoutQualitiesAndWithAlignmentsCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fasta.gz", racon_test_data_path +
+        "sample_overlaps.sam.gz", racon_test_data_path + "sample_layout.fasta.gz",
+        racon::PolisherType::kC, 500, 10, 0.3, 5, -4, -8, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, true);
+    EXPECT_EQ(polished_sequences.size(), 1);
+
+    polished_sequences[0]->create_reverse_complement();
+
+    auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+        racon_test_data_path + "sample_reference.fasta.gz");
+    parser->parse(polished_sequences, -1);
+    EXPECT_EQ(polished_sequences.size(), 2);
+
+    EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+        polished_sequences[1]->data()), 1661); // CPU 1770
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithQualitiesLargerWindowCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+        "sample_overlaps.paf.gz", racon_test_data_path + "sample_layout.fasta.gz",
+        racon::PolisherType::kC, 1000, 10, 0.3, 5, -4, -8, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, true);
+    EXPECT_EQ(polished_sequences.size(), 1);
+
+    polished_sequences[0]->create_reverse_complement();
+
+    auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+        racon_test_data_path + "sample_reference.fasta.gz");
+    parser->parse(polished_sequences, -1);
+    EXPECT_EQ(polished_sequences.size(), 2);
+
+    EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+        polished_sequences[1]->data()), 4168); // CPU 1289
+}
+
+TEST_F(RaconPolishingTest, ConsensusWithQualitiesEditDistanceCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+        "sample_overlaps.paf.gz", racon_test_data_path + "sample_layout.fasta.gz",
+        racon::PolisherType::kC, 500, 10, 0.3, 1, -1, -1, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, true);
+    EXPECT_EQ(polished_sequences.size(), 1);
+
+    polished_sequences[0]->create_reverse_complement();
+
+    auto parser = bioparser::createParser<bioparser::FastaParser, racon::Sequence>(
+        racon_test_data_path + "sample_reference.fasta.gz");
+    parser->parse(polished_sequences, -1);
+    EXPECT_EQ(polished_sequences.size(), 2);
+
+    EXPECT_EQ(calculateEditDistance(polished_sequences[0]->reverse_complement(),
+        polished_sequences[1]->data()), 1361); // CPU 1321
+}
+
+TEST_F(RaconPolishingTest, FragmentCorrectionWithQualitiesCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+        "sample_ava_overlaps.paf.gz", racon_test_data_path + "sample_reads.fastq.gz",
+        racon::PolisherType::kC, 500, 10, 0.3, 1, -1, -1, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, true);
+    EXPECT_EQ(polished_sequences.size(), 39);
+
+    uint32_t total_length = 0;
+    for (const auto& it: polished_sequences) {
+        total_length += it->data().size();
+    }
+    EXPECT_EQ(total_length, 385543); // CPU 389394
+}
+
+TEST_F(RaconPolishingTest, FragmentCorrectionWithQualitiesFullCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+        "sample_ava_overlaps.paf.gz", racon_test_data_path + "sample_reads.fastq.gz",
+        racon::PolisherType::kF, 500, 10, 0.3, 1, -1, -1, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, false);
+    EXPECT_EQ(polished_sequences.size(), 236);
+
+    uint32_t total_length = 0;
+    for (const auto& it: polished_sequences) {
+        total_length += it->data().size();
+    }
+    EXPECT_EQ(total_length, 1655505); // CPU 1658216
+}
+
+TEST_F(RaconPolishingTest, FragmentCorrectionWithoutQualitiesFullCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fasta.gz", racon_test_data_path +
+        "sample_ava_overlaps.paf.gz", racon_test_data_path + "sample_reads.fasta.gz",
+        racon::PolisherType::kF, 500, 10, 0.3, 1, -1, -1, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, false);
+    EXPECT_EQ(polished_sequences.size(), 236);
+
+    uint32_t total_length = 0;
+    for (const auto& it: polished_sequences) {
+        total_length += it->data().size();
+    }
+    EXPECT_EQ(total_length, 1663732); // CPU 1663982
+}
+
+TEST_F(RaconPolishingTest, FragmentCorrectionWithQualitiesFullMhapCUDA) {
+    SetUp(racon_test_data_path + "sample_reads.fastq.gz", racon_test_data_path +
+        "sample_ava_overlaps.mhap.gz", racon_test_data_path + "sample_reads.fastq.gz",
+        racon::PolisherType::kF, 500, 10, 0.3, 1, -1, -1, 1);
+
+    initialize();
+
+    std::vector<std::unique_ptr<racon::Sequence>> polished_sequences;
+    polish(polished_sequences, false);
+    EXPECT_EQ(polished_sequences.size(), 236);
+
+    uint32_t total_length = 0;
+    for (const auto& it: polished_sequences) {
+        total_length += it->data().size();
+    }
+    EXPECT_EQ(total_length, 1655505); // CPU 1658216 
+}
+#endif



View it on GitLab: https://salsa.debian.org/med-team/racon/compare/bc5f1f7386b29a55ba2bfd0af0c8d193e7931db0...07bf95601d09fa77eb6329165e5b121a4e9aa011

-- 
View it on GitLab: https://salsa.debian.org/med-team/racon/compare/bc5f1f7386b29a55ba2bfd0af0c8d193e7931db0...07bf95601d09fa77eb6329165e5b121a4e9aa011
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190802/1b5416e8/attachment-0001.html>


More information about the debian-med-commit mailing list