[med-svn] [Git][med-team/rampler][master] 6 commits: routine-update: New upstream version

Michael R. Crusoe gitlab at salsa.debian.org
Tue Jan 19 07:02:39 GMT 2021



Michael R. Crusoe pushed to branch master at Debian Med / rampler


Commits:
67b948b5 by Michael R. Crusoe at 2021-01-19T07:36:41+01:00
routine-update: New upstream version

- - - - -
6976c32e by Michael R. Crusoe at 2021-01-19T07:36:42+01:00
New upstream version 2.0.0
- - - - -
0f88ebc5 by Michael R. Crusoe at 2021-01-19T07:36:42+01:00
Update upstream source from tag 'upstream/2.0.0'

Update to upstream version '2.0.0'
with Debian dir beebcf81f52b4afaf56fa44ebfe31fd2d81d0e63
- - - - -
4631d4d1 by Michael R. Crusoe at 2021-01-19T07:36:42+01:00
routine-update: Standards-Version: 4.5.1

- - - - -
8cf56564 by Michael R. Crusoe at 2021-01-19T08:01:36+01:00
Refreshed patches, removed arm64.patch.

- - - - -
6583bdba by Michael R. Crusoe at 2021-01-19T08:01:37+01:00
routine-update: Ready to upload to unstable

- - - - -


13 changed files:

- .travis.yml
- CMakeLists.txt
- README.md
- debian/changelog
- debian/control
- − debian/patches/arm64.patch
- debian/patches/series
- debian/patches/use_debian_packaged_libs.patch
- src/main.cpp
- src/sampler.cpp
- src/sampler.hpp
- − src/sequence.cpp
- − src/sequence.hpp


Changes:

=====================================
.travis.yml
=====================================
@@ -2,41 +2,46 @@ dist: trusty
 
 language: cpp
 
-compiler:
-    - clang
-    - gcc
+matrix:
+  include:
+    - name: "GCC 4.8 (Linux)"  # GCC 4.8.5 & CMake 3.9.2
+      os: linux
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.8
+            - cmake
+      env:
+        - SET_COMPILER="export CC=gcc-4.8 && export CXX=g++-4.8"
+
+    - name: "Clang 4.0 (Linux)"  # Clang 4.0 & CMake 3.9.2
+      os: linux
+      addons:
+        apt:
+          sources:
+            - llvm-toolchain-trusty-4.0
+          packages:
+            - clang-4.0
+            - cmake
+      env:
+        - SET_COMPILER="export CC=clang-4.0 && export CXX=clang++-4.0"
+
+    - name: "Clang Xcode 9.0 (OSX)"  # Clang 9.0.0 & CMake 3.9.2
+      os: osx
+      osx_image: xcode9
 
 before_install:
-    # cmake 3.2
-    - sudo add-apt-repository ppa:george-edison55/cmake-3.x -y
-
-    # g++4.8.1
-    - if [ "$CXX" == "g++" ]; then sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; fi
-
-    # clang 3.4
-    - if [ "$CXX" == "clang++" ]; then sudo add-apt-repository -y ppa:h-rayflood/llvm; fi
-
-    - sudo apt-get update -qq
+  - eval "${SET_COMPILER}"
 
 install:
-    # cmake 3.2
-    - sudo apt-get install cmake cmake-data
-
-    # g++4.8.1
-    - if [ "$CXX" == "g++" ]; then sudo apt-get install -qq g++-4.8; fi
-    - if [ "$CXX" == "g++" ]; then export CXX="g++-4.8"; fi
-
-    # clang 3.4
-    - if [ "$CXX" == "clang++" ]; then sudo apt-get install --allow-unauthenticated -qq clang-3.4; fi
-    - if [ "$CXX" == "clang++" ]; then export CXX="clang++-3.4"; fi
+  - mkdir build && cd build
+  - cmake -DCMAKE_BUILD_TYPE=Release .. && make
 
 script:
-    - mkdir build
-    - cd build
-    - cmake -DCMAKE_BUILD_TYPE=Release ..
-    - make
+  - ./bin/rampler --version
 
 notifications:
-    email:
-        on_success: change
-        on_failure: always
+  email:
+    on_failure: always


=====================================
CMakeLists.txt
=====================================
@@ -1,23 +1,33 @@
-cmake_minimum_required(VERSION 3.2)
-project(rampler)
+cmake_minimum_required(VERSION 3.9)
 
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
+project(rampler VERSION 2.0.0
+                LANGUAGES CXX
+                DESCRIPTION "Rampler is a tool for subsampling or splitting FASTA/Q files.")
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic")
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
-add_executable(rampler
-    src/main.cpp
-    src/sequence.cpp
-    src/sampler.cpp)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
 
 if (NOT TARGET bioparser)
-    add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
-endif()
+  add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
+endif ()
+if (NOT TARGET biosoup)
+  add_subdirectory(vendor/bioparser/vendor/biosoup EXCLUDE_FROM_ALL)
+endif ()
 
-target_link_libraries(rampler bioparser)
+add_executable(${PROJECT_NAME}
+  src/main.cpp
+  src/sampler.cpp)
+target_link_libraries(${PROJECT_NAME}
+  bioparser
+  biosoup)
+target_compile_definitions(${PROJECT_NAME} PRIVATE
+  RAMPLER_VERSION="v${PROJECT_VERSION}")
 
-install(TARGETS rampler DESTINATION bin)
+include(GNUInstallDirs)
+install(TARGETS ${PROJECT_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})


=====================================
README.md
=====================================
@@ -1,73 +1,57 @@
 # Rampler
 
 [![Latest GitHub release](https://img.shields.io/github/release/rvaser/rampler.svg)](https://github.com/rvaser/rampler/releases/latest)
-[![Build status for c++/clang++](https://travis-ci.org/rvaser/rampler.svg?branch=master)](https://travis-ci.org/rvaser/rampler)
+[![Build status for c++/clang++](https://travis-ci.com/rvaser/rampler.svg?branch=master)](https://travis-ci.com/rvaser/rampler)
 
-Standalone module for sampling genomic sequences. It supports two modes, random subsampling of sequencer data to a desired depth (given the reference length) and file splitting to desired size in bytes.
+Rampler is a standalone module for sampling genomic sequences. It supports two modes, random subsampling of sequencing data to a desired depth (given the reference length) and file splitting to desired size in bytes.
 
-Rampler takes as first input argument a file in FASTA/FASTQ format which can be compressed with gzip. The rest of input parameters depend on the mode of operation. The output is stored into a file(s) which is in the same format as the input file but uncompressed.
-
-## Dependencies
-1. gcc 4.8+ or clang 3.4+
-2. cmake 3.2+
-
-## Installation
-To install Rampler run the following commands:
+## Usage
 
+To build rampler run the following commands:
 ```bash
 git clone --recursive https://github.com/rvaser/rampler.git rampler
-cd rampler
-mkdir build
-cd build
-cmake -DCMAKE_BUILD_TYPE=Release ..
-make
+cd rampler && mkdir build && cd build
+cmake -DCMAKE_BUILD_TYPE=Release .. && make
+./bin/rampler
 ```
 
-After successful installation, an executable named `rampler` will appear in `build/bin`.
-
-Optionally, you can run `sudo make install` to install rampler executable to your machine.
-
-***Note***: if you omitted `--recursive` from `git clone`, run `git submodule update --init --recursive` before proceeding with compilation.
-
-## Usage
-Usage of rampler is as following:
-
-    usage: rampler [options ...] <mode>
+which will display the following usage:
 
-    <mode>
-        subsample <sequences> <reference length> <coverage> [<coverage> ...]
-
-            <sequences>
-                input file in FASTA/FASTQ format (can be compressed with gzip)
-                containing sequences to be subsampled
-            <reference length>
-                integer denoting length of the reference genome (or
-                assembly) from which the sequences originate
-            <coverage>
-                integer denoting desired coverage of the subsampled
-                sequences
-
-        split <sequences> <chunk size>
-
-            <sequences>
-                input file in FASTA/FASTQ format (can be compressed with gzip)
-                containing sequences which will be split into smaller chunks
-            <chunk size>
-                integer denoting the desired chunk size in bytes
-
-    options:
-        -o, --out-directory
-            default: current directory
-            path in which sampled files will be created
-        --version
-            prints the version number
-        -h, --help
-            prints out the help
-
-## Contact information
+```bash
+usage: rampler [options ...] <mode>
+
+  <mode>
+    subsample <sequences> <reference length> <coverage> [<coverage> ...]
+
+      <sequences>
+        input file in FASTA/FASTQ format (can be compressed with gzip)
+      <reference length>
+        integer denoting length of the reference genome (or assembly)
+      <coverage>
+        integer denoting desired coverage of the subsampled sequences
+
+    split <sequences> <chunk size>
+
+      <sequences>
+        input file in FASTA/FASTQ format (can be compressed with gzip)
+        containing sequences which will be split into smaller chunks
+      <chunk size>
+        integer denoting the desired chunk size in bytes
+
+  options:
+    -o, --out-directory <string>
+      default: current directory
+      path in which sampled files will be created
+    --version
+      prints the version number
+    -h, --help
+      prints out the help
+```
 
-For additional information, help and bug reports please send an email to one of the following: robert.vaser at fer.hr
+#### Dependencies
+1. gcc 4.8+ or clang 4.0+
+2. cmake 3.9+
+3. zlib
 
 ## Acknowledgment
-
 This work has been supported in part by Croatian Science Foundation under the project UIP-11-2013-7353.


=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+rampler (2.0.0-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream version
+  * Standards-Version: 4.5.1 (routine-update)
+  * Refreshed patches, removed arm64.patch.
+
+ -- Michael R. Crusoe <crusoe at debian.org>  Tue, 19 Jan 2021 07:40:24 +0100
+
 rampler (1.1.1-3) unstable; urgency=medium
 
   * Team upload.


=====================================
debian/control
=====================================
@@ -7,7 +7,7 @@ Priority: optional
 Build-Depends: debhelper-compat (= 13),
                cmake,
                libbioparser-dev
-Standards-Version: 4.5.0
+Standards-Version: 4.5.1
 Vcs-Browser: https://salsa.debian.org/med-team/rampler
 Vcs-Git: https://salsa.debian.org/med-team/rampler.git
 Homepage: https://github.com/rvaser/rampler


=====================================
debian/patches/arm64.patch deleted
=====================================
@@ -1,16 +0,0 @@
-Author: Nilesh Patra <npatra974 at gmail.com>
-Description: Change char to signed for comparison - fixes arm64 tests
-Last-Changed: October 1, 2020
-Forwarded: https://github.com/rvaser/rampler/pull/3
---- a/src/main.cpp
-+++ b/src/main.cpp
-@@ -24,7 +24,7 @@
- 
-     std::string out_directory = ".";
- 
--    char argument;
-+    signed char argument;
-     while ((argument = getopt_long(argc, argv, "o:h", options, nullptr)) != -1) {
-         switch (argument) {
-             case 'o':
-


=====================================
debian/patches/series
=====================================
@@ -1,2 +1 @@
 use_debian_packaged_libs.patch
-arm64.patch


=====================================
debian/patches/use_debian_packaged_libs.patch
=====================================
@@ -1,18 +1,27 @@
 Author: Andreas Tille <tille at debian.org>
-Last-Update:  Fri, 08 Jun 2018 13:20:51 +0200
+Last-Update:  2021-01-19
 Description: Use Debian packaged libraries
-
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -14,10 +14,6 @@ add_executable(rampler
-     src/sequence.cpp
-     src/sampler.cpp)
+Forwarded: not-needed
+--- rampler.orig/CMakeLists.txt
++++ rampler/CMakeLists.txt
+@@ -13,19 +13,11 @@
+ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
  
 -if (NOT TARGET bioparser)
--    add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
--endif()
+-  add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
+-endif ()
+-if (NOT TARGET biosoup)
+-  add_subdirectory(vendor/bioparser/vendor/biosoup EXCLUDE_FROM_ALL)
+-endif ()
 -
--target_link_libraries(rampler bioparser)
-+target_link_libraries(rampler z)
+ add_executable(${PROJECT_NAME}
+   src/main.cpp
+   src/sampler.cpp)
+ target_link_libraries(${PROJECT_NAME}
+-  bioparser
+-  biosoup)
++  z)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE
+   RAMPLER_VERSION="v${PROJECT_VERSION}")
  
- install(TARGETS rampler DESTINATION bin)


=====================================
src/main.cpp
=====================================
@@ -1,120 +1,164 @@
-#include <stdio.h>
-#include <stdlib.h>
+// Copyright (c) 2021 Robert Vaser
+
 #include <getopt.h>
 
-#include "sequence.hpp"
+#include <iostream>
+
+#include "bioparser/fasta_parser.hpp"
+#include "bioparser/fastq_parser.hpp"
+
 #include "sampler.hpp"
 
-#include "bioparser/bioparser.hpp"
+std::atomic<std::uint32_t> biosoup::Sequence::num_objects{0};
 
-static const char* version = "v1.1.1";
+namespace {
+
+static const char* rampler_version = RAMPLER_VERSION;
 
 static struct option options[] = {
-    {"out-directory", required_argument, 0, 'o'},
-    {"version", no_argument, 0, 'v'},
-    {"help", no_argument, 0, 'h'},
-    {0, 0, 0, 0}
+  {"out-directory", required_argument, nullptr, 'o'},
+  {"version", no_argument, nullptr, 'v'},
+  {"help", no_argument, nullptr, 'h'},
+  {nullptr, 0, nullptr, 0}
 };
 
-void help();
-
-int main(int argc, char** argv) {
-
-    std::vector<std::string> input_parameters;
-
-    std::string out_directory = ".";
-
-    char argument;
-    while ((argument = getopt_long(argc, argv, "o:h", options, nullptr)) != -1) {
-        switch (argument) {
-            case 'o':
-                out_directory = optarg;
-                break;
-            case 'v':
-                printf("%s\n", version);
-                exit(0);
-            case 'h':
-                help();
-                exit(0);
-            default:
-                exit(1);
-        }
+std::unique_ptr<bioparser::Parser<biosoup::Sequence>> CreateParser(
+    const std::string& path, std::string* name, std::string* ext) {
+  auto is_suffix = [] (const std::string& s, const std::string& suff) {
+    return s.size() < suff.size() ? false :
+        s.compare(s.size() - suff.size(), suff.size(), suff) == 0;
+  };
+
+  std::size_t c = path.rfind('/');
+  *name = (c == std::string::npos ? path : path.substr(c + 1));
+
+  c = name->find('.');
+  *ext = (c == std::string::npos ? "" : name->substr(c, name->find('.', c + 1) - c));  // NOLINT
+  *name = (c == std::string::npos ? *name : name->substr(0, c));
+
+  if (is_suffix(path, ".fasta")    || is_suffix(path, ".fa") ||
+      is_suffix(path, ".fasta.gz") || is_suffix(path, ".fa.gz")) {
+    try {
+      return bioparser::Parser<biosoup::Sequence>::Create<bioparser::FastaParser>(path);  // NOLINT
+    } catch (const std::invalid_argument& exception) {
+      std::cerr << exception.what() << std::endl;
+      return nullptr;
     }
-
-    if (optind == argc) {
-        fprintf(stderr, "[rampler::] error: too few arguments!\n");
-        help();
-        exit(1);
+  }
+  if (is_suffix(path, ".fastq")    || is_suffix(path, ".fq") ||
+      is_suffix(path, ".fastq.gz") || is_suffix(path, ".fq.gz")) {
+    try {
+      return bioparser::Parser<biosoup::Sequence>::Create<bioparser::FastqParser>(path);  // NOLINT
+    } catch (const std::invalid_argument& exception) {
+      std::cerr << exception.what() << std::endl;
+      return nullptr;
     }
+  }
 
-    for (int32_t i = optind; i < argc; ++i) {
-        input_parameters.emplace_back(argv[i]);
-    }
-
-    bool do_subsample = false, do_split = false;
-    if (input_parameters[0] == "subsample") {
-        do_subsample = true;
-    } else if (input_parameters[0] == "split") {
-        do_split = true;
-    } else {
-        fprintf(stderr, "[rampler::] error: unkown mode %s!\n", input_parameters[0].c_str());
-        exit(1);
-    }
+  std::cerr << "[rampler::CreateParser] error: file " << path
+            << " has unsupported format extension (valid extensions: .fasta, "
+            << ".fasta.gz, .fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz)"
+            << std::endl;
+  return nullptr;
+}
 
-    if ((do_subsample && input_parameters.size() < 4) ||
-        (do_split && input_parameters.size() < 3)) {
+void Help() {
+  std::cout <<
+      "usage: rampler [options ...] <mode>\n"
+      "\n"
+      "  <mode>\n"
+      "    subsample <sequences> <reference length> <coverage> [<coverage> ...]\n"  // NOLINT
+      "\n"
+      "      <sequences>\n"
+      "        input file in FASTA/FASTQ format (can be compressed with gzip)\n"
+      "      <reference length>\n"
+      "        integer denoting length of the reference genome (or assembly)\n"
+      "      <coverage>\n"
+      "        integer denoting desired coverage of the subsampled sequences\n"
+      "\n"
+      "    split <sequences> <chunk size>\n"
+      "\n"
+      "      <sequences>\n"
+      "        input file in FASTA/FASTQ format (can be compressed with gzip)\n"
+      "      <chunk size>\n"
+      "        integer denoting the desired chunk size in bytes\n"
+      "\n"
+      "  options:\n"
+      "    -o, --out-directory <string>\n"
+      "      default: current directory\n"
+      "      path in which sampled files will be created\n"
+      "    --version\n"
+      "      prints the version number\n"
+      "    -h, --help\n"
+      "      prints the usage\n";
+}
 
-        fprintf(stderr, "[rampler::] error: missing input parameter(s)!\n");
-        exit(1);
-    }
+}  // namespace
 
-    auto sampler = rampler::createSampler(input_parameters[1]);
-    sampler->initialize();
-
-    if (do_split) {
-        sampler->split(out_directory, atoi(input_parameters[2].c_str()));
-    } else if (do_subsample) {
-        uint32_t reference_length = atoi(input_parameters[2].c_str());
-        for (uint32_t i = 3; i < input_parameters.size(); ++i) {
-            sampler->subsample(out_directory, reference_length,
-                atoi(input_parameters[i].c_str()));
-        }
+int main(int argc, char** argv) {
+  std::vector<std::string> input_parameters;
+  std::string out_directory = ".";
+
+  int arg;
+  while ((arg = getopt_long(argc, argv, "o:h", options, nullptr)) != -1) {
+    switch (arg) {
+      case 'o': out_directory = optarg; break;
+      case 'v': std::cerr << rampler_version << std::endl; return 0;
+      case 'h': Help(); return 0;
+      default: return 1;
     }
+  }
 
+  if (argc == 1) {
+    Help();
     return 0;
-}
+  }
+
+  if (optind == argc) {
+    std::cerr << "[rampler::] error: missing arguments!" << std::endl;
+    return 1;
+  }
+
+  for (std::int32_t i = optind; i < argc; ++i) {
+    input_parameters.emplace_back(argv[i]);
+  }
+
+  bool subsample = false, split = false;
+  if (input_parameters[0] == "subsample") {
+    subsample = true;
+  } else if (input_parameters[0] == "split") {
+    split = true;
+  } else {
+    std::cerr << "[rampler::] error: unknown mode!" << std::endl;
+    return 1;
+  }
+
+  if ((subsample && input_parameters.size() < 4) ||
+      (split && input_parameters.size() < 3)) {
+    std::cerr << "[rampler::] error: missing arguments!" << std::endl;
+    return 1;
+  }
+
+  std::string name{}, ext{};
+  auto sparser = CreateParser(input_parameters[1], &name, &ext);
+  if (sparser == nullptr) {
+    return 1;
+  }
+
+  rampler::Sampler sampler{std::move(sparser), name, ext};
+  sampler.Initialize();
+
+  if (split) {
+    sampler.Split(out_directory, atoi(input_parameters[2].c_str()));
+  } else if (subsample) {
+    std::uint32_t reference_length = atoi(input_parameters[2].c_str());
+    for (std::uint32_t i = 3; i < input_parameters.size(); ++i) {
+      sampler.Subsample(
+          out_directory,
+          reference_length,
+          atoi(input_parameters[i].c_str()));
+    }
+  }
 
-void help() {
-    printf(
-        "usage: rampler [options ...] <mode>\n"
-        "\n"
-        "    <mode>\n"
-        "        subsample <sequences> <reference length> <coverage> [<coverage> ...]\n"
-        "\n"
-        "            <sequences>\n"
-        "                input file in FASTA/FASTQ format (can be compressed with gzip)\n"
-        "                containing sequences to be subsampled\n"
-        "            <reference length>\n"
-        "                integer denoting length of the reference genome (or\n"
-        "                assembly) from which the sequences originate\n"
-        "            <coverage>\n"
-        "                integer denoting desired coverage of the subsampled\n"
-        "                sequences\n"
-        "\n"
-        "        split <sequences> <chunk size>\n"
-        "\n"
-        "            <sequences>\n"
-        "                input file in FASTA/FASTQ format (can be compressed with gzip)\n"
-        "                containing sequences which will be split into smaller chunks\n"
-        "            <chunk size>\n"
-        "                integer denoting the desired chunk size in bytes\n"
-        "\n"
-        "    options:\n"
-        "        -o, --out-directory\n"
-        "            default: current directory\n"
-        "            path in which sampled files will be created\n"
-        "        --version\n"
-        "            prints the version number\n"
-        "        -h, --help\n"
-        "            prints the usage\n");
+  return 0;
 }


=====================================
src/sampler.cpp
=====================================
@@ -1,181 +1,138 @@
-/*!
- * @file sampler.cpp
- *
- * @brief Sampler class source file
- */
+// Copyright (c) 2021 Robert Vaser
 
-#include <random>
-
-#include "sequence.hpp"
 #include "sampler.hpp"
 
-#include "bioparser/bioparser.hpp"
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <utility>
 
 namespace rampler {
 
-constexpr uint32_t kChunkSize = 1024 * 1024 * 1024; // ~ 1GB
-
-std::unique_ptr<Sampler> createSampler(const std::string& sequences_path) {
-
-    std::unique_ptr<bioparser::Parser<Sequence>> sparser = nullptr;
-
-    std::string base_name = sequences_path.substr(sequences_path.rfind('/') + 1);
-    base_name = base_name.substr(0, base_name.find('.'));
-    std::string extension;
-
-    auto is_suffix = [](const std::string& src, const std::string& suffix) -> bool {
-        if (src.size() < suffix.size()) {
-            return false;
-        }
-        return src.compare(src.size() - suffix.size(), suffix.size(), suffix) == 0;
-    };
-
-    if (is_suffix(sequences_path, ".fasta") || is_suffix(sequences_path, ".fa") ||
-        is_suffix(sequences_path, ".fasta.gz") || is_suffix(sequences_path, ".fa.gz")) {
-        sparser = bioparser::createParser<bioparser::FastaParser, Sequence>(
-            sequences_path);
-        extension = ".fasta";
-    } else if (is_suffix(sequences_path, ".fastq") || is_suffix(sequences_path, ".fq") ||
-        is_suffix(sequences_path, ".fastq.gz") || is_suffix(sequences_path, ".fq.gz")) {
-        sparser = bioparser::createParser<bioparser::FastqParser, Sequence>(
-            sequences_path);
-        extension = ".fastq";
-    } else {
-        fprintf(stderr, "[rampler::createSampler] error: "
-            "file %s has unsupported format extension (valid extensions: "
-            ".fasta, .fasta.gz, .fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz)!\n",
-            sequences_path.c_str());
-        exit(1);
-    }
-
-    return std::unique_ptr<Sampler>(new Sampler(std::move(sparser), base_name,
-        extension));
-}
+constexpr uint32_t kChunkSize = 1024 * 1024 * 1024;  // ~ 1GB
 
-Sampler::Sampler(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
-    const std::string& base_name, const std::string& extension)
-        : sparser_(std::move(sparser)), sequences_length_(0), base_name_(base_name),
-        extension_(extension) {
+Sampler::Sampler(
+    std::unique_ptr<bioparser::Parser<biosoup::Sequence>> sparser,
+    const std::string& base_name,
+    const std::string& extension)
+    : sparser_(std::move(sparser)),
+      sequences_length_(0),
+      base_name_(base_name),
+      extension_(extension) {
 }
 
-Sampler::~Sampler() {
-}
+void Sampler::Initialize() {
+  if (sequences_length_ != 0) {
+    return;
+  }
 
-void Sampler::initialize() {
-
-    if (sequences_length_ != 0) {
-        fprintf(stderr, "[rampler::Sampler::initialize] warning: "
-            "object already initialized!\n");
-        return;
+  sparser_->Reset();
+  while (true) {
+    auto sequences = sparser_->Parse(1ULL << 30);
+    if (sequences.empty()) {
+      break;
     }
 
-    sparser_->reset();
-    while (true) {
-        std::vector<std::unique_ptr<Sequence>> sequences;
-        auto status = sparser_->parse(sequences, kChunkSize);
-
-        for (const auto& it: sequences) {
-            sequences_length_ += it->data().size();
-        }
-
-        if (!status) {
-            break;
-        }
+    for (const auto& it : sequences) {
+      sequences_length_ += it->data.size();
     }
+  }
 }
 
-void Sampler::subsample(const std::string& out_directory, uint32_t reference_length,
-    uint32_t coverage) {
-
-    if (coverage * reference_length > sequences_length_) {
-        fprintf(stderr, "[rampler::Sampler::subsample] warning: "
-            "insufficient data for coverage %u!\n", coverage);
-        return;
+void Sampler::Subsample(
+    const std::string& out_directory,
+    std::uint32_t reference_length,
+    std::uint32_t coverage) {
+  if (coverage * reference_length > sequences_length_) {
+    std::cerr << "[rampler::Sampler::subsample] warning: "
+              << "insufficient data for coverage of " << coverage
+              << std::endl;
+    return;
+  }
+
+  std::random_device r;
+  std::mt19937 generator(r());
+  std::uniform_real_distribution<double> distribution(0.0, 1.0);
+
+  double ratio = (coverage * reference_length) / static_cast<double>(sequences_length_);  // NOLINT
+
+  std::string out_path =
+      out_directory + "/" + base_name_ + "_" +
+      std::to_string(coverage) + "x" + extension_;
+
+  std::ofstream ofs(out_path);
+  if (!ofs.is_open()) {
+    throw std::runtime_error(
+        "[rampler::Sampler::subsample] error: unable to create file on disk!");
+  }
+
+  sparser_->Reset();
+  while (true) {
+    auto sequences = sparser_->Parse(1ULL << 30);
+    if (sequences.empty()) {
+      break;
     }
 
-    std::random_device r;
-    std::mt19937 generator(r());
-    std::uniform_real_distribution<double> distribution(0.0, 1.0);
-
-    double ratio = (coverage * reference_length) / static_cast<double>(
-        sequences_length_);
-
-    std::string out_path = out_directory + "/" + base_name_ + "_" +
-        std::to_string(coverage) + "x" + extension_;
-    auto out = fopen(out_path.c_str(), "w");
-    if (out == nullptr) {
-        fprintf(stderr, "rampler::Sampler::subsample] error: "
-            "unable to create file %s!\n", out_path.c_str());
-        exit(1);
-    }
-
-    sparser_->reset();
-    while (true) {
-        std::vector<std::unique_ptr<Sequence>> sequences;
-        auto status = sparser_->parse(sequences, kChunkSize);
-
-        for (const auto& it: sequences) {
-            if (distribution(generator) < ratio) {
-                if (it->quality().empty()) {
-                    fprintf(out, ">%s\n%s\n", it->name().c_str(),
-                        it->data().c_str());
-                } else {
-                    fprintf(out, "@%s\n%s\n+\n%s\n", it->name().c_str(),
-                        it->data().c_str(), it->quality().c_str());
-                }
-            }
-        }
-
-        if (!status) {
-            break;
+    for (const auto& it : sequences) {
+      if (distribution(generator) < ratio) {
+        if (it->quality.empty()) {
+          ofs << ">" << it->name << std::endl
+              << it->data << std::endl;
+        } else {
+          ofs << "@" << it->name << std::endl
+              << it->data << std::endl
+              << "+" << std::endl
+              << it->quality << std::endl;
         }
+      }
     }
+  }
 
-    fclose(out);
+  ofs.close();
 }
 
-void Sampler::split(const std::string& out_directory, uint32_t chunk_size) {
-
-    if (chunk_size > sequences_length_) {
-        fprintf(stderr, "[rampler::Sampler::split] warning: "
-            "insufficient data for chunk size %u!\n", chunk_size);
-        return;
+void Sampler::Split(const std::string& out_directory, std::uint32_t chunk_size) {  // NOLINT
+  if (chunk_size > sequences_length_) {
+    std::cerr << "[rampler::Sampler::split] warning: "
+              << "insufficient data for chunk size " << chunk_size
+              << std::endl;
+    return;
+  }
+
+  uint32_t chunk_number = 0;
+
+  sparser_->Reset();
+  while (true) {
+    auto sequences = sparser_->Parse(chunk_size);
+    if (sequences.empty()) {
+      break;
     }
 
-    uint32_t chunk_number = 0;
+    std::string out_path =
+        out_directory + "/" + base_name_ + "_" +
+        std::to_string(chunk_number++) + extension_;
 
-    sparser_->reset();
-    while (true) {
-        std::vector<std::unique_ptr<Sequence>> sequences;
-        auto status = sparser_->parse(sequences, chunk_size);
-
-        std::string out_path = out_directory + "/" + base_name_ + "_" +
-            std::to_string(chunk_number) + extension_;
-        auto out = fopen(out_path.c_str(), "w");
-        if (out == nullptr) {
-            fprintf(stderr, "rampler::Sampler::subsample] error: "
-                "unable to create file %s!\n", out_path.c_str());
-            exit(1);
-        }
-
-        for (const auto& it: sequences) {
-            if (it->quality().empty()) {
-                fprintf(out, ">%s\n%s\n", it->name().c_str(),
-                    it->data().c_str());
-            } else {
-                fprintf(out, "@%s\n%s\n+\n%s\n", it->name().c_str(),
-                    it->data().c_str(), it->quality().c_str());
-            }
-        }
-
-        fclose(out);
-
-        ++chunk_number;
+    std::ofstream ofs(out_path);
+    if (!ofs.is_open()) {
+      throw std::runtime_error(
+          "[rampler::Sampler::subsample] error: unable to create file on disk!");  // NOLINT
+    }
 
-        if (!status) {
-            break;
-        }
+    for (const auto& it : sequences) {
+      if (it->quality.empty()) {
+        ofs << ">" << it->name << std::endl
+            << it->data << std::endl;
+      } else {
+        ofs << "@" << it->name << std::endl
+            << it->data << std::endl
+            << "+" << std::endl
+            << it->quality << std::endl;
+      }
     }
-}
 
+    ofs.close();
+  }
 }
+
+}  // namespace rampler


=====================================
src/sampler.hpp
=====================================
@@ -1,19 +1,15 @@
-/*!
- * @file sampler.hpp
- *
- * @brief Sampler class header file
- */
+// Copyright (c) 2021 Robert Vaser
 
-#pragma once
+#ifndef RAMPLER_SAMPLER_HPP_
+#define RAMPLER_SAMPLER_HPP_
 
-#include <stdlib.h>
-#include <vector>
+#include <cstdint>
 #include <memory>
+#include <string>
+#include <vector>
 
-namespace bioparser {
-    template<class T>
-    class Parser;
-}
+#include "bioparser/parser.hpp"
+#include "biosoup/sequence.hpp"
 
 namespace rampler {
 
@@ -21,27 +17,36 @@ class Sampler;
 std::unique_ptr<Sampler> createSampler(const std::string& sequences_path);
 
 class Sampler {
-public:
-    ~Sampler();
+ public:
+  Sampler(
+      std::unique_ptr<bioparser::Parser<biosoup::Sequence>> sparser,
+      const std::string& base_name,
+      const std::string& extension);
+
+  Sampler(const Sampler&) = delete;
+  Sampler& operator=(const Sampler&) = delete;
 
-    void initialize();
+  Sampler(Sampler&&) = delete;
+  Sampler& operator=(Sampler&&) = delete;
 
-    void subsample(const std::string& out_directory, uint32_t reference_length,
-        uint32_t coverage);
+  ~Sampler() = default;
 
-    void split(const std::string& out_directory, uint32_t chunk_size);
+  void Initialize();
 
-    friend std::unique_ptr<Sampler> createSampler(const std::string& sequences_path);
-private:
-    Sampler(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
-        const std::string& base_name, const std::string& extension);
-    Sampler(const Sampler&) = delete;
-    const Sampler& operator=(const Sampler&) = delete;
+  void Subsample(
+      const std::string& out_directory,
+      std::uint32_t reference_length,
+      std::uint32_t coverage);
 
-    std::unique_ptr<bioparser::Parser<Sequence>> sparser_;
-    uint64_t sequences_length_;
-    std::string base_name_;
-    std::string extension_;
+  void Split(const std::string& out_directory, std::uint32_t chunk_size);
+
+ private:
+  std::unique_ptr<bioparser::Parser<biosoup::Sequence>> sparser_;
+  std::uint64_t sequences_length_;
+  std::string base_name_;
+  std::string extension_;
 };
 
-}
+}  // namespace rampler
+
+#endif  // RAMPLER_SAMPLER_HPP_


=====================================
src/sequence.cpp deleted
=====================================
@@ -1,22 +0,0 @@
-/*!
- * @file sequence.cpp
- *
- * @brief Sequence class source file
- */
-
-#include "sequence.hpp"
-
-namespace rampler {
-
-Sequence::Sequence(const char* name, uint32_t name_length, const char* data,
-    uint32_t data_length)
-        : name_(name, name_length), data_(data, data_length), quality_() {
-}
-
-Sequence::Sequence(const char* name, uint32_t name_length, const char* data,
-    uint32_t data_length, const char* quality, uint32_t quality_length)
-        : name_(name, name_length), data_(data, data_length), quality_(quality,
-        quality_length) {
-}
-
-}


=====================================
src/sequence.hpp deleted
=====================================
@@ -1,53 +0,0 @@
-/*!
- * @file sequence.hpp
- *
- * @brief Sequence class header file
- */
-
-#pragma once
-
-#include <stdint.h>
-#include <string>
-
-namespace bioparser {
-    template<class T>
-    class FastaParser;
-
-    template<class T>
-    class FastqParser;
-}
-
-namespace rampler {
-
-class Sequence {
-public:
-    ~Sequence() = default;
-
-    const std::string& name() const {
-        return name_;
-    }
-
-    const std::string& data() const {
-        return data_;
-    }
-
-    const std::string& quality() const {
-        return quality_;
-    }
-
-    friend bioparser::FastaParser<Sequence>;
-    friend bioparser::FastqParser<Sequence>;
-private:
-    Sequence(const char* name, uint32_t name_length, const char* data,
-        uint32_t data_length);
-    Sequence(const char* name, uint32_t name_length, const char* data,
-        uint32_t data_length, const char* quality, uint32_t quality_length);
-    Sequence(const Sequence&) = delete;
-    const Sequence& operator=(const Sequence&) = delete;
-
-    std::string name_;
-    std::string data_;
-    std::string quality_;
-};
-
-}



View it on GitLab: https://salsa.debian.org/med-team/rampler/-/compare/3e81d73a55dc4e719cae81d756843de6b6d4506c...6583bdba24466729710867275c1fa96d0e7d363b

-- 
View it on GitLab: https://salsa.debian.org/med-team/rampler/-/compare/3e81d73a55dc4e719cae81d756843de6b6d4506c...6583bdba24466729710867275c1fa96d0e7d363b
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210119/5cb6138a/attachment-0001.html>


More information about the debian-med-commit mailing list