[med-svn] [Git][med-team/rampler][master] 6 commits: routine-update: New upstream version
Michael R. Crusoe
gitlab at salsa.debian.org
Tue Jan 19 07:02:39 GMT 2021
Michael R. Crusoe pushed to branch master at Debian Med / rampler
Commits:
67b948b5 by Michael R. Crusoe at 2021-01-19T07:36:41+01:00
routine-update: New upstream version
- - - - -
6976c32e by Michael R. Crusoe at 2021-01-19T07:36:42+01:00
New upstream version 2.0.0
- - - - -
0f88ebc5 by Michael R. Crusoe at 2021-01-19T07:36:42+01:00
Update upstream source from tag 'upstream/2.0.0'
Update to upstream version '2.0.0'
with Debian dir beebcf81f52b4afaf56fa44ebfe31fd2d81d0e63
- - - - -
4631d4d1 by Michael R. Crusoe at 2021-01-19T07:36:42+01:00
routine-update: Standards-Version: 4.5.1
- - - - -
8cf56564 by Michael R. Crusoe at 2021-01-19T08:01:36+01:00
Refreshed patches, removed arm64.patch.
- - - - -
6583bdba by Michael R. Crusoe at 2021-01-19T08:01:37+01:00
routine-update: Ready to upload to unstable
- - - - -
13 changed files:
- .travis.yml
- CMakeLists.txt
- README.md
- debian/changelog
- debian/control
- − debian/patches/arm64.patch
- debian/patches/series
- debian/patches/use_debian_packaged_libs.patch
- src/main.cpp
- src/sampler.cpp
- src/sampler.hpp
- − src/sequence.cpp
- − src/sequence.hpp
Changes:
=====================================
.travis.yml
=====================================
@@ -2,41 +2,46 @@ dist: trusty
language: cpp
-compiler:
- - clang
- - gcc
+matrix:
+ include:
+ - name: "GCC 4.8 (Linux)" # GCC 4.8.5 & CMake 3.9.2
+ os: linux
+ addons:
+ apt:
+ sources:
+ - ubuntu-toolchain-r-test
+ packages:
+ - g++-4.8
+ - cmake
+ env:
+ - SET_COMPILER="export CC=gcc-4.8 && export CXX=g++-4.8"
+
+ - name: "Clang 4.0 (Linux)" # Clang 4.0 & CMake 3.9.2
+ os: linux
+ addons:
+ apt:
+ sources:
+ - llvm-toolchain-trusty-4.0
+ packages:
+ - clang-4.0
+ - cmake
+ env:
+ - SET_COMPILER="export CC=clang-4.0 && export CXX=clang++-4.0"
+
+ - name: "Clang Xcode 9.0 (OSX)" # Clang 9.0.0 & CMake 3.9.2
+ os: osx
+ osx_image: xcode9
before_install:
- # cmake 3.2
- - sudo add-apt-repository ppa:george-edison55/cmake-3.x -y
-
- # g++4.8.1
- - if [ "$CXX" == "g++" ]; then sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; fi
-
- # clang 3.4
- - if [ "$CXX" == "clang++" ]; then sudo add-apt-repository -y ppa:h-rayflood/llvm; fi
-
- - sudo apt-get update -qq
+ - eval "${SET_COMPILER}"
install:
- # cmake 3.2
- - sudo apt-get install cmake cmake-data
-
- # g++4.8.1
- - if [ "$CXX" == "g++" ]; then sudo apt-get install -qq g++-4.8; fi
- - if [ "$CXX" == "g++" ]; then export CXX="g++-4.8"; fi
-
- # clang 3.4
- - if [ "$CXX" == "clang++" ]; then sudo apt-get install --allow-unauthenticated -qq clang-3.4; fi
- - if [ "$CXX" == "clang++" ]; then export CXX="clang++-3.4"; fi
+ - mkdir build && cd build
+ - cmake -DCMAKE_BUILD_TYPE=Release .. && make
script:
- - mkdir build
- - cd build
- - cmake -DCMAKE_BUILD_TYPE=Release ..
- - make
+ - ./bin/rampler --version
notifications:
- email:
- on_success: change
- on_failure: always
+ email:
+ on_failure: always
=====================================
CMakeLists.txt
=====================================
@@ -1,23 +1,33 @@
-cmake_minimum_required(VERSION 3.2)
-project(rampler)
+cmake_minimum_required(VERSION 3.9)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
+project(rampler VERSION 2.0.0
+ LANGUAGES CXX
+ DESCRIPTION "Rampler is a tool for subsampling or splitting FASTA/Q files.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic")
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
-add_executable(rampler
- src/main.cpp
- src/sequence.cpp
- src/sampler.cpp)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
if (NOT TARGET bioparser)
- add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
-endif()
+ add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
+endif ()
+if (NOT TARGET biosoup)
+ add_subdirectory(vendor/bioparser/vendor/biosoup EXCLUDE_FROM_ALL)
+endif ()
-target_link_libraries(rampler bioparser)
+add_executable(${PROJECT_NAME}
+ src/main.cpp
+ src/sampler.cpp)
+target_link_libraries(${PROJECT_NAME}
+ bioparser
+ biosoup)
+target_compile_definitions(${PROJECT_NAME} PRIVATE
+ RAMPLER_VERSION="v${PROJECT_VERSION}")
-install(TARGETS rampler DESTINATION bin)
+include(GNUInstallDirs)
+install(TARGETS ${PROJECT_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
=====================================
README.md
=====================================
@@ -1,73 +1,57 @@
# Rampler
[![Latest GitHub release](https://img.shields.io/github/release/rvaser/rampler.svg)](https://github.com/rvaser/rampler/releases/latest)
-[![Build status for c++/clang++](https://travis-ci.org/rvaser/rampler.svg?branch=master)](https://travis-ci.org/rvaser/rampler)
+[![Build status for c++/clang++](https://travis-ci.com/rvaser/rampler.svg?branch=master)](https://travis-ci.com/rvaser/rampler)
-Standalone module for sampling genomic sequences. It supports two modes, random subsampling of sequencer data to a desired depth (given the reference length) and file splitting to desired size in bytes.
+Rampler is a standalone module for sampling genomic sequences. It supports two modes, random subsampling of sequencing data to a desired depth (given the reference length) and file splitting to desired size in bytes.
-Rampler takes as first input argument a file in FASTA/FASTQ format which can be compressed with gzip. The rest of input parameters depend on the mode of operation. The output is stored into a file(s) which is in the same format as the input file but uncompressed.
-
-## Dependencies
-1. gcc 4.8+ or clang 3.4+
-2. cmake 3.2+
-
-## Installation
-To install Rampler run the following commands:
+## Usage
+To build rampler run the following commands:
```bash
git clone --recursive https://github.com/rvaser/rampler.git rampler
-cd rampler
-mkdir build
-cd build
-cmake -DCMAKE_BUILD_TYPE=Release ..
-make
+cd rampler && mkdir build && cd build
+cmake -DCMAKE_BUILD_TYPE=Release .. && make
+./bin/rampler
```
-After successful installation, an executable named `rampler` will appear in `build/bin`.
-
-Optionally, you can run `sudo make install` to install rampler executable to your machine.
-
-***Note***: if you omitted `--recursive` from `git clone`, run `git submodule update --init --recursive` before proceeding with compilation.
-
-## Usage
-Usage of rampler is as following:
-
- usage: rampler [options ...] <mode>
+which will display the following usage:
- <mode>
- subsample <sequences> <reference length> <coverage> [<coverage> ...]
-
- <sequences>
- input file in FASTA/FASTQ format (can be compressed with gzip)
- containing sequences to be subsampled
- <reference length>
- integer denoting length of the reference genome (or
- assembly) from which the sequences originate
- <coverage>
- integer denoting desired coverage of the subsampled
- sequences
-
- split <sequences> <chunk size>
-
- <sequences>
- input file in FASTA/FASTQ format (can be compressed with gzip)
- containing sequences which will be split into smaller chunks
- <chunk size>
- integer denoting the desired chunk size in bytes
-
- options:
- -o, --out-directory
- default: current directory
- path in which sampled files will be created
- --version
- prints the version number
- -h, --help
- prints out the help
-
-## Contact information
+```bash
+usage: rampler [options ...] <mode>
+
+ <mode>
+ subsample <sequences> <reference length> <coverage> [<coverage> ...]
+
+ <sequences>
+ input file in FASTA/FASTQ format (can be compressed with gzip)
+ <reference length>
+ integer denoting length of the reference genome (or assembly)
+ <coverage>
+ integer denoting desired coverage of the subsampled sequences
+
+ split <sequences> <chunk size>
+
+ <sequences>
+ input file in FASTA/FASTQ format (can be compressed with gzip)
+ containing sequences which will be split into smaller chunks
+ <chunk size>
+ integer denoting the desired chunk size in bytes
+
+ options:
+ -o, --out-directory <string>
+ default: current directory
+ path in which sampled files will be created
+ --version
+ prints the version number
+ -h, --help
+ prints out the help
+```
-For additional information, help and bug reports please send an email to one of the following: robert.vaser at fer.hr
+#### Dependencies
+1. gcc 4.8+ or clang 4.0+
+2. cmake 3.9+
+3. zlib
## Acknowledgment
-
This work has been supported in part by Croatian Science Foundation under the project UIP-11-2013-7353.
=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+rampler (2.0.0-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version
+ * Standards-Version: 4.5.1 (routine-update)
+ * Refreshed patches, removed arm64.patch.
+
+ -- Michael R. Crusoe <crusoe at debian.org> Tue, 19 Jan 2021 07:40:24 +0100
+
rampler (1.1.1-3) unstable; urgency=medium
* Team upload.
=====================================
debian/control
=====================================
@@ -7,7 +7,7 @@ Priority: optional
Build-Depends: debhelper-compat (= 13),
cmake,
libbioparser-dev
-Standards-Version: 4.5.0
+Standards-Version: 4.5.1
Vcs-Browser: https://salsa.debian.org/med-team/rampler
Vcs-Git: https://salsa.debian.org/med-team/rampler.git
Homepage: https://github.com/rvaser/rampler
=====================================
debian/patches/arm64.patch deleted
=====================================
@@ -1,16 +0,0 @@
-Author: Nilesh Patra <npatra974 at gmail.com>
-Description: Change char to signed for comparison - fixes arm64 tests
-Last-Changed: October 1, 2020
-Forwarded: https://github.com/rvaser/rampler/pull/3
---- a/src/main.cpp
-+++ b/src/main.cpp
-@@ -24,7 +24,7 @@
-
- std::string out_directory = ".";
-
-- char argument;
-+ signed char argument;
- while ((argument = getopt_long(argc, argv, "o:h", options, nullptr)) != -1) {
- switch (argument) {
- case 'o':
-
=====================================
debian/patches/series
=====================================
@@ -1,2 +1 @@
use_debian_packaged_libs.patch
-arm64.patch
=====================================
debian/patches/use_debian_packaged_libs.patch
=====================================
@@ -1,18 +1,27 @@
Author: Andreas Tille <tille at debian.org>
-Last-Update: Fri, 08 Jun 2018 13:20:51 +0200
+Last-Update: 2021-01-19
Description: Use Debian packaged libraries
-
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -14,10 +14,6 @@ add_executable(rampler
- src/sequence.cpp
- src/sampler.cpp)
+Forwarded: not-needed
+--- rampler.orig/CMakeLists.txt
++++ rampler/CMakeLists.txt
+@@ -13,19 +13,11 @@
+ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
-if (NOT TARGET bioparser)
-- add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
--endif()
+- add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
+-endif ()
+-if (NOT TARGET biosoup)
+- add_subdirectory(vendor/bioparser/vendor/biosoup EXCLUDE_FROM_ALL)
+-endif ()
-
--target_link_libraries(rampler bioparser)
-+target_link_libraries(rampler z)
+ add_executable(${PROJECT_NAME}
+ src/main.cpp
+ src/sampler.cpp)
+ target_link_libraries(${PROJECT_NAME}
+- bioparser
+- biosoup)
++ z)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE
+ RAMPLER_VERSION="v${PROJECT_VERSION}")
- install(TARGETS rampler DESTINATION bin)
=====================================
src/main.cpp
=====================================
@@ -1,120 +1,164 @@
-#include <stdio.h>
-#include <stdlib.h>
+// Copyright (c) 2021 Robert Vaser
+
#include <getopt.h>
-#include "sequence.hpp"
+#include <iostream>
+
+#include "bioparser/fasta_parser.hpp"
+#include "bioparser/fastq_parser.hpp"
+
#include "sampler.hpp"
-#include "bioparser/bioparser.hpp"
+std::atomic<std::uint32_t> biosoup::Sequence::num_objects{0};
-static const char* version = "v1.1.1";
+namespace {
+
+static const char* rampler_version = RAMPLER_VERSION;
static struct option options[] = {
- {"out-directory", required_argument, 0, 'o'},
- {"version", no_argument, 0, 'v'},
- {"help", no_argument, 0, 'h'},
- {0, 0, 0, 0}
+ {"out-directory", required_argument, nullptr, 'o'},
+ {"version", no_argument, nullptr, 'v'},
+ {"help", no_argument, nullptr, 'h'},
+ {nullptr, 0, nullptr, 0}
};
-void help();
-
-int main(int argc, char** argv) {
-
- std::vector<std::string> input_parameters;
-
- std::string out_directory = ".";
-
- char argument;
- while ((argument = getopt_long(argc, argv, "o:h", options, nullptr)) != -1) {
- switch (argument) {
- case 'o':
- out_directory = optarg;
- break;
- case 'v':
- printf("%s\n", version);
- exit(0);
- case 'h':
- help();
- exit(0);
- default:
- exit(1);
- }
+std::unique_ptr<bioparser::Parser<biosoup::Sequence>> CreateParser(
+ const std::string& path, std::string* name, std::string* ext) {
+ auto is_suffix = [] (const std::string& s, const std::string& suff) {
+ return s.size() < suff.size() ? false :
+ s.compare(s.size() - suff.size(), suff.size(), suff) == 0;
+ };
+
+ std::size_t c = path.rfind('/');
+ *name = (c == std::string::npos ? path : path.substr(c + 1));
+
+ c = name->find('.');
+ *ext = (c == std::string::npos ? "" : name->substr(c, name->find('.', c + 1) - c)); // NOLINT
+ *name = (c == std::string::npos ? *name : name->substr(0, c));
+
+ if (is_suffix(path, ".fasta") || is_suffix(path, ".fa") ||
+ is_suffix(path, ".fasta.gz") || is_suffix(path, ".fa.gz")) {
+ try {
+ return bioparser::Parser<biosoup::Sequence>::Create<bioparser::FastaParser>(path); // NOLINT
+ } catch (const std::invalid_argument& exception) {
+ std::cerr << exception.what() << std::endl;
+ return nullptr;
}
-
- if (optind == argc) {
- fprintf(stderr, "[rampler::] error: too few arguments!\n");
- help();
- exit(1);
+ }
+ if (is_suffix(path, ".fastq") || is_suffix(path, ".fq") ||
+ is_suffix(path, ".fastq.gz") || is_suffix(path, ".fq.gz")) {
+ try {
+ return bioparser::Parser<biosoup::Sequence>::Create<bioparser::FastqParser>(path); // NOLINT
+ } catch (const std::invalid_argument& exception) {
+ std::cerr << exception.what() << std::endl;
+ return nullptr;
}
+ }
- for (int32_t i = optind; i < argc; ++i) {
- input_parameters.emplace_back(argv[i]);
- }
-
- bool do_subsample = false, do_split = false;
- if (input_parameters[0] == "subsample") {
- do_subsample = true;
- } else if (input_parameters[0] == "split") {
- do_split = true;
- } else {
- fprintf(stderr, "[rampler::] error: unkown mode %s!\n", input_parameters[0].c_str());
- exit(1);
- }
+ std::cerr << "[rampler::CreateParser] error: file " << path
+ << " has unsupported format extension (valid extensions: .fasta, "
+ << ".fasta.gz, .fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz)"
+ << std::endl;
+ return nullptr;
+}
- if ((do_subsample && input_parameters.size() < 4) ||
- (do_split && input_parameters.size() < 3)) {
+void Help() {
+ std::cout <<
+ "usage: rampler [options ...] <mode>\n"
+ "\n"
+ " <mode>\n"
+ " subsample <sequences> <reference length> <coverage> [<coverage> ...]\n" // NOLINT
+ "\n"
+ " <sequences>\n"
+ " input file in FASTA/FASTQ format (can be compressed with gzip)\n"
+ " <reference length>\n"
+ " integer denoting length of the reference genome (or assembly)\n"
+ " <coverage>\n"
+ " integer denoting desired coverage of the subsampled sequences\n"
+ "\n"
+ " split <sequences> <chunk size>\n"
+ "\n"
+ " <sequences>\n"
+ " input file in FASTA/FASTQ format (can be compressed with gzip)\n"
+ " <chunk size>\n"
+ " integer denoting the desired chunk size in bytes\n"
+ "\n"
+ " options:\n"
+ " -o, --out-directory <string>\n"
+ " default: current directory\n"
+ " path in which sampled files will be created\n"
+ " --version\n"
+ " prints the version number\n"
+ " -h, --help\n"
+ " prints the usage\n";
+}
- fprintf(stderr, "[rampler::] error: missing input parameter(s)!\n");
- exit(1);
- }
+} // namespace
- auto sampler = rampler::createSampler(input_parameters[1]);
- sampler->initialize();
-
- if (do_split) {
- sampler->split(out_directory, atoi(input_parameters[2].c_str()));
- } else if (do_subsample) {
- uint32_t reference_length = atoi(input_parameters[2].c_str());
- for (uint32_t i = 3; i < input_parameters.size(); ++i) {
- sampler->subsample(out_directory, reference_length,
- atoi(input_parameters[i].c_str()));
- }
+int main(int argc, char** argv) {
+ std::vector<std::string> input_parameters;
+ std::string out_directory = ".";
+
+ int arg;
+ while ((arg = getopt_long(argc, argv, "o:h", options, nullptr)) != -1) {
+ switch (arg) {
+ case 'o': out_directory = optarg; break;
+ case 'v': std::cerr << rampler_version << std::endl; return 0;
+ case 'h': Help(); return 0;
+ default: return 1;
}
+ }
+ if (argc == 1) {
+ Help();
return 0;
-}
+ }
+
+ if (optind == argc) {
+ std::cerr << "[rampler::] error: missing arguments!" << std::endl;
+ return 1;
+ }
+
+ for (std::int32_t i = optind; i < argc; ++i) {
+ input_parameters.emplace_back(argv[i]);
+ }
+
+ bool subsample = false, split = false;
+ if (input_parameters[0] == "subsample") {
+ subsample = true;
+ } else if (input_parameters[0] == "split") {
+ split = true;
+ } else {
+ std::cerr << "[rampler::] error: unknown mode!" << std::endl;
+ return 1;
+ }
+
+ if ((subsample && input_parameters.size() < 4) ||
+ (split && input_parameters.size() < 3)) {
+ std::cerr << "[rampler::] error: missing arguments!" << std::endl;
+ return 1;
+ }
+
+ std::string name{}, ext{};
+ auto sparser = CreateParser(input_parameters[1], &name, &ext);
+ if (sparser == nullptr) {
+ return 1;
+ }
+
+ rampler::Sampler sampler{std::move(sparser), name, ext};
+ sampler.Initialize();
+
+ if (split) {
+ sampler.Split(out_directory, atoi(input_parameters[2].c_str()));
+ } else if (subsample) {
+ std::uint32_t reference_length = atoi(input_parameters[2].c_str());
+ for (std::uint32_t i = 3; i < input_parameters.size(); ++i) {
+ sampler.Subsample(
+ out_directory,
+ reference_length,
+ atoi(input_parameters[i].c_str()));
+ }
+ }
-void help() {
- printf(
- "usage: rampler [options ...] <mode>\n"
- "\n"
- " <mode>\n"
- " subsample <sequences> <reference length> <coverage> [<coverage> ...]\n"
- "\n"
- " <sequences>\n"
- " input file in FASTA/FASTQ format (can be compressed with gzip)\n"
- " containing sequences to be subsampled\n"
- " <reference length>\n"
- " integer denoting length of the reference genome (or\n"
- " assembly) from which the sequences originate\n"
- " <coverage>\n"
- " integer denoting desired coverage of the subsampled\n"
- " sequences\n"
- "\n"
- " split <sequences> <chunk size>\n"
- "\n"
- " <sequences>\n"
- " input file in FASTA/FASTQ format (can be compressed with gzip)\n"
- " containing sequences which will be split into smaller chunks\n"
- " <chunk size>\n"
- " integer denoting the desired chunk size in bytes\n"
- "\n"
- " options:\n"
- " -o, --out-directory\n"
- " default: current directory\n"
- " path in which sampled files will be created\n"
- " --version\n"
- " prints the version number\n"
- " -h, --help\n"
- " prints the usage\n");
+ return 0;
}
=====================================
src/sampler.cpp
=====================================
@@ -1,181 +1,138 @@
-/*!
- * @file sampler.cpp
- *
- * @brief Sampler class source file
- */
+// Copyright (c) 2021 Robert Vaser
-#include <random>
-
-#include "sequence.hpp"
#include "sampler.hpp"
-#include "bioparser/bioparser.hpp"
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <utility>
namespace rampler {
-constexpr uint32_t kChunkSize = 1024 * 1024 * 1024; // ~ 1GB
-
-std::unique_ptr<Sampler> createSampler(const std::string& sequences_path) {
-
- std::unique_ptr<bioparser::Parser<Sequence>> sparser = nullptr;
-
- std::string base_name = sequences_path.substr(sequences_path.rfind('/') + 1);
- base_name = base_name.substr(0, base_name.find('.'));
- std::string extension;
-
- auto is_suffix = [](const std::string& src, const std::string& suffix) -> bool {
- if (src.size() < suffix.size()) {
- return false;
- }
- return src.compare(src.size() - suffix.size(), suffix.size(), suffix) == 0;
- };
-
- if (is_suffix(sequences_path, ".fasta") || is_suffix(sequences_path, ".fa") ||
- is_suffix(sequences_path, ".fasta.gz") || is_suffix(sequences_path, ".fa.gz")) {
- sparser = bioparser::createParser<bioparser::FastaParser, Sequence>(
- sequences_path);
- extension = ".fasta";
- } else if (is_suffix(sequences_path, ".fastq") || is_suffix(sequences_path, ".fq") ||
- is_suffix(sequences_path, ".fastq.gz") || is_suffix(sequences_path, ".fq.gz")) {
- sparser = bioparser::createParser<bioparser::FastqParser, Sequence>(
- sequences_path);
- extension = ".fastq";
- } else {
- fprintf(stderr, "[rampler::createSampler] error: "
- "file %s has unsupported format extension (valid extensions: "
- ".fasta, .fasta.gz, .fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz)!\n",
- sequences_path.c_str());
- exit(1);
- }
-
- return std::unique_ptr<Sampler>(new Sampler(std::move(sparser), base_name,
- extension));
-}
+constexpr uint32_t kChunkSize = 1024 * 1024 * 1024; // ~ 1GB
-Sampler::Sampler(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
- const std::string& base_name, const std::string& extension)
- : sparser_(std::move(sparser)), sequences_length_(0), base_name_(base_name),
- extension_(extension) {
+Sampler::Sampler(
+ std::unique_ptr<bioparser::Parser<biosoup::Sequence>> sparser,
+ const std::string& base_name,
+ const std::string& extension)
+ : sparser_(std::move(sparser)),
+ sequences_length_(0),
+ base_name_(base_name),
+ extension_(extension) {
}
-Sampler::~Sampler() {
-}
+void Sampler::Initialize() {
+ if (sequences_length_ != 0) {
+ return;
+ }
-void Sampler::initialize() {
-
- if (sequences_length_ != 0) {
- fprintf(stderr, "[rampler::Sampler::initialize] warning: "
- "object already initialized!\n");
- return;
+ sparser_->Reset();
+ while (true) {
+ auto sequences = sparser_->Parse(1ULL << 30);
+ if (sequences.empty()) {
+ break;
}
- sparser_->reset();
- while (true) {
- std::vector<std::unique_ptr<Sequence>> sequences;
- auto status = sparser_->parse(sequences, kChunkSize);
-
- for (const auto& it: sequences) {
- sequences_length_ += it->data().size();
- }
-
- if (!status) {
- break;
- }
+ for (const auto& it : sequences) {
+ sequences_length_ += it->data.size();
}
+ }
}
-void Sampler::subsample(const std::string& out_directory, uint32_t reference_length,
- uint32_t coverage) {
-
- if (coverage * reference_length > sequences_length_) {
- fprintf(stderr, "[rampler::Sampler::subsample] warning: "
- "insufficient data for coverage %u!\n", coverage);
- return;
+void Sampler::Subsample(
+ const std::string& out_directory,
+ std::uint32_t reference_length,
+ std::uint32_t coverage) {
+ if (coverage * reference_length > sequences_length_) {
+ std::cerr << "[rampler::Sampler::subsample] warning: "
+ << "insufficient data for coverage of " << coverage
+ << std::endl;
+ return;
+ }
+
+ std::random_device r;
+ std::mt19937 generator(r());
+ std::uniform_real_distribution<double> distribution(0.0, 1.0);
+
+ double ratio = (coverage * reference_length) / static_cast<double>(sequences_length_); // NOLINT
+
+ std::string out_path =
+ out_directory + "/" + base_name_ + "_" +
+ std::to_string(coverage) + "x" + extension_;
+
+ std::ofstream ofs(out_path);
+ if (!ofs.is_open()) {
+ throw std::runtime_error(
+ "[rampler::Sampler::subsample] error: unable to create file on disk!");
+ }
+
+ sparser_->Reset();
+ while (true) {
+ auto sequences = sparser_->Parse(1ULL << 30);
+ if (sequences.empty()) {
+ break;
}
- std::random_device r;
- std::mt19937 generator(r());
- std::uniform_real_distribution<double> distribution(0.0, 1.0);
-
- double ratio = (coverage * reference_length) / static_cast<double>(
- sequences_length_);
-
- std::string out_path = out_directory + "/" + base_name_ + "_" +
- std::to_string(coverage) + "x" + extension_;
- auto out = fopen(out_path.c_str(), "w");
- if (out == nullptr) {
- fprintf(stderr, "rampler::Sampler::subsample] error: "
- "unable to create file %s!\n", out_path.c_str());
- exit(1);
- }
-
- sparser_->reset();
- while (true) {
- std::vector<std::unique_ptr<Sequence>> sequences;
- auto status = sparser_->parse(sequences, kChunkSize);
-
- for (const auto& it: sequences) {
- if (distribution(generator) < ratio) {
- if (it->quality().empty()) {
- fprintf(out, ">%s\n%s\n", it->name().c_str(),
- it->data().c_str());
- } else {
- fprintf(out, "@%s\n%s\n+\n%s\n", it->name().c_str(),
- it->data().c_str(), it->quality().c_str());
- }
- }
- }
-
- if (!status) {
- break;
+ for (const auto& it : sequences) {
+ if (distribution(generator) < ratio) {
+ if (it->quality.empty()) {
+ ofs << ">" << it->name << std::endl
+ << it->data << std::endl;
+ } else {
+ ofs << "@" << it->name << std::endl
+ << it->data << std::endl
+ << "+" << std::endl
+ << it->quality << std::endl;
}
+ }
}
+ }
- fclose(out);
+ ofs.close();
}
-void Sampler::split(const std::string& out_directory, uint32_t chunk_size) {
-
- if (chunk_size > sequences_length_) {
- fprintf(stderr, "[rampler::Sampler::split] warning: "
- "insufficient data for chunk size %u!\n", chunk_size);
- return;
+void Sampler::Split(const std::string& out_directory, std::uint32_t chunk_size) { // NOLINT
+ if (chunk_size > sequences_length_) {
+ std::cerr << "[rampler::Sampler::split] warning: "
+ << "insufficient data for chunk size " << chunk_size
+ << std::endl;
+ return;
+ }
+
+ uint32_t chunk_number = 0;
+
+ sparser_->Reset();
+ while (true) {
+ auto sequences = sparser_->Parse(chunk_size);
+ if (sequences.empty()) {
+ break;
}
- uint32_t chunk_number = 0;
+ std::string out_path =
+ out_directory + "/" + base_name_ + "_" +
+ std::to_string(chunk_number++) + extension_;
- sparser_->reset();
- while (true) {
- std::vector<std::unique_ptr<Sequence>> sequences;
- auto status = sparser_->parse(sequences, chunk_size);
-
- std::string out_path = out_directory + "/" + base_name_ + "_" +
- std::to_string(chunk_number) + extension_;
- auto out = fopen(out_path.c_str(), "w");
- if (out == nullptr) {
- fprintf(stderr, "rampler::Sampler::subsample] error: "
- "unable to create file %s!\n", out_path.c_str());
- exit(1);
- }
-
- for (const auto& it: sequences) {
- if (it->quality().empty()) {
- fprintf(out, ">%s\n%s\n", it->name().c_str(),
- it->data().c_str());
- } else {
- fprintf(out, "@%s\n%s\n+\n%s\n", it->name().c_str(),
- it->data().c_str(), it->quality().c_str());
- }
- }
-
- fclose(out);
-
- ++chunk_number;
+ std::ofstream ofs(out_path);
+ if (!ofs.is_open()) {
+ throw std::runtime_error(
+ "[rampler::Sampler::subsample] error: unable to create file on disk!"); // NOLINT
+ }
- if (!status) {
- break;
- }
+ for (const auto& it : sequences) {
+ if (it->quality.empty()) {
+ ofs << ">" << it->name << std::endl
+ << it->data << std::endl;
+ } else {
+ ofs << "@" << it->name << std::endl
+ << it->data << std::endl
+ << "+" << std::endl
+ << it->quality << std::endl;
+ }
}
-}
+ ofs.close();
+ }
}
+
+} // namespace rampler
=====================================
src/sampler.hpp
=====================================
@@ -1,19 +1,15 @@
-/*!
- * @file sampler.hpp
- *
- * @brief Sampler class header file
- */
+// Copyright (c) 2021 Robert Vaser
-#pragma once
+#ifndef RAMPLER_SAMPLER_HPP_
+#define RAMPLER_SAMPLER_HPP_
-#include <stdlib.h>
-#include <vector>
+#include <cstdint>
#include <memory>
+#include <string>
+#include <vector>
-namespace bioparser {
- template<class T>
- class Parser;
-}
+#include "bioparser/parser.hpp"
+#include "biosoup/sequence.hpp"
namespace rampler {
@@ -21,27 +17,36 @@ class Sampler;
std::unique_ptr<Sampler> createSampler(const std::string& sequences_path);
class Sampler {
-public:
- ~Sampler();
+ public:
+ Sampler(
+ std::unique_ptr<bioparser::Parser<biosoup::Sequence>> sparser,
+ const std::string& base_name,
+ const std::string& extension);
+
+ Sampler(const Sampler&) = delete;
+ Sampler& operator=(const Sampler&) = delete;
- void initialize();
+ Sampler(Sampler&&) = delete;
+ Sampler& operator=(Sampler&&) = delete;
- void subsample(const std::string& out_directory, uint32_t reference_length,
- uint32_t coverage);
+ ~Sampler() = default;
- void split(const std::string& out_directory, uint32_t chunk_size);
+ void Initialize();
- friend std::unique_ptr<Sampler> createSampler(const std::string& sequences_path);
-private:
- Sampler(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
- const std::string& base_name, const std::string& extension);
- Sampler(const Sampler&) = delete;
- const Sampler& operator=(const Sampler&) = delete;
+ void Subsample(
+ const std::string& out_directory,
+ std::uint32_t reference_length,
+ std::uint32_t coverage);
- std::unique_ptr<bioparser::Parser<Sequence>> sparser_;
- uint64_t sequences_length_;
- std::string base_name_;
- std::string extension_;
+ void Split(const std::string& out_directory, std::uint32_t chunk_size);
+
+ private:
+ std::unique_ptr<bioparser::Parser<biosoup::Sequence>> sparser_;
+ std::uint64_t sequences_length_;
+ std::string base_name_;
+ std::string extension_;
};
-}
+} // namespace rampler
+
+#endif // RAMPLER_SAMPLER_HPP_
=====================================
src/sequence.cpp deleted
=====================================
@@ -1,22 +0,0 @@
-/*!
- * @file sequence.cpp
- *
- * @brief Sequence class source file
- */
-
-#include "sequence.hpp"
-
-namespace rampler {
-
-Sequence::Sequence(const char* name, uint32_t name_length, const char* data,
- uint32_t data_length)
- : name_(name, name_length), data_(data, data_length), quality_() {
-}
-
-Sequence::Sequence(const char* name, uint32_t name_length, const char* data,
- uint32_t data_length, const char* quality, uint32_t quality_length)
- : name_(name, name_length), data_(data, data_length), quality_(quality,
- quality_length) {
-}
-
-}
=====================================
src/sequence.hpp deleted
=====================================
@@ -1,53 +0,0 @@
-/*!
- * @file sequence.hpp
- *
- * @brief Sequence class header file
- */
-
-#pragma once
-
-#include <stdint.h>
-#include <string>
-
-namespace bioparser {
- template<class T>
- class FastaParser;
-
- template<class T>
- class FastqParser;
-}
-
-namespace rampler {
-
-class Sequence {
-public:
- ~Sequence() = default;
-
- const std::string& name() const {
- return name_;
- }
-
- const std::string& data() const {
- return data_;
- }
-
- const std::string& quality() const {
- return quality_;
- }
-
- friend bioparser::FastaParser<Sequence>;
- friend bioparser::FastqParser<Sequence>;
-private:
- Sequence(const char* name, uint32_t name_length, const char* data,
- uint32_t data_length);
- Sequence(const char* name, uint32_t name_length, const char* data,
- uint32_t data_length, const char* quality, uint32_t quality_length);
- Sequence(const Sequence&) = delete;
- const Sequence& operator=(const Sequence&) = delete;
-
- std::string name_;
- std::string data_;
- std::string quality_;
-};
-
-}
View it on GitLab: https://salsa.debian.org/med-team/rampler/-/compare/3e81d73a55dc4e719cae81d756843de6b6d4506c...6583bdba24466729710867275c1fa96d0e7d363b
--
View it on GitLab: https://salsa.debian.org/med-team/rampler/-/compare/3e81d73a55dc4e719cae81d756843de6b6d4506c...6583bdba24466729710867275c1fa96d0e7d363b
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210119/5cb6138a/attachment-0001.html>
More information about the debian-med-commit
mailing list