[med-svn] [fast5] 01/03: Imported Upstream version 0.5.9
Afif Elghraoui
afif at moszumanska.debian.org
Wed Dec 28 07:32:52 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository fast5.
commit d798d488a7920eb6290069c20b81cd231889d547
Author: Afif Elghraoui <afif at debian.org>
Date: Tue Dec 27 21:24:10 2016 -0800
Imported Upstream version 0.5.9
---
.travis.yml | 2 +-
README.org | 49 ++++-
VERSION | 2 +-
python/fast5/version.py | 2 +-
python/setup.py | 2 +-
src/.gitignore | 5 +-
src/Makefile | 14 +-
src/f5dump.cpp | 366 +++++++++++++++++++--------------
src/{f5dump-full.cpp => f5ls-full.cpp} | 0
src/{f5dump.cpp => f5ls.cpp} | 0
src/fast5.hpp | 169 +++++++++++----
src/hdf5-mod.cpp | 2 +-
12 files changed, 415 insertions(+), 198 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 58c3113..2d95ffb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,5 +15,5 @@ install:
- docker run --rm -v $PWD:/data fast5 bash -c 'virtualenv build-venv && source build-venv/bin/activate && make -C python -e develop'
script:
- - docker run --rm -v $PWD:/data fast5 bash -c 'src/hdf5-mod -f file.000.fast5 && src/f5-mod file.000.fast5 && src/f5dump file.000.fast5 && src/f5dump-full file.000.fast5'
+ - docker run --rm -v $PWD:/data fast5 bash -c 'src/hdf5-mod -f file.000.fast5 && src/f5-mod file.000.fast5 && src/f5ls file.000.fast5 && src/f5ls-full file.000.fast5'
- docker run --rm -v $PWD:/data fast5 bash -c 'source build-venv/bin/activate && python -c "import fast5; f = fast5.File(\"file.000.fast5\"); print(f.file_version()); print(f.have_eventdetection_events())"'
diff --git a/README.org b/README.org
index 14a23ff..27a8903 100644
--- a/README.org
+++ b/README.org
@@ -14,7 +14,7 @@ This is a header-only library. You only need to copy [[file:src/fast5.hpp][src/f
**** Usage
-See [[file:src/f5dump.cpp][src/f5dump.cpp]] for an example.
+See [[file:src/f5ls.cpp][src/f5ls.cpp]] for an example.
*** Python Wrapper
@@ -42,6 +42,53 @@ print(f.file_version())
print(f.have_eventdetection_events())
#+END_EXAMPLE
+*** f5dump
+
+The program =f5dump= can be used to list and extract some of the contents of =fast5= files, including: raw signals, event-detection events, basecall events, and basecall fastq.
+
+**** Installation
+
+In addition to this =fast5= repository, you will need HDF5 (headers and libraries), as well the the header-only libraries [[https://github.com/mateidavid/tclap.git][TCLAP]] and [[https://github.com/mateidavid/hpptools.git][HPPTOOLS]]. To build =f5dump=, run =make f5dump [VAR1=VALUE1] ...=, where =VAR=-s are used to instruct the [[file:src/Makefile][Makefile]] where to find various dependencies.
+
+**** Usage
+
+In each run, =f5dump= requires exactly one command among: =--ls/--id/--rw/--ed/--ev/--fq=. If no command is given, =--ls= is assumed. It also requires exactly one =fast5= file to inspect.
+
+- In =--ls= mode, =f5dump= lists some of the contents of the file. Sample output:
+
+ #+BEGIN_EXAMPLE
+rw Read_1019
+ed 000 Read_1019
+ed 001 Read_1019
+bc2d 2D_000 2 1 1 1D_000
+bc1d 1D_000 0 1 1 001
+bc1d 1D_000 1 1 1 001
+#+END_EXAMPLE
+
+ Explanations:
+
+ - =rw=: the file contains raw samples from one read, =Read_1019=.
+
+ - =ed=: the file contains 2 event-detection groups, =000= and =001=, both run on raw samples from =Read_1019=.
+
+ - =bc2d=: the file contains 1 basecall group =2D_000= with 2D data (=2=); this group has both fastq data and events (=1 1=); its corresponding 1D basecall group is =1D_000=.
+
+ - =bc1d=: the file contains 1 basecall group =1D_000= with 1D data for each strand (=0= and =1=); each contains fastq data and events (=1 1=); its corresponding event-detection group is =001=.
+
+ Notes:
+
+ - The group names are suffixes understood by the =fast5= library. E.g., the basecall group =RNN_1D_000= would correspond to the HDF5 group =/Analyses/Basecall_RNN_1D_000=.
+
+ - Not all the links between groups are always available. Notably, some =fast5= files are missing the link between a 1D basecall group and its original event-detection group.
+
+- In =--id= mode, =f5dump= dumps =channel_id= and =tracking_id= metadata.
+
+- In =--rw/--ed/--ev/--fq= mode, =f5dump= dumps: raw signal data/event-detection events/basecall events/basecall fastq data.
+
+- Optional selector flags =--gr/--st/--rn= can be used to specify a group name, strand (=0/1/2=), or read name. Not all combinations make sense: e.g, =--st= is ignored for event-detection data.
+
+- Optional output flags =--time-int/--curr-int/--rw-time= can modify the output: convert times into integers, dump raw signal currents in internal integer encoding, and add time stamps to raw signals.
+
*** License
[[file:LICENSE][MIT License]].
diff --git a/VERSION b/VERSION
index 659914a..416bfb0 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.5.8
+0.5.9
diff --git a/python/fast5/version.py b/python/fast5/version.py
index 91201fc..eaddd12 100644
--- a/python/fast5/version.py
+++ b/python/fast5/version.py
@@ -1 +1 @@
-__version__ = '0.5.8'
+__version__ = '0.5.9'
diff --git a/python/setup.py b/python/setup.py
old mode 100755
new mode 100644
index c15cabf..487dede
--- a/python/setup.py
+++ b/python/setup.py
@@ -16,7 +16,7 @@ exec(open('fast5/version.py').read())
hdf5_dir = os.environ.get('HDF5_DIR', '/usr')
hdf5_include_dir = os.environ.get('HDF5_INCLUDE_DIR', os.path.join(hdf5_dir, 'include'))
hdf5_lib_dir = os.environ.get('HDF5_LIB_DIR', os.path.join(hdf5_dir, 'lib'))
-hdf5_lib = os.environ.get('HDF_LIB', 'hdf5')
+hdf5_lib = os.environ.get('HDF5_LIB', 'hdf5')
if not os.path.isfile(os.path.join(hdf5_include_dir, 'H5pubconf.h')):
sys.exit(hdf5_include_dir + ': could not find HDF5 header files; use HDF5_DIR or HDF5_INCLUDE_DIR')
if (not os.path.isfile(os.path.join(hdf5_lib_dir, 'lib' + hdf5_lib + '.so'))
diff --git a/src/.gitignore b/src/.gitignore
index 869f46c..70906a8 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -1,4 +1,5 @@
-f5dump
-f5dump-full
+f5ls
+f5ls-full
hdf5-mod
f5-mod
+f5dump
diff --git a/src/Makefile b/src/Makefile
index f9e2325..03b93d6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -8,8 +8,10 @@ HDF5_DIR = /usr/local
HDF5_INCLUDE_DIR = ${HDF5_DIR}/include
HDF5_LIB_DIR = ${HDF5_DIR}/lib
HDF5_LIB = hdf5
+TCLAP_DIR = tclap
+HPPTOOLS_DIR = hpptools
-TARGETS = f5dump f5dump-full hdf5-mod f5-mod
+TARGETS = f5ls f5ls-full hdf5-mod f5-mod
all: ${TARGETS}
@@ -29,5 +31,15 @@ check_hdf5:
@[ -f "${HDF5_INCLUDE_DIR}/H5pubconf.h" ] || { echo "HDF5 headers not found" >&2; exit 1; }
@[ -f "${HDF5_LIB_DIR}/lib${HDF5_LIB}.so" ] || [ -f "${HDF5_LIB_DIR}/lib${HDF5_LIB}.a" ] || { echo "HDF5 library not found" >&2; exit 1; }
+check_tclap:
+ @[ -f "${TCLAP_DIR}/include/tclap/CmdLine.h" ] || { echo "TCLAP not found; get it from https://github.com/mateidavid/tclap.git" >&2; exit 1; }
+
+check_hpptools:
+ @[ -f "${HPPTOOLS_DIR}/include/alg.hpp" ] || { echo "HPPTOOLS not found; get it from https://github.com/mateidavid/hpptools.git" >&2; exit 1; }
+
%: %.cpp fast5.hpp hdf5_tools.hpp | check_hdf5
${CXX} -std=c++11 -O0 -g3 -ggdb -fno-eliminate-unused-debug-types -Wall -Wextra -Wpedantic -isystem ${HDF5_INCLUDE_DIR} -o $@ $< -L${HDF5_LIB_DIR} -Wl,--rpath=${HDF5_LIB_DIR} -l${HDF5_LIB} -lpthread -lz -ldl
+
+f5dump: f5dump.cpp fast5.hpp hdf5_tools.hpp | check_hdf5 check_tclap check_hpptools
+ ${CXX} -std=c++11 -O0 -g3 -ggdb -fno-eliminate-unused-debug-types -Wall -Wextra -Wpedantic -isystem ${HDF5_INCLUDE_DIR} -isystem ${TCLAP_DIR}/include -I ${HPPTOOLS_DIR}/include -o $@ $< -L${HDF5_LIB_DIR} -Wl,--rpath=${HDF5_LIB_DIR} -l${HDF5_LIB} -lpthread -lz -ldl
+
diff --git a/src/f5dump.cpp b/src/f5dump.cpp
index af655dd..1dd8984 100644
--- a/src/f5dump.cpp
+++ b/src/f5dump.cpp
@@ -1,20 +1,40 @@
#include <cassert>
#include <iostream>
+#include <iomanip>
#include <string>
+#include <tclap/CmdLine.h>
+#include "alg.hpp"
+
#include "fast5.hpp"
using namespace std;
-template < typename T >
-void print_vector(ostream& os, const vector< T >& v, const string& delim)
+
+namespace opts
{
- for (auto it = v.begin(); it != v.end(); ++it)
- {
- if (it != v.begin()) os << delim;
- os << *it;
- }
+ using namespace TCLAP;
+ string description = "Dump contents of ONT fast5 files.";
+ CmdLine cmd_parser(description);
+ //
+ ValueArg< unsigned > float_prec("", "float-prec", "Float precision.", false, 10, "int", cmd_parser);
+ SwitchArg rw_time("", "rw-time", "Add timepoints to raw data.", cmd_parser);
+ SwitchArg curr_int("", "curr-int", "Dump current data encoded as int (raw samples only).", cmd_parser);
+ SwitchArg time_int("", "time-int", "Dump start/length data encoded as int.", cmd_parser);
+ //
+ ValueArg< string > rn("", "rn", "Read name.", false, "", "Read_1015|...", cmd_parser);
+ ValueArg< unsigned > st("", "st", "Strand.", false, 0, "0|1|2", cmd_parser);
+ ValueArg< string > gr("", "gr", "Group name suffix.", false, "", "000|RNN_001|...", cmd_parser);
+ //
+ SwitchArg fq("", "fq", "Dump basecall fastq data.", cmd_parser);
+ SwitchArg ev("", "ev", "Dump basecall event data.", cmd_parser);
+ SwitchArg ed("", "ed", "Dump event detection data.", cmd_parser);
+ SwitchArg rw("", "rw", "Dump raw samples data.", cmd_parser);
+ SwitchArg id("", "id", "Dump channel/tracking id data.", cmd_parser);
+ SwitchArg ls("", "ls", "List groups/read names.", cmd_parser);
+ UnlabeledValueArg< string > input_fn("input", "Fast5 file.", true, "", "file", cmd_parser);
}
+
template < typename U, typename V >
void print_map(ostream& os, const map< U, V >& m, const string& prefix)
{
@@ -24,174 +44,218 @@ void print_map(ostream& os, const map< U, V >& m, const string& prefix)
}
}
-int main(int argc, char* argv[])
+unsigned time_int(double f, fast5::Channel_Id_Parameters const & channel_id_params)
{
- if (argc != 2)
- {
- cerr << "use: " << argv[0] << " <fast5_file>" << endl;
- return EXIT_FAILURE;
- }
- string file_name(argv[1]);
- //
- // open the FAST5 file for reading
- //
- if (not fast5::File::is_valid_file(file_name))
- {
- cout << "not a fast5 file [" << file_name << "]" << endl;
- return EXIT_SUCCESS;
- }
+ return f * channel_id_params.sampling_rate;
+}
+
+void real_main()
+{
+ fast5::File f;
+ try
{
- fast5::File f;
+ // open file
+ f.open(opts::input_fn);
+ auto channel_id_params = f.get_channel_id_params();
//
- // All fast5 operations are performed inside a try-catch block. This should
- // resist various hdf5 errors without leaking memory.
+ // list
//
- try
+ if (opts::ls)
{
- //
- // open file
- //
- f.open(file_name);
- assert(f.is_open());
- //
- // extract version information for the ONT software used to generate this dataset
- //
- cout << "file_version=" << f.file_version() << endl;
- //
- // inspect channel_id params
- //
- if (f.have_channel_id_params())
+ // rw
{
- auto channel_id_params = f.get_channel_id_params();
- cout << "channel_id/channel_number=" << channel_id_params.channel_number << endl
- << "channel_id/digitisation=" << channel_id_params.digitisation << endl
- << "channel_id/offset=" << channel_id_params.offset << endl
- << "channel_id/range=" << channel_id_params.range << endl
- << "channel_id/sampling_rate=" << channel_id_params.sampling_rate << endl;
+ auto rn_l = f.get_raw_samples_read_name_list();
+ for (auto const & rn : rn_l)
+ {
+ cout << "rw\t" << rn << endl;
+ }
}
- //
- // inspect tracking_id params
- //
- if (f.have_tracking_id_params())
+ // ed
{
- auto tracking_id_params = f.get_tracking_id_params();
- print_map(cout, tracking_id_params, "tracking_id/");
+ auto gr_l = f.get_eventdetection_group_list();
+ for (auto const & gr : gr_l)
+ {
+ auto rn_l = f.get_eventdetection_read_name_list(gr);
+ cout << "ed\t" << gr << "\t" << alg::os_join(rn_l, ",") << endl;
+ }
}
- //
- // inspect sequences params
- //
- if (f.have_sequences_params())
+ // bc
{
- auto sequences_params = f.get_sequences_params();
- print_map(cout, sequences_params, "sequences/");
+ for (auto st : vector< unsigned >({ 2, 0, 1 }))
+ {
+ auto gr_l = f.get_basecall_strand_group_list(st);
+ for (auto const & gr : gr_l)
+ {
+ int have_fastq = f.have_basecall_fastq(st, gr);
+ int have_events = (st == 2
+ ? f.have_basecall_events(0, gr) and f.have_basecall_events(1, gr)
+ : f.have_basecall_events(st, gr));
+ string link = (st == 2? f.get_basecall_group_1d(gr) : f.get_basecall_eventdetection_group(gr));
+ cout
+ << (st == 2? "bc2d" : "bc1d") << "\t"
+ << gr << "\t"
+ << st << "\t"
+ << have_fastq << "\t"
+ << have_events << "\t"
+ << (not link.empty()? link : string(".")) << endl
+ ;
+ }
+ }
}
- //
- // inspect raw samples
- //
- if (f.have_raw_samples())
+ }
+ //
+ // id
+ //
+ if (opts::id)
+ {
+ cout
+ << "channel_id/channel_number=" << channel_id_params.channel_number << endl
+ << "channel_id/digitisation=" << channel_id_params.digitisation << endl
+ << "channel_id/offset=" << channel_id_params.offset << endl
+ << "channel_id/range=" << channel_id_params.range << endl
+ << "channel_id/sampling_rate=" << channel_id_params.sampling_rate << endl
+ ;
+ if (f.have_tracking_id_params())
{
- auto rs_params = f.get_raw_samples_params();
- auto rs = f.get_raw_samples();
- cout << "raw_samples/read_id=" << rs_params.read_id << endl
- << "raw_samples/read_number=" << rs_params.read_number << endl
- << "raw_samples/start_mux=" << rs_params.start_mux << endl
- << "raw_samples/start_time=" << rs_params.start_time << endl
- << "raw_samples/duration=" << rs_params.duration << endl
- << "raw_samples/size=" << rs.size() << endl;
- const auto& e = rs.front();
- cout << " (" << e << ")" << endl;
+ auto tracking_id_params = f.get_tracking_id_params();
+ print_map(cout, tracking_id_params, "tracking_id/");
}
- //
- // inspect eventdetection events
- //
- cout << "eventdetection_group_list=";
- print_vector(cout, f.get_eventdetection_group_list(), ",");
- cout << endl;
- if (f.have_eventdetection_events())
- {
- auto ed_params = f.get_eventdetection_params();
- print_map(cout, ed_params, "eventdetection/");
- auto ed_ev_params = f.get_eventdetection_event_params();
- auto ed_ev = f.get_eventdetection_events();
- cout << "eventdetection/events/abasic_found=" << ed_ev_params.abasic_found << endl
- << "eventdetection/events/duration=" << ed_ev_params.duration << endl
- << "eventdetection/events/median_before=" << ed_ev_params.median_before << endl
- << "eventdetection/events/read_id=" << ed_ev_params.read_id << endl
- << "eventdetection/events/read_number=" << ed_ev_params.read_number << endl
- << "eventdetection/events/scaling_used=" << ed_ev_params.scaling_used << endl
- << "eventdetection/events/start_mux=" << ed_ev_params.start_mux << endl
- << "eventdetection/events/start_time=" << ed_ev_params.start_time << endl
- << "eventdetection/events/size=" << ed_ev.size() << endl;
- const auto& e = ed_ev.front();
- cout << " (mean=" << e.mean
- << ", stdv=" << e.stdv
- << ", start=" << e.start
- << ", length=" << e.length << ")" << endl;
- } // if have_eventdetection_events
- //
- // inspect basecall groups
- //
- for (unsigned st = 0; st < 3; ++st)
+ }
+ //
+ // raw samples
+ //
+ if (opts::rw and f.have_raw_samples(opts::rn))
+ {
+ auto rs_params = f.get_raw_samples_params(opts::rn);
+ cout
+ << "#read_id=" << rs_params.read_id << endl
+ << "#read_number=" << rs_params.read_number << endl
+ << "#start_mux=" << rs_params.start_mux << endl
+ << "#start_time=" << rs_params.start_time << endl
+ << "#duration=" << rs_params.duration << endl
+ ;
+ if (not opts::curr_int)
{
- cout << "basecall(" << st << ")/group_list=";
- print_vector(cout, f.get_basecall_strand_group_list(st), ",");
- cout << endl;
- // basecall sequence
- if (f.have_basecall_seq(st))
+ auto rs = f.get_raw_samples(opts::rn);
+ if (opts::rw_time)
{
- cout << "basecall(" << st << ")/seq_size=" << f.get_basecall_seq(st).size() << endl;
+ cout << "start\t";
}
- // basecall model
- if (f.have_basecall_model(st))
+ cout << "curr_f" << endl;
+ for (unsigned i = 0; i < rs.size(); ++i)
{
- cout << "basecall(" << st << ")/model_file=" << f.get_basecall_model_file(st) << endl;
- auto m_params = f.get_basecall_model_params(st);
- auto m = f.get_basecall_model(st);
- cout << "basecall(" << st << ")/model/scale=" << m_params.scale << endl
- << "basecall(" << st << ")/model/shift=" << m_params.shift << endl
- << "basecall(" << st << ")/model/drift=" << m_params.drift << endl
- << "basecall(" << st << ")/model/var=" << m_params.var << endl
- << "basecall(" << st << ")/model/scale_sd=" << m_params.scale_sd << endl
- << "basecall(" << st << ")/model/var_sd=" << m_params.var_sd << endl
- << "basecall(" << st << ")/model/size=" << m.size() << endl;
- const auto& e = m.front();
- cout << " (kmer=" << e.get_kmer()
- << ", level_mean=" << e.level_mean
- << ", level_stdv=" << e.level_stdv << ")" << endl;
+ if (opts::rw_time)
+ {
+ cout << rs_params.start_time + i << "\t";
+ }
+ cout << rs[i] << endl;
}
- // basecall events
- if (f.have_basecall_events(st))
+ }
+ else
+ {
+ auto rs_int = f.get_raw_samples_int(opts::rn);
+ if (opts::rw_time)
{
- auto ev = f.get_basecall_events(st);
- cout << "basecall(" << st << ")/events/size=" << ev.size() << endl;
- const auto& e = ev.front();
- cout << " (mean=" << e.mean
- << ", stdv=" << e.stdv
- << ", start=" << e.start
- << ", length=" << e.length
- << ", model_state=" << e.get_model_state()
- << ", p_model_state=" << e.p_model_state
- << ", move=" << e.move << ")" << endl;
+ cout << "start\t";
}
- // basecall event alignment
- if (st == 2 and f.have_basecall_event_alignment())
+ cout << "curr_i" << endl;
+ for (unsigned i = 0; i < rs_int.size(); ++i)
{
- auto al = f.get_basecall_event_alignment();
- cout << "basecall(2)/event_alignment/size=" << al.size() << endl;
- const auto& e = al.front();
- cout << " (template_index=" << e.template_index
- << ", complement_index=" << e.complement_index
- << ", kmer=" << e.get_kmer() << ")" << endl;
+ if (opts::rw_time)
+ {
+ cout << rs_params.start_time + i << "\t";
+ }
+ cout << rs_int[i] << endl;
}
- } // for st
- }
- catch (hdf5_tools::Exception& e)
+ }
+ } // if opts::rw
+ //
+ // event detection
+ //
+ if (opts::ed and f.have_eventdetection_events(opts::gr, opts::rn))
{
- cout << "hdf5 error: " << e.what() << endl;
- }
+ auto ede_params = f.get_eventdetection_event_params(opts::gr, opts::rn);
+ cout
+ << "#read_id=" << ede_params.read_id << endl
+ << "#read_number=" << ede_params.read_number << endl
+ << "#start_mux=" << ede_params.start_mux << endl
+ << "#start_time=" << ede_params.start_time << endl
+ << "#duration=" << ede_params.duration << endl
+ << "#scaling_used=" << ede_params.scaling_used << endl
+ ;
+ auto ede = f.get_eventdetection_events(opts::gr, opts::rn);
+ cout
+ << "start\tlength\tmean\tstdv" << endl
+ << alg::os_join(ede, "\n", [] (fast5::EventDetection_Event_Entry const & e) {
+ ostringstream oss;
+ oss.precision(opts::float_prec);
+ oss << e.start << "\t" << e.length << "\t" << e.mean << "\t" << e.stdv;
+ return oss.str();
+ })
+ << endl;
+ } // if opts::ed
//
- // fast5 file is closed by its destructor at the end of this scope
+ // basecall events
+ //
+ if (opts::ev and f.have_basecall_events(opts::st, opts::gr))
+ {
+ auto bce = f.get_basecall_events(opts::st, opts::gr);
+ cout
+ << alg::os_join(bce, "\n", [&channel_id_params] (fast5::Event_Entry const & e) {
+ ostringstream oss;
+ oss.precision(opts::float_prec);
+ if (not opts::time_int)
+ {
+ oss << e.start << "\t" << e.length << "\t";
+ }
+ else
+ {
+ oss
+ << time_int(e.start, channel_id_params) << "\t"
+ << time_int(e.length, channel_id_params) << "\t";
+ }
+ oss
+ << e.mean << "\t"
+ << e.stdv << "\t"
+ << string(e.model_state.begin(), e.model_state.end()).data() << "\t"
+ << e.move;
+ return oss.str();
+ })
+ << endl;
+ } // if opts::ev
//
+ // basecall fastq
+ //
+ if (opts::fq and f.have_basecall_fastq(opts::st, opts::gr))
+ {
+ auto fq = f.get_basecall_fastq(opts::st, opts::gr);
+ cout << fq << endl;
+ } // if opts::fq
+ }
+ catch (hdf5_tools::Exception& e)
+ {
+ cerr << opts::input_fn.get() << ": HDF5 error: " << e.what() << endl;
+ exit(EXIT_FAILURE);
+ }
+ f.close();
+} // real_main()
+
+int main(int argc, char * argv[])
+{
+ opts::cmd_parser.parse(argc, argv);
+ //clog
+ // << "program: " << opts::cmd_parser.getProgramName() << endl
+ // << "version: " << opts::cmd_parser.getVersion() << endl
+ // << "args: " << opts::cmd_parser.getOrigArgv() << endl;
+ if (opts::ls + opts::id + opts::rw + opts::ed + opts::ev + opts::fq == 0)
+ {
+ opts::ls.set(true);
+ }
+ else if (opts::ls + opts::id + opts::rw + opts::ed + opts::ev + opts::fq > 1)
+ {
+ cerr << "at most one of --ls/--id/--rw/--ed/--ev/--fq must be given" << endl;
+ exit(EXIT_FAILURE);
}
- assert(fast5::File::get_object_count() == 0);
+ cout.precision(opts::float_prec);
+ real_main();
}
diff --git a/src/f5dump-full.cpp b/src/f5ls-full.cpp
similarity index 100%
rename from src/f5dump-full.cpp
rename to src/f5ls-full.cpp
diff --git a/src/f5dump.cpp b/src/f5ls.cpp
similarity index 100%
copy from src/f5dump.cpp
copy to src/f5ls.cpp
diff --git a/src/fast5.hpp b/src/fast5.hpp
index 62ef69a..14ae3cd 100644
--- a/src/fast5.hpp
+++ b/src/fast5.hpp
@@ -42,6 +42,7 @@ typedef std::map< std::string, std::string > Tracking_Id_Parameters;
typedef std::map< std::string, std::string > Sequences_Parameters;
typedef float Raw_Samples_Entry;
+typedef int16_t Raw_Samples_Int_Entry;
struct Raw_Samples_Parameters
{
@@ -305,10 +306,24 @@ public:
}
/**
* Check if raw samples exist.
+ * If _rn non-empty, check if raw samples exist for given read.
*/
- bool have_raw_samples() const
+ bool have_raw_samples(const std::string& _rn = std::string()) const
{
- return have_channel_id_params() and not get_raw_samples_read_name_list().empty();
+ if (not have_channel_id_params())
+ {
+ return false;
+ }
+ auto rn_l = get_raw_samples_read_name_list();
+ if (_rn.empty())
+ {
+ return not rn_l.empty();
+ }
+ else
+ {
+ std::set< std::string > rn_d(rn_l.begin(), rn_l.end());
+ return rn_d.count(_rn) > 0;
+ }
}
/**
* Get raw samples attributes for given read name (default: first read name).
@@ -326,20 +341,29 @@ public:
return res;
}
/**
+ * Get raw samples for given read name as ints (default: first read name).
+ */
+ std::vector< Raw_Samples_Int_Entry > get_raw_samples_int(const std::string& _rn = std::string()) const
+ {
+ // get raw samples
+ std::vector< Raw_Samples_Int_Entry > res;
+ const std::string& rn = not _rn.empty()? _rn : get_raw_samples_read_name_list().front();
+ Base::read(raw_samples_path(rn), res);
+ return res;
+ }
+ /**
* Get raw samples for given read name (default: first read name).
*/
std::vector< Raw_Samples_Entry > get_raw_samples(const std::string& _rn = std::string()) const
{
// get raw samples
- std::vector< uint16_t > raw_samples;
- const std::string& rn = not _rn.empty()? _rn : get_raw_samples_read_name_list().front();
- Base::read(raw_samples_path(rn), raw_samples);
+ auto raw_samples_int = get_raw_samples_int(_rn);
// get scaling parameters
auto channel_id_params = get_channel_id_params();
// decode levels
std::vector< Raw_Samples_Entry > res;
- res.reserve(raw_samples.size());
- for (auto int_level : raw_samples)
+ res.reserve(raw_samples_int.size());
+ for (auto int_level : raw_samples_int)
{
res.push_back((static_cast< float >(int_level) + channel_id_params.offset)
* channel_id_params.range / channel_id_params.digitisation);
@@ -370,9 +394,13 @@ public:
return detect_eventdetection_read_name_list(ed_gr);
}
/**
- * Check if EventDetection events exist for given EventDetection group (default: first EventDetection group).
+ * Check if EventDetection events exist.
+ * If _ed_gr given: check if events exist for given group; else: check first EventDetection group.
+ * If _rn given: check if events exist for given group and read name.
*/
- bool have_eventdetection_events(const std::string& _ed_gr = std::string()) const
+ bool have_eventdetection_events(
+ const std::string& _ed_gr = std::string(),
+ const std::string& _rn = std::string()) const
{
std::string ed_gr;
if (_ed_gr.empty())
@@ -385,7 +413,16 @@ public:
{
ed_gr = _ed_gr;
}
- return not get_eventdetection_read_name_list(ed_gr).empty();
+ auto rn_l = get_eventdetection_read_name_list(ed_gr);
+ if (_rn.empty())
+ {
+ return not rn_l.empty();
+ }
+ else
+ {
+ std::set< std::string > rn_d(rn_l.begin(), rn_l.end());
+ return rn_d.count(_rn) > 0;
+ }
}
/**
* Get EventDetection params for given EventDetection group (default: first EventDetection group).
@@ -597,7 +634,8 @@ public:
{
if (_bc_gr.empty() and get_basecall_strand_group_list(st).empty()) return false;
const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- return Base::dataset_exists(basecall_model_path(bc_gr, st));
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ return Base::dataset_exists(basecall_model_path(bc_gr_1d, st));
}
/**
* Get Basecall model file name for given Basecall group and given strand.
@@ -606,13 +644,15 @@ public:
{
std::string res;
const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- assert(Base::exists(basecall_model_file_path(bc_gr, st)));
- Base::read(basecall_model_file_path(bc_gr, st), res);
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ assert(Base::exists(basecall_model_file_path(bc_gr_1d, st)));
+ Base::read(basecall_model_file_path(bc_gr_1d, st), res);
return res;
}
void add_basecall_model_file(unsigned st, const std::string& bc_gr, const std::string& file_name) const
{
- std::string path = basecall_model_file_path(bc_gr, st);
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ std::string path = basecall_model_file_path(bc_gr_1d, st);
Base::write(path, false, file_name);
}
/**
@@ -622,7 +662,8 @@ public:
{
Model_Parameters res;
const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- std::string path = basecall_model_path(bc_gr, st);
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ std::string path = basecall_model_path(bc_gr_1d, st);
Base::read(path + "/scale", res.scale);
Base::read(path + "/shift", res.shift);
Base::read(path + "/drift", res.drift);
@@ -634,7 +675,8 @@ public:
template < typename T >
void add_basecall_model_params(unsigned st, const std::string& bc_gr, const T& params) const
{
- std::string path = basecall_model_path(bc_gr, st);
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ std::string path = basecall_model_path(bc_gr_1d, st);
Base::write(path + "/scale", false, params.scale);
Base::write(path + "/shift", false, params.shift);
Base::write(path + "/drift", false, params.drift);
@@ -655,7 +697,8 @@ public:
m.add_member("level_stdv", &Model_Entry::level_stdv);
m.add_member("sd_mean", &Model_Entry::sd_mean);
m.add_member("sd_stdv", &Model_Entry::sd_stdv);
- Base::read(basecall_model_path(bc_gr, st), res, m);
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ Base::read(basecall_model_path(bc_gr_1d, st), res, m);
return res;
}
/**
@@ -670,7 +713,8 @@ public:
cm.add_member("level_stdv", &T::level_stdv);
cm.add_member("sd_mean", &T::sd_mean);
cm.add_member("sd_stdv", &T::sd_stdv);
- Base::write(basecall_model_path(bc_gr, st), true, m, cm);
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ Base::write(basecall_model_path(bc_gr_1d, st), true, m, cm);
}
/**
* Check if Basecall events exist for given Basecall group and given strand.
@@ -679,7 +723,8 @@ public:
{
if (_bc_gr.empty() and get_basecall_strand_group_list(st).empty()) return false;
const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- return Base::dataset_exists(basecall_events_path(bc_gr, st));
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ return Base::dataset_exists(basecall_events_path(bc_gr_1d, st));
}
/**
* Get Basecall events for given Basecall group and given strand.
@@ -696,7 +741,8 @@ public:
m.add_member("p_model_state", &Event_Entry::p_model_state);
m.add_member("model_state", &Event_Entry::model_state);
m.add_member("move", &Event_Entry::move);
- Base::read(basecall_events_path(bc_gr, st), res, m);
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ Base::read(basecall_events_path(bc_gr_1d, st), res, m);
return res;
}
/**
@@ -713,7 +759,8 @@ public:
cm.add_member("p_model_state", &T::p_model_state);
cm.add_member("model_state", &T::model_state);
cm.add_member("move", &T::move);
- Base::write(basecall_events_path(bc_gr, st), true, ev, cm);
+ auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+ Base::write(basecall_events_path(bc_gr_1d, st), true, ev, cm);
}
/**
* Check if Basecall event alignment exist for given Basecall group.
@@ -739,32 +786,78 @@ public:
return res;
}
- static std::string fq2seq(const std::string& fq)
- {
- size_t nl1_pos = fq.find_first_of('\n');
- if (nl1_pos == std::string::npos) return std::string();
- size_t nl2_pos = fq.find_first_of('\n', nl1_pos + 1);
- if (nl2_pos == std::string::npos) return std::string();
- return fq.substr(nl1_pos + 1, nl2_pos - nl1_pos - 1);
- }
-
/**
- * Access alternate 1d basecall group.
+ * Get basecall group holding 1d calls.
*/
- static bool have_basecall_alt_1d_group(const std::string& bc_gr)
+ std::string get_basecall_group_1d(const std::string& bc_gr) const
{
- return bc_gr.substr(0, 3) == "2D_";
+ std::string path = basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/basecall_1d";
+ if (Base::attribute_exists(path))
+ {
+ std::string tmp;
+ Base::read(path, tmp);
+ auto tmp1 = tmp.substr(0, 18);
+ auto tmp2 = tmp.substr(18);
+ if (tmp1 == "Analyses/Basecall_"
+ and Base::group_exists(basecall_root_path() + "/" + basecall_group_prefix() + tmp2))
+ {
+ return tmp2;
+ }
+ }
+ return bc_gr;
}
- static std::string get_basecall_alt_1d_group(const std::string& bc_gr)
+ /**
+ * Get EventDetection group for given Basecall group, if available.
+ */
+ std::string get_basecall_eventdetection_group(const std::string& bc_gr) const
{
- if (have_basecall_alt_1d_group(bc_gr))
+ std::string path = basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/event_detection";
+ if (Base::attribute_exists(path))
{
- return std::string("1D_") + bc_gr.substr(3);
+ std::string tmp;
+ Base::read(path, tmp);
+ auto pos = tmp.find(eventdetection_group_prefix());
+ if (pos != std::string::npos)
+ {
+ pos += eventdetection_group_prefix().size();
+ auto end_pos = tmp.find("/", pos);
+ if (end_pos == std::string::npos)
+ {
+ end_pos = tmp.size();
+ }
+ return tmp.substr(pos, end_pos - pos);
+ }
}
- else
+ return std::string();
+ }
+
+ static std::string fq2seq(const std::string& fq)
+ {
+ return split_fq(fq)[1];
+ }
+ static std::array< std::string, 4 > split_fq(const std::string& fq)
+ {
+ std::array< std::string, 4 > res = {{"", "", "", ""}};
+ size_t i = 0;
+ for (int k = 0; k < 4; ++k)
{
- return bc_gr;
+ if (k % 2 == 0) ++i;
+ size_t j = fq.find_first_of('\n', i);
+ if (j == std::string::npos)
+ {
+ if (k == 3)
+ {
+ j = fq.size();
+ }
+ else
+ {
+ return {{"", "", "", ""}};
+ }
+ }
+ res[k] = fq.substr(i, j - i);
+ i = j + 1;
}
+ return res;
}
private:
diff --git a/src/hdf5-mod.cpp b/src/hdf5-mod.cpp
index 60dc01c..314bdb3 100644
--- a/src/hdf5-mod.cpp
+++ b/src/hdf5-mod.cpp
@@ -131,7 +131,7 @@ int main(int argc, char* argv[])
assert(f.is_open());
assert(f.is_rw());
//
- // write a /file_version to allow f5dump to work
+ // write a /file_version to allow f5ls to work
//
string file_version("42");
f.write("/file_version", false, file_version);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fast5.git
More information about the debian-med-commit
mailing list