[med-svn] [fast5] 01/03: Imported Upstream version 0.5.9

Afif Elghraoui afif at moszumanska.debian.org
Wed Dec 28 07:32:52 UTC 2016


This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository fast5.

commit d798d488a7920eb6290069c20b81cd231889d547
Author: Afif Elghraoui <afif at debian.org>
Date:   Tue Dec 27 21:24:10 2016 -0800

    Imported Upstream version 0.5.9
---
 .travis.yml                            |   2 +-
 README.org                             |  49 ++++-
 VERSION                                |   2 +-
 python/fast5/version.py                |   2 +-
 python/setup.py                        |   2 +-
 src/.gitignore                         |   5 +-
 src/Makefile                           |  14 +-
 src/f5dump.cpp                         | 366 +++++++++++++++++++--------------
 src/{f5dump-full.cpp => f5ls-full.cpp} |   0
 src/{f5dump.cpp => f5ls.cpp}           |   0
 src/fast5.hpp                          | 169 +++++++++++----
 src/hdf5-mod.cpp                       |   2 +-
 12 files changed, 415 insertions(+), 198 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 58c3113..2d95ffb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,5 +15,5 @@ install:
     - docker run --rm -v $PWD:/data fast5 bash -c 'virtualenv build-venv && source build-venv/bin/activate && make -C python -e develop'
 
 script:
-    - docker run --rm -v $PWD:/data fast5 bash -c 'src/hdf5-mod -f file.000.fast5 && src/f5-mod file.000.fast5 && src/f5dump file.000.fast5 && src/f5dump-full file.000.fast5'
+    - docker run --rm -v $PWD:/data fast5 bash -c 'src/hdf5-mod -f file.000.fast5 && src/f5-mod file.000.fast5 && src/f5ls file.000.fast5 && src/f5ls-full file.000.fast5'
     - docker run --rm -v $PWD:/data fast5 bash -c 'source build-venv/bin/activate && python -c "import fast5; f = fast5.File(\"file.000.fast5\"); print(f.file_version()); print(f.have_eventdetection_events())"'
diff --git a/README.org b/README.org
index 14a23ff..27a8903 100644
--- a/README.org
+++ b/README.org
@@ -14,7 +14,7 @@ This is a header-only library. You only need to copy [[file:src/fast5.hpp][src/f
 
 **** Usage
 
-See [[file:src/f5dump.cpp][src/f5dump.cpp]] for an example.
+See [[file:src/f5ls.cpp][src/f5ls.cpp]] for an example.
 
 *** Python Wrapper
 
@@ -42,6 +42,53 @@ print(f.file_version())
 print(f.have_eventdetection_events())
 #+END_EXAMPLE
 
+*** f5dump
+
+The program =f5dump= can be used to list and extract some of the contents of =fast5= files, including: raw signals, event-detection events, basecall events, and basecall fastq.
+
+**** Installation
+
+In addition to this =fast5= repository, you will need HDF5 (headers and libraries), as well the the header-only libraries [[https://github.com/mateidavid/tclap.git][TCLAP]] and [[https://github.com/mateidavid/hpptools.git][HPPTOOLS]]. To build =f5dump=, run =make f5dump [VAR1=VALUE1] ...=, where =VAR=-s are used to instruct the [[file:src/Makefile][Makefile]] where to find various dependencies.
+
+**** Usage
+
+In each run, =f5dump= requires exactly one command among: =--ls/--id/--rw/--ed/--ev/--fq=. If no command is given, =--ls= is assumed. It also requires exactly one =fast5= file to inspect.
+
+- In =--ls= mode, =f5dump= lists some of the contents of the file. Sample output:
+
+  #+BEGIN_EXAMPLE
+rw      Read_1019
+ed      000     Read_1019
+ed      001     Read_1019
+bc2d    2D_000  2       1       1       1D_000
+bc1d    1D_000  0       1       1       001
+bc1d    1D_000  1       1       1       001
+#+END_EXAMPLE
+
+  Explanations:
+
+  - =rw=: the file contains raw samples from one read, =Read_1019=.
+
+  - =ed=: the file contains 2 event-detection groups, =000= and =001=, both run on raw samples from =Read_1019=.
+
+  - =bc2d=: the file contains 1 basecall group =2D_000= with 2D data (=2=); this group has both fastq data and events (=1 1=); its corresponding 1D basecall group is =1D_000=.
+
+  - =bc1d=: the file contains 1 basecall group =1D_000= with 1D data for each strand (=0= and =1=); each contains fastq data and events (=1 1=); its corresponding event-detection group is =001=.
+
+  Notes:
+
+  - The group names are suffixes understood by the =fast5= library. E.g., the basecall group =RNN_1D_000= would correspond to the HDF5 group =/Analyses/Basecall_RNN_1D_000=.
+
+  - Not all the links between groups are always available. Notably, some =fast5= files are missing the link between a 1D basecall group and its original event-detection group.
+
+- In =--id= mode, =f5dump= dumps =channel_id= and =tracking_id= metadata.
+
+- In =--rw/--ed/--ev/--fq= mode, =f5dump= dumps: raw signal data/event-detection events/basecall events/basecall fastq data.
+
+- Optional selector flags =--gr/--st/--rn= can be used to specify a group name, strand (=0/1/2=), or read name. Not all combinations make sense: e.g, =--st= is ignored for event-detection data.
+
+- Optional output flags =--time-int/--curr-int/--rw-time= can modify the output: convert times into integers, dump raw signal currents in internal integer encoding, and add time stamps to raw signals.
+
 *** License
 
 [[file:LICENSE][MIT License]].
diff --git a/VERSION b/VERSION
index 659914a..416bfb0 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.5.8
+0.5.9
diff --git a/python/fast5/version.py b/python/fast5/version.py
index 91201fc..eaddd12 100644
--- a/python/fast5/version.py
+++ b/python/fast5/version.py
@@ -1 +1 @@
-__version__ = '0.5.8'
+__version__ = '0.5.9'
diff --git a/python/setup.py b/python/setup.py
old mode 100755
new mode 100644
index c15cabf..487dede
--- a/python/setup.py
+++ b/python/setup.py
@@ -16,7 +16,7 @@ exec(open('fast5/version.py').read())
 hdf5_dir = os.environ.get('HDF5_DIR', '/usr')
 hdf5_include_dir = os.environ.get('HDF5_INCLUDE_DIR', os.path.join(hdf5_dir, 'include'))
 hdf5_lib_dir = os.environ.get('HDF5_LIB_DIR', os.path.join(hdf5_dir, 'lib'))
-hdf5_lib = os.environ.get('HDF_LIB', 'hdf5')
+hdf5_lib = os.environ.get('HDF5_LIB', 'hdf5')
 if not os.path.isfile(os.path.join(hdf5_include_dir, 'H5pubconf.h')):
     sys.exit(hdf5_include_dir + ': could not find HDF5 header files; use HDF5_DIR or HDF5_INCLUDE_DIR')
 if (not os.path.isfile(os.path.join(hdf5_lib_dir, 'lib' + hdf5_lib + '.so'))
diff --git a/src/.gitignore b/src/.gitignore
index 869f46c..70906a8 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -1,4 +1,5 @@
-f5dump
-f5dump-full
+f5ls
+f5ls-full
 hdf5-mod
 f5-mod
+f5dump
diff --git a/src/Makefile b/src/Makefile
index f9e2325..03b93d6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -8,8 +8,10 @@ HDF5_DIR = /usr/local
 HDF5_INCLUDE_DIR = ${HDF5_DIR}/include
 HDF5_LIB_DIR = ${HDF5_DIR}/lib
 HDF5_LIB = hdf5
+TCLAP_DIR = tclap
+HPPTOOLS_DIR = hpptools
 
-TARGETS = f5dump f5dump-full hdf5-mod f5-mod
+TARGETS = f5ls f5ls-full hdf5-mod f5-mod
 
 all: ${TARGETS}
 
@@ -29,5 +31,15 @@ check_hdf5:
 	@[ -f "${HDF5_INCLUDE_DIR}/H5pubconf.h" ] || { echo "HDF5 headers not found" >&2; exit 1; }
 	@[ -f "${HDF5_LIB_DIR}/lib${HDF5_LIB}.so" ] || [ -f "${HDF5_LIB_DIR}/lib${HDF5_LIB}.a" ] || { echo "HDF5 library not found" >&2; exit 1; }
 
+check_tclap:
+	@[ -f "${TCLAP_DIR}/include/tclap/CmdLine.h" ] || { echo "TCLAP not found; get it from https://github.com/mateidavid/tclap.git" >&2; exit 1; }
+
+check_hpptools:
+	@[ -f "${HPPTOOLS_DIR}/include/alg.hpp" ] || { echo "HPPTOOLS not found; get it from https://github.com/mateidavid/hpptools.git" >&2; exit 1; }
+
 %: %.cpp fast5.hpp hdf5_tools.hpp | check_hdf5
 	${CXX} -std=c++11 -O0 -g3 -ggdb -fno-eliminate-unused-debug-types -Wall -Wextra -Wpedantic -isystem ${HDF5_INCLUDE_DIR} -o $@ $< -L${HDF5_LIB_DIR} -Wl,--rpath=${HDF5_LIB_DIR} -l${HDF5_LIB} -lpthread -lz -ldl
+
+f5dump: f5dump.cpp fast5.hpp hdf5_tools.hpp | check_hdf5 check_tclap check_hpptools
+	${CXX} -std=c++11 -O0 -g3 -ggdb -fno-eliminate-unused-debug-types -Wall -Wextra -Wpedantic -isystem ${HDF5_INCLUDE_DIR} -isystem ${TCLAP_DIR}/include -I ${HPPTOOLS_DIR}/include -o $@ $< -L${HDF5_LIB_DIR} -Wl,--rpath=${HDF5_LIB_DIR} -l${HDF5_LIB} -lpthread -lz -ldl
+
diff --git a/src/f5dump.cpp b/src/f5dump.cpp
index af655dd..1dd8984 100644
--- a/src/f5dump.cpp
+++ b/src/f5dump.cpp
@@ -1,20 +1,40 @@
 #include <cassert>
 #include <iostream>
+#include <iomanip>
 #include <string>
 
+#include <tclap/CmdLine.h>
+#include "alg.hpp"
+
 #include "fast5.hpp"
 
 using namespace std;
 
-template < typename T >
-void print_vector(ostream& os, const vector< T >& v, const string& delim)
+
+namespace opts
 {
-    for (auto it = v.begin(); it != v.end(); ++it)
-    {
-        if (it != v.begin()) os << delim;
-        os << *it;
-    }
+    using namespace TCLAP;
+    string description = "Dump contents of ONT fast5 files.";
+    CmdLine cmd_parser(description);
+    //
+    ValueArg< unsigned > float_prec("", "float-prec", "Float precision.", false, 10, "int", cmd_parser);
+    SwitchArg rw_time("", "rw-time", "Add timepoints to raw data.", cmd_parser);
+    SwitchArg curr_int("", "curr-int", "Dump current data encoded as int (raw samples only).", cmd_parser);
+    SwitchArg time_int("", "time-int", "Dump start/length data encoded as int.", cmd_parser);
+    //
+    ValueArg< string > rn("", "rn", "Read name.", false, "", "Read_1015|...", cmd_parser);
+    ValueArg< unsigned > st("", "st", "Strand.", false, 0, "0|1|2", cmd_parser);
+    ValueArg< string > gr("", "gr", "Group name suffix.", false, "", "000|RNN_001|...", cmd_parser);
+    //
+    SwitchArg fq("", "fq", "Dump basecall fastq data.", cmd_parser);
+    SwitchArg ev("", "ev", "Dump basecall event data.", cmd_parser);
+    SwitchArg ed("", "ed", "Dump event detection data.", cmd_parser);
+    SwitchArg rw("", "rw", "Dump raw samples data.", cmd_parser);
+    SwitchArg id("", "id", "Dump channel/tracking id data.", cmd_parser);
+    SwitchArg ls("", "ls", "List groups/read names.", cmd_parser);
+    UnlabeledValueArg< string > input_fn("input", "Fast5 file.", true, "", "file", cmd_parser);
 }
+
 template < typename U, typename V >
 void print_map(ostream& os, const map< U, V >& m, const string& prefix)
 {
@@ -24,174 +44,218 @@ void print_map(ostream& os, const map< U, V >& m, const string& prefix)
     }
 }
 
-int main(int argc, char* argv[])
+unsigned time_int(double f, fast5::Channel_Id_Parameters const & channel_id_params)
 {
-    if (argc != 2)
-    {
-        cerr << "use: " << argv[0] << " <fast5_file>" << endl;
-        return EXIT_FAILURE;
-    }
-    string file_name(argv[1]);
-    //
-    // open the FAST5 file for reading
-    //
-    if (not fast5::File::is_valid_file(file_name))
-    {
-        cout << "not a fast5 file [" << file_name << "]" << endl;
-        return EXIT_SUCCESS;
-    }
+    return f * channel_id_params.sampling_rate;
+}
+
+void real_main()
+{
+    fast5::File f;
+    try
     {
-        fast5::File f;
+        // open file
+        f.open(opts::input_fn);
+        auto channel_id_params = f.get_channel_id_params();
         //
-        // All fast5 operations are performed inside a try-catch block. This should
-        // resist various hdf5 errors without leaking memory.
+        // list
         //
-        try
+        if (opts::ls)
         {
-            //
-            // open file
-            //
-            f.open(file_name);
-            assert(f.is_open());
-            //
-            // extract version information for the ONT software used to generate this dataset
-            //
-            cout << "file_version=" << f.file_version() << endl;
-            //
-            // inspect channel_id params
-            //
-            if (f.have_channel_id_params())
+            // rw
             {
-                auto channel_id_params = f.get_channel_id_params();
-                cout << "channel_id/channel_number=" << channel_id_params.channel_number << endl
-                     << "channel_id/digitisation=" << channel_id_params.digitisation << endl
-                     << "channel_id/offset=" << channel_id_params.offset << endl
-                     << "channel_id/range=" << channel_id_params.range << endl
-                     << "channel_id/sampling_rate=" << channel_id_params.sampling_rate << endl;
+                auto rn_l = f.get_raw_samples_read_name_list();
+                for (auto const & rn : rn_l)
+                {
+                    cout << "rw\t" << rn << endl;
+                }
             }
-            //
-            // inspect tracking_id params
-            //
-            if (f.have_tracking_id_params())
+            // ed
             {
-                auto tracking_id_params = f.get_tracking_id_params();
-                print_map(cout, tracking_id_params, "tracking_id/");
+                auto gr_l = f.get_eventdetection_group_list();
+                for (auto const & gr : gr_l)
+                {
+                    auto rn_l = f.get_eventdetection_read_name_list(gr);
+                    cout << "ed\t" << gr << "\t" << alg::os_join(rn_l, ",") << endl;
+                }
             }
-            //
-            // inspect sequences params
-            //
-            if (f.have_sequences_params())
+            // bc
             {
-                auto sequences_params = f.get_sequences_params();
-                print_map(cout, sequences_params, "sequences/");
+                for (auto st : vector< unsigned >({ 2, 0, 1 }))
+                {
+                    auto gr_l = f.get_basecall_strand_group_list(st);
+                    for (auto const & gr : gr_l)
+                    {
+                        int have_fastq = f.have_basecall_fastq(st, gr);
+                        int have_events = (st == 2
+                                           ? f.have_basecall_events(0, gr) and f.have_basecall_events(1, gr)
+                                           : f.have_basecall_events(st, gr));
+                        string link = (st == 2? f.get_basecall_group_1d(gr) : f.get_basecall_eventdetection_group(gr));
+                        cout
+                            << (st == 2? "bc2d" : "bc1d") << "\t"
+                            << gr << "\t"
+                            << st << "\t"
+                            << have_fastq << "\t"
+                            << have_events << "\t"
+                            << (not link.empty()? link : string(".")) << endl
+                            ;
+                    }
+                }
             }
-            //
-            // inspect raw samples
-            //
-            if (f.have_raw_samples())
+        }
+        //
+        // id
+        //
+        if (opts::id)
+        {
+            cout
+                << "channel_id/channel_number=" << channel_id_params.channel_number << endl
+                << "channel_id/digitisation=" << channel_id_params.digitisation << endl
+                << "channel_id/offset=" << channel_id_params.offset << endl
+                << "channel_id/range=" << channel_id_params.range << endl
+                << "channel_id/sampling_rate=" << channel_id_params.sampling_rate << endl
+                ;
+            if (f.have_tracking_id_params())
             {
-                auto rs_params = f.get_raw_samples_params();
-                auto rs = f.get_raw_samples();
-                cout << "raw_samples/read_id=" << rs_params.read_id << endl
-                     << "raw_samples/read_number=" << rs_params.read_number << endl
-                     << "raw_samples/start_mux=" << rs_params.start_mux << endl
-                     << "raw_samples/start_time=" << rs_params.start_time << endl
-                     << "raw_samples/duration=" << rs_params.duration << endl
-                     << "raw_samples/size=" << rs.size() << endl;
-                const auto& e = rs.front();
-                cout << "  (" << e << ")" << endl;
+                auto tracking_id_params = f.get_tracking_id_params();
+                print_map(cout, tracking_id_params, "tracking_id/");
             }
-            //
-            // inspect eventdetection events
-            //
-            cout << "eventdetection_group_list=";
-            print_vector(cout, f.get_eventdetection_group_list(), ",");
-            cout << endl;
-            if (f.have_eventdetection_events())
-            {
-                auto ed_params = f.get_eventdetection_params();
-                print_map(cout, ed_params, "eventdetection/");
-                auto ed_ev_params = f.get_eventdetection_event_params();
-                auto ed_ev = f.get_eventdetection_events();
-                cout << "eventdetection/events/abasic_found=" << ed_ev_params.abasic_found << endl
-                     << "eventdetection/events/duration=" << ed_ev_params.duration << endl
-                     << "eventdetection/events/median_before=" << ed_ev_params.median_before << endl
-                     << "eventdetection/events/read_id=" << ed_ev_params.read_id << endl
-                     << "eventdetection/events/read_number=" << ed_ev_params.read_number << endl
-                     << "eventdetection/events/scaling_used=" << ed_ev_params.scaling_used << endl
-                     << "eventdetection/events/start_mux=" << ed_ev_params.start_mux << endl
-                     << "eventdetection/events/start_time=" << ed_ev_params.start_time << endl
-                     << "eventdetection/events/size=" << ed_ev.size() << endl;
-                const auto& e = ed_ev.front();
-                cout << "  (mean=" << e.mean
-                     << ", stdv=" << e.stdv
-                     << ", start=" << e.start
-                     << ", length=" << e.length << ")" << endl;
-            } // if have_eventdetection_events
-            //
-            // inspect basecall groups
-            //
-            for (unsigned st = 0; st < 3; ++st)
+        }
+        //
+        // raw samples
+        //
+        if (opts::rw and f.have_raw_samples(opts::rn))
+        {
+            auto rs_params = f.get_raw_samples_params(opts::rn);
+            cout
+                << "#read_id=" << rs_params.read_id << endl
+                << "#read_number=" << rs_params.read_number << endl
+                << "#start_mux=" << rs_params.start_mux << endl
+                << "#start_time=" << rs_params.start_time << endl
+                << "#duration=" << rs_params.duration << endl
+                ;
+            if (not opts::curr_int)
             {
-                cout << "basecall(" << st << ")/group_list=";
-                print_vector(cout, f.get_basecall_strand_group_list(st), ",");
-                cout << endl;
-                // basecall sequence
-                if (f.have_basecall_seq(st))
+                auto rs = f.get_raw_samples(opts::rn);
+                if (opts::rw_time)
                 {
-                    cout << "basecall(" << st << ")/seq_size=" << f.get_basecall_seq(st).size() << endl;
+                    cout << "start\t";
                 }
-                // basecall model
-                if (f.have_basecall_model(st))
+                cout << "curr_f" << endl;
+                for (unsigned i = 0; i < rs.size(); ++i)
                 {
-                    cout << "basecall(" << st << ")/model_file=" << f.get_basecall_model_file(st) << endl;
-                    auto m_params = f.get_basecall_model_params(st);
-                    auto m = f.get_basecall_model(st);
-                    cout << "basecall(" << st << ")/model/scale=" << m_params.scale << endl
-                         << "basecall(" << st << ")/model/shift=" << m_params.shift << endl
-                         << "basecall(" << st << ")/model/drift=" << m_params.drift << endl
-                         << "basecall(" << st << ")/model/var=" << m_params.var << endl
-                         << "basecall(" << st << ")/model/scale_sd=" << m_params.scale_sd << endl
-                         << "basecall(" << st << ")/model/var_sd=" << m_params.var_sd << endl
-                         << "basecall(" << st << ")/model/size=" << m.size() << endl;
-                    const auto& e = m.front();
-                    cout << "  (kmer=" << e.get_kmer()
-                         << ", level_mean=" << e.level_mean
-                         << ", level_stdv=" << e.level_stdv << ")" << endl;
+                    if (opts::rw_time)
+                    {
+                        cout << rs_params.start_time + i << "\t";
+                    }
+                    cout << rs[i] << endl;
                 }
-                // basecall events
-                if (f.have_basecall_events(st))
+            }
+            else
+            {
+                auto rs_int = f.get_raw_samples_int(opts::rn);
+                if (opts::rw_time)
                 {
-                    auto ev = f.get_basecall_events(st);
-                    cout << "basecall(" << st << ")/events/size=" << ev.size() << endl;
-                    const auto& e = ev.front();
-                    cout << "  (mean=" << e.mean
-                         << ", stdv=" << e.stdv
-                         << ", start=" << e.start
-                         << ", length=" << e.length
-                         << ", model_state=" << e.get_model_state()
-                         << ", p_model_state=" << e.p_model_state
-                         << ", move=" << e.move << ")" << endl;
+                    cout << "start\t";
                 }
-                // basecall event alignment
-                if (st == 2 and f.have_basecall_event_alignment())
+                cout << "curr_i" << endl;
+                for (unsigned i = 0; i < rs_int.size(); ++i)
                 {
-                    auto al = f.get_basecall_event_alignment();
-                    cout << "basecall(2)/event_alignment/size=" << al.size() << endl;
-                    const auto& e = al.front();
-                    cout << "  (template_index=" << e.template_index
-                         << ", complement_index=" << e.complement_index
-                         << ", kmer=" << e.get_kmer() << ")" << endl;
+                    if (opts::rw_time)
+                    {
+                        cout << rs_params.start_time + i << "\t";
+                    }
+                    cout << rs_int[i] << endl;
                 }
-            } // for st
-        }
-        catch (hdf5_tools::Exception& e)
+            }
+        } // if opts::rw
+        //
+        // event detection
+        //
+        if (opts::ed and f.have_eventdetection_events(opts::gr, opts::rn))
         {
-            cout << "hdf5 error: " << e.what() << endl;
-        }
+            auto ede_params = f.get_eventdetection_event_params(opts::gr, opts::rn);
+            cout
+                << "#read_id=" << ede_params.read_id << endl
+                << "#read_number=" << ede_params.read_number << endl
+                << "#start_mux=" << ede_params.start_mux << endl
+                << "#start_time=" << ede_params.start_time << endl
+                << "#duration=" << ede_params.duration << endl
+                << "#scaling_used=" << ede_params.scaling_used << endl
+                ;
+            auto ede = f.get_eventdetection_events(opts::gr, opts::rn);
+            cout
+                << "start\tlength\tmean\tstdv" << endl
+                << alg::os_join(ede, "\n", [] (fast5::EventDetection_Event_Entry const & e) {
+                        ostringstream oss;
+                        oss.precision(opts::float_prec);
+                        oss << e.start << "\t" << e.length << "\t" << e.mean << "\t" << e.stdv;
+                        return oss.str();
+                    })
+                << endl;
+        } // if opts::ed
         //
-        // fast5 file is closed by its destructor at the end of this scope
+        // basecall events
+        //
+        if (opts::ev and f.have_basecall_events(opts::st, opts::gr))
+        {
+            auto bce = f.get_basecall_events(opts::st, opts::gr);
+            cout
+                << alg::os_join(bce, "\n", [&channel_id_params] (fast5::Event_Entry const & e) {
+                        ostringstream oss;
+                        oss.precision(opts::float_prec);
+                        if (not opts::time_int)
+                        {
+                            oss << e.start << "\t" << e.length << "\t";
+                        }
+                        else
+                        {
+                            oss
+                                << time_int(e.start, channel_id_params) << "\t"
+                                << time_int(e.length, channel_id_params) << "\t";
+                        }
+                        oss
+                            << e.mean << "\t"
+                            << e.stdv << "\t"
+                            << string(e.model_state.begin(), e.model_state.end()).data() << "\t"
+                            << e.move;
+                        return oss.str();
+                    })
+                << endl;
+        } // if opts::ev
         //
+        // basecall fastq
+        //
+        if (opts::fq and f.have_basecall_fastq(opts::st, opts::gr))
+        {
+            auto fq = f.get_basecall_fastq(opts::st, opts::gr);
+            cout << fq << endl;
+        } // if opts::fq
+    }
+    catch (hdf5_tools::Exception& e)
+    {
+        cerr << opts::input_fn.get() << ": HDF5 error: " << e.what() << endl;
+        exit(EXIT_FAILURE);
+    }
+    f.close();
+} // real_main()
+
+int main(int argc, char * argv[])
+{
+    opts::cmd_parser.parse(argc, argv);
+    //clog
+    //    << "program: " << opts::cmd_parser.getProgramName() << endl
+    //    << "version: " << opts::cmd_parser.getVersion() << endl
+    //    << "args: " << opts::cmd_parser.getOrigArgv() << endl;
+    if (opts::ls + opts::id + opts::rw + opts::ed + opts::ev + opts::fq == 0)
+    {
+        opts::ls.set(true);
+    }
+    else if (opts::ls + opts::id + opts::rw + opts::ed + opts::ev + opts::fq > 1)
+    {
+        cerr << "at most one of --ls/--id/--rw/--ed/--ev/--fq must be given" << endl;
+        exit(EXIT_FAILURE);
     }
-    assert(fast5::File::get_object_count() == 0);
+    cout.precision(opts::float_prec);
+    real_main();
 }
diff --git a/src/f5dump-full.cpp b/src/f5ls-full.cpp
similarity index 100%
rename from src/f5dump-full.cpp
rename to src/f5ls-full.cpp
diff --git a/src/f5dump.cpp b/src/f5ls.cpp
similarity index 100%
copy from src/f5dump.cpp
copy to src/f5ls.cpp
diff --git a/src/fast5.hpp b/src/fast5.hpp
index 62ef69a..14ae3cd 100644
--- a/src/fast5.hpp
+++ b/src/fast5.hpp
@@ -42,6 +42,7 @@ typedef std::map< std::string, std::string > Tracking_Id_Parameters;
 typedef std::map< std::string, std::string > Sequences_Parameters;
 
 typedef float Raw_Samples_Entry;
+typedef int16_t Raw_Samples_Int_Entry;
 
 struct Raw_Samples_Parameters
 {
@@ -305,10 +306,24 @@ public:
     }
     /**
      * Check if raw samples exist.
+     * If _rn non-empty, check if raw samples exist for given read.
      */
-    bool have_raw_samples() const
+    bool have_raw_samples(const std::string& _rn = std::string()) const
     {
-        return have_channel_id_params() and not get_raw_samples_read_name_list().empty();
+        if (not have_channel_id_params())
+        {
+            return false;
+        }
+        auto rn_l = get_raw_samples_read_name_list();
+        if (_rn.empty())
+        {
+            return not rn_l.empty();
+        }
+        else
+        {
+            std::set< std::string > rn_d(rn_l.begin(), rn_l.end());
+            return rn_d.count(_rn) > 0;
+        }
     }
     /**
      * Get raw samples attributes for given read name (default: first read name).
@@ -326,20 +341,29 @@ public:
         return res;
     }
     /**
+     * Get raw samples for given read name as ints (default: first read name).
+     */
+    std::vector< Raw_Samples_Int_Entry > get_raw_samples_int(const std::string& _rn = std::string()) const
+    {
+        // get raw samples
+        std::vector< Raw_Samples_Int_Entry > res;
+        const std::string& rn = not _rn.empty()? _rn : get_raw_samples_read_name_list().front();
+        Base::read(raw_samples_path(rn), res);
+        return res;
+    }
+    /**
      * Get raw samples for given read name (default: first read name).
      */
     std::vector< Raw_Samples_Entry > get_raw_samples(const std::string& _rn = std::string()) const
     {
         // get raw samples
-        std::vector< uint16_t > raw_samples;
-        const std::string& rn = not _rn.empty()? _rn : get_raw_samples_read_name_list().front();
-        Base::read(raw_samples_path(rn), raw_samples);
+        auto raw_samples_int = get_raw_samples_int(_rn);
         // get scaling parameters
         auto channel_id_params = get_channel_id_params();
         // decode levels
         std::vector< Raw_Samples_Entry > res;
-        res.reserve(raw_samples.size());
-        for (auto int_level : raw_samples)
+        res.reserve(raw_samples_int.size());
+        for (auto int_level : raw_samples_int)
         {
             res.push_back((static_cast< float >(int_level) + channel_id_params.offset)
                           * channel_id_params.range / channel_id_params.digitisation);
@@ -370,9 +394,13 @@ public:
         return detect_eventdetection_read_name_list(ed_gr);
     }
     /**
-     * Check if EventDetection events exist for given EventDetection group (default: first EventDetection group).
+     * Check if EventDetection events exist.
+     * If _ed_gr given: check if events exist for given group; else: check first EventDetection group.
+     * If _rn given: check if events exist for given group and read name.
      */
-    bool have_eventdetection_events(const std::string& _ed_gr = std::string()) const
+    bool have_eventdetection_events(
+        const std::string& _ed_gr = std::string(),
+        const std::string& _rn = std::string()) const
     {
         std::string ed_gr;
         if (_ed_gr.empty())
@@ -385,7 +413,16 @@ public:
         {
             ed_gr = _ed_gr;
         }
-        return not get_eventdetection_read_name_list(ed_gr).empty();
+        auto rn_l = get_eventdetection_read_name_list(ed_gr);
+        if (_rn.empty())
+        {
+            return not rn_l.empty();
+        }
+        else
+        {
+            std::set< std::string > rn_d(rn_l.begin(), rn_l.end());
+            return rn_d.count(_rn) > 0;
+        }
     }
     /**
      * Get EventDetection params for given EventDetection group (default: first EventDetection group).
@@ -597,7 +634,8 @@ public:
     {
         if (_bc_gr.empty() and get_basecall_strand_group_list(st).empty()) return false;
         const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
-        return Base::dataset_exists(basecall_model_path(bc_gr, st));
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        return Base::dataset_exists(basecall_model_path(bc_gr_1d, st));
     }
     /**
      * Get Basecall model file name for given Basecall group and given strand.
@@ -606,13 +644,15 @@ public:
     {
         std::string res;
         const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
-        assert(Base::exists(basecall_model_file_path(bc_gr, st)));
-        Base::read(basecall_model_file_path(bc_gr, st), res);
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        assert(Base::exists(basecall_model_file_path(bc_gr_1d, st)));
+        Base::read(basecall_model_file_path(bc_gr_1d, st), res);
         return res;
     }
     void add_basecall_model_file(unsigned st, const std::string& bc_gr, const std::string& file_name) const
     {
-        std::string path = basecall_model_file_path(bc_gr, st);
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        std::string path = basecall_model_file_path(bc_gr_1d, st);
         Base::write(path, false, file_name);
     }
     /**
@@ -622,7 +662,8 @@ public:
     {
         Model_Parameters res;
         const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
-        std::string path = basecall_model_path(bc_gr, st);
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        std::string path = basecall_model_path(bc_gr_1d, st);
         Base::read(path + "/scale", res.scale);
         Base::read(path + "/shift", res.shift);
         Base::read(path + "/drift", res.drift);
@@ -634,7 +675,8 @@ public:
     template < typename T >
     void add_basecall_model_params(unsigned st, const std::string& bc_gr, const T& params) const
     {
-        std::string path = basecall_model_path(bc_gr, st);
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        std::string path = basecall_model_path(bc_gr_1d, st);
         Base::write(path + "/scale", false, params.scale);
         Base::write(path + "/shift", false, params.shift);
         Base::write(path + "/drift", false, params.drift);
@@ -655,7 +697,8 @@ public:
         m.add_member("level_stdv", &Model_Entry::level_stdv);
         m.add_member("sd_mean", &Model_Entry::sd_mean);
         m.add_member("sd_stdv", &Model_Entry::sd_stdv);
-        Base::read(basecall_model_path(bc_gr, st), res, m);
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        Base::read(basecall_model_path(bc_gr_1d, st), res, m);
         return res;
     }
     /**
@@ -670,7 +713,8 @@ public:
         cm.add_member("level_stdv", &T::level_stdv);
         cm.add_member("sd_mean", &T::sd_mean);
         cm.add_member("sd_stdv", &T::sd_stdv);
-        Base::write(basecall_model_path(bc_gr, st), true, m, cm);
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        Base::write(basecall_model_path(bc_gr_1d, st), true, m, cm);
     }
     /**
      * Check if Basecall events exist for given Basecall group and given strand.
@@ -679,7 +723,8 @@ public:
     {
         if (_bc_gr.empty() and get_basecall_strand_group_list(st).empty()) return false;
         const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
-        return Base::dataset_exists(basecall_events_path(bc_gr, st));
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        return Base::dataset_exists(basecall_events_path(bc_gr_1d, st));
     }
     /**
      * Get Basecall events for given Basecall group and given strand.
@@ -696,7 +741,8 @@ public:
         m.add_member("p_model_state", &Event_Entry::p_model_state);
         m.add_member("model_state", &Event_Entry::model_state);
         m.add_member("move", &Event_Entry::move);
-        Base::read(basecall_events_path(bc_gr, st), res, m);
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        Base::read(basecall_events_path(bc_gr_1d, st), res, m);
         return res;
     }
     /**
@@ -713,7 +759,8 @@ public:
         cm.add_member("p_model_state", &T::p_model_state);
         cm.add_member("model_state", &T::model_state);
         cm.add_member("move", &T::move);
-        Base::write(basecall_events_path(bc_gr, st), true, ev, cm);
+        auto bc_gr_1d = get_basecall_group_1d(bc_gr);
+        Base::write(basecall_events_path(bc_gr_1d, st), true, ev, cm);
     }
     /**
      * Check if Basecall event alignment exist for given Basecall group.
@@ -739,32 +786,78 @@ public:
         return res;
     }
 
-    static std::string fq2seq(const std::string& fq)
-    {
-        size_t nl1_pos = fq.find_first_of('\n');
-        if (nl1_pos == std::string::npos) return std::string();
-        size_t nl2_pos = fq.find_first_of('\n', nl1_pos + 1);
-        if (nl2_pos == std::string::npos) return std::string();
-        return fq.substr(nl1_pos + 1, nl2_pos - nl1_pos - 1);
-    }
-
     /**
-     * Access alternate 1d basecall group.
+     * Get basecall group holding 1d calls.
      */
-    static bool have_basecall_alt_1d_group(const std::string& bc_gr)
+    std::string get_basecall_group_1d(const std::string& bc_gr) const
     {
-        return bc_gr.substr(0, 3) == "2D_";
+        std::string path = basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/basecall_1d";
+        if (Base::attribute_exists(path))
+        {
+            std::string tmp;
+            Base::read(path, tmp);
+            auto tmp1 = tmp.substr(0, 18);
+            auto tmp2 = tmp.substr(18);
+            if (tmp1 == "Analyses/Basecall_"
+                and Base::group_exists(basecall_root_path() + "/" + basecall_group_prefix() + tmp2))
+            {
+                return tmp2;
+            }
+        }
+        return bc_gr;
     }
-    static std::string get_basecall_alt_1d_group(const std::string& bc_gr)
+    /**
+     * Get EventDetection group for given Basecall group, if available.
+     */
+    std::string get_basecall_eventdetection_group(const std::string& bc_gr) const
     {
-        if (have_basecall_alt_1d_group(bc_gr))
+        std::string path = basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/event_detection";
+        if (Base::attribute_exists(path))
         {
-            return std::string("1D_") + bc_gr.substr(3);
+            std::string tmp;
+            Base::read(path, tmp);
+            auto pos = tmp.find(eventdetection_group_prefix());
+            if (pos != std::string::npos)
+            {
+                pos += eventdetection_group_prefix().size();
+                auto end_pos = tmp.find("/", pos);
+                if (end_pos == std::string::npos)
+                {
+                    end_pos = tmp.size();
+                }
+                return tmp.substr(pos, end_pos - pos);
+            }
         }
-        else
+        return std::string();
+    }
+
+    static std::string fq2seq(const std::string& fq)
+    {
+        return split_fq(fq)[1];
+    }
+    static std::array< std::string, 4 > split_fq(const std::string& fq)
+    {
+        std::array< std::string, 4 > res = {{"", "", "", ""}};
+        size_t i = 0;
+        for (int k = 0; k < 4; ++k)
         {
-            return bc_gr;
+            if (k % 2 == 0) ++i;
+            size_t j = fq.find_first_of('\n', i);
+            if (j == std::string::npos)
+            {
+                if (k == 3)
+                {
+                    j = fq.size();
+                }
+                else
+                {
+                    return {{"", "", "", ""}};
+                }
+            }
+            res[k] = fq.substr(i, j - i);
+            i = j + 1;
         }
+        return res;
     }
 
 private:
diff --git a/src/hdf5-mod.cpp b/src/hdf5-mod.cpp
index 60dc01c..314bdb3 100644
--- a/src/hdf5-mod.cpp
+++ b/src/hdf5-mod.cpp
@@ -131,7 +131,7 @@ int main(int argc, char* argv[])
             assert(f.is_open());
             assert(f.is_rw());
             //
-            // write a /file_version to allow f5dump to work
+            // write a /file_version to allow f5ls to work
             //
             string file_version("42");
             f.write("/file_version", false, file_version);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fast5.git



More information about the debian-med-commit mailing list