[med-svn] [fast5] 01/03: New upstream version 0.6.2
Steffen Möller
moeller at moszumanska.debian.org
Fri Sep 15 12:18:30 UTC 2017
This is an automated email from the git hooks/post-receive script.
moeller pushed a commit to branch master
in repository fast5.
commit 1a33a450a6cf3f504cc66a0a1fc6c8956df3cfa1
Author: Steffen Moeller <moeller at debian.org>
Date: Fri Sep 15 13:45:50 2017 +0200
New upstream version 0.6.2
---
.travis.Dockerfile.in | 17 +-
.travis.yml | 2 +-
.version_files | 2 +-
README.org | 91 +-
VERSION | 2 +-
python/.gitignore | 1 +
python/Makefile | 20 +-
python/bin/f5ls | 249 ++++
python/bin/f5pack | 240 ++++
python/fast5/.version.py.in | 1 -
python/fast5/__init__.py | 10 -
python/fast5/fast5.pyx | 523 +++++++
python/fast5/source/fast5.cpp | 182 ---
python/fast5/version.py | 1 -
python/setup.py | 86 +-
src/.fast5_version.hpp.in | 16 +
src/.gitignore | 1 +
src/Bit_Packer.hpp | 152 ++
src/File_Packer.hpp | 982 +++++++++++++
src/Huffman_Packer.hpp | 357 +++++
src/Makefile | 42 +-
src/cwmap.fast5_ed_len_1.inl | 103 ++
src/cwmap.fast5_ed_skip_1.inl | 4 +
src/cwmap.fast5_ev_move_1.inl | 6 +
src/cwmap.fast5_ev_rel_skip_1.inl | 4 +
src/cwmap.fast5_fq_bp_1.inl | 7 +
src/cwmap.fast5_fq_qv_1.inl | 35 +
src/cwmap.fast5_rw_1.inl | 204 +++
src/f5-mod.cpp | 14 +-
src/f5dump.cpp | 96 +-
src/f5ls-full.cpp | 35 +-
src/f5ls.cpp | 17 +-
src/f5pack.cpp | 185 +++
src/fast5.hpp | 2788 ++++++++++++++++++++++++++++---------
src/fast5_version.hpp | 16 +
src/hdf5-mod.cpp | 12 +
src/hdf5_tools.hpp | 316 ++++-
src/huffman-decode.cpp | 55 +
src/huffman-encode.cpp | 44 +
src/hufftk | 171 +++
src/logger.hpp | 378 +++++
src/tmp.cpp | 7 +
42 files changed, 6340 insertions(+), 1134 deletions(-)
diff --git a/.travis.Dockerfile.in b/.travis.Dockerfile.in
index 18e40bb..d90ecb5 100644
--- a/.travis.Dockerfile.in
+++ b/.travis.Dockerfile.in
@@ -11,21 +11,22 @@ RUN apt-get update && \
apt-get install -y \
build-essential \
libhdf5-dev \
- libboost-python-dev \
- python2.7-minimal \
- python-setuptools \
- python-virtualenv
+ libpython2.7-dev \
+ python2.7-minimal
+RUN curl https://bootstrap.pypa.io/get-pip.py | python - && \
+ pip install \
+ cython \
+ setuptools \
+ virtualenv
# expose prerequisites settings
ENV HDF5_INCLUDE_DIR=/usr/include/hdf5/serial
ENV HDF5_LIB_DIR=/usr/lib/x86_64-linux-gnu/hdf5/serial
-ENV BOOST_INCLUDE_DIR=/usr/include
-ENV BOOST_LIB_DIR=/usr/lib/x86_64-linux-gnu
# if necessary, specify compiler
#RUN apt-get install -y g++-4.9 g++-5 g++-6
-#ENV CC=gcc-4.9
-#ENV CXX=g++-4.9
+#ENV CC=gcc-6
+#ENV CXX=g++-6
# use host id
RUN groupadd --gid ${GROUP_ID} ${GROUP_NAME}
diff --git a/.travis.yml b/.travis.yml
index 2d95ffb..b1bd9f1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,7 @@ before_install:
install:
- docker run --rm -v $PWD:/data fast5 make -C src -e
- - docker run --rm -v $PWD:/data fast5 bash -c 'virtualenv build-venv && source build-venv/bin/activate && make -C python -e develop'
+ - docker run --rm -v $PWD:/data fast5 bash -c 'virtualenv build-venv --system-site-packages && source build-venv/bin/activate && make -C python -e develop'
script:
- docker run --rm -v $PWD:/data fast5 bash -c 'src/hdf5-mod -f file.000.fast5 && src/f5-mod file.000.fast5 && src/f5ls file.000.fast5 && src/f5ls-full file.000.fast5'
diff --git a/.version_files b/.version_files
index c822d11..3491213 100644
--- a/.version_files
+++ b/.version_files
@@ -1,2 +1,2 @@
VERSION
-python/fast5/version.py
+src/fast5_version.hpp
diff --git a/README.org b/README.org
index 27a8903..bd589ba 100644
--- a/README.org
+++ b/README.org
@@ -4,91 +4,38 @@
[[http://travis-ci.org/mateidavid/fast5][http://travis-ci.org/mateidavid/fast5.svg?branch=master]] [[https://tldrlegal.com/license/mit-license][http://img.shields.io/:license-mit-blue.svg]]
-A lightweight C++11 library to read raw signal data from Oxford Nanopore's Fast5 files.
+A lightweight C++ library for accessing Oxford Nanopore Technologies sequencing data.
-*** C++
+*** Installation
-**** Installation
+**** Core C++ Library
-This is a header-only library. You only need to copy [[file:src/fast5.hpp][src/fast5.hpp]] and [[file:src/hdf5_tools.hpp][src/hdf5_tools.hpp]] into your C++ project.
+The core library is written in header-only C++11, and it enables read-write access to fast5 files from C++ code.
-**** Usage
+The core library requires no installation, other than setting the compiler's include path to find [[file:src/fast5.hpp][fast5.hpp]]. See [[file:src/f5ls-full.cpp][f5ls-full.cpp]] for an example.
-See [[file:src/f5ls.cpp][src/f5ls.cpp]] for an example.
+The core library is built on top the HDF5 C API, so the compiler must also be able to find the HDF5 headers and libraries. See the project's Travis CI [[file:.travis.Dockerfile.in][Dockerfile]] for an example of how to install prerequisites on Debian Jessie.
-*** Python Wrapper
+**** Python Wrapper
-An optional python wrapper for this library is available through Boost.Python. The wrapper currently implements only read-only access.
+The Python wrapper for the core library enables read-only access to fast5 files from Python code. The wrapper also adds several Python scripts:
-**** Installation
+- [[file:python/bin/f5ls][f5ls]] :: Summarize contents of fast5 files.
+- [[file:python/bin/f5pack][f5pack]] :: Pack and unpack fast5 files. For a detailed description of this tool, see our [[http://simpsonlab.github.io/2017/02/27/packing_fast5/][blog post]].
-#+BEGIN_EXAMPLE
-cd python
-HDF5_DIR=/usr/local BOOST_DIR=/usr/local make develop-user
-#+END_EXAMPLE
-
-Notes:
-
-- HDF5 and Boost.Python must be available, and their locations can be passed on to the Python setup process using the environment variables =HDF5_DIR= and =BOOST_DIR=. Alternatively, the respective include directories, library directories, and library names may be specified explicitly with: =HDF5_INCLUDE_DIR=, =HDF5_LIB_DIR=, =HDF5_LIB=, =BOOST_INCLUDE_DIR=, =BOOST_LIB_DIR=, =BOOST_PYTHON_LIB=. For details, see [[file:python/setup.py][python/setup.py]] and [[file:.travis.yml][.travis.yml]].
-
-- To install =fast5= as a package in a virtualenv, use the target =develop=. To install as a user package, use the target =develop-user=. For details, see [[file:python/Makefile][python/Makefile]].
-
-**** Usage
+The Python wrapper also depends on Cython. To build the Python wrapper:
#+BEGIN_EXAMPLE
-import fast5
-f = fast5.File("file.000.fast5")
-print(f.file_version())
-print(f.have_eventdetection_events())
+# set paths to HDF5
+export HDF5_INCLUDE_DIR=/path/to/hdf5.h
+export HDF5_LIB_DIR=/path/to/libhdf5.so
+
+# either
+make -C python develop # to install in a virtualenv
+# or
+make -C python develop-user # to install in user mode
#+END_EXAMPLE
-*** f5dump
-
-The program =f5dump= can be used to list and extract some of the contents of =fast5= files, including: raw signals, event-detection events, basecall events, and basecall fastq.
-
-**** Installation
-
-In addition to this =fast5= repository, you will need HDF5 (headers and libraries), as well the the header-only libraries [[https://github.com/mateidavid/tclap.git][TCLAP]] and [[https://github.com/mateidavid/hpptools.git][HPPTOOLS]]. To build =f5dump=, run =make f5dump [VAR1=VALUE1] ...=, where =VAR=-s are used to instruct the [[file:src/Makefile][Makefile]] where to find various dependencies.
-
-**** Usage
-
-In each run, =f5dump= requires exactly one command among: =--ls/--id/--rw/--ed/--ev/--fq=. If no command is given, =--ls= is assumed. It also requires exactly one =fast5= file to inspect.
-
-- In =--ls= mode, =f5dump= lists some of the contents of the file. Sample output:
-
- #+BEGIN_EXAMPLE
-rw Read_1019
-ed 000 Read_1019
-ed 001 Read_1019
-bc2d 2D_000 2 1 1 1D_000
-bc1d 1D_000 0 1 1 001
-bc1d 1D_000 1 1 1 001
-#+END_EXAMPLE
-
- Explanations:
-
- - =rw=: the file contains raw samples from one read, =Read_1019=.
-
- - =ed=: the file contains 2 event-detection groups, =000= and =001=, both run on raw samples from =Read_1019=.
-
- - =bc2d=: the file contains 1 basecall group =2D_000= with 2D data (=2=); this group has both fastq data and events (=1 1=); its corresponding 1D basecall group is =1D_000=.
-
- - =bc1d=: the file contains 1 basecall group =1D_000= with 1D data for each strand (=0= and =1=); each contains fastq data and events (=1 1=); its corresponding event-detection group is =001=.
-
- Notes:
-
- - The group names are suffixes understood by the =fast5= library. E.g., the basecall group =RNN_1D_000= would correspond to the HDF5 group =/Analyses/Basecall_RNN_1D_000=.
-
- - Not all the links between groups are always available. Notably, some =fast5= files are missing the link between a 1D basecall group and its original event-detection group.
-
-- In =--id= mode, =f5dump= dumps =channel_id= and =tracking_id= metadata.
-
-- In =--rw/--ed/--ev/--fq= mode, =f5dump= dumps: raw signal data/event-detection events/basecall events/basecall fastq data.
-
-- Optional selector flags =--gr/--st/--rn= can be used to specify a group name, strand (=0/1/2=), or read name. Not all combinations make sense: e.g, =--st= is ignored for event-detection data.
-
-- Optional output flags =--time-int/--curr-int/--rw-time= can modify the output: convert times into integers, dump raw signal currents in internal integer encoding, and add time stamps to raw signals.
-
*** License
[[file:LICENSE][MIT License]].
diff --git a/VERSION b/VERSION
index 416bfb0..b616048 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.5.9
+0.6.2
diff --git a/python/.gitignore b/python/.gitignore
index 44dafa0..f227ead 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,3 +1,4 @@
+fast5.cpp
build/
dist/
*.egg-info/
diff --git a/python/Makefile b/python/Makefile
index 8c4047e..e3fc9b8 100755
--- a/python/Makefile
+++ b/python/Makefile
@@ -1,3 +1,10 @@
+#
+# Part of: https://github.com/mateidavid/fast5
+#
+# (c) 2017: Matei David, Ontario Institute for Cancer Research
+# MIT License
+#
+
.SUFFIXES:
MAKEFLAGS += -r
SHELL := /bin/bash
@@ -16,16 +23,21 @@ help: ## This help.
clean: ## Remove build products
${PYTHON} setup.py clean
- rm -rf fast5.egg-info build dist
- find fast5/ \( -name '*.pyc' -o -name '*.so' \) -delete
+ rm -rf build dist fast5.egg-info fast5*.so
check_virtualenv:
@[ "$$VIRTUAL_ENV" ] || { echo "not in a virtualenv" >&2; exit 1; }
-develop: check_virtualenv clean ## Install in develop mode to current virtualenv
+install: check_virtualenv ## Install to current virtualenv
+ ${PYTHON} setup.py install
+
+install-user: ## Install to current user
+ ${PYTHON} setup.py install --user
+
+develop: check_virtualenv ## Install in develop mode to current virtualenv
${PYTHON} setup.py develop
-develop-user: clean ## Install in develop mode to current user
+develop-user: ## Install in develop mode to current user
${PYTHON} setup.py develop --user
develop-uninstall: check_virtualenv clean ## Uninstall from current virtualenv
diff --git a/python/bin/f5ls b/python/bin/f5ls
new file mode 100755
index 0000000..14e78fc
--- /dev/null
+++ b/python/bin/f5ls
@@ -0,0 +1,249 @@
+#!/usr/bin/env python
+
+#
+# Part of: https://github.com/mateidavid/fast5
+#
+# (c) 2017: Matei David, Ontario Institute for Cancer Research
+# MIT License
+#
+
+import argparse
+import datetime
+import dateutil.parser
+import logging
+import math
+import os
+import sys
+
+import fast5
+
+import signal
+signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+
+def add_fast5(fn, rel_dn, args):
+ logger.info("adding fast5 fn=" + fn + " rel_dn=" + rel_dn)
+ return [fn]
+
+def add_dir(dn, args):
+ l = list()
+ logger.info("processing dir dn=" + dn)
+ for t in os.walk(dn):
+ rel_dn = os.path.relpath(t[0], dn)
+ for rel_fn in t[2]:
+ fn = os.path.join(t[0], rel_fn)
+ if fast5.File.is_valid_file(fn):
+ l += add_fast5(fn, rel_dn, args)
+ if not args.recurse:
+ break
+ return l
+
+def add_fofn(fn, args):
+ l = list()
+ logger.info("processing fofn fn=" + fn)
+ if fn != "-":
+ f = open(fn)
+ else:
+ f = sys.stdin
+ for p in f:
+ p = p.strip()
+ if fast5.File.is_valid_file(p):
+ l += add_fast5(p, "", args)
+ else:
+ logger.warning("fofn line not a fast5 file: " + p)
+ if fn != "-":
+ f.close()
+ return l
+
+def add_paths(pl, args):
+ l = list()
+ if len(pl) == 0:
+ pl.append("-")
+ for p in pl:
+ if os.path.isdir(p):
+ l += add_dir(p, args)
+ elif fast5.File.is_valid_file(p):
+ l += add_fast5(p, "", args)
+ else:
+ l += add_fofn(p, args)
+ return l
+
+def stat_file(ifn, args):
+ d = dict()
+ try:
+ f = fast5.File(ifn)
+ # cid params
+ d["cid"] = f.get_channel_id_params()
+ d["tid"] = f.get_tracking_id_params()
+ # raw samples
+ d["rs_rn_l"] = f.get_raw_samples_read_name_list()
+ d["rs"] = dict()
+ for rn in d["rs_rn_l"]:
+ d["rs"][rn] = dict()
+ d["rs"][rn]["params"] = f.get_raw_samples_params(rn)
+ d["rs"][rn]["packed"] = not f.have_raw_samples_unpack(rn)
+ # basecall groups
+ d["bc_gr_l"] = f.get_basecall_group_list()
+ d["bc"] = dict()
+ d["bc_desc"] = dict()
+ d["bc_summary"] = dict()
+ for gr in d["bc_gr_l"]:
+ d["bc"][gr] = dict()
+ d["bc"][gr]["desc"] = f.get_basecall_group_description(gr)
+ d["bc"][gr]["summary"] = f.get_basecall_summary(gr)
+ d["bc"][gr]["start"] = dict()
+ d["bc"][gr]["length"] = dict()
+ d["bc"][gr]["count"] = dict()
+ d["bc"][gr]["packed_fastq"] = dict()
+ d["bc"][gr]["packed_events"] = dict()
+ for st in [0, 1, 2]:
+ d["bc"][gr]["packed_fastq"][st] = not f.have_basecall_fastq_unpack(st, gr)
+ if st < 2:
+ d["bc"][gr]["packed_events"][st] = not f.have_basecall_events_unpack(st, gr)
+ if d["bc"][gr]["desc"]["have_events"][st]:
+ ev_params = f.get_basecall_events_params(st, gr)
+ d["bc"][gr]["start"][st] = ev_params["start_time"]
+ d["bc"][gr]["length"][st] = ev_params["duration"]
+ if d["bc"][gr]["start"][st] < 1e-3:
+ d["bc"][gr]["start"][st] = float('nan')
+ if d["bc"][gr]["length"][st] < 1e-3:
+ d["bc"][gr]["length"][st] = float('nan')
+ if False:
+ e = f.get_basecall_events(st, gr)
+ d["bc"][gr]["start"][st] = e[0]["start"]
+ d["bc"][gr]["length"][st] = e[-1]["start"] + e[-1]["length"] - e[0]["start"]
+ d["bc"][gr]["count"][st] = len(e)
+ else:
+ d["bc"][gr]["packed_alignment"] = not f.have_basecall_alignment_unpack(gr)
+ # eventdetection groups
+ d["ed_gr_l"] = f.get_eventdetection_group_list()
+ d["ed"] = dict()
+ for gr in d["ed_gr_l"]:
+ d["ed"][gr] = dict()
+ d["ed"][gr]["rn_l"] = f.get_eventdetection_read_name_list(gr)
+ d["ed"][gr]["rn"] = dict()
+ for rn in d["ed"][gr]["rn_l"]:
+ d["ed"][gr]["rn"][rn] = dict()
+ d["ed"][gr]["rn"][rn]["packed"] = not f.have_eventdetection_events_unpack(gr, rn)
+ except RuntimeError as e:
+ d = dict()
+ return d
+
+def as_time(v, r):
+ if math.isnan(v):
+ return 'nan'
+ x = float(v)/r
+ m, s = divmod(x, 60)
+ h, m = divmod(m, 60)
+ return "%d:%02d:%02d.%03d" % (h, m, s, (x * 1000) % 1000)
+
+def print_path(p, v, args):
+ if type(v) == list:
+ print(args.delim[1].join(str(e) for e in p) + args.delim[0] + args.delim[1].join(str(e) for e in v))
+ else:
+ print(args.delim[1].join(str(e) for e in p) + args.delim[0] + str(v))
+
+def list_file(ifn, include_fn, args):
+ d = stat_file(ifn, args)
+ if include_fn:
+ print_path(["file"], ifn, args)
+ if "cid" not in d:
+ return
+ # tid
+ for k in ["device_id", "asic_id", "flow_cell_id", "exp_script_purpose"]:
+ if k not in d["tid"]:
+ continue
+ print_path(["tid", k], d["tid"][k], args)
+ if "exp_start_time" in d["tid"]:
+ if 'T' in d["tid"]["exp_start_time"]:
+ exp_start_time = dateutil.parser.parse(d["tid"]["exp_start_time"])
+ else:
+ exp_start_time = datetime.datetime.fromtimestamp(int(d["tid"]["exp_start_time"]))
+ print_path(["tid", "exp_start_date"], exp_start_time.date().isoformat(), args)
+ print_path(["tid", "exp_start_time"], exp_start_time.time().isoformat(), args)
+ # cid
+ for k in ["channel_number", "sampling_rate"]:
+ if k not in d["cid"]:
+ continue
+ print_path(["cid", k], d["cid"][k], args)
+ sampling_rate = d["cid"]["sampling_rate"]
+ # rs
+ for rn in d["rs_rn_l"]:
+ print_path(["rs", rn, "packed"], int(d["rs"][rn]["packed"]), args)
+ print_path(["rs", rn, "read_number"], d["rs"][rn]["params"]["read_number"], args)
+ print_path(["rs", rn, "read_id"], d["rs"][rn]["params"]["read_id"], args)
+ print_path(["rs", rn, "start"], as_time(d["rs"][rn]["params"]["start_time"], sampling_rate), args)
+ print_path(["rs", rn, "length"], as_time(d["rs"][rn]["params"]["duration"], sampling_rate), args)
+ # bc
+ for gr in d["bc_gr_l"]:
+ print_path(["bc", gr, "id"], d["bc"][gr]["desc"]["name"] + ":" + d["bc"][gr]["desc"]["version"], args)
+ for st in [0, 1, 2]:
+ if not d["bc"][gr]["desc"]["have_subgroup"][st]:
+ continue
+ # fastq
+ fq_len = 0
+ print_path(["bc", gr, st, "fastq", "packed"], int(d["bc"][gr]["packed_fastq"][st]), args)
+ if d["bc"][gr]["desc"]["have_fastq"][st]:
+ for k in ["sequence_length", "mean_qscore"]:
+ fk = ["basecall_1d_template", "basecall_1d_complement", "basecall_2d"][st] + "/" + k
+ if fk not in d["bc"][gr]["summary"]:
+ continue
+ print_path(["bc", gr, st, "fastq", k], d["bc"][gr]["summary"][fk], args)
+ if k == "sequence_length":
+ fq_len = d["bc"][gr]["summary"][fk]
+ if st < 2:
+ # events
+ print_path(["bc", gr, st, "events", "packed"], int(d["bc"][gr]["packed_events"][st]), args)
+ if d["bc"][gr]["desc"]["have_events"][st]:
+ for k in ["start", "length"]:
+ print_path(["bc", gr, st, "events", k], as_time(float(d["bc"][gr][k][st]), 1.0), args)
+ if st in d["bc"][gr]["count"]:
+ print_path(["bc", gr, st, "events", "count"], d["bc"][gr]["count"][st], args)
+ print_path(["bc", gr, st, "bps"], "%.2f" % (float(fq_len) / d["bc"][gr]["length"][st]), args)
+ # model
+ print_path(["bc", gr, st, "model"], int(d["bc"][gr]["desc"]["have_model"][st]), args)
+ else:
+ print_path(["bc", gr, st, "alignment", "packed"], int(d["bc"][gr]["packed_alignment"]), args)
+ if d["bc"][gr]["desc"]["have_subgroup"][2]:
+ print_path(["bc", gr, "bc_1d_gr"], d["bc"][gr]["desc"]["bc_1d_gr"], args)
+ if d["bc"][gr]["desc"]["have_subgroup"][0] or d["bc"][gr]["desc"]["have_subgroup"][1]:
+ print_path(["bc", gr, "ed_gr"], d["bc"][gr]["desc"]["ed_gr"], args)
+ # ed
+ for gr in d["ed_gr_l"]:
+ for rn in d["ed"][gr]["rn_l"]:
+ print_path(["ed", gr, rn, "packed"], int(d["ed"][gr]["rn"][rn]["packed"]), args)
+
+
+if __name__ == "__main__":
+ description = """
+ Summarize contents of ONT fast5 files.
+ """
+ parser = argparse.ArgumentParser(description=description, epilog="")
+ parser.add_argument("--log-level", default="warning",
+ help="log level")
+ #
+ parser.add_argument("--delim", default="\t/",
+ help="Delimiters list; first char used between path and value, second char used between path elements.")
+ parser.add_argument("-R", "--recurse", action="store_true",
+ help="Recurse in input directories.")
+ #
+ parser.add_argument("inputs", nargs="*", default=[], action="append",
+ help="Input directories, fast5 files, or files of fast5 file names.")
+ args = parser.parse_args()
+
+ numeric_log_level = getattr(logging, args.log_level.upper(), None)
+ if not isinstance(numeric_log_level, int):
+ raise ValueError("Invalid log level: '%s'" % args.log_level)
+ logging.basicConfig(level=numeric_log_level,
+ format="%(asctime)s %(name)s.%(levelname)s %(message)s",
+ datefmt="%Y/%m/%d %H:%M:%S")
+ logger = logging.getLogger(os.path.basename(__file__))
+ fast5.Logger.set_levels_from_options([args.log_level.lower()])
+ # fix delim
+ args.delim = list(args.delim)
+ while len(args.delim) < 2:
+ args.delim.append("")
+ logger.debug("args: " + str(args))
+
+ fl = add_paths(args.inputs[0], args)
+ for ifn in fl:
+ list_file(ifn, len(fl) > 1, args)
diff --git a/python/bin/f5pack b/python/bin/f5pack
new file mode 100755
index 0000000..e31bb90
--- /dev/null
+++ b/python/bin/f5pack
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+
+#
+# Part of: https://github.com/mateidavid/fast5
+#
+# (c) 2017: Matei David, Ontario Institute for Cancer Research
+# MIT License
+#
+
+import argparse
+import logging
+import os
+import sys
+
+import fast5
+
+import signal
+signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+
+policy_d = {
+ "drop": 0,
+ "pack": 1,
+ "unpack": 2,
+ "copy": 3,
+}
+
+def add_fast5(fn, rel_dn, args):
+ logger.info("adding fast5 fn=" + fn + " rel_dn=" + rel_dn)
+ return [(fn, os.path.normpath(os.path.join(args.output, rel_dn, os.path.basename(fn))))]
+
+def add_dir(dn, args):
+ l = list()
+ logger.info("processing dir dn=" + dn)
+ for t in os.walk(dn):
+ rel_dn = os.path.relpath(t[0], dn)
+ for rel_fn in t[2]:
+ fn = os.path.join(t[0], rel_fn)
+ if fast5.File.is_valid_file(fn):
+ l += add_fast5(fn, rel_dn, args)
+ if not args.recurse:
+ break
+ return l
+
+def add_fofn(fn, args):
+ l = list()
+ logger.info("processing fofn fn=" + fn)
+ if fn != "-":
+ f = open(fn)
+ else:
+ f = sys.stdin
+ for p in f:
+ p = p.strip()
+ if fast5.File.is_valid_file(p):
+ l += add_fast5(p, "", args)
+ else:
+ logger.warning("fofn line not a fast5 file: " + p)
+ if fn != "-":
+ f.close()
+ return l
+
+def add_paths(pl, args):
+ l = list()
+ if len(pl) == 0:
+ pl.append("-")
+ for p in pl:
+ if os.path.isdir(p):
+ l += add_dir(p, args)
+ elif fast5.File.is_valid_file(p):
+ l += add_fast5(p, "", args)
+ else:
+ l += add_fofn(p, args)
+ return l
+
+if __name__ == "__main__":
+ description = """
+ Pack and unpack ONT fast5 files.
+ """
+ parser = argparse.ArgumentParser(description=description, epilog="")
+ parser.add_argument("--log", default="warning",
+ help="log level")
+ #
+ parser.add_argument("--pack", action="store_true",
+ help="Pack data (default).")
+ parser.add_argument("--unpack", action="store_true",
+ help="Unpack data.")
+ parser.add_argument("--archive", action="store_true",
+ help="Pack raw samples data, drop rest.")
+ parser.add_argument("--fastq", action="store_true",
+ help="Pack fastq data, drop rest.")
+ #
+ parser.add_argument("--rs", choices=["drop", "pack", "unpack", "copy"],
+ help="Policy for raw samples.")
+ parser.add_argument("--ed", choices=["drop", "pack", "unpack", "copy"],
+ help="Policy for eventdetection events.")
+ parser.add_argument("--fq", choices=["drop", "pack", "unpack", "copy"],
+ help="Policy for fastq.")
+ parser.add_argument("--ev", choices=["drop", "pack", "unpack", "copy"],
+ help="Policy for basecall events.")
+ parser.add_argument("--al", choices=["drop", "pack", "unpack", "copy"],
+ help="Policy for basecall alignment.")
+ #
+ parser.add_argument("--force", action="store_true",
+ help="Overwrite existing destination files.")
+ parser.add_argument("--qv-bits", type=int,
+ help="QV bits to keep.")
+ parser.add_argument("--p-model-state-bits", type=int,
+ help="p_model_state bits to keep.")
+ parser.add_argument("-R", "--recurse", action="store_true",
+ help="Recurse in input directories.")
+ parser.add_argument("-o", "--output", required=True,
+ help="Output directory.")
+ #
+ parser.add_argument("inputs", nargs="*", default=[], action="append",
+ help="Input directories, fast5 files, or files of fast5 file names. For input directories, the subdirectory hierarchy (if traversed with --recurse) is recreated in the output directory.")
+ args = parser.parse_args()
+
+ numeric_log_level = getattr(logging, args.log.upper(), None)
+ if not isinstance(numeric_log_level, int):
+ raise ValueError("Invalid log level: '%s'" % args.log)
+ logging.basicConfig(level=numeric_log_level,
+ format="%(asctime)s %(name)s.%(levelname)s %(message)s",
+ datefmt="%Y/%m/%d %H:%M:%S")
+ logger = logging.getLogger(os.path.basename(__file__))
+ fast5.Logger.set_levels_from_options([args.log.lower()])
+ logger.debug("args: " + str(args))
+
+ if args.pack + args.unpack + args.archive + args.fastq > 1:
+ sys.exit("At most one of --pack/--unpack/--archive/--fastq may be specified")
+ if (not args.pack and
+ not args.unpack and
+ not args.archive and
+ not args.fastq and
+ args.rs is None and
+ args.ed is None and
+ args.fq is None and
+ args.ev is None and
+ args.al is None):
+ args.pack = True
+ if args.pack:
+ if args.rs is None: args.rs = "pack"
+ if args.ed is None: args.ed = "pack"
+ if args.fq is None: args.fq = "pack"
+ if args.ev is None: args.ev = "pack"
+ if args.al is None: args.al = "pack"
+ if args.unpack:
+ if args.rs is None: args.rs = "unpack"
+ if args.ed is None: args.ed = "unpack"
+ if args.fq is None: args.fq = "unpack"
+ if args.ev is None: args.ev = "unpack"
+ if args.al is None: args.al = "unpack"
+ if args.archive:
+ if args.rs is None: args.rs = "pack"
+ if args.ed is None: args.ed = "drop"
+ if args.fq is None: args.fq = "drop"
+ if args.ev is None: args.ev = "drop"
+ if args.al is None: args.al = "drop"
+ if args.fastq:
+ if args.rs is None: args.rs = "drop"
+ if args.ed is None: args.ed = "drop"
+ if args.fq is None: args.fq = "pack"
+ if args.ev is None: args.ev = "drop"
+ if args.al is None: args.al = "drop"
+ if args.rs is None: args.rs = "drop"
+ if args.ed is None: args.ed = "drop"
+ if args.fq is None: args.fq = "drop"
+ if args.ev is None: args.ev = "drop"
+ if args.al is None: args.al = "drop"
+ logger.info("rs: " + args.rs)
+ logger.info("ed: " + args.ed)
+ logger.info("fq: " + args.fq)
+ logger.info("ev: " + args.ev)
+ logger.info("al: " + args.al)
+ fp = fast5.File_Packer(
+ policy_d[args.rs],
+ policy_d[args.ed],
+ policy_d[args.fq],
+ policy_d[args.ev],
+ policy_d[args.al],
+ )
+ if args.force: fp.set_force(True)
+ if args.qv_bits: fp.set_qv_bits(args.qv_bits)
+ if args.p_model_state_bits: fp.set_p_model_state_bits(args.p_model_state_bits)
+ fl = add_paths(args.inputs[0], args)
+ errored_files_cnt = 0
+ input_bytes = 0
+ output_bytes = 0
+ for t in fl:
+ ifn = t[0]
+ ofn = t[1]
+ odn = os.path.dirname(t[1])
+ if not os.path.isdir(odn):
+ os.makedirs(odn)
+ logger.info("packing ifn=" + ifn + " ofn=" + ofn)
+ try:
+ fp.run(ifn, ofn)
+ except RuntimeError as e:
+ logger.warning("error packing " + ifn + ": " + str(e))
+ os.remove(ofn)
+ errored_files_cnt += 1
+ continue
+ input_bytes += os.stat(ifn).st_size
+ output_bytes += os.stat(ofn).st_size
+
+ cnt = fp.get_counts()
+ cnt_total_bits = dict()
+ output_ds_bytes = 0
+ print("bp_seq_count\t%d" % cnt["bp_seq_count"])
+ if cnt["bp_seq_count"] == 0:
+ cnt["bp_seq_count"] = float('nan')
+ for cl in [["rs_count", "rs_bits"],
+ ["ed_count", "ed_skip_bits", "ed_len_bits"],
+ ["fq_count", "fq_bp_bits", "fq_qv_bits"],
+ ["ev_count", "ev_rel_skip_bits", "ev_skip_bits", "ev_len_bits", "ev_move_bits", "ev_p_model_state_bits"],
+ ["al_count", "al_template_step_bits", "al_complement_step_bits", "al_move_bits"]]:
+ cnt_total_bits[cl[0]] = 0
+ if cnt[cl[0]] == 0:
+ continue
+ print(cl[0] + "\t%d" % cnt[cl[0]])
+ for c in cl[1:]:
+ cnt_total_bits[cl[0]] += cnt[c]
+ if cnt[c] == 0:
+ continue
+ print((c + "\t%d\t%.2f\t%.2f") % (cnt[c], float(cnt[c]) / cnt[cl[0]], float(cnt[c])/cnt["bp_seq_count"]))
+ output_ds_bytes += cnt_total_bits[cl[0]] / 8
+ print(cl[0].split('_')[0] + "_total_bits\t%d\t%.2f\t%.2f" % (cnt_total_bits[cl[0]], float(cnt_total_bits[cl[0]])/cnt[cl[0]], float(cnt_total_bits[cl[0]])/cnt["bp_seq_count"]))
+
+ if cnt["rs_total_duration"] > .001 and cnt["rs_called_duration"] > .001:
+ print("rs_total_duration\t%.2f" % cnt["rs_total_duration"])
+ print("rs_called_duration\t%.2f" % cnt["rs_called_duration"])
+ print("rs_frac_called\t%.2f" % (cnt["rs_called_duration"] / cnt["rs_total_duration"]))
+ print("bp_per_sec\t%.2f" % (float(cnt["bp_seq_count"]) / cnt["rs_called_duration"]))
+ print("input_bytes\t%d" % input_bytes)
+ print("output_bytes\t%d" % output_bytes)
+ print("output_overhead_bytes\t%d" % (output_bytes - output_ds_bytes))
+
+ print("processed_files\t%d" % len(fl))
+ if errored_files_cnt > 0:
+ print("errored_files\t%d" % errored_files_cnt)
+
+ sys.exit(errored_files_cnt > 0)
diff --git a/python/fast5/.version.py.in b/python/fast5/.version.py.in
deleted file mode 100644
index d8ed4d2..0000000
--- a/python/fast5/.version.py.in
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = '${VERSION}'
diff --git a/python/fast5/__init__.py b/python/fast5/__init__.py
deleted file mode 100755
index 14e4b5d..0000000
--- a/python/fast5/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""
-fast5.__init__.py
-(c) 2016: Matei David, Ontario Institute for Cancer Research
-MIT License
-"""
-
-from .version import __version__
-from fast5 import *
-
-__version_info__ = tuple([int(num) for num in __version__.split('.')])
diff --git a/python/fast5/fast5.pyx b/python/fast5/fast5.pyx
new file mode 100644
index 0000000..aa64764
--- /dev/null
+++ b/python/fast5/fast5.pyx
@@ -0,0 +1,523 @@
+#
+# Part of: https://github.com/mateidavid/fast5
+#
+# (c) 2017: Matei David, Ontario Institute for Cancer Research
+# MIT License
+#
+
+from cython.operator cimport dereference as deref
+
+from libc.stdint cimport int16_t
+from libcpp cimport bool
+from libcpp.map cimport map as cmap
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+
+cdef extern from "fast5.hpp" namespace "fast5":
+
+ cdef string cpp_version "fast5::version"
+
+ cppclass Cpp_Logger "logger::Logger":
+ @staticmethod
+ void set_levels_from_options(vector[string]) except +
+
+ ctypedef cmap[string, string] Attr_Map
+
+ struct Channel_Id_Params:
+ string channel_number
+ double digitisation
+ double offset
+ double range
+ double sampling_rate
+
+ ctypedef Attr_Map Tracking_Id_Params
+
+ ctypedef Attr_Map Sequences_Params
+
+ struct Raw_Samples_Params:
+ string read_id
+ long long read_number
+ long long start_mux
+ long long start_time
+ long long duration
+
+ ctypedef float Raw_Sample
+
+ ctypedef int16_t Raw_Int_Sample
+
+ struct EventDetection_Events_Params:
+ string read_id
+ long long read_number
+ long long scaling_used
+ long long start_mux
+ long long start_time
+ long long duration
+ double median_before
+ unsigned abasic_found
+
+ struct EventDetection_Event:
+ double mean
+ double stdv
+ long long start
+ long long length
+
+ struct Basecall_Model_Params:
+ double scale
+ double shift
+ double drift
+ double var
+ double scale_sd
+ double var_sd
+
+ struct Basecall_Model_State:
+ double level_mean
+ double level_stdv
+ double sd_mean
+ double sd_stdv
+ #char kmer[8]
+
+ struct Basecall_Events_Params:
+ double start_time
+ double duration
+
+ struct Basecall_Event:
+ double mean
+ double stdv
+ double start
+ double length
+ double p_model_state
+ long long move
+ #char model_state[8]
+
+ struct Basecall_Alignment_Entry:
+ long long template_index
+ long long complement_index
+ #char kmer[8]
+
+ struct Basecall_Group_Description:
+ string name
+ string version
+ string ed_gr
+ string bc_1d_gr
+ bool have_subgroup[3]
+ bool have_fastq[3]
+ bool have_events[3]
+ bool have_model[2]
+ bool have_alignment
+
+ cppclass Cpp_File "fast5::File":
+ Cpp_File() except +
+ Cpp_File(string) except +
+ Cpp_File(string, bool) except +
+
+ bool is_open()
+ bool is_rw()
+ string file_name()
+ void open(string) except +
+ void open(string, bool) except +
+ void create(string) except +
+ void create(string, bool) except +
+ void close() except +
+ @staticmethod
+ bool is_valid_file(string)
+
+ string file_version() except +
+
+ bool have_channel_id_params()
+ Channel_Id_Params get_channel_id_params()
+ bool have_sampling_rate()
+ double get_sampling_rate()
+
+ bool have_tracking_id_params()
+ Tracking_Id_Params get_tracking_id_params() except +
+
+ bool have_sequences_params()
+ Sequences_Params get_sequences_params() except +
+
+ vector[string] get_raw_samples_read_name_list()
+ bool have_raw_samples()
+ bool have_raw_samples(string)
+ bool have_raw_samples_unpack(string) except +
+ bool have_raw_samples_pack(string) except +
+ Raw_Samples_Params get_raw_samples_params() except +
+ Raw_Samples_Params get_raw_samples_params(string) except +
+ vector[Raw_Int_Sample] get_raw_int_samples() except +
+ vector[Raw_Int_Sample] get_raw_int_samples(string) except +
+ vector[Raw_Sample] get_raw_samples() except +
+ vector[Raw_Sample] get_raw_samples(string) except +
+
+ vector[string] get_eventdetection_group_list()
+ bool have_eventdetection_group()
+ bool have_eventdetection_group(string)
+ vector[string] get_eventdetection_read_name_list()
+ vector[string] get_eventdetection_read_name_list(string)
+ bool have_eventdetection_events()
+ bool have_eventdetection_events(string)
+ bool have_eventdetection_events(string, string)
+ bool have_eventdetection_events_unpack(string, string) except +
+ bool have_eventdetection_events_pack(string, string) except +
+ Attr_Map get_eventdetection_params() except +
+ Attr_Map get_eventdetection_params(string) except +
+ EventDetection_Events_Params get_eventdetection_events_params() except +
+ EventDetection_Events_Params get_eventdetection_events_params(string) except +
+ EventDetection_Events_Params get_eventdetection_events_params(string, string) except +
+ vector[EventDetection_Event] get_eventdetection_events() except +
+ vector[EventDetection_Event] get_eventdetection_events(string) except +
+ vector[EventDetection_Event] get_eventdetection_events(string, string) except +
+
+ vector[string] get_basecall_group_list()
+ bool have_basecall_group()
+ bool have_basecall_group(string)
+ vector[string] get_basecall_strand_group_list(unsigned)
+ bool have_basecall_strand_group(unsigned)
+ bool have_basecall_strand_group(unsigned, string)
+ Basecall_Group_Description get_basecall_group_description(string) except +
+ string get_basecall_1d_group(string)
+ string get_basecall_eventdetection_group(string)
+ Attr_Map get_basecall_params(string) except +
+ bool have_basecall_log(string)
+ string get_basecall_log(string) except +
+ Attr_Map get_basecall_config(string) except +
+ Attr_Map get_basecall_summary(string) except +
+
+ bool have_basecall_fastq(unsigned)
+ bool have_basecall_fastq(unsigned, string)
+ bool have_basecall_fastq_unpack(unsigned, string) except +
+ bool have_basecall_fastq_pack(unsigned, string) except +
+ string get_basecall_fastq(unsigned) except +
+ string get_basecall_fastq(unsigned, string) except +
+ bool have_basecall_seq(unsigned)
+ bool have_basecall_seq(unsigned, string)
+ string get_basecall_seq(unsigned) except +
+ string get_basecall_seq(unsigned, string) except +
+
+ bool have_basecall_model(unsigned)
+ bool have_basecall_model(unsigned, string)
+ string get_basecall_model_file(unsigned) except +
+ string get_basecall_model_file(unsigned, string) except +
+ Basecall_Model_Params get_basecall_model_params(unsigned) except +
+ Basecall_Model_Params get_basecall_model_params(unsigned, string) except +
+ vector[Basecall_Model_State] get_basecall_model(unsigned) except +
+ vector[Basecall_Model_State] get_basecall_model(unsigned, string) except +
+
+ bool have_basecall_events(unsigned)
+ bool have_basecall_events(unsigned, string)
+ bool have_basecall_events_unpack(unsigned, string) except +
+ bool have_basecall_events_pack(unsigned, string) except +
+ Basecall_Events_Params get_basecall_events_params(unsigned) except +
+ Basecall_Events_Params get_basecall_events_params(unsigned, string) except +
+ vector[Basecall_Event] get_basecall_events(unsigned) except +
+ vector[Basecall_Event] get_basecall_events(unsigned, string) except +
+
+ bool have_basecall_alignment()
+ bool have_basecall_alignment(string)
+ bool have_basecall_alignment_unpack(string) except +
+ bool have_basecall_alignment_pack(string) except +
+ vector[Basecall_Alignment_Entry] get_basecall_alignment() except +
+ vector[Basecall_Alignment_Entry] get_basecall_alignment(string) except +
+
+cdef extern from "File_Packer.hpp" namespace "fast5":
+
+ struct Counts "fast5::File_Packer::Counts":
+ size_t rs_count
+ size_t rs_bits
+ size_t ed_count
+ size_t ed_skip_bits
+ size_t ed_len_bits
+ size_t fq_count
+ size_t bp_seq_count
+ size_t fq_bp_bits
+ size_t fq_qv_bits
+ size_t ev_count
+ size_t ev_rel_skip_bits
+ size_t ev_skip_bits
+ size_t ev_len_bits
+ size_t ev_move_bits
+ size_t ev_p_model_state_bits
+ size_t al_count
+ size_t al_template_step_bits
+ size_t al_complement_step_bits
+ size_t al_move_bits
+ double rs_total_duration
+ double rs_called_duration
+
+ cppclass Cpp_File_Packer "fast5::File_Packer":
+
+ Cpp_File_Packer()
+ Cpp_File_Packer(int)
+ Cpp_File_Packer(int, int, int, int, int)
+
+ void set_check(bool)
+ void set_force(bool)
+ void set_qv_bits(unsigned)
+ void set_p_model_state_bits(unsigned)
+
+ void run(string, string) except +
+ void reset_counts()
+ Counts get_counts()
+
+__version__ = cpp_version
+
+cdef class Logger:
+ @staticmethod
+ def set_levels_from_options(s):
+ Cpp_Logger.set_levels_from_options(s)
+
+cdef class File:
+ cdef unique_ptr[Cpp_File] thisptr
+
+ def __init__(self, name=None, rw=None):
+ if name is None:
+ self.thisptr.reset(new Cpp_File())
+ elif rw is None:
+ self.thisptr.reset(new Cpp_File(name))
+ else:
+ self.thisptr.reset(new Cpp_File(name, rw))
+
+ def is_open(self):
+ return deref(self.thisptr).is_open()
+ def is_rw(self):
+ return deref(self.thisptr).is_rw()
+ def file_name(self):
+ return deref(self.thisptr).file_name()
+ def open(self, file_name, rw=None):
+ if rw is None:
+ return deref(self.thisptr).open(file_name)
+ else:
+ return deref(self.thisptr).open(file_name, rw)
+ def create(self, file_name, trunc=None):
+ if trunc is None:
+ return deref(self.thisptr).open(file_name)
+ else:
+ return deref(self.thisptr).open(file_name, trunc)
+ def close(self):
+ return deref(self.thisptr).close()
+ @staticmethod
+ def is_valid_file(s):
+ return Cpp_File.is_valid_file(s)
+
+ def file_version(self):
+ return deref(self.thisptr).file_version()
+
+ def have_channel_id_params(self):
+ return deref(self.thisptr).have_channel_id_params()
+ def get_channel_id_params(self):
+ return deref(self.thisptr).get_channel_id_params()
+
+ def have_tracking_id_params(self):
+ return deref(self.thisptr).have_tracking_id_params()
+ def get_tracking_id_params(self):
+ return deref(self.thisptr).get_tracking_id_params()
+
+ def have_sequences_params(self):
+ return deref(self.thisptr).have_sequences_params()
+ def get_sequences_params(self):
+ return deref(self.thisptr).get_sequences_params()
+
+ def get_raw_samples_read_name_list(self):
+ return deref(self.thisptr).get_raw_samples_read_name_list()
+ def have_raw_samples(self, rn=None):
+ if rn is None:
+ return deref(self.thisptr).have_raw_samples()
+ else:
+ return deref(self.thisptr).have_raw_samples(rn)
+ def have_raw_samples_unpack(self, rn):
+ return deref(self.thisptr).have_raw_samples_unpack(rn)
+ def have_raw_samples_pack(self, rn):
+ return deref(self.thisptr).have_raw_samples_pack(rn)
+ def get_raw_samples_params(self, rn=None):
+ if rn is None:
+ return deref(self.thisptr).get_raw_samples_params()
+ else:
+ return deref(self.thisptr).get_raw_samples_params(rn)
+ def get_raw_int_samples(self, rn=None):
+ if rn is None:
+ return deref(self.thisptr).get_raw_int_samples()
+ else:
+ return deref(self.thisptr).get_raw_int_samples(rn)
+ def get_raw_samples(self, rn=None):
+ if rn is None:
+ return deref(self.thisptr).get_raw_samples()
+ else:
+ return deref(self.thisptr).get_raw_samples(rn)
+
+ def get_eventdetection_group_list(self):
+ return deref(self.thisptr).get_eventdetection_group_list()
+ def have_eventdetection_group(self, gr=None):
+ if gr is None:
+ return deref(self.thisptr).have_eventdetection_group()
+ else:
+ return deref(self.thisptr).have_eventdetection_group(gr)
+ def get_eventdetection_params(self, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_eventdetection_params()
+ else:
+ return deref(self.thisptr).get_eventdetection_params(gr)
+ def get_eventdetection_read_name_list(self, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_eventdetection_read_name_list()
+ else:
+ return deref(self.thisptr).get_eventdetection_read_name_list(gr)
+ def have_eventdetection_events(self, gr=None, rn=None):
+ if gr is None:
+ return deref(self.thisptr).have_eventdetection_events()
+ elif rn is None:
+ return deref(self.thisptr).have_eventdetection_events(gr)
+ else:
+ return deref(self.thisptr).have_eventdetection_events(gr, rn)
+ def have_eventdetection_events_unpack(self, gr, rn):
+ return deref(self.thisptr).have_eventdetection_events_unpack(gr, rn)
+ def have_eventdetection_events_pack(self, gr, rn):
+ return deref(self.thisptr).have_eventdetection_events_pack(gr, rn)
+ def get_eventdetection_events_params(self, gr=None, rn=None):
+ if gr is None:
+ return deref(self.thisptr).get_eventdetection_events_params()
+ elif rn is None:
+ return deref(self.thisptr).get_eventdetection_events_params(gr)
+ else:
+ return deref(self.thisptr).get_eventdetection_events_params(gr, rn)
+ def get_eventdetection_events(self, gr=None, rn=None):
+ if gr is None:
+ return deref(self.thisptr).get_eventdetection_events()
+ elif rn is None:
+ return deref(self.thisptr).get_eventdetection_events(gr)
+ else:
+ return deref(self.thisptr).get_eventdetection_events(gr, rn)
+
+ def get_basecall_group_list(self):
+ return deref(self.thisptr).get_basecall_group_list()
+ def get_basecall_strand_group_list(self, st):
+ return deref(self.thisptr).get_basecall_strand_group_list(st)
+ def have_basecall_group(self, gr=None):
+ if gr is None:
+ return deref(self.thisptr).have_basecall_group()
+ else:
+ return deref(self.thisptr).have_basecall_group(gr)
+ def have_basecall_strand_group(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).have_basecall_strand_group(st)
+ else:
+ return deref(self.thisptr).have_basecall_strand_group(st, gr)
+ def get_basecall_group_description(self, gr):
+ return deref(self.thisptr).get_basecall_group_description(gr)
+ def get_basecall_1d_group(self, gr):
+ return deref(self.thisptr).get_basecall_1d_group(gr)
+ def get_basecall_eventdetection_group(self, gr):
+ return deref(self.thisptr).get_basecall_eventdetection_group(gr)
+ def get_basecall_params(self, gr):
+ return deref(self.thisptr).get_basecall_params(gr)
+ def get_basecall_log(self, gr):
+ return deref(self.thisptr).get_basecall_log(gr)
+ def get_basecall_config(self, gr):
+ return deref(self.thisptr).get_basecall_config(gr)
+ def get_basecall_summary(self, gr):
+ return deref(self.thisptr).get_basecall_summary(gr)
+
+ def have_basecall_fastq(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).have_basecall_fastq(st)
+ else:
+ return deref(self.thisptr).have_basecall_fastq(st, gr)
+ def have_basecall_fastq_unpack(self, st, gr):
+ return deref(self.thisptr).have_basecall_fastq_unpack(st, gr)
+ def have_basecall_fastq_pack(self, st, gr):
+ return deref(self.thisptr).have_basecall_fastq_pack(st, gr)
+ def get_basecall_fastq(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_basecall_fastq(st)
+ else:
+ return deref(self.thisptr).get_basecall_fastq(st, gr)
+ def have_basecall_seq(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).have_basecall_seq(st)
+ else:
+ return deref(self.thisptr).have_basecall_seq(st, gr)
+ def get_basecall_seq(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_basecall_seq(st)
+ else:
+ return deref(self.thisptr).get_basecall_seq(st, gr)
+
+ def have_basecall_model(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).have_basecall_model(st)
+ else:
+ return deref(self.thisptr).have_basecall_model(st, gr)
+ def get_basecall_model_file(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_basecall_model_file(st)
+ else:
+ return deref(self.thisptr).get_basecall_model_file(st, gr)
+ def get_basecall_model_params(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_basecall_model_params(st)
+ else:
+ return deref(self.thisptr).get_basecall_model_params(st, gr)
+ def get_basecall_model(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_basecall_model(st)
+ else:
+ return deref(self.thisptr).get_basecall_model(st, gr)
+
+ def have_basecall_events(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).have_basecall_events(st)
+ else:
+ return deref(self.thisptr).have_basecall_events(st, gr)
+ def have_basecall_events_unpack(self, st, gr):
+ return deref(self.thisptr).have_basecall_events_unpack(st, gr)
+ def have_basecall_events_pack(self, st, gr):
+ return deref(self.thisptr).have_basecall_events_pack(st, gr)
+ def get_basecall_events_params(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_basecall_events_params(st)
+ else:
+ return deref(self.thisptr).get_basecall_events_params(st, gr)
+ def get_basecall_events(self, st, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_basecall_events(st)
+ else:
+ return deref(self.thisptr).get_basecall_events(st, gr)
+
+ def have_basecall_alignment(self, gr=None):
+ if gr is None:
+ return deref(self.thisptr).have_basecall_alignment()
+ else:
+ return deref(self.thisptr).have_basecall_alignment(gr)
+ def have_basecall_alignment_unpack(self, gr):
+ return deref(self.thisptr).have_basecall_alignment_unpack(gr)
+ def have_basecall_alignment_pack(self, gr):
+ return deref(self.thisptr).have_basecall_alignment_pack(gr)
+ def get_basecall_alignment(self, gr=None):
+ if gr is None:
+ return deref(self.thisptr).get_basecall_alignment()
+ else:
+ return deref(self.thisptr).get_basecall_alignment(gr)
+
+cdef class File_Packer:
+ cdef unique_ptr[Cpp_File_Packer] thisptr
+
+ def __init__(self, a1=None, a2=None, a3=None, a4=None, a5=None):
+ if a1 is None:
+ self.thisptr.reset(new Cpp_File_Packer())
+ elif a2 is None:
+ self.thisptr.reset(new Cpp_File_Packer(a1))
+ else:
+ self.thisptr.reset(new Cpp_File_Packer(a1, a2, a3, a4, a5))
+
+ def set_check(self, _check):
+ deref(self.thisptr).set_check(_check)
+ def set_force(self, _force):
+ deref(self.thisptr).set_force(_force)
+ def set_qv_bits(self, _qv_bits):
+ deref(self.thisptr).set_qv_bits(_qv_bits)
+ def set_p_model_state_bits(self, _p_model_state_bits):
+ deref(self.thisptr).set_p_model_state_bits(_p_model_state_bits)
+
+ def run(self, ifn, ofn):
+ deref(self.thisptr).run(ifn, ofn)
+ def reset_counts(self):
+ deref(self.thisptr).reset_counts()
+ def get_counts(self):
+ return deref(self.thisptr).get_counts()
diff --git a/python/fast5/source/fast5.cpp b/python/fast5/source/fast5.cpp
deleted file mode 100644
index 2e51dd8..0000000
--- a/python/fast5/source/fast5.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-#include <boost/python.hpp>
-#include <boost/python/suite/indexing/map_indexing_suite.hpp>
-#include <boost/python/suite/indexing/vector_indexing_suite.hpp>
-#include <boost/python/overloads.hpp>
-
-#include "fast5.hpp"
-
-namespace bp = boost::python;
-
-// member functions with default arguments
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_raw_samples_params_overloads, get_raw_samples_params, 0, 1)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_raw_samples_overloads, get_raw_samples, 0, 1)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_eventdetection_read_name_list_overloads, get_eventdetection_read_name_list, 0, 1)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(have_eventdetection_events_overloads, have_eventdetection_events, 0, 1)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_eventdetection_params_overloads, get_eventdetection_params, 0, 1)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_eventdetection_event_params_overloads, get_eventdetection_event_params, 0, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_eventdetection_events_overloads, get_eventdetection_events, 0, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(have_basecall_fastq_overlords, have_basecall_fastq, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_basecall_fastq_overlords, get_basecall_fastq, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(have_basecall_seq_overlords, have_basecall_seq, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_basecall_seq_overlords, get_basecall_seq, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(have_basecall_model_overlords, have_basecall_model, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_basecall_model_file_overlords, get_basecall_model_file, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_basecall_model_params_overlords, get_basecall_model_params, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_basecall_model_overlords, get_basecall_model, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(have_basecall_events_overlords, have_basecall_events, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_basecall_events_overlords, get_basecall_events, 1, 2)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(have_basecall_event_alignment_overlords, have_basecall_event_alignment, 0, 1)
-BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(get_basecall_event_alignment_overlords, get_basecall_event_alignment, 0, 1)
-
-BOOST_PYTHON_MODULE(fast5)
-{
- bp::class_<fast5::Channel_Id_Parameters>("Channel_Id_Parameters")
- .def_readwrite("channel_number", &fast5::Channel_Id_Parameters::channel_number)
- .def_readwrite("digitisation", &fast5::Channel_Id_Parameters::digitisation)
- .def_readwrite("offset", &fast5::Channel_Id_Parameters::offset)
- .def_readwrite("range", &fast5::Channel_Id_Parameters::range)
- .def_readwrite("sampling_rate", &fast5::Channel_Id_Parameters::sampling_rate)
- ;
- bp::class_<fast5::Raw_Samples_Parameters>("Raw_Samples_Parameters")
- .def_readwrite("read_id", &fast5::Raw_Samples_Parameters::read_id)
- .def_readwrite("read_number", &fast5::Raw_Samples_Parameters::read_number)
- .def_readwrite("start_mux", &fast5::Raw_Samples_Parameters::start_mux)
- .def_readwrite("start_time", &fast5::Raw_Samples_Parameters::start_time)
- .def_readwrite("duration", &fast5::Raw_Samples_Parameters::duration)
- ;;
- bp::class_<fast5::EventDetection_Event_Parameters>("EventDetection_Event_Parameters")
- .def_readwrite("read_id", &fast5::EventDetection_Event_Parameters::read_id)
- .def_readwrite("read_number", &fast5::EventDetection_Event_Parameters::read_number)
- .def_readwrite("scaling_used", &fast5::EventDetection_Event_Parameters::scaling_used)
- .def_readwrite("start_mux", &fast5::EventDetection_Event_Parameters::start_mux)
- .def_readwrite("start_time", &fast5::EventDetection_Event_Parameters::start_time)
- .def_readwrite("duration", &fast5::EventDetection_Event_Parameters::duration)
- .def_readwrite("median_before", &fast5::EventDetection_Event_Parameters::median_before)
- .def_readwrite("abasic_found", &fast5::EventDetection_Event_Parameters::abasic_found)
- ;
- bp::class_<fast5::EventDetection_Event_Entry>("EventDetection_Event_Entry")
- .def_readwrite("mean", &fast5::EventDetection_Event_Entry::mean)
- .def_readwrite("stdv", &fast5::EventDetection_Event_Entry::stdv)
- .def_readwrite("start", &fast5::EventDetection_Event_Entry::start)
- .def_readwrite("length", &fast5::EventDetection_Event_Entry::length)
- ;
- bp::class_<fast5::Model_Entry>("Model_Entry")
- .def_readwrite("variant", &fast5::Model_Entry::variant)
- .def_readwrite("level_mean", &fast5::Model_Entry::level_mean)
- .def_readwrite("level_stdv", &fast5::Model_Entry::level_stdv)
- .def_readwrite("sd_mean", &fast5::Model_Entry::sd_mean)
- .def_readwrite("sd_stdv", &fast5::Model_Entry::sd_stdv)
- .def_readwrite("weight", &fast5::Model_Entry::weight)
- .def_readwrite("kmer", &fast5::Model_Entry::kmer)
- ;
- bp::class_<fast5::Model_Parameters>("Model_Parameters")
- .def_readwrite("scale", &fast5::Model_Parameters::scale)
- .def_readwrite("shift", &fast5::Model_Parameters::shift)
- .def_readwrite("drift", &fast5::Model_Parameters::drift)
- .def_readwrite("var", &fast5::Model_Parameters::var)
- .def_readwrite("scale_sd", &fast5::Model_Parameters::scale_sd)
- .def_readwrite("var_sd", &fast5::Model_Parameters::var_sd)
- ;
- bp::class_<fast5::Event_Entry>("Event_Entry")
- .def_readwrite("mean", &fast5::Event_Entry::mean)
- .def_readwrite("stdv", &fast5::Event_Entry::stdv)
- .def_readwrite("start", &fast5::Event_Entry::start)
- .def_readwrite("length", &fast5::Event_Entry::length)
- .def_readwrite("p_model_state", &fast5::Event_Entry::p_model_state)
- .def_readwrite("p_mp_state", &fast5::Event_Entry::p_mp_state)
- .def_readwrite("p_A", &fast5::Event_Entry::p_A)
- .def_readwrite("p_C", &fast5::Event_Entry::p_C)
- .def_readwrite("p_G", &fast5::Event_Entry::p_G)
- .def_readwrite("p_T", &fast5::Event_Entry::p_T)
- .def_readwrite("move", &fast5::Event_Entry::move)
- .def_readwrite("model_state", &fast5::Event_Entry::model_state)
- .def_readwrite("mp_state", &fast5::Event_Entry::mp_state)
- ;;
- bp::class_<fast5::Event_Alignment_Entry>("Event_Alignment_Entry")
- .def_readwrite("template_index", &fast5::Event_Alignment_Entry::template_index)
- .def_readwrite("complement_index", &fast5::Event_Alignment_Entry::complement_index)
- .def("get_kmer", &fast5::Event_Alignment_Entry::get_kmer)
- ;;
-
- bp::class_<std::map<std::string, std::string>>("Map_Str_Str")
- .def(bp::map_indexing_suite<std::map<std::string, std::string>>())
- ;
- bp::class_<std::vector<std::string>>("Vec_Str")
- .def(bp::vector_indexing_suite<std::vector<std::string>>())
- ;
- bp::class_<std::vector<fast5::Raw_Samples_Entry>>("Vec_Raw_Samples_Entry")
- .def(bp::vector_indexing_suite<std::vector<fast5::Raw_Samples_Entry>>())
- ;
- bp::class_<std::vector<fast5::EventDetection_Event_Entry>>("Vec_EventDetection_Event_Entry")
- .def(bp::vector_indexing_suite<std::vector<fast5::EventDetection_Event_Entry>>())
- ;
- bp::class_<std::vector<fast5::Model_Entry>>("Vec_Model_Entry")
- .def(bp::vector_indexing_suite<std::vector<fast5::Model_Entry>>())
- ;
- bp::class_<std::vector<fast5::Event_Entry>>("Vec_Event_Entry")
- .def(bp::vector_indexing_suite<std::vector<fast5::Event_Entry>>())
- ;
- bp::class_<std::vector<fast5::Event_Alignment_Entry>>("Vec_Event_Alignment_Entry")
- .def(bp::vector_indexing_suite<std::vector<fast5::Event_Alignment_Entry>>())
- ;
-
- bp::class_<fast5::File, boost::noncopyable>("File")
- .def(bp::init<std::string, bp::optional<bool>>())
- .def("is_open", &fast5::File::is_open)
- .def("is_rw", &fast5::File::is_rw)
- .def("file_name", &fast5::File::file_name, bp::return_value_policy<bp::copy_const_reference>())
- .def("open", &fast5::File::open)
- .def("create", &fast5::File::create)
- .def("close", &fast5::File::close)
- .def("is_valid_file", &hdf5_tools::File::is_valid_file).staticmethod("is_valid_file")
- .def("get_object_count", &hdf5_tools::File::get_object_count).staticmethod("get_object_count")
- //
- .def("file_version", &fast5::File::file_version)
- //
- .def("have_channel_id_params", &fast5::File::have_channel_id_params)
- .def("get_channel_id_params", &fast5::File::get_channel_id_params)
- //
- .def("have_sampling_rate", &fast5::File::have_sampling_rate)
- .def("get_sampling_rate", &fast5::File::get_sampling_rate)
- //
- .def("have_tracking_id_params", &fast5::File::have_tracking_id_params)
- .def("get_tracking_id_params", &fast5::File::get_tracking_id_params)
- //
- .def("have_sequences_params", &fast5::File::have_sequences_params)
- .def("get_sequences_params", &fast5::File::get_sequences_params)
- //
- .def("get_raw_samples_read_name_list", &fast5::File::get_raw_samples_read_name_list, bp::return_value_policy<bp::copy_const_reference>())
- .def("have_raw_samples", &fast5::File::have_raw_samples)
- .def("get_raw_samples_params", &fast5::File::get_raw_samples_params, get_raw_samples_params_overloads())
- .def("get_raw_samples", &fast5::File::get_raw_samples, get_raw_samples_overloads())
- //
- .def("get_eventdetection_group_list", &fast5::File::get_eventdetection_group_list, bp::return_value_policy<bp::copy_const_reference>())
- .def("have_eventdetection_groups", &fast5::File::have_eventdetection_groups)
- .def("get_eventdetection_read_name_list", &fast5::File::get_eventdetection_read_name_list, get_eventdetection_read_name_list_overloads())
- .def("have_eventdetection_events", &fast5::File::have_eventdetection_events, have_eventdetection_events_overloads())
- .def("get_eventdetection_params", &fast5::File::get_eventdetection_params, get_eventdetection_params_overloads())
- .def("get_eventdetection_event_params", &fast5::File::get_eventdetection_event_params, get_eventdetection_event_params_overloads())
- .def("get_eventdetection_events", &fast5::File::get_eventdetection_events, get_eventdetection_events_overloads())
- //
- .def("get_basecall_group_list", &fast5::File::get_basecall_group_list, bp::return_value_policy<bp::copy_const_reference>())
- .def("have_basecall_groups", &fast5::File::have_basecall_groups)
- .def("get_basecall_strand_group_list", &fast5::File::get_basecall_strand_group_list, bp::return_value_policy<bp::copy_const_reference>())
- .def("have_basecall_strand_groups", &fast5::File::have_basecall_strand_groups)
- .def("have_basecall_log", &fast5::File::have_basecall_log)
- .def("get_basecall_log", &fast5::File::get_basecall_log)
- .def("have_basecall_fastq", &fast5::File::have_basecall_fastq, have_basecall_fastq_overlords())
- .def("get_basecall_fastq", &fast5::File::get_basecall_fastq, get_basecall_fastq_overlords())
- .def("add_basecall_fastq", &fast5::File::add_basecall_fastq)
- .def("have_basecall_seq", &fast5::File::have_basecall_seq, have_basecall_seq_overlords())
- .def("get_basecall_seq", &fast5::File::get_basecall_seq, get_basecall_seq_overlords())
- .def("add_basecall_seq", &fast5::File::add_basecall_seq)
- .def("have_basecall_model", &fast5::File::have_basecall_model, have_basecall_model_overlords())
- .def("get_basecall_model_file", &fast5::File::get_basecall_model_file, get_basecall_model_file_overlords())
- .def("get_basecall_model_params", &fast5::File::get_basecall_model_params, get_basecall_model_params_overlords())
- .def("get_basecall_model", &fast5::File::get_basecall_model, get_basecall_model_overlords())
- .def("have_basecall_events", &fast5::File::have_basecall_events, have_basecall_events_overlords())
- .def("get_basecall_events", &fast5::File::get_basecall_events, get_basecall_events_overlords())
- .def("have_basecall_event_alignment", &fast5::File::have_basecall_event_alignment, have_basecall_event_alignment_overlords())
- .def("get_basecall_event_alignment", &fast5::File::get_basecall_event_alignment, get_basecall_event_alignment_overlords())
- ;
-}
diff --git a/python/fast5/version.py b/python/fast5/version.py
deleted file mode 100644
index eaddd12..0000000
--- a/python/fast5/version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = '0.5.9'
diff --git a/python/setup.py b/python/setup.py
old mode 100644
new mode 100755
index 487dede..f969514
--- a/python/setup.py
+++ b/python/setup.py
@@ -1,16 +1,18 @@
-"""
-fast5.setup.py
-(c) 2016: Matei David, Ontario Institute for Cancer Research
-MIT License
-"""
+#!/usr/bin/env python
+
+#
+# Part of: https://github.com/mateidavid/fast5
+#
+# (c) 2017: Matei David, Ontario Institute for Cancer Research
+# MIT License
+#
import os
-import re
-import pkg_resources
import sys
+
from setuptools import setup, Extension
-exec(open('fast5/version.py').read())
+use_cython = True #os.environ.get('USE_CYTHON', '') != ''
# check HDF5 include and lib dirs
hdf5_dir = os.environ.get('HDF5_DIR', '/usr')
@@ -23,18 +25,9 @@ if (not os.path.isfile(os.path.join(hdf5_lib_dir, 'lib' + hdf5_lib + '.so'))
and not os.path.isfile(os.path.join(hdf5_lib_dir, 'lib' + hdf5_lib + '.a'))):
sys.exit(hdf5_lib_dir + ': could not find HDF5 library file; use HDF5_DIR or HDF5_LIB_DIR/HDF5_LIB')
-# check Boost.Python include and lib dirs
-boost_dir = os.environ.get('BOOST_DIR', '/usr')
-boost_include_dir = os.environ.get('BOOST_INCLUDE_DIR', os.path.join(boost_dir, 'include'))
-boost_lib_dir = os.environ.get('BOOST_LIB_DIR', os.path.join(boost_dir, 'lib'))
-boost_python_lib = os.environ.get('BOOST_PYTHON_LIB', 'boost_python')
-if not os.path.isfile(os.path.join(boost_include_dir, 'boost', 'python.hpp')):
- sys.exit(boost_include_dir + ': could not find Boost Python header files; use BOOST_DIR or BOOST_INCLUDE_DIR')
-if (not os.path.isfile(os.path.join(boost_lib_dir, 'lib' + boost_python_lib + '.so'))
- and not os.path.isfile(os.path.join(boost_lib_dir, 'lib' + boost_python_lib + '.a'))):
- sys.exit(boost_lib_dir + ': could not find Boost Python library file; use BOOST_DIR or BOOST_LIB_DIR/BOOST_PYTHON_LIB')
-
-fast5_dir = os.environ.get('FAST5_DIR', os.path.join('..', 'src'))
+fast5_dir = os.environ.get('FAST5_DIR', '..')
+fast5_src_dir = os.path.join(fast5_dir, 'src')
+fast5_version = open(os.path.join(fast5_dir, 'VERSION')).readline().strip()
extra_compile_args = [
'-std=c++11',
@@ -42,57 +35,46 @@ extra_compile_args = [
]
# don't indiscriminately add /usr/include to work around bug:
# https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/Q5SWCUUMWQ4EMS7CU2CBOZHV3WZYOOTT/
-for d in [hdf5_include_dir, boost_include_dir]:
+for d in [hdf5_include_dir]:
if d != '/usr/include':
extra_compile_args += ['-isystem', d]
-
#extra_compile_args += ['-O0', '-g3', '-ggdb', '-fno-eliminate-unused-debug-types', '-v']
+
extra_link_args = []
#extra_link_args += ['-v']
+#if sys.platform == 'darwin':
+# extra_compile_args.append('-mmacosx-version-min=10.7')
+
extensions = [
Extension(
- 'fast5.fast5',
- include_dirs=[
- fast5_dir,
- ],
- sources=[
- os.path.join('fast5', 'source', 'fast5.cpp'),
- ],
- depends=[
- os.path.join(fast5_dir, fn)
- for fn in ['fast5.hpp', 'hdf5_tools.hpp']
- ],
+ 'fast5',
+ language='c++',
+ sources=['fast5/fast5.' + ['cpp', 'pyx'][use_cython]],
+ include_dirs=[fast5_src_dir],
+ library_dirs=[hdf5_lib_dir],
+ runtime_library_dirs=[hdf5_lib_dir],
+ libraries=[hdf5_lib],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
- library_dirs=[
- hdf5_lib_dir,
- boost_lib_dir,
- ],
- runtime_library_dirs=[
- hdf5_lib_dir,
- boost_lib_dir,
- ],
- libraries=[
- hdf5_lib,
- boost_python_lib,
- ],
),
]
+if use_cython:
+ from Cython.Build import cythonize
+ extensions = cythonize(extensions)
+
setup(
name='fast5',
description='Fast5 file interface.',
- version=__version__,
- #long_description=open('README').read(),
+ version=fast5_version,
author='Matei David, Ontario Institute for Cancer Research',
author_email='matei.david at oicr.on.ca',
license='MIT',
url='https://github.com/mateidavid/fast5',
- packages=['fast5'],
- exclude_package_data={
- '': ['*.c', '*.cpp', '*.h', '*.hpp'],
- },
ext_modules=extensions,
- scripts=[],
+ scripts=[
+ os.path.join('bin', 'f5ls'),
+ os.path.join('bin', 'f5pack'),
+ ],
)
diff --git a/src/.fast5_version.hpp.in b/src/.fast5_version.hpp.in
new file mode 100644
index 0000000..1d485dc
--- /dev/null
+++ b/src/.fast5_version.hpp.in
@@ -0,0 +1,16 @@
+#ifndef __FAST5_VERSION_HPP
+#define __FAST5_VERSION_HPP
+
+namespace fast5
+{
+
+namespace
+{
+
+static char const * const version = "${VERSION}";
+
+}
+
+}
+
+#endif
diff --git a/src/.gitignore b/src/.gitignore
index 70906a8..1126707 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -3,3 +3,4 @@ f5ls-full
hdf5-mod
f5-mod
f5dump
+f5pack
diff --git a/src/Bit_Packer.hpp b/src/Bit_Packer.hpp
new file mode 100644
index 0000000..eb92f49
--- /dev/null
+++ b/src/Bit_Packer.hpp
@@ -0,0 +1,152 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
+#ifndef __BIT_PACKER_HPP
+#define __BIT_PACKER_HPP
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <map>
+#include <limits>
+#include <stdexcept>
+#include <cassert>
+
+#include "logger.hpp"
+
+namespace fast5
+{
+
+class Bit_Packer
+{
+public:
+ typedef std::vector< std::uint8_t > Code_Type;
+ typedef std::map< std::string, std::string > Code_Params_Type;
+
+ template < typename Int_Type >
+ std::pair< Code_Type, Code_Params_Type >
+ encode(std::vector< Int_Type > const & v, unsigned num_bits) const
+ {
+ Code_Type res;
+ Code_Params_Type res_params;
+ res_params["packer"] = "bit_packer";
+ num_bits = std::min(num_bits, (unsigned)sizeof(Int_Type) * 8);
+ std::ostringstream oss;
+ oss << num_bits;
+ res_params["num_bits"] = oss.str();
+ oss.str("");
+ oss << v.size();
+ res_params["size"] = oss.str();
+ long long unsigned buff = 0;
+ unsigned buff_len = 0;
+ auto val_mask = (1llu << num_bits) - 1;
+ for (unsigned i = 0; i < v.size(); ++i)
+ {
+ // flush out buff
+ while (buff_len >= 8)
+ {
+ res.push_back(buff & 0xFF);
+ buff >>= 8;
+ buff_len -= 8;
+ }
+ assert(buff_len < 8);
+ long long unsigned x = v[i];
+ if (buff_len + num_bits <= 64)
+ {
+ buff |= (x & val_mask) << buff_len;
+ buff_len += num_bits;
+ }
+ else
+ {
+ assert(num_bits > 56);
+ buff |= (x & 0xFF) << buff_len;
+ res.push_back(buff & 0xFF);
+ buff >>= 8;
+ x >>= 8;
+ buff |= (x & (val_mask >> 8)) << buff_len;
+ buff_len += num_bits - 8;
+ }
+ }
+ while (buff_len >= 8)
+ {
+ res.push_back(buff & 0xFF);
+ buff >>= 8;
+ buff_len -= 8;
+ }
+ if (buff_len > 0)
+ {
+ res.push_back(buff & 0xFF);
+ }
+ return std::make_pair(std::move(res), std::move(res_params));
+ } // encode()
+
+ template < typename Int_Type >
+ std::vector< Int_Type >
+ decode(Code_Type const & v, Code_Params_Type const & v_params) const
+ {
+ std::vector< Int_Type > res;
+ unsigned num_bits;
+ size_t sz;
+ std::istringstream(v_params.at("num_bits")) >> num_bits;
+ std::istringstream(v_params.at("size")) >> sz;
+ if (v.size() != (sz * num_bits) / 8 + ((sz * num_bits) % 8 > 0? 1 : 0))
+ {
+ LOG_THROW
+ << "incorrect size: v_size=" << v.size();
+ }
+ long long unsigned buff = 0;
+ unsigned buff_len = 0;
+ unsigned j = 0;
+ auto val_mask = (1llu << num_bits) - 1;
+ for (unsigned i = 0; i < sz; ++i)
+ {
+ while (j < v.size() and buff_len <= 64 - 8)
+ {
+ buff |= ((long long unsigned)v.at(j) << buff_len);
+ ++j;
+ buff_len += 8;
+ }
+ long long unsigned x;
+ if (buff_len >= num_bits)
+ {
+ x = buff & val_mask;
+ buff >>= num_bits;
+ buff_len -= num_bits;
+ }
+ else
+ {
+ // 56 < buff_len < num_bits
+ x = buff & 0xFF;
+ buff >>= 8;
+ buff_len -= 8;
+ buff |= (v.at(j) << buff_len);
+ ++j;
+ buff_len += 8;
+ x |= ((buff & (val_mask >> 8)) << 8);
+ buff >>= (num_bits - 8);
+ buff_len -= num_bits - 8;
+ }
+ res.push_back(x);
+ }
+ return res;
+ } // decode()
+
+ //
+ // static packer access
+ //
+ static Bit_Packer const &
+ get_packer()
+ {
+ static Bit_Packer _packer;
+ return _packer;
+ }
+}; // class Bit_Packer
+
+} // namespace fast5
+
+#endif
diff --git a/src/File_Packer.hpp b/src/File_Packer.hpp
new file mode 100644
index 0000000..990cc38
--- /dev/null
+++ b/src/File_Packer.hpp
@@ -0,0 +1,982 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
+#ifndef __FILE_PACKER_HPP
+#define __FILE_PACKER_HPP
+
+#include <string>
+#include <set>
+
+#include "fast5.hpp"
+#include "logger.hpp"
+
+#define STATIC_MEMBER_WRAPPER(_type, _id, _init) \
+ static _type & _id() { static _type _ ## _id = _init; return _ ## _id; }
+
+namespace fast5
+{
+
+class File_Packer
+{
+public:
+ struct Counts
+ {
+ //
+ size_t rs_count;
+ size_t rs_bits;
+ //
+ size_t ed_count;
+ size_t ed_skip_bits;
+ size_t ed_len_bits;
+ //
+ size_t fq_count;
+ size_t bp_seq_count;
+ size_t fq_bp_bits;
+ size_t fq_qv_bits;
+ //
+ size_t ev_count;
+ size_t ev_rel_skip_bits;
+ size_t ev_skip_bits;
+ size_t ev_len_bits;
+ size_t ev_move_bits;
+ size_t ev_p_model_state_bits;
+ //
+ size_t al_count;
+ size_t al_template_step_bits;
+ size_t al_complement_step_bits;
+ size_t al_move_bits;
+ //
+ double rs_total_duration;
+ double rs_called_duration;
+
+ Counts() :
+ //
+ rs_count(0),
+ rs_bits(0),
+ //
+ ed_count(0),
+ ed_skip_bits(0),
+ ed_len_bits(0),
+ //
+ fq_count(0),
+ bp_seq_count(0),
+ fq_bp_bits(0),
+ fq_qv_bits(0),
+ //
+ ev_count(0),
+ ev_rel_skip_bits(0),
+ ev_skip_bits(0),
+ ev_len_bits(0),
+ ev_move_bits(0),
+ ev_p_model_state_bits(0),
+ //
+ al_count(0),
+ al_template_step_bits(0),
+ al_complement_step_bits(0),
+ al_move_bits(0),
+ //
+ rs_total_duration(0.0),
+ rs_called_duration(0.0)
+ {}
+ Counts & operator += (Counts const & other)
+ {
+ //
+ rs_count += other.rs_count;
+ rs_bits += other.rs_bits;
+ //
+ ed_count += other.ed_count;
+ ed_skip_bits += other.ed_skip_bits;
+ ed_len_bits += other.ed_len_bits;
+ //
+ fq_count += other.fq_count;
+ bp_seq_count += other.bp_seq_count;
+ fq_bp_bits += other.fq_bp_bits;
+ fq_qv_bits += other.fq_qv_bits;
+ //
+ ev_count += other.ev_count;
+ ev_rel_skip_bits += other.ev_rel_skip_bits;
+ ev_skip_bits += other.ev_skip_bits;
+ ev_len_bits += other.ev_len_bits;
+ ev_move_bits += other.ev_move_bits;
+ ev_p_model_state_bits += other.ev_p_model_state_bits;
+ //
+ al_count += other.al_count;
+ al_template_step_bits += other.al_template_step_bits;
+ al_complement_step_bits += other.al_complement_step_bits;
+ al_move_bits += other.al_move_bits;
+ //
+ rs_total_duration += other.rs_total_duration;
+ rs_called_duration += other.rs_called_duration;
+ return *this;
+ }
+ };
+
+ File_Packer() :
+ File_Packer(1)
+ {}
+
+ File_Packer(int _policy) :
+ File_Packer(_policy, _policy, _policy, _policy, _policy)
+ {}
+
+ File_Packer(int _rw_policy, int _ed_policy, int _fq_policy, int _ev_policy, int _al_policy) :
+ rw_policy(_rw_policy),
+ ed_policy(_ed_policy),
+ fq_policy(_fq_policy),
+ ev_policy(_ev_policy),
+ al_policy(_al_policy),
+ check(true),
+ force(false),
+ qv_bits(max_qv_bits()),
+ p_model_state_bits(default_p_model_state_bits())
+ {}
+
+ void set_check(bool _check) { check = _check; }
+ void set_force(bool _force) { force = _force; }
+ void set_qv_bits(unsigned _qv_bits) { qv_bits = _qv_bits; }
+ void set_p_model_state_bits(unsigned _p_model_state_bits) { p_model_state_bits = _p_model_state_bits; }
+
+ STATIC_MEMBER_WRAPPER(unsigned const, max_qv_bits, 5)
+ STATIC_MEMBER_WRAPPER(unsigned const, max_qv_mask, ((unsigned)1 << max_qv_bits()) - 1)
+ STATIC_MEMBER_WRAPPER(unsigned const, default_p_model_state_bits, 2)
+
+ void
+ run(std::string const & ifn, std::string const & ofn) const
+ {
+ File src_f;
+ File dst_f;
+ Counts cnt;
+ try
+ {
+ // open files
+ src_f.open(ifn);
+ dst_f.create(ofn, force);
+ assert(src_f.is_open());
+ assert(dst_f.is_open());
+ assert(dst_f.is_rw());
+ // copy attributes under / and /UniqueGlobalKey
+ copy_attributes(src_f, dst_f, "", false);
+ copy_attributes(src_f, dst_f, "/UniqueGlobalKey", true);
+ std::set< std::string > bc_gr_s;
+ // process raw samples
+ if (rw_policy == 1)
+ {
+ pack_rw(src_f, dst_f, cnt);
+ }
+ else if (rw_policy == 2)
+ {
+ unpack_rw(src_f, dst_f);
+ }
+ else if (rw_policy == 3)
+ {
+ copy_rw(src_f, dst_f);
+ }
+ // process eventdetection events
+ if (ed_policy == 1)
+ {
+ pack_ed(src_f, dst_f, cnt);
+ }
+ else if (ed_policy == 2)
+ {
+ unpack_ed(src_f, dst_f);
+ }
+ else if (ed_policy == 3)
+ {
+ copy_ed(src_f, dst_f);
+ }
+ // process basecall fastq
+ if (fq_policy == 1)
+ {
+ pack_fq(src_f, dst_f, bc_gr_s, cnt);
+ }
+ else if (fq_policy == 2)
+ {
+ unpack_fq(src_f, dst_f, bc_gr_s);
+ }
+ else if (fq_policy == 3)
+ {
+ copy_fq(src_f, dst_f, bc_gr_s);
+ }
+ // process basecall events
+ if (ev_policy == 1)
+ {
+ pack_ev(src_f, dst_f, bc_gr_s, cnt);
+ }
+ else if (ev_policy == 2)
+ {
+ unpack_ev(src_f, dst_f, bc_gr_s);
+ }
+ else if (ev_policy == 3)
+ {
+ copy_ev(src_f, dst_f, bc_gr_s);
+ }
+ // process basecall alignments
+ if (al_policy == 1)
+ {
+ pack_al(src_f, dst_f, bc_gr_s, cnt);
+ }
+ else if (al_policy == 2)
+ {
+ unpack_al(src_f, dst_f, bc_gr_s);
+ }
+ else if (al_policy == 3)
+ {
+ copy_al(src_f, dst_f, bc_gr_s);
+ }
+ // copy basecall params
+ copy_basecall_params(src_f, dst_f, bc_gr_s);
+ // close files
+ src_f.close();
+ dst_f.close();
+ }
+ catch (hdf5_tools::Exception & e)
+ {
+ std::ostringstream oss;
+ oss << ifn << ": HDF5 error: " << e.what();
+ throw std::runtime_error(oss.str());
+ }
+ counts += cnt;
+ } // run()
+
+ void reset_counts() const
+ {
+ counts = Counts();
+ }
+
+ Counts const & get_counts() const
+ {
+ return counts;
+ }
+private:
+ int rw_policy;
+ int ed_policy;
+ int fq_policy;
+ int ev_policy;
+ int al_policy;
+ bool check;
+ bool force;
+ unsigned qv_bits;
+ unsigned p_model_state_bits;
+ mutable Counts counts;
+
+ void
+ pack_rw(File const & src_f, File & dst_f, Counts & cnt) const
+ {
+ auto rn_l = src_f.get_raw_samples_read_name_list();
+ for (auto const & rn : rn_l)
+ {
+ if (src_f.have_raw_samples_pack(rn))
+ {
+ auto rs_pack = src_f.get_raw_samples_pack(rn);
+ dst_f.add_raw_samples(rn, rs_pack);
+ }
+ else if (src_f.have_raw_samples_unpack(rn))
+ {
+ auto rsi_ds = src_f.get_raw_int_samples_dataset(rn);
+ auto & rsi = rsi_ds.first;
+ auto & rs_params = rsi_ds.second;
+ auto rs_pack = src_f.pack_rw(rsi_ds);
+ dst_f.add_raw_samples(rn, rs_pack);
+ if (check)
+ {
+ auto rsi_ds_unpack = dst_f.get_raw_int_samples_dataset(rn);
+ auto & rsi_unpack = rsi_ds_unpack.first;
+ auto & rs_params_unpack = rsi_ds_unpack.second;
+ if (not (rs_params_unpack == rs_params))
+ {
+ LOG_THROW
+ << "check failed: rs_params_unpack!=rs_params";
+ }
+ if (rsi_unpack.size() != rsi.size())
+ {
+ LOG_THROW
+ << "check failed: rs_unpack.size=" << rsi_unpack.size()
+ << " rs_orig.size=" << rsi.size();
+
+ }
+ for (unsigned i = 0; i < rsi_unpack.size(); ++i)
+ {
+ if (rsi_unpack[i] != rsi[i])
+ {
+ LOG_THROW
+ << "check failed: i=" << i
+ << " rs_unpack=" << rsi_unpack[i]
+ << " rs_orig=" << rsi[i];
+ }
+ }
+ }
+ cnt.rs_count += rsi.size();
+ cnt.rs_bits += rs_pack.signal.size() * sizeof(rs_pack.signal[0]) * 8;
+ if (cnt.rs_total_duration == 0.0)
+ {
+ auto cid_params = src_f.get_channel_id_params();
+ cnt.rs_total_duration = src_f.time_to_float(rs_params.duration, cid_params);
+ }
+ LOG(info)
+ << "rn=" << rn
+ << " rs_size=" << rsi.size()
+ << " signal_bits=" << rs_pack.signal_params.at("avg_bits")
+ << std::endl;
+ }
+ }
+ } // pack_rw()
+
+ void
+ unpack_rw(File const & src_f, File & dst_f) const
+ {
+ auto rn_l = src_f.get_raw_samples_read_name_list();
+ for (auto const & rn : rn_l)
+ {
+ auto rsi_ds = src_f.get_raw_int_samples_dataset(rn);
+ dst_f.add_raw_samples_dataset(rn, rsi_ds);
+ }
+ } // unpack_rw()
+
+ void
+ copy_rw(File const & src_f, File & dst_f) const
+ {
+ auto rn_l = src_f.get_raw_samples_read_name_list();
+ for (auto const & rn : rn_l)
+ {
+ if (src_f.have_raw_samples_unpack(rn))
+ {
+ auto rsi_ds = src_f.get_raw_int_samples_dataset(rn);
+ dst_f.add_raw_samples_dataset(rn, rsi_ds);
+ }
+ else if (src_f.have_raw_samples_pack(rn))
+ {
+ auto rs_pack = src_f.get_raw_samples_pack(rn);
+ dst_f.add_raw_samples(rn, rs_pack);
+ }
+ }
+ } // copy_rw()
+
+ void
+ pack_ed(File const & src_f, File & dst_f, Counts & cnt) const
+ {
+ auto gr_l = src_f.get_eventdetection_group_list();
+ for (auto const & gr : gr_l)
+ {
+ auto rn_l = src_f.get_eventdetection_read_name_list(gr);
+ for (auto const & rn : rn_l)
+ {
+ auto ed_params = src_f.get_eventdetection_params(gr);
+ dst_f.add_eventdetection_params(gr, ed_params);
+ if (src_f.have_eventdetection_events_pack(gr, rn))
+ {
+ auto ede_pack = src_f.get_eventdetection_events_pack(gr, rn);
+ dst_f.add_eventdetection_events(gr, rn, ede_pack);
+ }
+ else if (src_f.have_eventdetection_events(gr, rn))
+ {
+ auto ede_ds = src_f.get_eventdetection_events_dataset(gr, rn);
+ auto & ede = ede_ds.first;
+ auto & ede_params = ede_ds.second;
+ auto ede_pack = src_f.pack_ed(ede_ds);
+ dst_f.add_eventdetection_events(gr, rn, ede_pack);
+ if (check)
+ {
+ decltype(ede_ds) ede_ds_unpack;
+ try
+ {
+ ede_ds_unpack = dst_f.get_eventdetection_events_dataset(gr, rn);
+ }
+ catch (std::logic_error & e)
+ {
+ LOG_THROW
+ << "check failed: " << e.what();
+ }
+ auto & ede_unpack = ede_ds_unpack.first;
+ auto & ede_params_unpack = ede_ds_unpack.second;
+ if (not (ede_params_unpack == ede_params))
+ {
+ LOG_THROW
+ << "check failed: ede_params_unpack!=ede_params";
+ }
+ if (ede_unpack.size() != ede.size())
+ {
+ LOG_THROW
+ << "check failed: gr=" << gr
+ << " ede_unpack.size=" << ede_unpack.size()
+ << " ede_orig.size=" << ede.size();
+ }
+ for (unsigned i = 0; i + 1 < ede_unpack.size(); ++i) // skip last event
+ {
+ LOG(debug1)
+ << "gr=" << gr
+ << " i=" << i
+ << " ede_unpack=(" << ede_unpack[i].start
+ << "," << ede_unpack[i].length
+ << "," << ede_unpack[i].mean
+ << "," << ede_unpack[i].stdv
+ << ") ed_orig=(" << ede[i].start
+ << "," << ede[i].length
+ << "," << ede[i].mean
+ << "," << ede[i].stdv
+ << ")" << std::endl;
+ if (ede_unpack[i].start != ede[i].start
+ or ede_unpack[i].length != ede[i].length
+ or abs(ede_unpack[i].mean - ede[i].mean) > .1
+ or abs(ede_unpack[i].stdv - ede[i].stdv) > .1)
+ {
+ LOG_THROW
+ << "check failed: gr=" << gr
+ << " i=" << i
+ << " ede_unpack=(" << ede_unpack[i].start
+ << "," << ede_unpack[i].length
+ << "," << ede_unpack[i].mean
+ << "," << ede_unpack[i].stdv
+ << ") ed_orig=(" << ede[i].start
+ << "," << ede[i].length
+ << "," << ede[i].mean
+ << "," << ede[i].stdv
+ << ")";
+ }
+ }
+ } // if check
+ cnt.ed_count += ede.size();
+ cnt.ed_skip_bits += ede_pack.skip.size() * sizeof(ede_pack.skip[0]) * 8;
+ cnt.ed_len_bits += ede_pack.len.size() * sizeof(ede_pack.len[0]) * 8;
+ LOG(info)
+ << "gr=" << gr
+ << " rn=" << rn
+ << " ed_size=" << ede.size()
+ << " skip_bits=" << ede_pack.skip_params.at("avg_bits")
+ << " len_bits=" << ede_pack.len_params.at("avg_bits")
+ << std::endl;
+ }
+ } // for rn
+ } // for gr
+ } // pack_ed()
+
+ void
+ unpack_ed(File const & src_f, File & dst_f) const
+ {
+ auto gr_l = src_f.get_eventdetection_group_list();
+ for (auto const & gr : gr_l)
+ {
+ auto rn_l = src_f.get_eventdetection_read_name_list(gr);
+ for (auto const & rn : rn_l)
+ {
+ auto ed_params = src_f.get_eventdetection_params(gr);
+ dst_f.add_eventdetection_params(gr, ed_params);
+ auto ede_ds = src_f.get_eventdetection_events_dataset(gr, rn);
+ dst_f.add_eventdetection_events_dataset(gr, rn, ede_ds);
+ }
+ }
+ } // unpack_ed()
+
+ void
+ copy_ed(File const & src_f, File & dst_f) const
+ {
+ auto gr_l = src_f.get_eventdetection_group_list();
+ for (auto const & gr : gr_l)
+ {
+ auto rn_l = src_f.get_eventdetection_read_name_list(gr);
+ for (auto const & rn : rn_l)
+ {
+ auto ed_params = src_f.get_eventdetection_params(gr);
+ dst_f.add_eventdetection_params(gr, ed_params);
+ if (src_f.have_eventdetection_events_unpack(gr, rn))
+ {
+ auto ede_ds = src_f.get_eventdetection_events_dataset(gr, rn);
+ dst_f.add_eventdetection_events_dataset(gr, rn, ede_ds);
+ }
+ else if (src_f.have_eventdetection_events_pack(gr, rn))
+ {
+ auto ede_pack = src_f.get_eventdetection_events_pack(gr, rn);
+ dst_f.add_eventdetection_events(gr, rn, ede_pack);
+ }
+ }
+ }
+ } // copy_ed()
+
+ void
+ pack_fq(File const & src_f, File & dst_f, std::set< std::string > & bc_gr_s, Counts & cnt) const
+ {
+ bool compute_bp_seq_count = false;
+ for (unsigned st = 0; st < 3; ++st)
+ {
+ auto gr_l = src_f.get_basecall_strand_group_list(st);
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_fastq_pack(st, gr))
+ {
+ bc_gr_s.insert(gr);
+ auto fq_pack = src_f.get_basecall_fastq_pack(st, gr);
+ dst_f.add_basecall_fastq(st, gr, fq_pack);
+ }
+ else if (src_f.have_basecall_fastq_unpack(st, gr))
+ {
+ compute_bp_seq_count = true;
+ bc_gr_s.insert(gr);
+ auto fq = src_f.get_basecall_fastq(st, gr);
+ auto fqa = src_f.split_fq(fq);
+ auto fq_pack = src_f.pack_fq(fq, qv_bits);
+ dst_f.add_basecall_fastq(st, gr, fq_pack);
+ if (check)
+ {
+ auto fq_unpack = dst_f.get_basecall_fastq(st, gr);
+ auto fqa_unpack = src_f.split_fq(fq_unpack);
+ if (fqa_unpack[0] != fqa[0])
+ {
+ LOG_THROW
+ << "check failed: st=" << st
+ << " gr=" << gr
+ << " fq_unpack_name=" << fqa_unpack[0]
+ << " fq_orig_name=" << fqa[0];
+ }
+ if (fqa_unpack[1] != fqa[1])
+ {
+ LOG_THROW
+ << "check failed: st=" << st
+ << " gr=" << gr
+ << " fq_unpack_bp=" << fqa_unpack[1]
+ << " fq_orig_bp=" << fqa[1];
+ }
+ if (fqa_unpack[3].size() != fqa[3].size())
+ {
+ LOG_THROW
+ << "check failed: st=" << st
+ << " gr=" << gr
+ << " fq_unpack_qv_size=" << fqa_unpack[3].size()
+ << " fq_orig_qv_size=" << fqa[3].size();
+ }
+ auto qv_mask = max_qv_mask() & (max_qv_mask() << (max_qv_bits() - qv_bits));
+ for (unsigned i = 0; i < fqa_unpack[3].size(); ++i)
+ {
+ if ((std::min<unsigned>(fqa_unpack[3][i] - 33, max_qv_mask()) & qv_mask) !=
+ (std::min<unsigned>(fqa[3][i] - 33, max_qv_mask()) & qv_mask))
+ {
+ LOG_THROW
+ << "check failed: st=" << st
+ << " gr=" << gr
+ << " i=" << i
+ << " fq_unpack_qv=" << fqa_unpack[3][i]
+ << " fq_orig_qv=" << fqa[3][i];
+ }
+ }
+ }
+ cnt.fq_count += fqa[1].size();
+ cnt.fq_bp_bits += fq_pack.bp.size() * sizeof(fq_pack.bp[0]) * 8;
+ cnt.fq_qv_bits += fq_pack.qv.size() * sizeof(fq_pack.qv[0]) * 8;
+ LOG(info)
+ << "gr=" << gr
+ << " st=" << st
+ << " bp_size=" << fqa[1].size()
+ << " fq_bp_bits=" << fq_pack.bp_params.at("avg_bits")
+ << " fq_qv_bits=" << fq_pack.qv_params.at("avg_bits")
+ << std::endl;
+ }
+ }
+ }
+ if (compute_bp_seq_count)
+ {
+ std::string sq;
+ auto gr_l = src_f.get_basecall_group_list();
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_seq(0, gr) and src_f.have_basecall_events(0, gr))
+ {
+ sq = src_f.get_basecall_seq(0, gr);
+ auto bce = src_f.get_basecall_events(0, gr);
+ cnt.rs_called_duration = bce.back().start + bce.back().length - bce.front().start;
+ break;
+ }
+ }
+ if (sq.empty() and src_f.have_basecall_seq(0))
+ {
+ sq = src_f.get_basecall_seq(0);
+ }
+ cnt.bp_seq_count += sq.size();
+ }
+ } // pack_fq()
+
+ void
+ unpack_fq(File const & src_f, File & dst_f, std::set< std::string > & bc_gr_s) const
+ {
+ for (unsigned st = 0; st < 3; ++st)
+ {
+ auto gr_l = src_f.get_basecall_strand_group_list(st);
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_fastq(st, gr))
+ {
+ bc_gr_s.insert(gr);
+ auto fq = src_f.get_basecall_fastq(st, gr);
+ dst_f.add_basecall_fastq(st, gr, fq);
+ }
+ }
+ }
+ } // unpack_fq()
+
+ void
+ copy_fq(File const & src_f, File & dst_f, std::set< std::string > & bc_gr_s) const
+ {
+ for (unsigned st = 0; st < 3; ++st)
+ {
+ auto gr_l = src_f.get_basecall_strand_group_list(st);
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_fastq_unpack(st, gr))
+ {
+ bc_gr_s.insert(gr);
+ auto fq = src_f.get_basecall_fastq(st, gr);
+ dst_f.add_basecall_fastq(st, gr, fq);
+ }
+ else if (src_f.have_basecall_fastq_pack(st, gr))
+ {
+ bc_gr_s.insert(gr);
+ auto fq_pack = src_f.get_basecall_fastq_pack(st, gr);
+ dst_f.add_basecall_fastq(st, gr, fq_pack);
+ }
+ }
+ }
+ } // copy_fq()
+
+ void
+ pack_ev(File const & src_f, File & dst_f, std::set< std::string > & bc_gr_s, Counts & cnt) const
+ {
+ for (unsigned st = 0; st < 2; ++st)
+ {
+ auto gr_l = src_f.get_basecall_strand_group_list(st);
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_events_pack(st, gr))
+ {
+ bc_gr_s.insert(gr);
+ auto ev_pack = src_f.get_basecall_events_pack(st, gr);
+ dst_f.add_basecall_events(st, gr, ev_pack);
+ }
+ else if (src_f.have_basecall_events_unpack(st, gr))
+ {
+ // bc group description
+ auto bc_params = src_f.get_basecall_params(gr);
+ auto bc_desc = src_f.get_basecall_group_description(gr);
+ if (bc_desc.name != "metrichor")
+ {
+ LOG(warning)
+ << "dropping basecall events group written by "
+ << bc_desc.name << ":" << bc_desc.version
+ << ": st=" << st << " gr=" << gr << "\n";
+ continue;
+ }
+ bc_gr_s.insert(gr);
+ auto ev_ds = src_f.get_basecall_events_dataset(st, gr);
+ auto & ev = ev_ds.first;
+ auto & ev_params = ev_ds.second;
+ // sampling rate
+ auto cid_params = src_f.get_channel_id_params();
+ // basecall fq
+ if (not src_f.have_basecall_fastq(st, gr))
+ {
+ LOG_THROW
+ << "missing fastq required to pack basecall events: st=" << st << " gr=" << gr;
+ }
+ auto sq = src_f.get_basecall_seq(st, gr);
+ // ed group
+ auto ed_gr = src_f.get_basecall_eventdetection_group(gr);
+ std::vector< EventDetection_Event > ed;
+ if (not ed_gr.empty())
+ {
+ ed = src_f.get_eventdetection_events(ed_gr);
+ }
+ // try to find mean_sd_temp
+ auto median_sd_temp = src_f.get_basecall_median_sd_temp(gr);
+ auto ev_pack = src_f.pack_ev(ev_ds, bc_desc, sq, ed, ed_gr,
+ cid_params, median_sd_temp, p_model_state_bits);
+ dst_f.add_basecall_events(st, gr, ev_pack);
+ if (check)
+ {
+ decltype(ev_ds) ev_ds_unpack;
+ try
+ {
+ ev_ds_unpack = dst_f.get_basecall_events_dataset(st, gr);
+ }
+ catch (std::logic_error & e)
+ {
+ LOG_THROW
+ << "check failed: " << e.what();
+ }
+ auto & ev_unpack = ev_ds_unpack.first;
+ auto & ev_params_unpack = ev_ds_unpack.second;
+ if (not (ev_params_unpack == ev_params))
+ {
+ LOG_THROW
+ << "check failed: ev_params_unpack!=ev_params";
+ }
+ if (ev_unpack.size() != ev.size())
+ {
+ LOG_THROW
+ << "check failed: st=" << st
+ << " gr=" << gr
+ << " ev_unpack.size=" << ev_unpack.size()
+ << " ev_orig.size=" << ev.size();
+ }
+ for (unsigned i = 0; i < ev_unpack.size(); ++i)
+ {
+ if (abs(ev_unpack[i].start - ev[i].start) > 1e-3
+ or abs(ev_unpack[i].length - ev[i].length) > 1e-3
+ or abs(ev_unpack[i].mean - ev[i].mean) > 1e-1
+ // workaround: allow for unexpected stdv when expected value is small
+ //or abs(ev_unpack[i].stdv - ev[i].stdv) > 1e-1
+ or (abs(ev_unpack[i].stdv - ev[i].stdv) > 1e-1
+ and ev_unpack[i].stdv != ev_pack.median_sd_temp)
+ // workaround: allow for invalid moves:
+ //or ev_unpack[i].move != ev[i].move
+ or ev_unpack[i].model_state != ev[i].model_state)
+ {
+ LOG_THROW
+ << "check failed: st=" << st
+ << " gr=" << gr
+ << " i=" << i
+ << " ev_unpack=(" << ev_unpack[i].start
+ << "," << ev_unpack[i].length
+ << "," << ev_unpack[i].mean
+ << "," << ev_unpack[i].stdv
+ << "," << ev_unpack[i].move
+ << "," << ev_unpack[i].get_model_state()
+ << ") ev_orig=(" << ev[i].start
+ << "," << ev[i].length
+ << "," << ev[i].mean
+ << "," << ev[i].stdv
+ << "," << ev[i].move
+ << "," << ev[i].get_model_state()
+ << ")";
+ }
+ if (abs(ev_unpack[i].stdv - ev[i].stdv) > 1e-1
+ and ev_unpack[i].stdv == ev_pack.median_sd_temp)
+ {
+ LOG(warning)
+ << "unexpected stdv: st=" << st
+ << " gr=" << gr
+ << " i=" << i
+ << " ev_unpack=(" << ev_unpack[i].start
+ << "," << ev_unpack[i].length
+ << "," << ev_unpack[i].mean
+ << "," << ev_unpack[i].stdv
+ << "," << ev_unpack[i].move
+ << "," << ev_unpack[i].get_model_state()
+ << ") ev_orig=(" << ev[i].start
+ << "," << ev[i].length
+ << "," << ev[i].mean
+ << "," << ev[i].stdv
+ << "," << ev[i].move
+ << "," << ev[i].get_model_state()
+ << ")\n";
+ }
+ }
+ }
+ cnt.ev_count += ev.size();
+ cnt.ev_rel_skip_bits += ev_pack.rel_skip.size() * sizeof(ev_pack.rel_skip[0]) * 8;
+ cnt.ev_skip_bits += ev_pack.skip.size() * sizeof(ev_pack.skip[0]) * 8;
+ cnt.ev_len_bits += ev_pack.len.size() * sizeof(ev_pack.len[0]) * 8;
+ cnt.ev_move_bits += ev_pack.move.size() * sizeof(ev_pack.move[0]) * 8;
+ cnt.ev_p_model_state_bits += ev_pack.p_model_state.size() * sizeof(ev_pack.p_model_state[0]) * 8;
+ std::ostringstream oss;
+ if (not ev_pack.rel_skip.empty())
+ {
+ oss
+ << "rel_skip_bits=" << ev_pack.rel_skip_params.at("avg_bits");
+ }
+ else
+ {
+ oss
+ << "skip_bits=" << ev_pack.skip_params.at("avg_bits")
+ << " len_bits=" << ev_pack.len_params.at("avg_bits");
+ }
+ LOG(info)
+ << "gr=" << gr
+ << " st=" << st
+ << " ev_size=" << ev.size()
+ << " " << oss.str()
+ << " move_bits=" << ev_pack.move_params.at("avg_bits")
+ << " p_model_state_bits=" << ev_pack.p_model_state_params.at("num_bits")
+ << std::endl;
+ }
+ }
+ }
+ } // pack_ev()
+
+ void
+ unpack_ev(File const & src_f, File & dst_f, std::set< std::string > & bc_gr_s) const
+ {
+ for (unsigned st = 0; st < 2; ++st)
+ {
+ auto gr_l = src_f.get_basecall_strand_group_list(st);
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_events(st, gr))
+ {
+ bc_gr_s.insert(gr);
+ auto ev_ds = src_f.get_basecall_events_dataset(st, gr);
+ dst_f.add_basecall_events_dataset(st, gr, ev_ds);
+ }
+ }
+ }
+ } // unpack_ev()
+
+ void
+ copy_ev(File const & src_f, File & dst_f, std::set< std::string > & bc_gr_s) const
+ {
+ for (unsigned st = 0; st < 2; ++st)
+ {
+ auto gr_l = src_f.get_basecall_strand_group_list(st);
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_events_unpack(st, gr))
+ {
+ bc_gr_s.insert(gr);
+ auto ev_ds = src_f.get_basecall_events_dataset(st, gr);
+ dst_f.add_basecall_events_dataset(st, gr, ev_ds);
+ }
+ else if (src_f.have_basecall_events_pack(st, gr))
+ {
+ bc_gr_s.insert(gr);
+ auto ev_pack = src_f.get_basecall_events_pack(st, gr);
+ dst_f.add_basecall_events(st, gr, ev_pack);
+ }
+ }
+ }
+ } // copy_ev()
+
+ void
+ pack_al(File const & src_f, File & dst_f, std::set< std::string > & bc_gr_s, Counts & cnt) const
+ {
+ auto gr_l = src_f.get_basecall_strand_group_list(2);
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_alignment_pack(gr))
+ {
+ bc_gr_s.insert(gr);
+ auto al_pack = src_f.get_basecall_alignment_pack(gr);
+ dst_f.add_basecall_alignment(gr, al_pack);
+ }
+ else if (src_f.have_basecall_alignment_unpack(gr))
+ {
+ // bc group description
+ auto bc_params = src_f.get_basecall_params(gr);
+ auto bc_desc = src_f.get_basecall_group_description(gr);
+ if (bc_desc.name != "metrichor")
+ {
+ LOG(warning)
+ << "dropping basecall alignment written by "
+ << bc_desc.name << ":" << bc_desc.version
+ << ": gr=" << gr << "\n";
+ continue;
+ }
+ bc_gr_s.insert(gr);
+ auto al = src_f.get_basecall_alignment(gr);
+ // basecall seq
+ if (not src_f.have_basecall_seq(2, gr))
+ {
+ LOG_THROW
+ << "missing fastq required to pack basecall alignment: gr=" << gr;
+ }
+ auto seq = src_f.get_basecall_seq(2, gr);
+ auto al_pack = src_f.pack_al(al, seq);
+ dst_f.add_basecall_alignment(gr, al_pack);
+ if (check)
+ {
+ auto al_unpack = dst_f.get_basecall_alignment(gr);
+ if (al_unpack.size() != al.size())
+ {
+ LOG_THROW
+ << "check failed: gr=" << gr
+ << " al_unpack.size=" << al_unpack.size()
+ << " al_orig.size=" << al.size();
+ }
+ for (unsigned i = 0; i < al.size(); ++i)
+ {
+ if (al_unpack[i].template_index != al[i].template_index
+ or al_unpack[i].complement_index != al[i].complement_index
+ or al_unpack[i].get_kmer() != al[i].get_kmer())
+ {
+ LOG_THROW
+ << "check failed: gr=" << gr
+ << " i=" << i
+ << " al_unpack=(" << al_unpack[i].template_index
+ << "," << al_unpack[i].complement_index
+ << "," << al_unpack[i].get_kmer()
+ << ") al_orig=(" << al[i].template_index
+ << "," << al[i].complement_index
+ << "," << al[i].get_kmer()
+ << ")";
+ }
+ }
+ }
+ cnt.al_count += al.size();
+ cnt.al_template_step_bits += al_pack.template_step.size() * sizeof(al_pack.template_step[0]) * 8;
+ cnt.al_complement_step_bits += al_pack.complement_step.size() * sizeof(al_pack.complement_step[0]) * 8;
+ cnt.al_move_bits += al_pack.move.size() * sizeof(al_pack.move[0]) * 8;
+ LOG(info)
+ << "gr=" << gr
+ << " al_size=" << al.size()
+ << " template_step_bits=" << al_pack.template_step_params.at("num_bits")
+ << " complement_step_bits=" << al_pack.complement_step_params.at("num_bits")
+ << " move_bits=" << al_pack.move_params.at("avg_bits")
+ << std::endl;
+ }
+ }
+ } // pack_al()
+
+ void
+ unpack_al(File const & src_f, File & dst_f, std::set< std::string > & bc_gr_s) const
+ {
+ auto gr_l = src_f.get_basecall_strand_group_list(2);
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_alignment(gr))
+ {
+ bc_gr_s.insert(gr);
+ auto al = src_f.get_basecall_alignment(gr);
+ dst_f.add_basecall_alignment(gr, al);
+ }
+ }
+ } // unpack_al()
+
+ void
+ copy_al(File const & src_f, File & dst_f, std::set< std::string > & bc_gr_s) const
+ {
+ auto gr_l = src_f.get_basecall_strand_group_list(2);
+ for (auto const & gr : gr_l)
+ {
+ if (src_f.have_basecall_alignment_unpack(gr))
+ {
+ bc_gr_s.insert(gr);
+ auto al = src_f.get_basecall_alignment(gr);
+ dst_f.add_basecall_alignment(gr, al);
+ }
+ else if (src_f.have_basecall_alignment_pack(gr))
+ {
+ bc_gr_s.insert(gr);
+ auto al_pack = src_f.get_basecall_alignment_pack(gr);
+ dst_f.add_basecall_alignment(gr, al_pack);
+ }
+ }
+ } // copy_al()
+
+ void
+ copy_basecall_params(File const & src_f, File & dst_f, std::set< std::string > const & bc_gr_s) const
+ {
+ for (auto const & gr : bc_gr_s)
+ {
+ auto bc_params = src_f.get_basecall_params(gr);
+ dst_f.add_basecall_params(gr, bc_params);
+ }
+ } // copy_basecall_params()
+
+ void
+ copy_attributes(File const & src_f, File const & dst_f, std::string const & p, bool recurse = false) const
+ {
+ File::Base::copy_attributes(src_f, dst_f, p, recurse);
+ } // copy_attributes()
+}; // class File_Packer
+
+} // namespace fast5
+
+#endif
diff --git a/src/Huffman_Packer.hpp b/src/Huffman_Packer.hpp
new file mode 100644
index 0000000..f17ac4a
--- /dev/null
+++ b/src/Huffman_Packer.hpp
@@ -0,0 +1,357 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
+#ifndef __HUFFMAN_PACKER_HPP
+#define __HUFFMAN_PACKER_HPP
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <map>
+#include <limits>
+#include <stdexcept>
+#include <cassert>
+#include <bitset>
+
+#include "logger.hpp"
+
+namespace fast5
+{
+
+class Huffman_Packer
+{
+public:
+ typedef std::vector< std::uint8_t > Code_Type;
+ typedef std::map< std::string, std::string > Code_Params_Type;
+
+ Huffman_Packer() = default;
+ Huffman_Packer(Huffman_Packer const &) = delete;
+ Huffman_Packer(Huffman_Packer &&) = default;
+ Huffman_Packer & operator = (Huffman_Packer const &) = delete;
+ Huffman_Packer & operator = (Huffman_Packer &&) = default;
+ Huffman_Packer(std::istream & is, std::string const & cwm_name)
+ {
+ load_codeword_map(is, cwm_name);
+ }
+ Huffman_Packer(std::vector< std::string > const & v, std::string const & cwm_name)
+ {
+ load_codeword_map(v.begin(), v.end(), cwm_name);
+ }
+ template < typename Iterator >
+ Huffman_Packer(Iterator it_begin, Iterator it_end, std::string const & cwm_name)
+ {
+ load_codeword_map(it_begin, it_end, cwm_name);
+ }
+
+ void load_codeword_map(std::istream & is, std::string const & cwm_name)
+ {
+ _cwm_name = cwm_name;
+ std::string v_s;
+ std::string cw_s;
+ while (is >> v_s >> cw_s)
+ {
+ add_codeword(v_s, cw_s);
+ }
+ }
+ template < typename Iterator >
+ void load_codeword_map(Iterator it_begin, Iterator it_end, std::string const & cwm_name)
+ {
+ _cwm_name = cwm_name;
+ for (auto it = it_begin; it != it_end and std::next(it) != it_end; it += 2)
+ {
+ add_codeword(*it, *next(it));
+ }
+ }
+
+ template < typename Int_Type >
+ std::pair< Code_Type, Code_Params_Type >
+ encode(std::vector< Int_Type > const & v, bool encode_diff = false) const
+ {
+ Code_Type res;
+ Code_Params_Type res_params = id();
+ res_params["code_diff"] = encode_diff? "1" : "0";
+ std::ostringstream oss;
+ oss << v.size();
+ res_params["size"] = oss.str();
+ uint64_t buff = 0;
+ uint8_t buff_len = 0;
+ bool reset = true;
+ Int_Type last = 0;
+ unsigned i = 0;
+ long long int val;
+ long long int x;
+ while (true)
+ {
+ assert(buff_len <= 64);
+ // flush buffer
+ while (buff_len >= 8)
+ {
+ res.push_back(buff & 0xFF);
+ buff >>= 8;
+ buff_len -= 8;
+ }
+ assert(buff_len < 8);
+ if (reset)
+ {
+ assert(buff_len == 0);
+ if (i == v.size()) break;
+ //LOG(debug) << "absolute value val=" << v[i] << std::endl;
+ for (unsigned j = 0; j < sizeof(Int_Type); ++j)
+ {
+ std::uint8_t y = (v[i] >> (8 * j)) & 0xFF;
+ //LOG(debug) << "byte " << j << ": " << std::bitset<8>(y) << std::endl;
+ res.push_back(y);
+ }
+ reset = false;
+ last = v[i];
+ ++i;
+ }
+ else // not reset
+ {
+ if (i < v.size())
+ {
+ val = v[i];
+ x = encode_diff? val - last : val;
+ reset = _cwm.count(x) == 0;
+ //LOG(debug) << "relative value: val=" << v[i] << " last=" << last << " x=" << x << " reset=" << reset << std::endl;
+ }
+ else
+ {
+ reset = true;
+ //LOG(debug) << "end: reset=1" << std::endl;
+ }
+ auto p = (not reset? _cwm.at(x) : _cwm.at(break_cw()));
+ buff |= (p.first << buff_len);
+ buff_len += p.second;
+ if (not reset)
+ {
+ last = v[i];
+ ++i;
+ }
+ else if ((buff_len % 8) > 0) // and reset
+ {
+ buff_len += 8 - (buff_len % 8);
+ }
+
+ }
+ }
+ oss.str("");
+ oss << std::fixed << std::setprecision(2) << (double)(res.size() * 8) / v.size();
+ res_params["avg_bits"] = oss.str();
+ return std::make_pair(std::move(res), std::move(res_params));
+ }
+
+ template < typename Int_Type >
+ std::vector< Int_Type >
+ decode(Code_Type const & v, Code_Params_Type const & v_params) const
+ {
+ check_params(v_params);
+ bool decode_diff = v_params.at("code_diff") == "1";
+ std::vector< Int_Type > res;
+ std::uint64_t buff = 0;
+ std::uint8_t buff_len = 0;
+ bool reset = true;
+ Int_Type last = 0;
+ unsigned i = 0;
+ while (i < v.size() or buff_len > 0)
+ {
+ assert(buff_len <= 64);
+ // fill buffer
+ while (i < v.size() and buff_len <= 56)
+ {
+ uint64_t y = v[i];
+ buff |= (y << buff_len);
+ buff_len += 8;
+ ++i;
+ }
+ assert(buff_len <= 64);
+ if (reset)
+ {
+ assert((buff_len % 8) == 0);
+ assert(buff_len / 8 >= sizeof(Int_Type));
+ //LOG(debug) << "absolute value" << std::endl;
+ Int_Type x = 0;
+ for (unsigned j = 0; j < sizeof(Int_Type); ++j)
+ {
+ std::uint64_t y = (buff & 0xFF);
+ //LOG(debug) << "byte " << j << ": " << std::bitset<8>(y) << std::endl;
+ x |= (y << (8 * j));
+ buff >>= 8;
+ buff_len -= 8;
+ }
+ //LOG(debug) << "got: val=" << x << std::endl;
+ res.push_back(x);
+ last = x;
+ reset = false;
+ }
+ else // not reset
+ {
+ //LOG(debug) << "reading relative value" << std::endl;
+ // TODO: faster decoding
+ // currently, try all codewords one by one
+ auto it = _cwm.begin();
+ while (it != _cwm.end())
+ {
+ if ((buff & ((1llu << it->second.second) - 1)) == it->second.first)
+ {
+ break;
+ }
+ ++it;
+ }
+ if (it == _cwm.end())
+ {
+ LOG_THROW
+ << "codeword not found: buff=" << std::bitset<64>(buff);
+ }
+ auto x = it->first;
+ auto p = it->second;
+ assert(buff_len >= p.second);
+ buff >>= p.second;
+ buff_len -= p.second;
+ if (x != break_cw())
+ {
+ //LOG(debug) << "got: x=" << x << " last=" << last << " val=" << x + last << " cw_len=" << (int)p.second << std::endl;
+ if (decode_diff) x += last;
+ if (sizeof(Int_Type) < 8
+ and (x < (long long)std::numeric_limits< Int_Type >::min()
+ or x > (long long)std::numeric_limits< Int_Type >::max()))
+ {
+ LOG_THROW
+ << "overflow";
+ }
+ res.push_back(x);
+ last = x;
+ }
+ else
+ {
+ //LOG(debug) << "got: break cw_len=" << (int)p.second << std::endl;
+ reset = true;
+ if ((buff_len % 8) > 0)
+ {
+ buff >>= (buff_len % 8);
+ buff_len -= (buff_len % 8);
+ }
+ }
+ }
+ }
+ return res;
+ }
+
+ //
+ // static coder access
+ //
+ static Huffman_Packer const &
+ get_coder(std::string const & cwm_name)
+ {
+ static_init();
+ if (cwm_m().count(cwm_name) == 0)
+ {
+ LOG_THROW
+ << "missing codeword map: " + cwm_name;
+ }
+ return cwm_m().at(cwm_name);
+ }
+
+private:
+ std::map< long long int, std::pair< std::uint64_t, std::uint8_t > > _cwm;
+ std::string _cwm_name;
+ static long long int break_cw()
+ {
+ static long long int const _break_cw = std::numeric_limits< long long int >::min();
+ return _break_cw;
+ }
+ Code_Params_Type id() const
+ {
+ Code_Params_Type res;
+ res["packer"] = "huffman_packer";
+ res["format_version"] = "2";
+ res["codeword_map_name"] = _cwm_name;
+ return res;
+ }
+ void check_params(Code_Params_Type const & params) const
+ {
+ auto _id = id();
+ if (params.at("packer") != _id.at("packer")
+ or params.at("format_version") != _id.at("format_version")
+ or params.at("codeword_map_name") != _id.at("codeword_map_name"))
+ {
+ LOG_THROW
+ << "decode id mismatch";
+ }
+ }
+ void add_codeword(std::string const & v_s, std::string const & cw_s)
+ {
+ long long int v;
+ if (v_s != ".")
+ {
+ std::istringstream(v_s) >> v;
+ }
+ else
+ {
+ v = break_cw();
+ }
+ std::uint64_t cw = 0;
+ if (cw_s.size() > 57)
+ {
+ LOG_THROW
+ << "codeword too long: " + v_s + " " + cw_s;
+ }
+ std::uint8_t cw_l = cw_s.size();
+ for (int i = cw_s.size() - 1; i >= 0; --i)
+ {
+ cw <<= 1;
+ cw |= (cw_s[i] == '1');
+ }
+ _cwm[v] = std::make_pair(cw, cw_l);
+ }
+
+ static std::map< std::string, Huffman_Packer > & cwm_m()
+ {
+ static std::map< std::string, Huffman_Packer > _cwm_m;
+ return _cwm_m;
+ }
+ static void static_init()
+ {
+ static bool inited = false;
+ if (inited) return;
+ std::deque< std::deque< std::string > > dd;
+ dd.push_back(
+#include "cwmap.fast5_rw_1.inl"
+ );
+ dd.push_back(
+#include "cwmap.fast5_ed_skip_1.inl"
+ );
+ dd.push_back(
+#include "cwmap.fast5_ed_len_1.inl"
+ );
+ dd.push_back(
+#include "cwmap.fast5_fq_bp_1.inl"
+ );
+ dd.push_back(
+#include "cwmap.fast5_fq_qv_1.inl"
+ );
+ dd.push_back(
+#include "cwmap.fast5_ev_rel_skip_1.inl"
+ );
+ dd.push_back(
+#include "cwmap.fast5_ev_move_1.inl"
+ );
+ cwm_m().clear();
+ for (auto & d : dd)
+ {
+ auto cwm_name = d.front();
+ Huffman_Packer hc(d.begin() + 1, d.end(), cwm_name);
+ cwm_m()[cwm_name] = std::move(hc);
+ }
+ inited = true;
+ } // static_init()
+}; // class Huffman_Packer
+
+} // namespace fast5
+
+#endif
diff --git a/src/Makefile b/src/Makefile
index 03b93d6..0d64a1e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,19 +1,35 @@
+#
+# Part of: https://github.com/mateidavid/fast5
+#
+# (c) 2017: Matei David, Ontario Institute for Cancer Research
+# MIT License
+#
+
.SUFFIXES:
MAKEFLAGS += -r
SHELL := /bin/bash
.DELETE_ON_ERROR:
.PHONY: all help list clean check_hdf5
-HDF5_DIR = /usr/local
-HDF5_INCLUDE_DIR = ${HDF5_DIR}/include
-HDF5_LIB_DIR = ${HDF5_DIR}/lib
-HDF5_LIB = hdf5
-TCLAP_DIR = tclap
-HPPTOOLS_DIR = hpptools
+HDF5_DIR ?= /usr/local
+HDF5_INCLUDE_DIR ?= ${HDF5_DIR}/include
+HDF5_LIB_DIR ?= ${HDF5_DIR}/lib
+HDF5_LIB ?= hdf5
+TCLAP_DIR ?= tclap
+HPPTOOLS_DIR ?= hpptools
TARGETS = f5ls f5ls-full hdf5-mod f5-mod
+EXTRA_TARGETS = f5dump f5pack
+HPP_FILES := fast5.hpp hdf5_tools.hpp Huffman_Packer.hpp Bit_Packer.hpp
+
+CXXFLAGS := -std=c++11 -O0 -g3 -ggdb -fno-eliminate-unused-debug-types -Wall -Wextra -Wpedantic
+CPPFLAGS := -isystem ${HDF5_INCLUDE_DIR}
+EXTRA_CPPFLAGS := -isystem ${TCLAP_DIR}/include -I ${HPPTOOLS_DIR}/include
+LDFLAGS := -L${HDF5_LIB_DIR} -Wl,--rpath=${HDF5_LIB_DIR} -l${HDF5_LIB} -lpthread -lz -ldl
+
+default: ${TARGETS}
-all: ${TARGETS}
+all: default ${EXTRA_TARGETS}
print-%:
@echo '$*=$($*)'
@@ -25,7 +41,7 @@ list:
@echo "TARGETS=${TARGETS}"
clean:
- rm -rf ${TARGETS}
+ rm -rf ${TARGETS} ${EXTRA_TARGETS}
check_hdf5:
@[ -f "${HDF5_INCLUDE_DIR}/H5pubconf.h" ] || { echo "HDF5 headers not found" >&2; exit 1; }
@@ -37,9 +53,11 @@ check_tclap:
check_hpptools:
@[ -f "${HPPTOOLS_DIR}/include/alg.hpp" ] || { echo "HPPTOOLS not found; get it from https://github.com/mateidavid/hpptools.git" >&2; exit 1; }
-%: %.cpp fast5.hpp hdf5_tools.hpp | check_hdf5
- ${CXX} -std=c++11 -O0 -g3 -ggdb -fno-eliminate-unused-debug-types -Wall -Wextra -Wpedantic -isystem ${HDF5_INCLUDE_DIR} -o $@ $< -L${HDF5_LIB_DIR} -Wl,--rpath=${HDF5_LIB_DIR} -l${HDF5_LIB} -lpthread -lz -ldl
+%: %.cpp ${HPP_FILES} | check_hdf5
+ ${CXX} ${CXXFLAGS} ${CPPFLAGS} -o $@ $< ${LDFLAGS}
-f5dump: f5dump.cpp fast5.hpp hdf5_tools.hpp | check_hdf5 check_tclap check_hpptools
- ${CXX} -std=c++11 -O0 -g3 -ggdb -fno-eliminate-unused-debug-types -Wall -Wextra -Wpedantic -isystem ${HDF5_INCLUDE_DIR} -isystem ${TCLAP_DIR}/include -I ${HPPTOOLS_DIR}/include -o $@ $< -L${HDF5_LIB_DIR} -Wl,--rpath=${HDF5_LIB_DIR} -l${HDF5_LIB} -lpthread -lz -ldl
+f5dump: f5dump.cpp ${HPP_FILES} | check_hdf5 check_tclap check_hpptools
+ ${CXX} ${CXXFLAGS} ${CPPFLAGS} ${EXTRA_CPPFLAGS} -o $@ $< ${LDFLAGS}
+f5pack: f5pack.cpp ${HPP_FILES} File_Packer.hpp | check_hdf5 check_tclap check_hpptools
+ ${CXX} ${CXXFLAGS} ${CPPFLAGS} ${EXTRA_CPPFLAGS} -o $@ $< ${LDFLAGS}
diff --git a/src/cwmap.fast5_ed_len_1.inl b/src/cwmap.fast5_ed_len_1.inl
new file mode 100644
index 0000000..98952ef
--- /dev/null
+++ b/src/cwmap.fast5_ed_len_1.inl
@@ -0,0 +1,103 @@
+{ "fast5_ed_len_1",
+"8", "001",
+"7", "010",
+"6", "100",
+"5", "111",
+"12", "0001",
+"11", "1010",
+"10", "1100",
+"9", "1101",
+"16", "00001",
+"4", "01101",
+"15", "01110",
+"14", "10110",
+"13", "10111",
+"19", "000001",
+"18", "011001",
+"17", "011111",
+"22", "0110000",
+"21", "0111100",
+"20", "0111101",
+"25", "00000010",
+"24", "01100010",
+"23", "01100011",
+"28", "000000001",
+"27", "000000110",
+"26", "000000111",
+"31", "0000000000",
+"30", "0000000100",
+"29", "0000000111",
+"32", "00000001010",
+"34", "000000000100",
+"35", "000000000110",
+"33", "000000011011",
+"41", "0000000001011",
+"40", "0000000001110",
+"36", "0000000101100",
+"37", "0000000110100",
+"38", "00000000010101",
+"42", "00000000011110",
+"39", "00000001101011",
+"99", "000000000101000",
+"100", "000000000101001",
+"43", "000000000111110",
+"44", "000000000111111",
+"46", "000000010110100",
+"53", "000000010110101",
+".", "0000000101101100",
+"1", "0000000101101101",
+"2", "0000000101101110",
+"3", "0000000101101111",
+"45", "0000000101110000",
+"47", "0000000101110001",
+"48", "0000000101110010",
+"49", "0000000101110011",
+"50", "0000000101110100",
+"51", "0000000101110101",
+"52", "0000000101110110",
+"54", "0000000101110111",
+"55", "0000000101111000",
+"56", "0000000101111001",
+"57", "0000000101111010",
+"58", "0000000101111011",
+"59", "0000000101111100",
+"60", "0000000101111101",
+"61", "0000000101111110",
+"62", "0000000101111111",
+"63", "0000000110000000",
+"64", "0000000110000001",
+"65", "0000000110000010",
+"66", "0000000110000011",
+"67", "0000000110000100",
+"68", "0000000110000101",
+"69", "0000000110000110",
+"70", "0000000110000111",
+"71", "0000000110001000",
+"72", "0000000110001001",
+"73", "0000000110001010",
+"74", "0000000110001011",
+"75", "0000000110001100",
+"76", "0000000110001101",
+"77", "0000000110001110",
+"78", "0000000110001111",
+"79", "0000000110010000",
+"80", "0000000110010001",
+"81", "0000000110010010",
+"82", "0000000110010011",
+"83", "0000000110010100",
+"84", "0000000110010101",
+"85", "0000000110010110",
+"86", "0000000110010111",
+"87", "0000000110011000",
+"88", "0000000110011001",
+"89", "0000000110011010",
+"90", "0000000110011011",
+"91", "0000000110011100",
+"92", "0000000110011101",
+"93", "0000000110011110",
+"94", "0000000110011111",
+"95", "0000000110101000",
+"96", "0000000110101001",
+"97", "0000000110101010",
+"98", "0000000110101011",
+}
diff --git a/src/cwmap.fast5_ed_skip_1.inl b/src/cwmap.fast5_ed_skip_1.inl
new file mode 100644
index 0000000..849f803
--- /dev/null
+++ b/src/cwmap.fast5_ed_skip_1.inl
@@ -0,0 +1,4 @@
+{ "fast5_ed_skip_1",
+"0", "0",
+".", "1",
+}
diff --git a/src/cwmap.fast5_ev_move_1.inl b/src/cwmap.fast5_ev_move_1.inl
new file mode 100644
index 0000000..1a695fc
--- /dev/null
+++ b/src/cwmap.fast5_ev_move_1.inl
@@ -0,0 +1,6 @@
+{ "fast5_ev_move_1",
+"0", "0",
+"1", "10",
+"2", "110",
+".", "111",
+}
diff --git a/src/cwmap.fast5_ev_rel_skip_1.inl b/src/cwmap.fast5_ev_rel_skip_1.inl
new file mode 100644
index 0000000..d315aca
--- /dev/null
+++ b/src/cwmap.fast5_ev_rel_skip_1.inl
@@ -0,0 +1,4 @@
+{ "fast5_ev_rel_skip_1",
+"0", "0",
+".", "1",
+}
diff --git a/src/cwmap.fast5_fq_bp_1.inl b/src/cwmap.fast5_fq_bp_1.inl
new file mode 100644
index 0000000..a42e4e2
--- /dev/null
+++ b/src/cwmap.fast5_fq_bp_1.inl
@@ -0,0 +1,7 @@
+{ "fast5_fq_bp_1",
+"65", "00",
+"67", "01",
+"71", "10",
+"84", "110",
+".", "111",
+}
diff --git a/src/cwmap.fast5_fq_qv_1.inl b/src/cwmap.fast5_fq_qv_1.inl
new file mode 100644
index 0000000..986f778
--- /dev/null
+++ b/src/cwmap.fast5_fq_qv_1.inl
@@ -0,0 +1,35 @@
+{ "fast5_fq_qv_1",
+"10", "10",
+"16", "00000",
+"17", "00001",
+"18", "00010",
+"19", "00011",
+"20", "00100",
+"21", "00101",
+"22", "00110",
+"23", "00111",
+"24", "01000",
+"25", "01001",
+"26", "01010",
+"27", "01011",
+"28", "01100",
+"29", "01101",
+"30", "01110",
+"31", "01111",
+".", "110000",
+"0", "110001",
+"1", "110010",
+"2", "110011",
+"3", "110100",
+"4", "110101",
+"5", "110110",
+"6", "110111",
+"7", "111000",
+"8", "111001",
+"9", "111010",
+"11", "111011",
+"12", "111100",
+"13", "111101",
+"14", "111110",
+"15", "111111",
+}
diff --git a/src/cwmap.fast5_rw_1.inl b/src/cwmap.fast5_rw_1.inl
new file mode 100644
index 0000000..23ad6e5
--- /dev/null
+++ b/src/cwmap.fast5_rw_1.inl
@@ -0,0 +1,204 @@
+{ "fast5_rw_1",
+"8", "00001",
+"-7", "00011",
+"7", "00100",
+"-6", "00110",
+"6", "01000",
+"-5", "01001",
+"5", "01010",
+"-4", "01100",
+"4", "01101",
+"-3", "01110",
+"3", "01111",
+"-2", "10001",
+"2", "10010",
+"-1", "10011",
+"0", "10100",
+"1", "10101",
+"-16", "000001",
+"16", "000100",
+"-15", "001011",
+"15", "001110",
+"-14", "010111",
+"14", "100000",
+"-13", "101101",
+"13", "101111",
+"-12", "110010",
+"12", "110011",
+"-11", "110101",
+"11", "110111",
+"-10", "111001",
+"10", "111010",
+"-9", "111100",
+"9", "111110",
+"-8", "111111",
+"22", "0001010",
+"-22", "0001011",
+"21", "0011111",
+"-21", "0101100",
+"-20", "1000011",
+"20", "1011000",
+".", "1011101",
+"19", "1100001",
+"-19", "1100010",
+"-18", "1101001",
+"18", "1101101",
+"-17", "1110110",
+"17", "1111010",
+"-29", "00000001",
+"28", "00000010",
+"-28", "00101010",
+"27", "00111101",
+"-27", "01011010",
+"26", "10110010",
+"-26", "10110011",
+"25", "11000001",
+"-25", "11000111",
+"24", "11011001",
+"-24", "11100001",
+"-23", "11101111",
+"23", "11110110",
+"36", "000000000",
+"35", "000000111",
+"-36", "001010001",
+"34", "001010011",
+"-35", "001111000",
+"-34", "010110110",
+"33", "100001000",
+"-33", "101110010",
+"32", "101110011",
+"-32", "110100000",
+"31", "110100010",
+"-31", "110110000",
+"30", "111000100",
+"-30", "111000111",
+"29", "111011101",
+"-49", "0000000011",
+"45", "0010100000",
+"-48", "0010100001",
+"-46", "0010101100",
+"44", "0010101110",
+"43", "0010101111",
+"-47", "0011110010",
+"42", "1000010010",
+"-45", "1000010011",
+"-44", "1000010111",
+"-43", "1011100001",
+"41", "1011100011",
+"-42", "1100000010",
+"-41", "1100011000",
+"40", "1100011010",
+"39", "1101000011",
+"-40", "1101100010",
+"38", "1110000001",
+"-39", "1110001010",
+"37", "1110001100",
+"-38", "1110111001",
+"-37", "1111011111",
+"-70", "00000000100",
+"61", "00000011000",
+"-65", "00000011011",
+"60", "00101001001",
+"-67", "00101001011",
+"58", "00101011010",
+"-66", "00101011011",
+"-64", "00111100110",
+"59", "00111100111",
+"-62", "01011011100",
+"57", "01011011101",
+"56", "01011011110",
+"-63", "01011011111",
+"-60", "10000101010",
+"-61", "10111000000",
+"55", "10111000100",
+"54", "10111000101",
+"52", "11000000000",
+"53", "11000000001",
+"-57", "11000000110",
+"-58", "11000000111",
+"-59", "11000110011",
+"51", "11000110110",
+"-56", "11010000101",
+"-55", "11010001110",
+"49", "11011000110",
+"-54", "11011000111",
+"50", "11100000001",
+"-52", "11100000101",
+"-51", "11100010111",
+"-53", "11101110000",
+"48", "11101110001",
+"46", "11110111001",
+"47", "11110111011",
+"-50", "11110111100",
+"-90", "000000001010",
+"-88", "000000001011",
+"84", "000000110010",
+"-89", "000000110011",
+"82", "000000110100",
+"81", "000000110101",
+"83", "001010010000",
+"-87", "001010010001",
+"80", "001010010100",
+"79", "001010010101",
+"-84", "100001010000",
+"76", "100001010001",
+"-85", "100001010010",
+"75", "100001010011",
+"74", "100001010110",
+"-86", "100001011000",
+"-82", "100001011001",
+"77", "100001011010",
+"-83", "100001011011",
+"78", "101110000010",
+"-81", "101110000011",
+"-80", "110000000100",
+"73", "110000000110",
+"-78", "110000000111",
+"72", "110001100100",
+"69", "110001101110",
+"-77", "110001101111",
+"-79", "110100001000",
+"71", "110100011000",
+"68", "110100011001",
+"-76", "110100011010",
+"-72", "110100011111",
+"70", "111000000000",
+"-74", "111000001001",
+"66", "111000001100",
+"-75", "111000001110",
+"-73", "111000001111",
+"67", "111000101101",
+"65", "111000110100",
+"63", "111000110110",
+"-71", "111101110000",
+"62", "111101110001",
+"-69", "111101110100",
+"-68", "111101110101",
+"64", "111101111010",
+"-98", "1000010101110",
+"97", "1000010101111",
+"95", "1100000001010",
+"-100", "1100000001011",
+"100", "1100011001010",
+"99", "1100011001011",
+"98", "1101000010010",
+"-99", "1101000010011",
+"-95", "1101000110110",
+"-94", "1101000110111",
+"94", "1101000111100",
+"92", "1101000111101",
+"-97", "1110000000010",
+"90", "1110000000011",
+"96", "1110000010000",
+"86", "1110000010001",
+"93", "1110000011010",
+"-96", "1110000011011",
+"91", "1110001011000",
+"-92", "1110001011001",
+"87", "1110001101010",
+"-93", "1110001101011",
+"88", "1110001101110",
+"89", "1110001101111",
+"-91", "1111011110110",
+"85", "1111011110111",
+}
diff --git a/src/f5-mod.cpp b/src/f5-mod.cpp
index 278ba57..5d510ae 100644
--- a/src/f5-mod.cpp
+++ b/src/f5-mod.cpp
@@ -1,3 +1,10 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
#include <cassert>
#include <iostream>
#include <string>
@@ -62,18 +69,17 @@ int main(int argc, char* argv[])
//
// add basecall events
//
- vector< fast5::Event_Entry > ev(3, {55.0, 1.0, 0.05, 0.01, .5, .5, .7, .1, .1, .1, 0,
- array< char, 8 >{"ACGTA"}, array< char, 8 >{"CGTAC"}});
+ vector< fast5::Basecall_Event > ev(3, {55.0, 1.0, 0.05, 0.01, .5, 0, array< char, 8 >{"ACGTA"}});
f.add_basecall_events(0, test_bc_grp_prefix + test_bc_grp_suffix, ev);
//
// add basecall pore model
//
- vector< fast5::Model_Entry > mod(3, {0, 56.0, 1.0, 42.0, 1.0, 5.0, array< char, 8 >{"ACGTA"}});
+ vector< fast5::Basecall_Model_State > mod(3, {56.0, 1.0, 42.0, 1.0, array< char, 8 >{"ACGTA"}});
f.add_basecall_model(0, test_bc_grp_prefix + test_bc_grp_suffix, mod);
//
// add basecall pore model params
//
- fast5::Model_Parameters params{1.0, 0.0, 0.0, 1.0, .9, .9};
+ fast5::Basecall_Model_Params params{1.0, 0.0, 0.0, 1.0, .9, .9};
f.add_basecall_model_params(0, test_bc_grp_prefix + test_bc_grp_suffix, params);
//
// add basecall model file
diff --git a/src/f5dump.cpp b/src/f5dump.cpp
index 1dd8984..3d18351 100644
--- a/src/f5dump.cpp
+++ b/src/f5dump.cpp
@@ -1,3 +1,10 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
#include <cassert>
#include <iostream>
#include <iomanip>
@@ -10,7 +17,6 @@
using namespace std;
-
namespace opts
{
using namespace TCLAP;
@@ -26,8 +32,9 @@ namespace opts
ValueArg< unsigned > st("", "st", "Strand.", false, 0, "0|1|2", cmd_parser);
ValueArg< string > gr("", "gr", "Group name suffix.", false, "", "000|RNN_001|...", cmd_parser);
//
- SwitchArg fq("", "fq", "Dump basecall fastq data.", cmd_parser);
+ SwitchArg al("", "al", "Dump basecall 2d alignment data.", cmd_parser);
SwitchArg ev("", "ev", "Dump basecall event data.", cmd_parser);
+ SwitchArg fq("", "fq", "Dump basecall fastq data.", cmd_parser);
SwitchArg ed("", "ed", "Dump event detection data.", cmd_parser);
SwitchArg rw("", "rw", "Dump raw samples data.", cmd_parser);
SwitchArg id("", "id", "Dump channel/tracking id data.", cmd_parser);
@@ -44,11 +51,6 @@ void print_map(ostream& os, const map< U, V >& m, const string& prefix)
}
}
-unsigned time_int(double f, fast5::Channel_Id_Parameters const & channel_id_params)
-{
- return f * channel_id_params.sampling_rate;
-}
-
void real_main()
{
fast5::File f;
@@ -56,7 +58,7 @@ void real_main()
{
// open file
f.open(opts::input_fn);
- auto channel_id_params = f.get_channel_id_params();
+ auto cid_params = f.get_channel_id_params();
//
// list
//
@@ -90,7 +92,7 @@ void real_main()
int have_events = (st == 2
? f.have_basecall_events(0, gr) and f.have_basecall_events(1, gr)
: f.have_basecall_events(st, gr));
- string link = (st == 2? f.get_basecall_group_1d(gr) : f.get_basecall_eventdetection_group(gr));
+ string link = (st == 2? f.get_basecall_1d_group(gr) : f.get_basecall_eventdetection_group(gr));
cout
<< (st == 2? "bc2d" : "bc1d") << "\t"
<< gr << "\t"
@@ -109,11 +111,11 @@ void real_main()
if (opts::id)
{
cout
- << "channel_id/channel_number=" << channel_id_params.channel_number << endl
- << "channel_id/digitisation=" << channel_id_params.digitisation << endl
- << "channel_id/offset=" << channel_id_params.offset << endl
- << "channel_id/range=" << channel_id_params.range << endl
- << "channel_id/sampling_rate=" << channel_id_params.sampling_rate << endl
+ << "channel_id/channel_number=" << cid_params.channel_number << endl
+ << "channel_id/digitisation=" << cid_params.digitisation << endl
+ << "channel_id/offset=" << cid_params.offset << endl
+ << "channel_id/range=" << cid_params.range << endl
+ << "channel_id/sampling_rate=" << cid_params.sampling_rate << endl
;
if (f.have_tracking_id_params())
{
@@ -153,7 +155,7 @@ void real_main()
}
else
{
- auto rs_int = f.get_raw_samples_int(opts::rn);
+ auto rs_int = f.get_raw_int_samples(opts::rn);
if (opts::rw_time)
{
cout << "start\t";
@@ -174,7 +176,7 @@ void real_main()
//
if (opts::ed and f.have_eventdetection_events(opts::gr, opts::rn))
{
- auto ede_params = f.get_eventdetection_event_params(opts::gr, opts::rn);
+ auto ede_params = f.get_eventdetection_events_params(opts::gr, opts::rn);
cout
<< "#read_id=" << ede_params.read_id << endl
<< "#read_number=" << ede_params.read_number << endl
@@ -186,7 +188,7 @@ void real_main()
auto ede = f.get_eventdetection_events(opts::gr, opts::rn);
cout
<< "start\tlength\tmean\tstdv" << endl
- << alg::os_join(ede, "\n", [] (fast5::EventDetection_Event_Entry const & e) {
+ << alg::os_join(ede, "\n", [] (fast5::EventDetection_Event const & e) {
ostringstream oss;
oss.precision(opts::float_prec);
oss << e.start << "\t" << e.length << "\t" << e.mean << "\t" << e.stdv;
@@ -195,13 +197,36 @@ void real_main()
<< endl;
} // if opts::ed
//
+ // basecall fastq
+ //
+ if (opts::fq and f.have_basecall_fastq(opts::st, opts::gr))
+ {
+ auto fq = f.get_basecall_fastq(opts::st, opts::gr);
+ cout << fq;
+ if (fq.size() > 0 and fq[fq.size() - 1] != '\n') cout << endl;
+ } // if opts::fq
+ //
// basecall events
//
if (opts::ev and f.have_basecall_events(opts::st, opts::gr))
{
+ auto bce_params = f.get_basecall_events_params(opts::st, opts::gr);
+ if (not opts::time_int)
+ {
+ cout
+ << "#start_time=" << bce_params.start_time << endl
+ << "#duration=" << bce_params.duration << endl;
+ }
+ else
+ {
+ cout
+ << "#start_time=" << f.time_to_int(bce_params.start_time, cid_params) << endl
+ << "#duration=" << f.time_to_int(bce_params.duration, cid_params) << endl;
+ }
auto bce = f.get_basecall_events(opts::st, opts::gr);
cout
- << alg::os_join(bce, "\n", [&channel_id_params] (fast5::Event_Entry const & e) {
+ << "start\tlength\tmean\tstdv\tstate\tmove\tp_model_state" << endl
+ << alg::os_join(bce, "\n", [&] (fast5::Basecall_Event const & e) {
ostringstream oss;
oss.precision(opts::float_prec);
if (not opts::time_int)
@@ -211,26 +236,33 @@ void real_main()
else
{
oss
- << time_int(e.start, channel_id_params) << "\t"
- << time_int(e.length, channel_id_params) << "\t";
+ << f.time_to_int(e.start, cid_params) << "\t"
+ << f.time_to_int(e.length, cid_params) << "\t";
}
oss
<< e.mean << "\t"
<< e.stdv << "\t"
- << string(e.model_state.begin(), e.model_state.end()).data() << "\t"
- << e.move;
+ << e.get_model_state() << "\t"
+ << e.move << "\t"
+ << e.p_model_state;
return oss.str();
})
<< endl;
} // if opts::ev
- //
- // basecall fastq
- //
- if (opts::fq and f.have_basecall_fastq(opts::st, opts::gr))
+ if (opts::al and f.have_basecall_alignment(opts::gr))
{
- auto fq = f.get_basecall_fastq(opts::st, opts::gr);
- cout << fq << endl;
- } // if opts::fq
+ auto aln = f.get_basecall_alignment(opts::gr);
+ cout
+ << "template\tcomplement\tkmer" << endl
+ << alg::os_join(aln, "\n", [&] (fast5::Basecall_Alignment_Entry const & a) {
+ ostringstream oss;
+ oss << a.template_index << "\t"
+ << a.complement_index << "\t"
+ << a.get_kmer();
+ return oss.str();
+ })
+ << endl;
+ } // if opts::al
}
catch (hdf5_tools::Exception& e)
{
@@ -247,13 +279,13 @@ int main(int argc, char * argv[])
// << "program: " << opts::cmd_parser.getProgramName() << endl
// << "version: " << opts::cmd_parser.getVersion() << endl
// << "args: " << opts::cmd_parser.getOrigArgv() << endl;
- if (opts::ls + opts::id + opts::rw + opts::ed + opts::ev + opts::fq == 0)
+ if (opts::ls + opts::id + opts::rw + opts::ed + opts::fq + opts::ev + opts::al == 0)
{
opts::ls.set(true);
}
- else if (opts::ls + opts::id + opts::rw + opts::ed + opts::ev + opts::fq > 1)
+ else if (opts::ls + opts::id + opts::rw + opts::ed + opts::fq + opts::ev + opts::al > 1)
{
- cerr << "at most one of --ls/--id/--rw/--ed/--ev/--fq must be given" << endl;
+ cerr << "at most one of --ls/--id/--rw/--ed/--fq/--ev/--al must be given" << endl;
exit(EXIT_FAILURE);
}
cout.precision(opts::float_prec);
diff --git a/src/f5ls-full.cpp b/src/f5ls-full.cpp
index d29000a..48dd47a 100644
--- a/src/f5ls-full.cpp
+++ b/src/f5ls-full.cpp
@@ -1,3 +1,10 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
#include <cassert>
#include <iostream>
#include <string>
@@ -121,9 +128,9 @@ int main(int argc, char* argv[])
//
bool have_eventdetection_events = f.have_eventdetection_events();
cout << "have_eventdetection_events=" << have_eventdetection_events << endl;
- bool have_eventdetection_groups = f.have_eventdetection_groups();
- cout << "have_eventdetection_groups=" << have_eventdetection_groups << endl;
- if (have_eventdetection_groups)
+ bool have_eventdetection_group = f.have_eventdetection_group();
+ cout << "have_eventdetection_group=" << have_eventdetection_group << endl;
+ if (have_eventdetection_group)
{
auto ed_gr_list = f.get_eventdetection_group_list();
cout << "eventdetection_group_list=";
@@ -143,7 +150,7 @@ int main(int argc, char* argv[])
{
std::ostringstream tmp;
tmp << "eventdetection/" << ed_gr << "/" << rn;
- auto ed_ev_params = f.get_eventdetection_event_params(ed_gr, rn);
+ auto ed_ev_params = f.get_eventdetection_events_params(ed_gr, rn);
auto ed_ev = f.get_eventdetection_events(ed_gr, rn);
cout << tmp.str() << "/abasic_found=" << ed_ev_params.abasic_found << endl
<< tmp.str() << "/duration=" << ed_ev_params.duration << endl
@@ -168,9 +175,9 @@ int main(int argc, char* argv[])
//
// inspect basecall groups
//
- bool have_basecall_groups = f.have_basecall_groups();
- cout << "have_basecall_groups=" << have_basecall_groups << endl;
- if (have_basecall_groups)
+ bool have_basecall_group = f.have_basecall_group();
+ cout << "have_basecall_group=" << have_basecall_group << endl;
+ if (have_basecall_group)
{
auto bc_gr_list = f.get_basecall_group_list();
cout << "basecall_group_list=";
@@ -192,9 +199,13 @@ int main(int argc, char* argv[])
print_map(cout, bc_params, tmp.str());
// check if basecall log exists
cout << "basecall/" << bc_gr << "/have_log=" << f.have_basecall_log(bc_gr) << endl;
+ // check if eventdetection link exists
+ auto bc_ed_gr = f.get_basecall_eventdetection_group(bc_gr);
+ cout << "basecall/" << bc_gr << "/eventdetection_group=" << bc_ed_gr << endl;
}
for (unsigned st = 0; st < 3; ++st)
{
+ auto gr_l = f.get_basecall_strand_group_list(st);
bool have_seq = f.have_basecall_seq(st);
cout << "basecall(" << st << ")/have_seq=" << have_seq << endl;
if (have_seq)
@@ -243,12 +254,12 @@ int main(int argc, char* argv[])
}
if (st == 2)
{
- bool have_event_alignment = f.have_basecall_event_alignment();
- cout << "basecall(2)/have_event_alignment=" << have_event_alignment << endl;
- if (have_event_alignment)
+ bool have_alignment = f.have_basecall_alignment();
+ cout << "basecall(2)/have_alignment=" << have_alignment << endl;
+ if (have_alignment)
{
- auto al = f.get_basecall_event_alignment();
- cout << "basecall(2)/event_alignment/size=" << al.size() << endl;
+ auto al = f.get_basecall_alignment();
+ cout << "basecall(2)/alignment/size=" << al.size() << endl;
for (const auto& e : al)
{
cout << " (template_index=" << e.template_index
diff --git a/src/f5ls.cpp b/src/f5ls.cpp
index af655dd..c485ab7 100644
--- a/src/f5ls.cpp
+++ b/src/f5ls.cpp
@@ -1,3 +1,10 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
#include <cassert>
#include <iostream>
#include <string>
@@ -111,7 +118,7 @@ int main(int argc, char* argv[])
{
auto ed_params = f.get_eventdetection_params();
print_map(cout, ed_params, "eventdetection/");
- auto ed_ev_params = f.get_eventdetection_event_params();
+ auto ed_ev_params = f.get_eventdetection_events_params();
auto ed_ev = f.get_eventdetection_events();
cout << "eventdetection/events/abasic_found=" << ed_ev_params.abasic_found << endl
<< "eventdetection/events/duration=" << ed_ev_params.duration << endl
@@ -173,11 +180,11 @@ int main(int argc, char* argv[])
<< ", p_model_state=" << e.p_model_state
<< ", move=" << e.move << ")" << endl;
}
- // basecall event alignment
- if (st == 2 and f.have_basecall_event_alignment())
+ // basecall alignment
+ if (st == 2 and f.have_basecall_alignment())
{
- auto al = f.get_basecall_event_alignment();
- cout << "basecall(2)/event_alignment/size=" << al.size() << endl;
+ auto al = f.get_basecall_alignment();
+ cout << "basecall(2)/alignment/size=" << al.size() << endl;
const auto& e = al.front();
cout << " (template_index=" << e.template_index
<< ", complement_index=" << e.complement_index
diff --git a/src/f5pack.cpp b/src/f5pack.cpp
new file mode 100644
index 0000000..9afd30f
--- /dev/null
+++ b/src/f5pack.cpp
@@ -0,0 +1,185 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
+#include <cassert>
+#include <iostream>
+#include <iomanip>
+#include <string>
+
+#include <tclap/CmdLine.h>
+#include "logger.hpp"
+
+#include "fast5.hpp"
+#include "File_Packer.hpp"
+
+using namespace std;
+
+namespace opts
+{
+ using namespace TCLAP;
+ string description = "Pack an ONT fast5 file.";
+ CmdLine cmd_parser(description);
+ //
+ MultiArg< string > log_level("", "log", "Log level. (default: info)", false, "string", cmd_parser);
+ MultiSwitchArg extra_verbosity("v", "", "Increase verbosity", cmd_parser);
+ //
+ SwitchArg al_drop("", "al-drop", "Drop basecall alignment data.", cmd_parser);
+ SwitchArg al_copy("", "al-copy", "Copy basecall alignment data.", cmd_parser);
+ SwitchArg al_unpack("", "al-unpack", "Unpack basecall alignment data.", cmd_parser);
+ SwitchArg al_pack("", "al-pack", "Pack basecall alignment data.", cmd_parser);
+ //
+ SwitchArg ev_drop("", "ev-drop", "Drop basecall event data.", cmd_parser);
+ SwitchArg ev_copy("", "ev-copy", "Copy basecall event data.", cmd_parser);
+ SwitchArg ev_unpack("", "ev-unpack", "Unpack basecall event data.", cmd_parser);
+ SwitchArg ev_pack("", "ev-pack", "Pack basecall event data.", cmd_parser);
+ //
+ SwitchArg fq_drop("", "fq-drop", "Drop basecall fastq data.", cmd_parser);
+ SwitchArg fq_copy("", "fq-copy", "Copy basecall fastq data.", cmd_parser);
+ SwitchArg fq_unpack("", "fq-unpack", "Unpack basecall fatsq data.", cmd_parser);
+ SwitchArg fq_pack("", "fq-pack", "Pack basecall fastq data.", cmd_parser);
+ //
+ SwitchArg ed_drop("", "ed-drop", "Drop event detection data.", cmd_parser);
+ SwitchArg ed_copy("", "ed-copy", "Copy event detection data.", cmd_parser);
+ SwitchArg ed_unpack("", "ed-unpack", "Unpack event detection data.", cmd_parser);
+ SwitchArg ed_pack("", "ed-pack", "Pack event detection data.", cmd_parser);
+ //
+ SwitchArg rw_drop("", "rw-drop", "Drop raw samples data.", cmd_parser);
+ SwitchArg rw_copy("", "rw-copy", "Copy raw samples data.", cmd_parser);
+ SwitchArg rw_unpack("", "rw-unpack", "Unpack raw samples data.", cmd_parser);
+ SwitchArg rw_pack("", "rw-pack", "Pack raw samples data.", cmd_parser);
+ //
+ ValueArg< unsigned > p_model_state_bits("", "p-model-state-bits", "P_Model_State bits to keep.", false, fast5::File_Packer::default_p_model_state_bits(), "int", cmd_parser);
+ ValueArg< unsigned > qv_bits("", "qv-bits", "QV bits to keep.", false, fast5::File_Packer::max_qv_bits(), "int", cmd_parser);
+ SwitchArg no_check("n", "no-check", "Don't check packing.", cmd_parser);
+ SwitchArg force("f", "force", "Overwrite output file if it exists.", cmd_parser);
+ //
+ SwitchArg fastq("", "fastq", "Pack fastq data, drop rest.", cmd_parser);
+ SwitchArg archive("", "archive", "Pack raw saples data, drop rest.", cmd_parser);
+ SwitchArg unpack("u", "unpack", "Unpack files.", cmd_parser);
+ SwitchArg pack("p", "pack", "Pack files (default, if no other pack/unpack/copy options).", cmd_parser);
+ //
+ UnlabeledValueArg< string > input_fn("input", "Input fast5 file.", true, "", "file", cmd_parser);
+ UnlabeledValueArg< string > output_fn("output", "Output fast5 file.", true, "", "file", cmd_parser);
+} // opts
+
+
+int main(int argc, char * argv[])
+{
+ opts::cmd_parser.parse(argc, argv);
+ // set log levels
+ auto default_level = (int)logger::level::info + opts::extra_verbosity.getValue();
+ logger::Logger::set_default_level(default_level);
+ logger::Logger::set_levels_from_options(opts::log_level, &clog);
+ // print options
+ LOG(info) << "program: " << opts::cmd_parser.getProgramName() << endl;
+ LOG(info) << "version: " << opts::cmd_parser.getVersion() << endl;
+ LOG(info) << "args: " << opts::cmd_parser.getOrigArgv() << endl;
+ // what to pack/unpack
+ if (opts::pack + opts::unpack + opts::archive + opts::fastq > 1)
+ {
+ LOG_EXIT << "at most one of --pack/--unpack/--archive/--fastq may be given" << endl;
+ }
+ if (opts::rw_pack + opts::rw_unpack + opts::rw_copy + opts::rw_drop > 1)
+ {
+ LOG_EXIT << "at most one of --rw-pack/--rw-unpack/--rw-copy/--rw-drop may be given" << endl;
+ }
+ if (opts::ed_pack + opts::ed_unpack + opts::ed_copy + opts::ed_drop > 1)
+ {
+ LOG_EXIT << "at most one of --ed-pack/--ed-unpack/--ed-copy/--ed-drop may be given" << endl;
+ }
+ if (opts::fq_pack + opts::fq_unpack + opts::fq_copy + opts::fq_drop > 1)
+ {
+ LOG_EXIT << "at most one of --fq-pack/--fq-unpack/--fq-copy/--fq-drop may be given" << endl;
+ }
+ if (opts::ev_pack + opts::ev_unpack + opts::ev_copy + opts::ev_drop > 1)
+ {
+ LOG_EXIT << "at most one of --ev-pack/--ev-unpack/--ev-copy/--ev-drop may be given" << endl;
+ }
+ if (opts::al_pack + opts::al_unpack + opts::al_copy + opts::al_drop > 1)
+ {
+ LOG_EXIT << "at most one of --al-pack/--al-unpack/--al-copy/--al-drop may be given" << endl;
+ }
+ if (opts::pack + opts::unpack + opts::archive + opts::fastq
+ + opts::rw_pack + opts::rw_unpack + opts::rw_copy + opts::rw_drop
+ + opts::ed_pack + opts::ed_unpack + opts::ed_copy + opts::ed_drop
+ + opts::fq_pack + opts::fq_unpack + opts::fq_copy + opts::fq_drop
+ + opts::ev_pack + opts::ev_unpack + opts::ev_copy + opts::ev_drop
+ + opts::al_pack + opts::al_unpack + opts::al_copy + opts::al_drop
+ == 0)
+ {
+ opts::pack.set(true);
+ }
+ if (opts::pack)
+ {
+ opts::rw_pack.set(true);
+ opts::ed_pack.set(true);
+ opts::fq_pack.set(true);
+ opts::ev_pack.set(true);
+ opts::al_pack.set(true);
+ }
+ else if (opts::unpack)
+ {
+ opts::rw_unpack.set(true);
+ opts::ed_unpack.set(true);
+ opts::fq_unpack.set(true);
+ opts::ev_unpack.set(true);
+ opts::al_unpack.set(true);
+ }
+ if (opts::archive)
+ {
+ opts::rw_pack.set(true);
+ }
+ if (opts::fastq)
+ {
+ opts::fq_pack.set(true);
+ }
+ if (opts::rw_pack + opts::rw_unpack + opts::rw_copy + opts::rw_drop == 0) opts::rw_drop.set(true);
+ if (opts::ed_pack + opts::ed_unpack + opts::ed_copy + opts::ed_drop == 0) opts::ed_drop.set(true);
+ if (opts::fq_pack + opts::fq_unpack + opts::fq_copy + opts::fq_drop == 0) opts::fq_drop.set(true);
+ if (opts::ev_pack + opts::ev_unpack + opts::ev_copy + opts::ev_drop == 0) opts::ev_drop.set(true);
+ if (opts::al_pack + opts::al_unpack + opts::al_copy + opts::al_drop == 0) opts::al_drop.set(true);
+ LOG(info) << "rw: " << (opts::rw_pack? "pack" : opts::rw_unpack? "unpack" : opts::rw_copy? "copy" : "drop") << endl;
+ LOG(info) << "ed: " << (opts::ed_pack? "pack" : opts::ed_unpack? "unpack" : opts::ed_copy? "copy" : "drop") << endl;
+ LOG(info) << "fq: " << (opts::fq_pack? "pack" : opts::fq_unpack? "unpack" : opts::fq_copy? "copy" : "drop") << endl;
+ LOG(info) << "ev: " << (opts::ev_pack? "pack" : opts::ev_unpack? "unpack" : opts::ev_copy? "copy" : "drop") << endl;
+ LOG(info) << "al: " << (opts::al_pack? "pack" : opts::al_unpack? "unpack" : opts::al_copy? "copy" : "drop") << endl;
+ LOG(info) << "check: " << (not opts::no_check? "yes" : "no") << endl;
+ // set File_Packer options
+ int rw_policy = (opts::rw_pack? 1 : opts::rw_unpack? 2 : opts::rw_copy? 3 : 0);
+ int ed_policy = (opts::ed_pack? 1 : opts::ed_unpack? 2 : opts::ed_copy? 3 : 0);
+ int fq_policy = (opts::fq_pack? 1 : opts::fq_unpack? 2 : opts::fq_copy? 3 : 0);
+ int ev_policy = (opts::ev_pack? 1 : opts::ev_unpack? 2 : opts::ev_copy? 3 : 0);
+ int al_policy = (opts::al_pack? 1 : opts::al_unpack? 2 : opts::al_copy? 3 : 0);
+ fast5::File_Packer fp(rw_policy, ed_policy, fq_policy, ev_policy, al_policy);
+ fp.set_check(not opts::no_check);
+ fp.set_force(opts::force);
+ fp.set_qv_bits(opts::qv_bits);
+ fp.set_p_model_state_bits(opts::p_model_state_bits);
+ fp.run(opts::input_fn, opts::output_fn);
+ auto cnt = fp.get_counts();
+ cout
+ << std::fixed << std::setprecision(2)
+ << "bp_seq_count\t" << cnt.bp_seq_count << "\n"
+ << "rs_count\t" << cnt.rs_count << "\n"
+ << "rs_bits\t" << (double)cnt.rs_bits/cnt.rs_count << "\n"
+ << "ed_count\t" << cnt.ed_count << "\n"
+ << "ed_skip_bits\t" << (double)cnt.ed_skip_bits/cnt.ed_count << "\n"
+ << "ed_len_bits\t" << (double)cnt.ed_len_bits/cnt.ed_count << "\n"
+ << "fq_count\t" << cnt.fq_count << "\n"
+ << "fq_bp_bits\t" << (double)cnt.fq_bp_bits/cnt.fq_count << "\n"
+ << "fq_qv_bits\t" << (double)cnt.fq_qv_bits/cnt.fq_count << "\n"
+ << "ev_count\t" << cnt.ev_count << "\n"
+ << "ev_rel_skip_bits\t" << (double)cnt.ev_rel_skip_bits/cnt.ev_count << "\n"
+ << "ev_skip_bits\t" << (double)cnt.ev_skip_bits/cnt.ev_count << "\n"
+ << "ev_len_bits\t" << (double)cnt.ev_len_bits/cnt.ev_count << "\n"
+ << "ev_move_bits\t" << (double)cnt.ev_move_bits/cnt.ev_count << "\n"
+ << "ev_p_model_state_bits\t" << (double)cnt.ev_p_model_state_bits/cnt.ev_count << "\n"
+ << "al_count\t" << cnt.al_count << "\n"
+ << "al_template_step_bits\t" << (double)cnt.al_template_step_bits/cnt.al_count << "\n"
+ << "al_complement_step_bits\t" << (double)cnt.al_complement_step_bits/cnt.al_count << "\n"
+ << "al_move_bits\t" << (double)cnt.al_move_bits/cnt.al_count << "\n";
+}
diff --git a/src/fast5.hpp b/src/fast5.hpp
index 14ae3cd..db6eb1d 100644
--- a/src/fast5.hpp
+++ b/src/fast5.hpp
@@ -1,3 +1,10 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
#ifndef __FAST5_HPP
#define __FAST5_HPP
@@ -13,13 +20,19 @@
#include <array>
#include <set>
#include <map>
+#include <stdexcept>
+#include "logger.hpp"
+#include "fast5_version.hpp"
#include "hdf5_tools.hpp"
+#include "Huffman_Packer.hpp"
+#include "Bit_Packer.hpp"
+
#define MAX_K_LEN 8
namespace
{
- inline static std::string array_to_string(const std::array< char, MAX_K_LEN >& a)
+ inline static std::string array_to_string(std::array< char, MAX_K_LEN > const & a)
{
return std::string(a.begin(), std::find(a.begin(), a.end(), '\0'));
}
@@ -28,47 +41,147 @@ namespace
namespace fast5
{
-struct Channel_Id_Parameters
+typedef hdf5_tools::File::Attr_Map Attr_Map;
+
+struct Channel_Id_Params
{
std::string channel_number;
double digitisation;
double offset;
double range;
double sampling_rate;
-}; // struct Channel_Id_Parameters
+ Channel_Id_Params()
+ : channel_number(""),
+ digitisation(0.0),
+ offset(0.0),
+ range(0.0),
+ sampling_rate(0.0) {}
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ f.read(p + "/channel_number", channel_number);
+ f.read(p + "/digitisation", digitisation);
+ f.read(p + "/offset", offset);
+ f.read(p + "/range", range);
+ f.read(p + "/sampling_rate", sampling_rate);
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ f.write_attribute(p + "/channel_number", channel_number);
+ f.write_attribute(p + "/digitisation", digitisation);
+ f.write_attribute(p + "/offset", offset);
+ f.write_attribute(p + "/range", range);
+ f.write_attribute(p + "/sampling_rate", sampling_rate);
+ }
+}; // struct Channel_Id_Params
-typedef std::map< std::string, std::string > Tracking_Id_Parameters;
+typedef Attr_Map Tracking_Id_Params;
-typedef std::map< std::string, std::string > Sequences_Parameters;
+typedef Attr_Map Sequences_Params;
-typedef float Raw_Samples_Entry;
-typedef int16_t Raw_Samples_Int_Entry;
+typedef float Raw_Sample;
+typedef int16_t Raw_Int_Sample;
-struct Raw_Samples_Parameters
+struct Raw_Samples_Params
{
std::string read_id;
long long read_number;
long long start_mux;
long long start_time;
long long duration;
-}; // struct Raw_Samples_Parameters
+ friend bool operator == (Raw_Samples_Params const & lhs, Raw_Samples_Params const & rhs)
+ {
+ return (lhs.read_id == rhs.read_id
+ and lhs.read_number == rhs.read_number
+ and lhs.start_mux == rhs.start_mux
+ and lhs.start_time == rhs.start_time
+ and lhs.duration == rhs.duration);
+ }
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ f.read(p + "/read_id", read_id);
+ f.read(p + "/read_number", read_number);
+ f.read(p + "/start_mux", start_mux);
+ f.read(p + "/start_time", start_time);
+ f.read(p + "/duration", duration);
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ f.write_attribute(p + "/read_id", read_id);
+ f.write_attribute(p + "/read_number", read_number);
+ f.write_attribute(p + "/start_mux", start_mux);
+ f.write_attribute(p + "/start_time", start_time);
+ f.write_attribute(p + "/duration", duration);
+ }
+}; // struct Raw_Samples_Params
+
+typedef std::pair< std::vector< Raw_Int_Sample >, Raw_Samples_Params > Raw_Int_Samples_Dataset;
+typedef std::pair< std::vector< Raw_Sample >, Raw_Samples_Params > Raw_Samples_Dataset;
+
+struct Raw_Samples_Pack
+{
+ Huffman_Packer::Code_Type signal;
+ Attr_Map signal_params;
+ //
+ Raw_Samples_Params params;
+ //
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ f.read(p + "/Signal", signal);
+ signal_params = f.get_attr_map(p + "/Signal");
+ params.read(f, p + "/params");
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ f.write_dataset(p + "/Signal", signal);
+ f.add_attr_map(p + "/Signal", signal_params);
+ params.write(f, p + "/params");
+ }
+}; // struct Raw_Samples_Pack
-struct EventDetection_Event_Entry
+struct EventDetection_Event
{
double mean;
double stdv;
long long start;
long long length;
- friend bool operator == (const EventDetection_Event_Entry& lhs, const EventDetection_Event_Entry& rhs)
+ friend bool operator == (EventDetection_Event const & lhs, EventDetection_Event const & rhs)
{
return lhs.mean == rhs.mean
and lhs.stdv == rhs.stdv
and lhs.start == rhs.start
and lhs.length == rhs.length;
}
+ static hdf5_tools::Compound_Map const & compound_map()
+ {
+ static hdf5_tools::Compound_Map m;
+ static bool inited = false;
+ if (not inited)
+ {
+ m.add_member("mean", &EventDetection_Event::mean);
+ m.add_member("start", &EventDetection_Event::start);
+ m.add_member("length", &EventDetection_Event::length);
+ m.add_member("stdv", &EventDetection_Event::stdv);
+ inited = true;
+ }
+ return m;
+ }
+ static hdf5_tools::Compound_Map const & alt_compound_map()
+ {
+ static hdf5_tools::Compound_Map m;
+ static bool inited = false;
+ if (not inited)
+ {
+ m.add_member("mean", &EventDetection_Event::mean);
+ m.add_member("start", &EventDetection_Event::start);
+ m.add_member("length", &EventDetection_Event::length);
+ m.add_member("variance", &EventDetection_Event::stdv);
+ inited = true;
+ }
+ return m;
+ }
}; // struct EventDetection_Event
-struct EventDetection_Event_Parameters
+struct EventDetection_Events_Params
{
std::string read_id;
long long read_number;
@@ -78,42 +191,136 @@ struct EventDetection_Event_Parameters
long long duration;
double median_before;
unsigned abasic_found;
-}; // struct EventDetection_Event_Parameters
+ friend bool operator == (EventDetection_Events_Params const & lhs, EventDetection_Events_Params const & rhs)
+ {
+ return (lhs.read_id == rhs.read_id
+ and lhs.read_number == rhs.read_number
+ and lhs.scaling_used == rhs.scaling_used
+ and lhs.start_mux == rhs.start_mux
+ and lhs.start_time == rhs.start_time
+ and lhs.duration == rhs.duration
+ and ((std::isnan(lhs.median_before) and std::isnan(rhs.median_before))
+ or lhs.median_before == rhs.median_before)
+ and lhs.abasic_found == rhs.abasic_found);
+ }
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ auto a_v = f.get_attr_list(p);
+ std::set< std::string > a_s(a_v.begin(), a_v.end());
+ f.read(p + "/read_number", read_number);
+ f.read(p + "/scaling_used", scaling_used);
+ f.read(p + "/start_mux", start_mux);
+ f.read(p + "/start_time", start_time);
+ f.read(p + "/duration", duration);
+ // optional fields
+ if (a_s.count("read_id"))
+ {
+ f.read(p + "/read_id", read_id);
+ }
+ if (a_s.count("median_before"))
+ {
+ f.read(p + "/median_before", median_before);
+ }
+ else
+ {
+ median_before = std::nan("");
+ }
+ if (a_s.count("abasic_found"))
+ {
+ f.read(p + "/abasic_found", abasic_found);
+ }
+ else
+ {
+ abasic_found = 2;
+ }
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ f.write_attribute(p + "/read_number", read_number);
+ f.write_attribute(p + "/scaling_used", scaling_used);
+ f.write_attribute(p + "/start_mux", start_mux);
+ f.write_attribute(p + "/start_time", start_time);
+ f.write_attribute(p + "/duration", duration);
+ if (not read_id.empty()) f.write_attribute(p + "/read_id", read_id);
+ if (not std::isnan(median_before)) f.write_attribute(p + "/median_before", median_before);
+ if (abasic_found < 2) f.write_attribute(p + "/abasic_found", abasic_found);
+ }
+}; // struct EventDetection_Events_Params
+
+typedef std::pair< std::vector< EventDetection_Event >, EventDetection_Events_Params > EventDetection_Events_Dataset;
+
+struct EventDetection_Events_Pack
+{
+ Huffman_Packer::Code_Type skip;
+ Attr_Map skip_params;
+ Huffman_Packer::Code_Type len;
+ Attr_Map len_params;
+ //
+ EventDetection_Events_Params params;
+ //
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ f.read(p + "/Skip", skip);
+ skip_params = f.get_attr_map(p + "/Skip");
+ f.read(p + "/Len", len);
+ len_params = f.get_attr_map(p + "/Len");
+ params.read(f, p + "/params");
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ f.write_dataset(p + "/Skip", skip);
+ f.add_attr_map(p + "/Skip", skip_params);
+ f.write_dataset(p + "/Len", len);
+ f.add_attr_map(p + "/Len", len_params);
+ params.write(f, p + "/params");
+ }
+}; // struct EventDetection_Events_Pack
//
// This struct represents the expected signal measured
// given the kmer sequence that is in the pore when the
// the observations are made. A pore model consists
// of 1024 of these entries (one per 5-mer) and global
-// shift/scaling parameters.
+// shift/scaling params.
//
-struct Model_Entry
+struct Basecall_Model_State
{
- long long variant;
double level_mean;
double level_stdv;
double sd_mean;
double sd_stdv;
- double weight;
std::array< char, MAX_K_LEN > kmer;
std::string get_kmer() const { return array_to_string(kmer); }
- friend bool operator == (const Model_Entry& lhs, const Model_Entry& rhs)
- {
- return lhs.variant == rhs.variant
- and lhs.level_mean == rhs.level_mean
- and lhs.level_stdv == rhs.level_stdv
- and lhs.sd_mean == rhs.sd_mean
- and lhs.sd_stdv == rhs.sd_stdv
- and lhs.weight == rhs.weight
- and lhs.kmer == rhs.kmer;
+ friend bool operator == (Basecall_Model_State const & lhs, Basecall_Model_State const & rhs)
+ {
+ return (lhs.level_mean == rhs.level_mean
+ and lhs.level_stdv == rhs.level_stdv
+ and lhs.sd_mean == rhs.sd_mean
+ and lhs.sd_stdv == rhs.sd_stdv
+ and lhs.kmer == rhs.kmer);
}
-}; // struct Model_Entry
+ static hdf5_tools::Compound_Map const & compound_map()
+ {
+ static hdf5_tools::Compound_Map m;
+ static bool inited = false;
+ if (not inited)
+ {
+ m.add_member("level_mean", &Basecall_Model_State::level_mean);
+ m.add_member("level_stdv", &Basecall_Model_State::level_stdv);
+ m.add_member("sd_mean", &Basecall_Model_State::sd_mean);
+ m.add_member("sd_stdv", &Basecall_Model_State::sd_stdv);
+ m.add_member("kmer", &Basecall_Model_State::kmer);
+ inited = true;
+ }
+ return m;
+ }
+}; // struct Basecall_Model_State
//
// This struct represents the global transformations
-// that must be applied to each Model_Entry
+// that must be applied to each Basecall_Model_State
//
-struct Model_Parameters
+struct Basecall_Model_Params
{
double scale;
double shift;
@@ -121,66 +328,303 @@ struct Model_Parameters
double var;
double scale_sd;
double var_sd;
-}; // struct Model_Parameters
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ f.read(p + "/scale", scale);
+ f.read(p + "/shift", shift);
+ f.read(p + "/drift", drift);
+ f.read(p + "/var", var);
+ f.read(p + "/scale_sd", scale_sd);
+ f.read(p + "/var_sd", var_sd);
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ f.write_attribute(p + "/scale", scale);
+ f.write_attribute(p + "/shift", shift);
+ f.write_attribute(p + "/drift", drift);
+ f.write_attribute(p + "/var", var);
+ f.write_attribute(p + "/scale_sd", scale_sd);
+ f.write_attribute(p + "/var_sd", var_sd);
+ }
+}; // struct Basecall_Model_Params
+
+struct Basecall_Fastq_Pack
+{
+ Huffman_Packer::Code_Type bp;
+ Attr_Map bp_params;
+ Huffman_Packer::Code_Type qv;
+ Attr_Map qv_params;
+ std::string read_name;
+ std::uint8_t qv_bits;
+ //
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ f.read(p + "/BP", bp);
+ bp_params = f.get_attr_map(p + "/BP");
+ f.read(p + "/QV", qv);
+ qv_params = f.get_attr_map(p + "/QV");
+ f.read(p + "/read_name", read_name);
+ f.read(p + "/qv_bits", qv_bits);
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ f.write_dataset(p + "/BP", bp);
+ f.add_attr_map(p + "/BP", bp_params);
+ f.write_dataset(p + "/QV", qv);
+ f.add_attr_map(p + "/QV", qv_params);
+ f.write_attribute(p + "/read_name", read_name);
+ f.write_attribute(p + "/qv_bits", qv_bits);
+ }
+}; // struct Basecall_Fastq_Pack
//
// This struct represents an observed event.
// The members of the struct are the same as
// the fields encoded in the FAST5 file.
//
-struct Event_Entry
+struct Basecall_Event
{
double mean;
double stdv;
double start;
double length;
double p_model_state;
- double p_mp_state;
- double p_A;
- double p_C;
- double p_G;
- double p_T;
long long move;
std::array< char, MAX_K_LEN > model_state;
- std::array< char, MAX_K_LEN > mp_state;
std::string get_model_state() const { return array_to_string(model_state); }
- std::string get_mp_state() const { return array_to_string(mp_state); }
- friend bool operator == (const Event_Entry& lhs, const Event_Entry& rhs)
+ friend bool operator == (Basecall_Event const & lhs, Basecall_Event const & rhs)
{
- return lhs.mean == rhs.mean
- and lhs.stdv == rhs.stdv
- and lhs.start == rhs.start
- and lhs.length == rhs.length
- and lhs.p_model_state == rhs.p_model_state
- and lhs.p_mp_state == rhs.p_mp_state
- and lhs.p_A == rhs.p_A
- and lhs.p_C == rhs.p_C
- and lhs.p_G == rhs.p_G
- and lhs.p_T == rhs.p_T
- and lhs.move == rhs.move
- and lhs.model_state == rhs.model_state
- and lhs.mp_state == rhs.mp_state;
- }
-}; // struct Event_Entry
+ return (lhs.mean == rhs.mean
+ and lhs.stdv == rhs.stdv
+ and lhs.start == rhs.start
+ and lhs.length == rhs.length
+ and lhs.p_model_state == rhs.p_model_state
+ and lhs.move == rhs.move
+ and lhs.model_state == rhs.model_state);
+ }
+ static hdf5_tools::Compound_Map const & compound_map()
+ {
+ static hdf5_tools::Compound_Map m;
+ static bool inited = false;
+ if (not inited)
+ {
+ m.add_member("mean", &Basecall_Event::mean);
+ m.add_member("stdv", &Basecall_Event::stdv);
+ m.add_member("start", &Basecall_Event::start);
+ m.add_member("length", &Basecall_Event::length);
+ m.add_member("p_model_state", &Basecall_Event::p_model_state);
+ m.add_member("move", &Basecall_Event::move);
+ m.add_member("model_state", &Basecall_Event::model_state);
+ inited = true;
+ }
+ return m;
+ }
+}; // struct Basecall_Event
+
+struct Basecall_Events_Params
+{
+ double start_time;
+ double duration;
+ friend bool operator == (Basecall_Events_Params const & lhs, Basecall_Events_Params const & rhs)
+ {
+ return (lhs.start_time == rhs.start_time
+ and lhs.duration == rhs.duration);
+ }
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ if (f.attribute_exists(p + "/start_time"))
+ {
+ f.read(p + "/start_time", start_time);
+ }
+ else
+ {
+ start_time = 0.0;
+ }
+ if (f.attribute_exists(p + "/duration"))
+ {
+ f.read(p + "/duration", duration);
+ }
+ else
+ {
+ duration = 0.0;
+ }
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ if (start_time > 0.0) f.write_attribute(p + "/start_time", start_time);
+ if (duration > 0.0) f.write_attribute(p + "/duration", duration);
+ }
+};
+
+typedef std::pair< std::vector< Basecall_Event >, Basecall_Events_Params > Basecall_Events_Dataset;
+
+struct Basecall_Events_Pack
+{
+ Huffman_Packer::Code_Type rel_skip;
+ Attr_Map rel_skip_params;
+ Huffman_Packer::Code_Type skip;
+ Attr_Map skip_params;
+ Huffman_Packer::Code_Type len;
+ Attr_Map len_params;
+ Huffman_Packer::Code_Type move;
+ Attr_Map move_params;
+ Bit_Packer::Code_Type p_model_state;
+ Attr_Map p_model_state_params;
+ //
+ std::string name;
+ std::string version;
+ std::string ed_gr;
+ long long start_time;
+ unsigned state_size;
+ double median_sd_temp;
+ unsigned p_model_state_bits;
+ //
+ Basecall_Events_Params params;
+ //
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ if (f.dataset_exists(p + "/Rel_Skip"))
+ {
+ f.read(p + "/Rel_Skip", rel_skip);
+ rel_skip_params = f.get_attr_map(p + "/Rel_Skip");
+ }
+ else
+ {
+ f.read(p + "/Skip", skip);
+ skip_params = f.get_attr_map(p + "/Skip");
+ f.read(p + "/Len", len);
+ len_params = f.get_attr_map(p + "/Len");
+ }
+ f.read(p + "/Move", move);
+ move_params = f.get_attr_map(p + "/Move");
+ f.read(p + "/P_Model_State", p_model_state);
+ p_model_state_params = f.get_attr_map(p + "/P_Model_State");
+ f.read(p + "/name", name);
+ f.read(p + "/version", version);
+ f.read(p + "/ed_gr", ed_gr);
+ f.read(p + "/start_time", start_time);
+ f.read(p + "/state_size", state_size);
+ f.read(p + "/median_sd_temp", median_sd_temp);
+ f.read(p + "/p_model_state_bits", p_model_state_bits);
+ params.read(f, p + "/params");
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ if (not rel_skip.empty())
+ {
+ f.write_dataset(p + "/Rel_Skip", rel_skip);
+ f.add_attr_map(p + "/Rel_Skip", rel_skip_params);
+ }
+ else
+ {
+ f.write_dataset(p + "/Skip", skip);
+ f.add_attr_map(p + "/Skip", skip_params);
+ f.write_dataset(p + "/Len", len);
+ f.add_attr_map(p + "/Len", len_params);
+ }
+ f.write_dataset(p + "/Move", move);
+ f.add_attr_map(p + "/Move", move_params);
+ f.write_dataset(p + "/P_Model_State", p_model_state);
+ f.add_attr_map(p + "/P_Model_State", p_model_state_params);
+ f.write_attribute(p + "/name", name);
+ f.write_attribute(p + "/version", version);
+ f.write_attribute(p + "/ed_gr", ed_gr);
+ f.write_attribute(p + "/start_time", start_time);
+ f.write_attribute(p + "/state_size", state_size);
+ f.write_attribute(p + "/median_sd_temp", median_sd_temp);
+ f.write_attribute(p + "/p_model_state_bits", p_model_state_bits);
+ params.write(f, p + "/params");
+ }
+}; // struct Basecall_Events_Pack
//
// This struct represents a template-to-complement
// match that is emitted by ONT's 2D basecaller
//
-struct Event_Alignment_Entry
+struct Basecall_Alignment_Entry
{
long long template_index;
long long complement_index;
std::array< char, MAX_K_LEN > kmer;
std::string get_kmer() const { return array_to_string(kmer); }
- friend bool operator == (const Event_Alignment_Entry& lhs, const Event_Alignment_Entry& rhs)
+ friend bool operator == (Basecall_Alignment_Entry const & lhs, Basecall_Alignment_Entry const & rhs)
{
return lhs.template_index == rhs.template_index
and lhs.complement_index == rhs.complement_index
and lhs.kmer == rhs.kmer;
}
-}; // struct Event_Alignment_Entry
+ static hdf5_tools::Compound_Map const & compound_map()
+ {
+ static hdf5_tools::Compound_Map m;
+ static bool inited = false;
+ if (not inited)
+ {
+ m.add_member("template", &Basecall_Alignment_Entry::template_index);
+ m.add_member("complement", &Basecall_Alignment_Entry::complement_index);
+ m.add_member("kmer", &Basecall_Alignment_Entry::kmer);
+ inited = true;
+ }
+ return m;
+ }
+}; // struct Basecall_Alignment_Entry
+struct Basecall_Alignment_Pack
+{
+ Bit_Packer::Code_Type template_step;
+ Bit_Packer::Code_Params_Type template_step_params;
+ Bit_Packer::Code_Type complement_step;
+ Bit_Packer::Code_Params_Type complement_step_params;
+ Huffman_Packer::Code_Type move;
+ Huffman_Packer::Code_Params_Type move_params;
+ unsigned template_index_start;
+ unsigned complement_index_start;
+ unsigned kmer_size;
+ //
+ void read(hdf5_tools::File const & f, std::string const & p)
+ {
+ f.read(p + "/Template_Step", template_step);
+ template_step_params = f.get_attr_map(p + "/Template_Step");
+ f.read(p + "/Complement_Step", complement_step);
+ complement_step_params = f.get_attr_map(p + "/Complement_Step");
+ f.read(p + "/Move", move);
+ move_params = f.get_attr_map(p + "/Move");
+ f.read(p + "/template_index_start", template_index_start);
+ f.read(p + "/complement_index_start", complement_index_start);
+ f.read(p + "/kmer_size", kmer_size);
+ }
+ void write(hdf5_tools::File const & f, std::string const & p) const
+ {
+ f.write_dataset(p + "/Template_Step", template_step);
+ f.add_attr_map(p + "/Template_Step", template_step_params);
+ f.write_dataset(p + "/Complement_Step", complement_step);
+ f.add_attr_map(p + "/Complement_Step", complement_step_params);
+ f.write_dataset(p + "/Move", move);
+ f.add_attr_map(p + "/Move", move_params);
+ f.write_attribute(p + "/template_index_start", template_index_start);
+ f.write_attribute(p + "/complement_index_start", complement_index_start);
+ f.write_attribute(p + "/kmer_size", kmer_size);
+ }
+};
+
+struct Basecall_Group_Description
+{
+ std::string name;
+ std::string version;
+ std::string ed_gr;
+ std::string bc_1d_gr;
+ bool have_subgroup[3];
+ bool have_fastq[3];
+ bool have_events[3];
+ bool have_model[2];
+ bool have_alignment;
+ Basecall_Group_Description() :
+ have_subgroup{false, false, false},
+ have_fastq{false, false, false},
+ have_events{false, false, false},
+ have_model{false, false},
+ have_alignment{false}
+ {}
+}; // struct Basecall_Group_Description
class File
: private hdf5_tools::File
@@ -188,654 +632,731 @@ class File
private:
typedef hdf5_tools::File Base;
public:
- //using Base::is_open;
- //using Base::is_rw;
- //using Base::file_name;
- //using Base::create;
- //using Base::close;
- using Base::get_object_count;
- using Base::is_valid_file;
- //using Base::write;
-
+ //
+ // Constructors
+ //
File() = default;
- File(const std::string& file_name, bool rw = false) { open(file_name, rw); }
+ File(std::string const & file_name, bool rw = false) { open(file_name, rw); }
- bool is_open() const { return static_cast< const Base* >(this)->is_open(); }
- bool is_rw() const { return static_cast< const Base* >(this)->is_rw(); }
- const std::string& file_name() const { return static_cast< const Base* >(this)->file_name(); }
- void create(const std::string& file_name, bool truncate = false) { static_cast< Base* >(this)->create(file_name, truncate); }
- void close() { static_cast< Base* >(this)->close(); }
+ //
+ // Base methods
+ //
+ using Base::is_open;
+ using Base::is_rw;
+ using Base::file_name;
+ using Base::create;
+ using Base::close;
+ using Base::get_object_count;
+ using Base::is_valid_file;
- void open(const std::string& file_name, bool rw = false)
+ //
+ // Base method wrappers
+ //
+ void
+ open(std::string const & file_name, bool rw = false)
{
Base::open(file_name, rw);
- if (is_open())
- {
- // detect raw samples read name
- detect_raw_samples_read_name_list();
- // detect eventdetection groups
- detect_eventdetection_group_list();
- // detect basecall groups
- detect_basecall_group_list();
- }
+ reload();
}
- /**
- * Extract "/file_version" attribute. This must exist.
- */
- std::string file_version() const
+ //
+ // Access /file_version
+ //
+ std::string
+ file_version() const
{
std::string res;
- assert(Base::exists(file_version_path()));
Base::read(file_version_path(), res);
return res;
}
- /**
- * Check if "/UniqueGlobalKey/channel_id" attributes exist.
- */
- bool have_channel_id_params() const
- {
- return Base::group_exists(channel_id_path());
- }
- /**
- * Extract "/UniqueGlobalKey/channel_id" attributes.
- */
- Channel_Id_Parameters get_channel_id_params() const
- {
- Channel_Id_Parameters res;
- Base::read(channel_id_path() + "/channel_number", res.channel_number);
- Base::read(channel_id_path() + "/digitisation", res.digitisation);
- Base::read(channel_id_path() + "/offset", res.offset);
- Base::read(channel_id_path() + "/range", res.range);
- Base::read(channel_id_path() + "/sampling_rate", res.sampling_rate);
- return res;
+ //
+ // Access /UniqueGlobalKey/channel_id
+ //
+ bool
+ have_channel_id_params() const
+ {
+ return _channel_id_params.sampling_rate > 0.0;
}
- /**
- * Check if sampling rate exists.
- */
- bool have_sampling_rate() const
+ Channel_Id_Params
+ get_channel_id_params() const
{
- return have_channel_id_params();
+ return _channel_id_params;
}
- /**
- * Get sampling rate.
- */
- double get_sampling_rate() const
+ void
+ add_channel_id_params(Channel_Id_Params const & channel_id_params)
{
- auto channel_id_params = get_channel_id_params();
- return channel_id_params.sampling_rate;
+ _channel_id_params = channel_id_params;
+ _channel_id_params.write(*this, channel_id_path());
}
+ bool
+ have_sampling_rate() const { return have_channel_id_params(); }
+ double
+ get_sampling_rate() const { return _channel_id_params.sampling_rate; }
- /**
- * Check if "/UniqueGlobalKey/tracking_id" attributes exist.
- */
- bool have_tracking_id_params() const
+ //
+ // Access /UniqueGlobalKey/tracking_id
+ //
+ bool
+ have_tracking_id_params() const
{
return Base::group_exists(tracking_id_path());
}
- /**
- * Extract "/UniqueGlobalKey/tracking_id" attributes.
- */
- Tracking_Id_Parameters get_tracking_id_params() const
+ Tracking_Id_Params
+ get_tracking_id_params() const
{
return get_attr_map(tracking_id_path());
}
+ void
+ add_tracking_id_params(Tracking_Id_Params const & tracking_id_params) const
+ {
+ add_attr_map(tracking_id_path(), tracking_id_params);
+ }
- /**
- * Check if sequences attributes exists.
- */
- bool have_sequences_params() const
+ //
+ // Access /Sequences
+ //
+ bool
+ have_sequences_params() const
{
return Base::group_exists(sequences_path());
}
- /**
- * Get sequences attributes.
- */
- Sequences_Parameters get_sequences_params() const
+ Sequences_Params
+ get_sequences_params() const
{
return get_attr_map(sequences_path());
}
+ void
+ add_sequences_params(Sequences_Params const & sequences_params) const
+ {
+ add_attr_map(sequences_path(), sequences_params);
+ }
- /**
- * Get list of raw samples read names.
- */
- const std::vector< std::string >& get_raw_samples_read_name_list() const
+ //
+ // Access Raw Samples
+ //
+ std::vector< std::string > const &
+ get_raw_samples_read_name_list() const
{
- return _raw_samples_read_name_list;
+ return _raw_samples_read_names;
}
- /**
- * Check if raw samples exist.
- * If _rn non-empty, check if raw samples exist for given read.
- */
- bool have_raw_samples(const std::string& _rn = std::string()) const
+ bool
+ have_raw_samples(std::string const & rn = std::string()) const
{
- if (not have_channel_id_params())
- {
- return false;
- }
- auto rn_l = get_raw_samples_read_name_list();
- if (_rn.empty())
+ auto && rn_l = get_raw_samples_read_name_list();
+ return (rn.empty()
+ ? not rn_l.empty()
+ : std::find(rn_l.begin(), rn_l.end(), rn) != rn_l.end());
+ }
+ bool
+ have_raw_samples_unpack(std::string const & rn) const
+ {
+ return Base::dataset_exists(raw_samples_path(rn));
+ }
+ bool
+ have_raw_samples_pack(std::string const & rn) const
+ {
+ return Base::group_exists(raw_samples_pack_path(rn));
+ }
+ Raw_Samples_Params
+ get_raw_samples_params(std::string const & rn = std::string()) const
+ {
+ Raw_Samples_Params res;
+ auto && _rn = fill_raw_samples_read_name(rn);
+ if (have_raw_samples_unpack(_rn))
{
- return not rn_l.empty();
+ res.read(*this, raw_samples_params_path(_rn));
}
else
{
- std::set< std::string > rn_d(rn_l.begin(), rn_l.end());
- return rn_d.count(_rn) > 0;
+ res.read(*this, raw_samples_params_pack_path(_rn));
}
+ return res;
}
- /**
- * Get raw samples attributes for given read name (default: first read name).
- */
- Raw_Samples_Parameters get_raw_samples_params(const std::string& _rn = std::string()) const
+ void
+ add_raw_samples_params(std::string const & rn, Raw_Samples_Params const & params) const
{
- Raw_Samples_Parameters res;
- const std::string& rn = not _rn.empty()? _rn : get_raw_samples_read_name_list().front();
std::string p = raw_samples_params_path(rn);
- Base::read(p + "/read_id", res.read_id);
- Base::read(p + "/read_number", res.read_number);
- Base::read(p + "/start_mux", res.start_mux);
- Base::read(p + "/start_time", res.start_time);
- Base::read(p + "/duration", res.duration);
- return res;
+ params.write(*this, p);
}
- /**
- * Get raw samples for given read name as ints (default: first read name).
- */
- std::vector< Raw_Samples_Int_Entry > get_raw_samples_int(const std::string& _rn = std::string()) const
+ std::vector< Raw_Int_Sample >
+ get_raw_int_samples(std::string const & rn = std::string()) const
{
- // get raw samples
- std::vector< Raw_Samples_Int_Entry > res;
- const std::string& rn = not _rn.empty()? _rn : get_raw_samples_read_name_list().front();
- Base::read(raw_samples_path(rn), res);
+ std::vector< Raw_Int_Sample > res;
+ auto && _rn = fill_raw_samples_read_name(rn);
+ if (have_raw_samples_unpack(_rn))
+ {
+ Base::read(raw_samples_path(_rn), res);
+ }
+ else if (have_raw_samples_pack(_rn))
+ {
+ auto rs_pack = get_raw_samples_pack(_rn);
+ res = unpack_rw(rs_pack).first;
+ }
return res;
}
- /**
- * Get raw samples for given read name (default: first read name).
- */
- std::vector< Raw_Samples_Entry > get_raw_samples(const std::string& _rn = std::string()) const
+ void
+ add_raw_samples(std::string const & rn, std::vector< Raw_Int_Sample > const & rsi)
+ {
+ Base::write_dataset(raw_samples_path(rn), rsi);
+ reload();
+ }
+ std::vector< Raw_Sample >
+ get_raw_samples(std::string const & rn = std::string()) const
{
// get raw samples
- auto raw_samples_int = get_raw_samples_int(_rn);
- // get scaling parameters
- auto channel_id_params = get_channel_id_params();
+ auto rsi = get_raw_int_samples(rn);
// decode levels
- std::vector< Raw_Samples_Entry > res;
- res.reserve(raw_samples_int.size());
- for (auto int_level : raw_samples_int)
+ std::vector< Raw_Sample > res;
+ res.reserve(rsi.size());
+ for (auto int_level : rsi)
{
- res.push_back((static_cast< float >(int_level) + channel_id_params.offset)
- * channel_id_params.range / channel_id_params.digitisation);
+ res.push_back(raw_sample_to_float(int_level, _channel_id_params));
}
return res;
}
- /**
- * Get list of EventDetection groups.
- */
- const std::vector< std::string >& get_eventdetection_group_list() const
+ //
+ // Access EventDetection groups
+ //
+ std::vector< std::string > const &
+ get_eventdetection_group_list() const
{
- return _eventdetection_group_list;
+ return _eventdetection_groups;
}
- /**
- * Check if any EventDetection groups exist.
- */
- bool have_eventdetection_groups() const
+ bool
+ have_eventdetection_group(std::string const & gr = std::string()) const
{
- return not get_eventdetection_group_list().empty();
+ return (gr.empty()
+ ? not _eventdetection_groups.empty()
+ : _eventdetection_read_names.count(gr));
}
- /**
- * Get list of reads for given EventDetection group (default: first EventDetection group).
- */
- std::vector< std::string > get_eventdetection_read_name_list(const std::string& _ed_gr = std::string()) const
+ std::vector< std::string > const &
+ get_eventdetection_read_name_list(std::string const & gr = std::string()) const
{
- const std::string& ed_gr = not _ed_gr.empty()? _ed_gr : get_eventdetection_group_list().front();
- return detect_eventdetection_read_name_list(ed_gr);
+ static const std::vector< std::string > _empty;
+ auto && _gr = fill_eventdetection_group(gr);
+ return (_eventdetection_read_names.count(_gr)
+ ? _eventdetection_read_names.at(_gr)
+ : _empty);
}
- /**
- * Check if EventDetection events exist.
- * If _ed_gr given: check if events exist for given group; else: check first EventDetection group.
- * If _rn given: check if events exist for given group and read name.
- */
- bool have_eventdetection_events(
- const std::string& _ed_gr = std::string(),
- const std::string& _rn = std::string()) const
+ Attr_Map
+ get_eventdetection_params(std::string const & gr = std::string()) const
{
- std::string ed_gr;
- if (_ed_gr.empty())
- {
- auto ed_gr_l = get_eventdetection_group_list();
- if (ed_gr_l.empty()) return false;
- ed_gr = ed_gr_l.front();
- }
- else
- {
- ed_gr = _ed_gr;
- }
- auto rn_l = get_eventdetection_read_name_list(ed_gr);
- if (_rn.empty())
+ auto && _gr = fill_eventdetection_group(gr);
+ return get_attr_map(eventdetection_group_path(_gr));
+ }
+ void
+ add_eventdetection_params(std::string const & gr, Attr_Map const & am) const
+ {
+ add_attr_map(eventdetection_group_path(gr), am);
+ }
+
+ //
+ // Access EventDetection events
+ //
+ bool
+ have_eventdetection_events(
+ std::string const & gr = std::string(), std::string const & rn = std::string()) const
+ {
+ auto && _gr = fill_eventdetection_group(gr);
+ auto && _rn = fill_eventdetection_read_name(_gr, rn);
+ return (_eventdetection_read_names.count(_gr)
+ and std::find(
+ _eventdetection_read_names.at(_gr).begin(),
+ _eventdetection_read_names.at(_gr).end(),
+ _rn)
+ != _eventdetection_read_names.at(_gr).end());
+ }
+ bool
+ have_eventdetection_events_unpack(std::string const & gr, std::string const & rn) const
+ {
+ return Base::dataset_exists(eventdetection_events_path(gr, rn));
+ }
+ bool
+ have_eventdetection_events_pack(std::string const & gr, std::string const & rn) const
+ {
+ return Base::group_exists(eventdetection_events_pack_path(gr, rn));
+ }
+ EventDetection_Events_Params
+ get_eventdetection_events_params(
+ std::string const & gr = std::string(), std::string const & rn = std::string()) const
+ {
+ EventDetection_Events_Params res;
+ auto && _gr = fill_eventdetection_group(gr);
+ auto && _rn = fill_eventdetection_read_name(_gr, rn);
+ if (have_eventdetection_events_unpack(_gr, _rn))
{
- return not rn_l.empty();
+ res.read(*this, eventdetection_events_params_path(_gr, _rn));
}
- else
+ else if (have_eventdetection_events_pack(_gr, _rn))
{
- std::set< std::string > rn_d(rn_l.begin(), rn_l.end());
- return rn_d.count(_rn) > 0;
+ res.read(*this, eventdetection_events_params_pack_path(_gr, _rn));
}
+ return res;
}
- /**
- * Get EventDetection params for given EventDetection group (default: first EventDetection group).
- */
- std::map< std::string, std::string > get_eventdetection_params(const std::string& _ed_gr = std::string()) const
+ void
+ add_eventdetection_events_params(
+ std::string const & gr, std::string const & rn,
+ EventDetection_Events_Params const & ede_params) const
{
- const std::string& ed_gr = not _ed_gr.empty()? _ed_gr : get_eventdetection_group_list().front();
- return get_attr_map(eventdetection_params_path(ed_gr));
+ auto p = eventdetection_events_params_path(gr, rn);
+ ede_params.write(*this, p);
}
- /**
- * Get EventDetection event params for given EventDetection group, and given read name
- * (default: first EventDetection group, and first read name in it).
- */
- EventDetection_Event_Parameters get_eventdetection_event_params(
- const std::string& _ed_gr = std::string(), const std::string& _rn = std::string()) const
+ std::vector< EventDetection_Event >
+ get_eventdetection_events(
+ std::string const & gr = std::string(), std::string const & rn = std::string()) const
{
- EventDetection_Event_Parameters res;
- const std::string& ed_gr = not _ed_gr.empty()? _ed_gr : get_eventdetection_group_list().front();
- const std::string rn = not _rn.empty()? _rn : get_eventdetection_read_name_list(ed_gr).front();
- auto p = eventdetection_event_params_path(ed_gr, rn);
- auto a_v = Base::get_attr_list(p);
- std::set< std::string > a_s(a_v.begin(), a_v.end());
- Base::read(p + "/read_number", res.read_number);
- Base::read(p + "/scaling_used", res.scaling_used);
- Base::read(p + "/start_mux", res.start_mux);
- Base::read(p + "/start_time", res.start_time);
- Base::read(p + "/duration", res.duration);
- // optional fields
- if (a_s.count("read_id"))
+ std::vector< EventDetection_Event > ede;
+ auto && _gr = fill_eventdetection_group(gr);
+ auto && _rn = fill_eventdetection_read_name(_gr, rn);
+ if (have_eventdetection_events_unpack(_gr, _rn))
{
- Base::read(p + "/read_id", res.read_id);
- }
- if (a_s.count("median_before"))
- {
- Base::read(p + "/median_before", res.median_before);
- }
- else
- {
- res.median_before = -1;
- }
- if (a_s.count("abasic_found"))
- {
- Base::read(p + "/abasic_found", res.abasic_found);
+ auto p = eventdetection_events_path(_gr, _rn);
+ // accept either stdv or variance
+ auto meml = get_struct_members(p);
+ std::set< std::string > mems(meml.begin(), meml.end());
+ if (mems.count("stdv"))
+ {
+ Base::read(p, ede, EventDetection_Event::compound_map());
+ }
+ else if (mems.count("variance"))
+ {
+ Base::read(p, ede, EventDetection_Event::alt_compound_map());
+ for (auto & e : ede)
+ {
+ e.stdv = std::sqrt(e.stdv);
+ }
+ }
+ else
+ {
+ LOG_THROW
+ << "neither stdv nor variance found for ed_gr=" << gr;
+ }
}
- else
+ else if (have_eventdetection_events_pack(_gr, _rn))
{
- res.abasic_found = 0;
+ auto ede_pack = get_eventdetection_events_pack(_gr, _rn);
+ if (not have_raw_samples(_rn))
+ {
+ LOG_THROW_(std::logic_error)
+ << "missing raw samples required to unpack eventdetection events: gr=" << _gr
+ << " rn=" << _rn;
+ }
+ auto rs_ds = get_raw_samples_dataset(_rn);
+ ede = unpack_ed(ede_pack, rs_ds).first;
}
- return res;
+ return ede;
+ } // get_eventdetection_events()
+ void
+ add_eventdetection_events(
+ std::string const & gr, std::string const & rn,
+ std::vector< EventDetection_Event > const & ede)
+ {
+ Base::write_dataset(eventdetection_events_path(gr, rn), ede, EventDetection_Event::compound_map());
+ reload();
}
- /**
- * Get EventDetection events for given EventDetection group, and given read name.
- */
- std::vector< EventDetection_Event_Entry > get_eventdetection_events(
- const std::string& _ed_gr = std::string(), const std::string& _rn = std::string()) const
+
+ //
+ // Access Basecall groups
+ //
+ std::vector< std::string > const &
+ get_basecall_group_list() const
{
- std::vector< EventDetection_Event_Entry > res;
- const std::string& ed_gr = not _ed_gr.empty()? _ed_gr : get_eventdetection_group_list().front();
- const std::string rn = not _rn.empty()? _rn : get_eventdetection_read_name_list(ed_gr).front();
- auto p = eventdetection_events_path(ed_gr, rn);
- auto struct_member_names = Base::get_struct_members(p);
- assert(struct_member_names.size() >= 4);
- bool have_stdv = false;
- bool have_variance = false;
- for (const auto& s : struct_member_names)
- {
- if (s == "stdv") have_stdv = true;
- else if (s == "variance") have_variance = true;
- }
- hdf5_tools::Compound_Map m;
- m.add_member("mean", &EventDetection_Event_Entry::mean);
- m.add_member("start", &EventDetection_Event_Entry::start);
- m.add_member("length", &EventDetection_Event_Entry::length);
- if (have_stdv)
+ return _basecall_groups;
+ }
+ bool
+ have_basecall_group(std::string const & gr = std::string()) const
+ {
+ auto && gr_l = get_basecall_group_list();
+ return (gr.empty()
+ ? not gr_l.empty()
+ : std::find(gr_l.begin(), gr_l.end(), gr) != gr_l.end());
+ }
+ std::vector< std::string > const &
+ get_basecall_strand_group_list(unsigned st) const
+ {
+ return _basecall_strand_groups.at(st);
+ }
+ bool
+ have_basecall_strand_group(unsigned st, std::string const & gr = std::string()) const
+ {
+ auto && gr_l = get_basecall_strand_group_list(st);
+ if (gr.empty())
{
- m.add_member("stdv", &EventDetection_Event_Entry::stdv);
+ return not gr_l.empty();
}
- else if (have_variance)
+ if (not _basecall_group_descriptions.count(gr))
{
- m.add_member("variance", &EventDetection_Event_Entry::stdv);
+ return false;
}
else
{
- // must have stdv or variance
- abort();
- }
- Base::read(p, res, m);
- if (not have_stdv)
- {
- // have read variances
- for (auto& e : res)
- {
- e.stdv = std::sqrt(e.stdv);
- }
+ return _basecall_group_descriptions.at(gr).have_subgroup[st];
}
- return res;
- } // get_eventdetection_events()
-
- /**
- * Get list of all Basecall groups.
- */
- const std::vector< std::string >& get_basecall_group_list() const
+ }
+ Basecall_Group_Description const &
+ get_basecall_group_description(std::string const & gr) const
{
- return _basecall_group_list;
+ return _basecall_group_descriptions.at(gr);
}
- /**
- * Check if any Basecall groups exist.
- */
- bool have_basecall_groups() const
+ std::string const &
+ get_basecall_1d_group(std::string const & gr) const
{
- return not get_basecall_group_list().empty();
+ static std::string const empty;
+ return (_basecall_group_descriptions.count(gr)
+ ? _basecall_group_descriptions.at(gr).bc_1d_gr
+ : empty);
}
- /**
- * Get list of Basecall groups for given strand.
- */
- const std::vector< std::string >& get_basecall_strand_group_list(unsigned st) const
+ std::string const &
+ get_basecall_eventdetection_group(std::string const & gr) const
{
- return _basecall_strand_group_list[st];
+ static std::string const empty;
+ return (_basecall_group_descriptions.count(gr)
+ ? _basecall_group_descriptions.at(gr).ed_gr
+ : empty);
}
- /**
- * Check if any Basecall groups exist for given strand.
- */
- bool have_basecall_strand_groups(unsigned st) const
+
+ //
+ // Access Basecall group params
+ //
+ Attr_Map
+ get_basecall_params(std::string const & gr) const
{
- return not get_basecall_strand_group_list(st).empty();
+ return get_attr_map(basecall_group_path(gr));
}
- /**
- * Get Basecall group params for given Basecall group.
- */
- std::map< std::string, std::string > get_basecall_params(const std::string& bc_gr) const
+ void
+ add_basecall_params(std::string const & gr, Attr_Map const & am) const
{
- return get_attr_map(basecall_root_path() + "/" + basecall_group_prefix() + bc_gr);
+ add_attr_map(basecall_group_path(gr), am);
}
- /**
- * Check if Basecall log exists for given Basecall group.
- */
- bool have_basecall_log(const std::string& bc_gr) const
+ //
+ // Access Basecall group log
+ //
+ bool
+ have_basecall_log(std::string const & gr) const
{
- std::string path = basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/Log";
- return Base::exists(path);
+ return Base::exists(basecall_log_path(gr));
}
- /**
- * Get Basecall log for given Basecall group.
- */
- std::string get_basecall_log(const std::string& bc_gr) const
+ std::string
+ get_basecall_log(std::string const & gr) const
{
std::string res;
- std::string path = basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/Log";
- Base::read(path, res);
+ Base::read(basecall_log_path(gr), res);
return res;
}
- /**
- * Check if Basecall fastq exists for given Basecall group and given strand.
- */
- bool have_basecall_fastq(unsigned st, const std::string& _bc_gr = std::string()) const
+ Attr_Map
+ get_basecall_config(std::string const & gr) const
{
- if (_bc_gr.empty() and get_basecall_strand_group_list(st).empty()) return false;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- return Base::dataset_exists(basecall_fastq_path(bc_gr, st));
+ Attr_Map res;
+ if (Base::group_exists(basecall_config_path(gr)))
+ {
+ res = get_attr_map(basecall_config_path(gr), true);
+ }
+ return res;
}
- /**
- * Get Basecall fastq for given Basecall group and given strand.
- */
- std::string get_basecall_fastq(unsigned st, const std::string& _bc_gr = std::string()) const
+ Attr_Map
+ get_basecall_summary(std::string const & gr) const
+ {
+ Attr_Map res;
+ if (Base::group_exists(basecall_summary_path(gr)))
+ {
+ res = get_attr_map(basecall_summary_path(gr), true);
+ }
+ return res;
+ }
+
+ //
+ // Access Basecall fastq
+ //
+ bool
+ have_basecall_fastq(unsigned st, std::string const & gr = std::string()) const
+ {
+ auto && _gr = fill_basecall_group(st, gr);
+ return (_basecall_group_descriptions.count(_gr)
+ and _basecall_group_descriptions.at(_gr).have_fastq[st]);
+ }
+ bool
+ have_basecall_fastq_unpack(unsigned st, std::string const & gr) const
+ {
+ return Base::dataset_exists(basecall_fastq_path(gr, st));
+ }
+ bool
+ have_basecall_fastq_pack(unsigned st, std::string const & gr) const
+ {
+ return Base::group_exists(basecall_fastq_pack_path(gr, st));
+ }
+ std::string
+ get_basecall_fastq(unsigned st, std::string const & gr = std::string()) const
{
std::string res;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- Base::read(basecall_fastq_path(bc_gr, st), res);
+ auto && _gr = fill_basecall_group(st, gr);
+ if (have_basecall_fastq_unpack(st, _gr))
+ {
+ Base::read(basecall_fastq_path(_gr, st), res);
+ }
+ else if (have_basecall_fastq_pack(st, _gr))
+ {
+ auto fq_pack = get_basecall_fastq_pack(st, _gr);
+ res = unpack_fq(fq_pack);
+ }
return res;
}
- /**
- * Add Basecall fastq
- */
- void add_basecall_fastq(unsigned st, const std::string& bc_gr, const std::string& fq) const
+ void
+ add_basecall_fastq(unsigned st, std::string const & gr, std::string const & fq)
{
- Base::write(basecall_fastq_path(bc_gr, st), true, fq);
+ Base::write(basecall_fastq_path(gr, st), true, fq);
+ reload();
}
- /**
- * Check if Basecall seq exists for given Basecall group and given strand.
- */
- bool have_basecall_seq(unsigned st, const std::string& _bc_gr = std::string()) const
+ bool
+ have_basecall_seq(unsigned st, std::string const & _gr = std::string()) const
{
- return have_basecall_fastq(st, _bc_gr);
+ return have_basecall_fastq(st, _gr);
}
- /**
- * Get Basecall sequence for given Basecall group and given strand.
- */
- std::string get_basecall_seq(unsigned st, const std::string& _bc_gr = std::string()) const
+ std::string
+ get_basecall_seq(unsigned st, std::string const & _gr = std::string()) const
{
- return fq2seq(get_basecall_fastq(st, _bc_gr));
+ return fq2seq(get_basecall_fastq(st, _gr));
}
- /**
- * Add Basecall seq
- */
- void add_basecall_seq(unsigned st, const std::string& bc_gr,
- const std::string& name, const std::string& seq, int default_qual = 33) const
+ void
+ add_basecall_seq(unsigned st, std::string const & gr,
+ std::string const & name, std::string const & seq, int default_qual = 33)
{
std::ostringstream oss;
- oss << '@' << name << std::endl
- << seq << std::endl
- << '+' << std::endl
- << std::string(seq.size(), static_cast< char >(default_qual));
- add_basecall_fastq(st, bc_gr, oss.str());
- }
- /**
- * Check if Basecall model exist for given Basecall group and given strand.
- */
- bool have_basecall_model(unsigned st, const std::string& _bc_gr = std::string()) const
- {
- if (_bc_gr.empty() and get_basecall_strand_group_list(st).empty()) return false;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- return Base::dataset_exists(basecall_model_path(bc_gr_1d, st));
- }
- /**
- * Get Basecall model file name for given Basecall group and given strand.
- */
- std::string get_basecall_model_file(unsigned st, const std::string& _bc_gr = std::string()) const
+ oss << "@" << name << "\n"
+ << seq << "\n"
+ << "+\n"
+ << std::string(seq.size(), (char)default_qual);
+ add_basecall_fastq(st, gr, oss.str());
+ reload();
+ }
+
+ //
+ // Access Basecall model
+ //
+ bool
+ have_basecall_model(unsigned st, std::string const & gr = std::string()) const
+ {
+ auto && gr_1d = fill_basecall_1d_group(st, gr);
+ return (_basecall_group_descriptions.count(gr_1d)
+ and _basecall_group_descriptions.at(gr_1d).have_model[st]);
+ }
+ std::string
+ get_basecall_model_file(unsigned st, std::string const & gr = std::string()) const
{
std::string res;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- assert(Base::exists(basecall_model_file_path(bc_gr_1d, st)));
- Base::read(basecall_model_file_path(bc_gr_1d, st), res);
+ auto && gr_1d = fill_basecall_1d_group(st, gr);
+ Base::read(basecall_model_file_path(gr_1d, st), res);
return res;
}
- void add_basecall_model_file(unsigned st, const std::string& bc_gr, const std::string& file_name) const
- {
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- std::string path = basecall_model_file_path(bc_gr_1d, st);
- Base::write(path, false, file_name);
- }
- /**
- * Get Basecall model parameters for given Basecall group and given strand.
- */
- Model_Parameters get_basecall_model_params(unsigned st, const std::string& _bc_gr = std::string()) const
- {
- Model_Parameters res;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- std::string path = basecall_model_path(bc_gr_1d, st);
- Base::read(path + "/scale", res.scale);
- Base::read(path + "/shift", res.shift);
- Base::read(path + "/drift", res.drift);
- Base::read(path + "/var", res.var);
- Base::read(path + "/scale_sd", res.scale_sd);
- Base::read(path + "/var_sd", res.var_sd);
- return res;
+ void
+ add_basecall_model_file(unsigned st, std::string const & gr, std::string const & file_name) const
+ {
+ Base::write_attribute(basecall_model_file_path(gr, st), file_name);
}
- template < typename T >
- void add_basecall_model_params(unsigned st, const std::string& bc_gr, const T& params) const
- {
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- std::string path = basecall_model_path(bc_gr_1d, st);
- Base::write(path + "/scale", false, params.scale);
- Base::write(path + "/shift", false, params.shift);
- Base::write(path + "/drift", false, params.drift);
- Base::write(path + "/var", false, params.var);
- Base::write(path + "/scale_sd", false, params.scale_sd);
- Base::write(path + "/var_sd", false, params.var_sd);
- }
- /**
- * Get Basecall model for given Basecall group and given strand.
- */
- std::vector< Model_Entry > get_basecall_model(unsigned st, const std::string& _bc_gr = std::string()) const
- {
- std::vector< Model_Entry > res;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- hdf5_tools::Compound_Map m;
- m.add_member("kmer", &Model_Entry::kmer);
- m.add_member("level_mean", &Model_Entry::level_mean);
- m.add_member("level_stdv", &Model_Entry::level_stdv);
- m.add_member("sd_mean", &Model_Entry::sd_mean);
- m.add_member("sd_stdv", &Model_Entry::sd_stdv);
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- Base::read(basecall_model_path(bc_gr_1d, st), res, m);
- return res;
+ Basecall_Model_Params
+ get_basecall_model_params(unsigned st, std::string const & gr = std::string()) const
+ {
+ Basecall_Model_Params params;
+ auto && gr_1d = fill_basecall_1d_group(st, gr);
+ std::string path = basecall_model_path(gr_1d, st);
+ params.read(*this, path);
+ return params;
}
- /**
- * Add Basecall model
- */
- template < typename T >
- void add_basecall_model(unsigned st, const std::string& bc_gr, const std::vector< T >& m) const
- {
- hdf5_tools::Compound_Map cm;
- cm.add_member("kmer", &T::kmer);
- cm.add_member("level_mean", &T::level_mean);
- cm.add_member("level_stdv", &T::level_stdv);
- cm.add_member("sd_mean", &T::sd_mean);
- cm.add_member("sd_stdv", &T::sd_stdv);
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- Base::write(basecall_model_path(bc_gr_1d, st), true, m, cm);
- }
- /**
- * Check if Basecall events exist for given Basecall group and given strand.
- */
- bool have_basecall_events(unsigned st, const std::string& _bc_gr = std::string()) const
- {
- if (_bc_gr.empty() and get_basecall_strand_group_list(st).empty()) return false;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- return Base::dataset_exists(basecall_events_path(bc_gr_1d, st));
- }
- /**
- * Get Basecall events for given Basecall group and given strand.
- */
- std::vector< Event_Entry > get_basecall_events(unsigned st, const std::string& _bc_gr = std::string()) const
- {
- std::vector< Event_Entry > res;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(st).front();
- hdf5_tools::Compound_Map m;
- m.add_member("mean", &Event_Entry::mean);
- m.add_member("start", &Event_Entry::start);
- m.add_member("stdv", &Event_Entry::stdv);
- m.add_member("length", &Event_Entry::length);
- m.add_member("p_model_state", &Event_Entry::p_model_state);
- m.add_member("model_state", &Event_Entry::model_state);
- m.add_member("move", &Event_Entry::move);
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- Base::read(basecall_events_path(bc_gr_1d, st), res, m);
- return res;
+ void
+ add_basecall_model_params(unsigned st, std::string const & gr, Basecall_Model_Params const & params) const
+ {
+ std::string path = basecall_model_path(gr, st);
+ params.write(*this, path);
+ }
+ std::vector< Basecall_Model_State >
+ get_basecall_model(unsigned st, std::string const & gr = std::string()) const
+ {
+ std::vector< Basecall_Model_State > mod;
+ auto && gr_1d = fill_basecall_1d_group(st, gr);
+ Base::read(basecall_model_path(gr_1d, st), mod, Basecall_Model_State::compound_map());
+ return mod;
}
- /**
- * Add Basecall events
- */
template < typename T >
- void add_basecall_events(unsigned st, const std::string& bc_gr, const std::vector< T >& ev) const
- {
- hdf5_tools::Compound_Map cm;
- cm.add_member("mean", &T::mean);
- cm.add_member("start", &T::start);
- cm.add_member("stdv", &T::stdv);
- cm.add_member("length", &T::length);
- cm.add_member("p_model_state", &T::p_model_state);
- cm.add_member("model_state", &T::model_state);
- cm.add_member("move", &T::move);
- auto bc_gr_1d = get_basecall_group_1d(bc_gr);
- Base::write(basecall_events_path(bc_gr_1d, st), true, ev, cm);
- }
- /**
- * Check if Basecall event alignment exist for given Basecall group.
- */
- bool have_basecall_event_alignment(const std::string& _bc_gr = std::string()) const
- {
- if (_bc_gr.empty() and get_basecall_strand_group_list(2).empty()) return false;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(2).front();
- return Base::dataset_exists(basecall_event_alignment_path(bc_gr));
- }
- /**
- * Get Basecall events for given Basecall group.
- */
- std::vector< Event_Alignment_Entry > get_basecall_event_alignment(const std::string& _bc_gr = std::string()) const
- {
- std::vector< Event_Alignment_Entry > res;
- const std::string& bc_gr = not _bc_gr.empty()? _bc_gr : get_basecall_strand_group_list(2).front();
- hdf5_tools::Compound_Map m;
- m.add_member("template", &Event_Alignment_Entry::template_index);
- m.add_member("complement", &Event_Alignment_Entry::complement_index);
- m.add_member("kmer", &Event_Alignment_Entry::kmer);
- Base::read(basecall_event_alignment_path(bc_gr), res, m);
- return res;
+ void add_basecall_model(unsigned st, std::string const & gr, std::vector< T > const & mod)
+ {
+ auto && gr_1d = get_basecall_1d_group(gr);
+ Base::write_dataset(basecall_model_path(gr_1d, st), mod, Basecall_Model_State::compound_map());
+ reload();
}
- /**
- * Get basecall group holding 1d calls.
- */
- std::string get_basecall_group_1d(const std::string& bc_gr) const
+ //
+ // Access Basecall events
+ //
+ bool
+ have_basecall_events(unsigned st, std::string const & gr = std::string()) const
{
- std::string path = basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/basecall_1d";
- if (Base::attribute_exists(path))
+ auto && gr_1d = fill_basecall_1d_group(st, gr);
+ return (_basecall_group_descriptions.count(gr_1d)
+ and _basecall_group_descriptions.at(gr_1d).have_events[st]);
+ }
+ bool
+ have_basecall_events_unpack(unsigned st, std::string const & gr) const
+ {
+ return Base::dataset_exists(basecall_events_path(gr, st));
+ }
+ bool
+ have_basecall_events_pack(unsigned st, std::string const & gr) const
+ {
+ return Base::group_exists(basecall_events_pack_path(gr, st));
+ }
+ Basecall_Events_Params
+ get_basecall_events_params(unsigned st, std::string const & gr = std::string()) const
+ {
+ Basecall_Events_Params bce_params;
+ auto && gr_1d = fill_basecall_1d_group(st, gr);
+ if (have_basecall_events_unpack(st, gr_1d))
{
- std::string tmp;
- Base::read(path, tmp);
- auto tmp1 = tmp.substr(0, 18);
- auto tmp2 = tmp.substr(18);
- if (tmp1 == "Analyses/Basecall_"
- and Base::group_exists(basecall_root_path() + "/" + basecall_group_prefix() + tmp2))
- {
- return tmp2;
- }
+ bce_params.read(*this, basecall_events_path(gr_1d, st));
+ }
+ else if (have_basecall_events_pack(st, gr_1d))
+ {
+ bce_params.read(*this, basecall_events_params_pack_path(gr_1d, st));
}
- return bc_gr;
+ return bce_params;
}
- /**
- * Get EventDetection group for given Basecall group, if available.
- */
- std::string get_basecall_eventdetection_group(const std::string& bc_gr) const
+ void
+ add_basecall_events_params(unsigned st, std::string const & gr,
+ Basecall_Events_Params const & bce_params) const
{
- std::string path = basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/event_detection";
- if (Base::attribute_exists(path))
+ auto path = basecall_events_path(gr, st);
+ if (not Base::dataset_exists(path))
{
- std::string tmp;
- Base::read(path, tmp);
- auto pos = tmp.find(eventdetection_group_prefix());
- if (pos != std::string::npos)
+ LOG_THROW
+ << "basecall events must be added before their params";
+ }
+ bce_params.write(*this, path);
+ }
+ std::vector< Basecall_Event >
+ get_basecall_events(unsigned st, std::string const & gr = std::string()) const
+ {
+ std::vector< Basecall_Event > res;
+ auto && gr_1d = fill_basecall_1d_group(st, gr);
+ if (have_basecall_events_unpack(st, gr_1d))
+ {
+ Base::read(basecall_events_path(gr_1d, st), res, Basecall_Event::compound_map());
+ }
+ else if (have_basecall_events_pack(st, gr_1d))
+ {
+ auto ev_pack = get_basecall_events_pack(st, gr_1d);
+ if (not have_basecall_seq(st, gr_1d))
+ {
+ LOG_THROW_(std::logic_error)
+ << "missing fastq required to unpack basecall events: st=" << st
+ << " gr=" << gr_1d;
+ }
+ auto sq = get_basecall_seq(st, gr_1d);
+ if (not ev_pack.ed_gr.empty())
{
- pos += eventdetection_group_prefix().size();
- auto end_pos = tmp.find("/", pos);
- if (end_pos == std::string::npos)
+ if (not have_eventdetection_events(ev_pack.ed_gr))
{
- end_pos = tmp.size();
+ LOG_THROW_(std::logic_error)
+ << "missing eventdetection events required to unpack basecall events: st=" << st
+ << " gr=" << gr_1d
+ << " ed_gr=" << ev_pack.ed_gr;
}
- return tmp.substr(pos, end_pos - pos);
+ auto ed = get_eventdetection_events(ev_pack.ed_gr);
+ res = unpack_ev(ev_pack, sq, ed, _channel_id_params).first;
}
+ else // ed_gr == "": packed relative to raw samples
+ {
+ if (not have_raw_samples())
+ {
+ LOG_THROW_(std::logic_error)
+ << "missing raw samples required to unpack basecall events: st=" << st
+ << " gr=" << gr_1d;
+ }
+ auto rs_ds = get_raw_samples_dataset();
+ auto ed = unpack_implicit_ed(ev_pack, rs_ds);
+ res = unpack_ev(ev_pack, sq, ed, _channel_id_params).first;
+ }
+ }
+ return res;
+ }
+ template < typename T >
+ void
+ add_basecall_events(unsigned st, std::string const & gr, std::vector< T > const & ev)
+ {
+ Base::write_dataset(basecall_events_path(gr, st), ev, T::compound_map());
+ reload();
+ }
+
+ //
+ // Access Basecall alignment
+ //
+ bool
+ have_basecall_alignment(std::string const & gr = std::string()) const
+ {
+ auto && _gr = fill_basecall_group(2, gr);
+ return (_basecall_group_descriptions.count(_gr)
+ and _basecall_group_descriptions.at(_gr).have_alignment);
+ }
+ bool
+ have_basecall_alignment_unpack(std::string const & gr) const
+ {
+ return Base::dataset_exists(basecall_alignment_path(gr));
+ }
+ bool
+ have_basecall_alignment_pack(std::string const & gr) const
+ {
+ return Base::group_exists(basecall_alignment_pack_path(gr));
+ }
+ std::vector< Basecall_Alignment_Entry >
+ get_basecall_alignment(std::string const & gr = std::string()) const
+ {
+ std::vector< Basecall_Alignment_Entry > al;
+ auto && _gr = fill_basecall_group(2, gr);
+ if (have_basecall_alignment_unpack(_gr))
+ {
+ Base::read(basecall_alignment_path(_gr), al, Basecall_Alignment_Entry::compound_map());
+ }
+ else if (have_basecall_alignment_pack(_gr)
+ and have_basecall_seq(2, _gr))
+ {
+ auto al_pack = get_basecall_alignment_pack(_gr);
+ auto seq = get_basecall_seq(2, _gr);
+ al = unpack_al(al_pack, seq);
}
- return std::string();
+ return al;
+ }
+ void
+ add_basecall_alignment(std::string const & gr, std::vector< Basecall_Alignment_Entry > const & al)
+ {
+ Base::write_dataset(basecall_alignment_path(gr), al, Basecall_Alignment_Entry::compound_map());
+ reload();
}
- static std::string fq2seq(const std::string& fq)
+ //
+ // Static helpers
+ //
+ static inline long long
+ time_to_int(double tf, Channel_Id_Params const & cid_params)
+ {
+ return tf * cid_params.sampling_rate;
+ }
+ static inline double
+ time_to_float(long long ti, Channel_Id_Params const & cid_params)
+ {
+ return ((long double)ti + .5) / cid_params.sampling_rate;
+ }
+ static inline float
+ raw_sample_to_float(int si, Channel_Id_Params const & cid_params)
+ {
+ return ((float)si + cid_params.offset)
+ * cid_params.range / cid_params.digitisation;
+ }
+ static std::string
+ fq2seq(std::string const & fq)
{
return split_fq(fq)[1];
}
- static std::array< std::string, 4 > split_fq(const std::string& fq)
+ static std::array< std::string, 4 >
+ split_fq(std::string const & fq)
{
std::array< std::string, 4 > res = {{"", "", "", ""}};
size_t i = 0;
@@ -861,193 +1382,1108 @@ public:
}
private:
- void detect_raw_samples_read_name_list()
+ friend struct File_Packer;
+
+ //
+ // Cached file data
+ //
+ Channel_Id_Params _channel_id_params;
+ std::vector< std::string > _raw_samples_read_names;
+ std::vector< std::string > _eventdetection_groups;
+ std::map< std::string, std::vector< std::string > > _eventdetection_read_names;
+ std::vector< std::string > _basecall_groups;
+ std::map< std::string, Basecall_Group_Description > _basecall_group_descriptions;
+ std::array< std::vector< std::string >, 3 > _basecall_strand_groups;
+
+ //
+ // Cache updaters
+ //
+ void
+ reload()
+ {
+ load_channel_id_params();
+ load_raw_samples_read_names();
+ load_eventdetection_groups();
+ load_basecall_groups();
+ }
+ void
+ load_channel_id_params()
+ {
+ if (not Base::group_exists(channel_id_path())) return;
+ _channel_id_params.read(*this, channel_id_path());
+ }
+ void
+ load_raw_samples_read_names()
{
+ _raw_samples_read_names.clear();
if (not Base::group_exists(raw_samples_root_path())) return;
- auto rn_list = Base::list_group(raw_samples_root_path());
- for (const auto& rn : rn_list)
+ auto rn_l = Base::list_group(raw_samples_root_path());
+ for (auto const & rn : rn_l)
{
- if (not Base::dataset_exists(raw_samples_path(rn))) continue;
- _raw_samples_read_name_list.push_back(rn);
+ if (have_raw_samples_unpack(rn)
+ or have_raw_samples_pack(rn))
+ {
+ _raw_samples_read_names.push_back(rn);
+ }
}
}
-
- void detect_eventdetection_group_list()
+ void
+ load_eventdetection_groups()
{
+ _eventdetection_groups.clear();
+ _eventdetection_read_names.clear();
if (not Base::group_exists(eventdetection_root_path())) return;
- auto g_list = Base::list_group(eventdetection_root_path());
- for (const auto& g : g_list)
+ auto ed_gr_prefix = eventdetection_group_prefix();
+ auto gr_l = Base::list_group(eventdetection_root_path());
+ for (auto const & g : gr_l)
{
- if (g.size() <= eventdetection_group_prefix().size()) continue;
- auto p = std::mismatch(eventdetection_group_prefix().begin(),
- eventdetection_group_prefix().end(),
- g.begin());
- if (p.first != eventdetection_group_prefix().end()) continue;
- _eventdetection_group_list.emplace_back(p.second, g.end());
+ if (g.substr(0, ed_gr_prefix.size()) != ed_gr_prefix) continue;
+ std::string gr = g.substr(ed_gr_prefix.size());
+ _eventdetection_groups.push_back(gr);
+ _eventdetection_read_names[gr] = detect_eventdetection_read_names(gr);
}
}
-
- std::vector< std::string > detect_eventdetection_read_name_list(const std::string& ed_gr) const
+ std::vector< std::string >
+ detect_eventdetection_read_names(std::string const & gr) const
{
std::vector< std::string > res;
- std::string p = eventdetection_root_path() + "/" + eventdetection_group_prefix() + ed_gr + "/Reads";
+ std::string p = eventdetection_root_path() + "/" + eventdetection_group_prefix() + gr + "/Reads";
if (not Base::group_exists(p)) return res;
- auto rn_list = Base::list_group(p);
- for (const auto& rn : rn_list)
+ auto rn_l = Base::list_group(p);
+ for (auto const & rn : rn_l)
{
- if (not Base::dataset_exists(p + "/" + rn + "/Events")) continue;
- res.push_back(rn);
+ if (have_eventdetection_events_unpack(gr, rn)
+ or have_eventdetection_events_pack(gr, rn))
+ {
+ res.push_back(rn);
+ }
}
return res;
}
-
- void detect_basecall_group_list()
+ void
+ load_basecall_groups()
{
+ _basecall_groups.clear();
+ _basecall_group_descriptions.clear();
+ std::for_each(
+ _basecall_strand_groups.begin(), _basecall_strand_groups.end(),
+ [] (decltype(_basecall_strand_groups)::value_type & v) {
+ v.clear();
+ });
if (not Base::group_exists(basecall_root_path())) return;
- auto g_list = Base::list_group(basecall_root_path());
- for (const auto& g : g_list)
- {
- if (g.size() <= basecall_group_prefix().size()) continue;
- auto p = std::mismatch(basecall_group_prefix().begin(),
- basecall_group_prefix().end(),
- g.begin());
- if (p.first != basecall_group_prefix().end()) continue;
- _basecall_group_list.emplace_back(p.second, g.end());
+ auto bc_gr_prefix = basecall_group_prefix();
+ auto gr_l = Base::list_group(basecall_root_path());
+ for (auto const & g : gr_l)
+ {
+ if (g.substr(0, bc_gr_prefix.size()) != bc_gr_prefix) continue;
+ // found basecall group
+ std::string gr = g.substr(bc_gr_prefix.size());
+ _basecall_groups.push_back(gr);
+ // name and version
+ _basecall_group_descriptions[gr] = detect_basecall_group_id(gr);
+ auto & bc_desc = _basecall_group_descriptions.at(gr);
+ // subgroups
for (unsigned st = 0; st < 3; ++st)
{
- if (Base::group_exists(basecall_root_path() + "/" + g + "/" + basecall_strand_subgroup(st)))
+ bc_desc.have_subgroup[st] =
+ Base::group_exists(basecall_strand_group_path(gr, st));
+ if (bc_desc.have_subgroup[st])
{
- _basecall_strand_group_list[st].emplace_back(p.second, g.end());
+ _basecall_strand_groups[st].push_back(gr);
+ // fastq
+ bc_desc.have_fastq[st] =
+ have_basecall_fastq_unpack(st, gr) or
+ have_basecall_fastq_pack(st, gr);
+ // events
+ bc_desc.have_events[st] =
+ have_basecall_events_unpack(st, gr) or
+ have_basecall_events_pack(st, gr);
+ if (st == 0)
+ {
+ // ed_gr
+ bc_desc.ed_gr = detect_basecall_eventdetection_group(gr);
+ }
+ if (st == 2)
+ {
+ // alignment
+ bc_desc.have_alignment =
+ have_basecall_alignment_unpack(gr)
+ or have_basecall_alignment_pack(gr);
+ }
}
}
+ // bc_1d_gr
+ if (bc_desc.have_subgroup[0] or bc_desc.have_subgroup[1])
+ {
+ bc_desc.bc_1d_gr = gr;
+ }
+ else if (bc_desc.have_subgroup[2])
+ {
+ bc_desc.bc_1d_gr = detect_basecall_1d_group(gr);
+ }
+ // model
+ for (unsigned st = 0; st < 2; ++st)
+ {
+ bc_desc.have_model[st] =
+ not bc_desc.bc_1d_gr.empty()
+ and Base::dataset_exists(basecall_model_path(bc_desc.bc_1d_gr, st));
+ }
}
}
-
- std::map< std::string, std::string > get_attr_map(const std::string& path) const
+ Basecall_Group_Description
+ detect_basecall_group_id(std::string const & gr) const
{
- std::map< std::string, std::string > res;
- auto a_list = Base::get_attr_list(path);
- for (const auto& a : a_list)
+ Basecall_Group_Description res;
+ res.name = "?";
+ res.version = "?";
+ auto am = get_basecall_params(gr);
+ if (am.count("name"))
+ {
+ if (am.at("name") == "ONT Sequencing Workflow")
+ {
+ res.name = "metrichor";
+ res.version = (am.count("chimaera version")? am.at("chimaera version") : "?") + "+" +
+ (am.count("dragonet version")? am.at("dragonet version") : "?");
+ }
+ else if (am.at("name") == "MinKNOW-Live-Basecalling")
+ {
+ res.name = "minknow";
+ res.version = (am.count("version")? am.at("version") : "?");
+ }
+ else if (am.at("name") == "ONT Albacore Sequencing Software")
+ {
+ res.name = "albacore";
+ res.version = (am.count("version")? am.at("version") : "?");
+ }
+ }
+ return res;
+ }
+ std::string
+ detect_basecall_1d_group(std::string const & gr) const
+ {
+ std::string path = basecall_group_path(gr) + "/basecall_1d";
+ if (Base::attribute_exists(path))
{
std::string tmp;
- Base::read(path + "/" + a, tmp);
- res[a] = tmp;
+ Base::read(path, tmp);
+ auto pref = basecall_root_path().substr(1) + "/" + basecall_group_prefix();
+ if (tmp.size() >= pref.size()
+ and tmp.substr(0, pref.size()) == pref)
+ {
+ auto gr_1d = tmp.substr(pref.size());
+ if (have_basecall_group(gr_1d))
+ {
+ return gr_1d;
+ }
+ }
}
+ return gr;
+ }
+ std::string
+ detect_basecall_eventdetection_group(std::string const & gr) const
+ {
+ auto bc_params = get_basecall_params(gr);
+ if (bc_params.count("event_detection"))
+ {
+ auto && tmp = bc_params.at("event_detection");
+ auto pref = eventdetection_root_path().substr(1) + "/" + eventdetection_group_prefix();
+ if (tmp.substr(0, pref.size()) == pref)
+ {
+ auto ed_gr = tmp.substr(pref.size());
+ if (have_eventdetection_group(ed_gr))
+ {
+ return ed_gr;
+ }
+ }
+ }
+ if (have_basecall_events_pack(0, gr))
+ {
+ auto ev_pack = get_basecall_events_pack(0, gr);
+ auto ed_gr = ev_pack.ed_gr;
+ if (have_eventdetection_group(ed_gr))
+ {
+ return ed_gr;
+ }
+ }
+ return "";
+ }
+ double
+ get_basecall_median_sd_temp(std::string const & gr) const
+ {
+ std::string segmentation_link_path = basecall_group_path(gr) + "/segmentation";
+ if (not Base::attribute_exists(segmentation_link_path)) return 0.0;
+ std::string segmentation_path;
+ Base::read(segmentation_link_path, segmentation_path);
+ std::string median_sd_temp_path = "/" + segmentation_path + "/Summary/split_hairpin/median_sd_temp";
+ if (not Base::attribute_exists(median_sd_temp_path)) return 0.0;
+ double res;
+ Base::read(median_sd_temp_path, res);
return res;
}
- // list of read names for which we have raw samples
- std::vector< std::string > _raw_samples_read_name_list;
-
- // list of EventDetection groups
- std::vector< std::string > _eventdetection_group_list;
-
- // list of Basecall groups
- std::vector< std::string > _basecall_group_list;
-
- // list of per-strand Basecall groups; 0/1/2 = template/complement/2d
- std::array< std::vector< std::string >, 3 > _basecall_strand_group_list;
-
- // static paths
- static const std::string& file_version_path()
+ //
+ // Functions that fill in empty arguments with default values
+ //
+ std::string const &
+ fill_raw_samples_read_name(std::string const & rn) const
+ {
+ return (not rn.empty() or _raw_samples_read_names.empty()
+ ? rn
+ : _raw_samples_read_names.front());
+ }
+ std::string const &
+ fill_eventdetection_group(std::string const & gr) const
+ {
+ return (not gr.empty() or _eventdetection_groups.empty()
+ ? gr
+ : _eventdetection_groups.front());
+ }
+ std::string const &
+ fill_eventdetection_read_name(std::string const & gr, std::string const & rn) const
+ {
+ return (not rn.empty()
+ or _eventdetection_read_names.count(gr) == 0
+ or _eventdetection_read_names.at(gr).empty()
+ ? rn
+ : _eventdetection_read_names.at(gr).front());
+ }
+ std::string const &
+ fill_basecall_group(unsigned st, std::string const & gr) const
+ {
+ return (not gr.empty()
+ or _basecall_strand_groups.at(st).empty()
+ ? gr
+ : _basecall_strand_groups.at(st).front());
+ }
+ std::string const &
+ fill_basecall_1d_group(unsigned st, std::string const & gr) const
{
- static const std::string _file_version_path = "/file_version";
- return _file_version_path;
+ auto && _gr = fill_basecall_group(st, gr);
+ return get_basecall_1d_group(_gr);
}
- static const std::string& channel_id_path()
+ //
+ // Packing interface
+ //
+ Raw_Samples_Pack
+ get_raw_samples_pack(std::string const & rn) const
+ {
+ Raw_Samples_Pack rs_pack;
+ auto path = raw_samples_pack_path(rn);
+ rs_pack.read(*this, path);
+ return rs_pack;
+ }
+ void
+ add_raw_samples(std::string const & rn, Raw_Samples_Pack const & rs_pack)
+ {
+ auto path = raw_samples_pack_path(rn);
+ rs_pack.write(*this, path);
+ reload();
+ }
+ Raw_Int_Samples_Dataset
+ get_raw_int_samples_dataset(std::string const & rn = std::string()) const
+ {
+ Raw_Int_Samples_Dataset res;
+ auto && _rn = fill_raw_samples_read_name(rn);
+ res.first = get_raw_int_samples(_rn);
+ res.second = get_raw_samples_params(_rn);
+ return res;
+ }
+ Raw_Samples_Dataset
+ get_raw_samples_dataset(std::string const & rn = std::string()) const
+ {
+ Raw_Samples_Dataset res;
+ auto && _rn = fill_raw_samples_read_name(rn);
+ res.first = get_raw_samples(_rn);
+ res.second = get_raw_samples_params(_rn);
+ return res;
+ }
+ void
+ add_raw_samples_dataset(std::string const & rn, Raw_Int_Samples_Dataset const & rsi_ds)
+ {
+ add_raw_samples(rn, rsi_ds.first);
+ add_raw_samples_params(rn, rsi_ds.second);
+ }
+ EventDetection_Events_Pack
+ get_eventdetection_events_pack(
+ std::string const & gr, std::string const & rn) const
+ {
+ EventDetection_Events_Pack ede_pack;
+ ede_pack.read(*this, eventdetection_events_pack_path(gr, rn));
+ return ede_pack;
+ }
+ void
+ add_eventdetection_events(
+ std::string const & gr, std::string const & rn,
+ EventDetection_Events_Pack const & ede_pack)
+ {
+ ede_pack.write(*this, eventdetection_events_pack_path(gr, rn));
+ reload();
+ }
+ EventDetection_Events_Dataset
+ get_eventdetection_events_dataset(
+ std::string const & gr, std::string const & rn) const
{
- static const std::string _channel_id_path = "/UniqueGlobalKey/channel_id";
- return _channel_id_path;
+ EventDetection_Events_Dataset ede_ds;
+ ede_ds.first = get_eventdetection_events(gr, rn);
+ ede_ds.second = get_eventdetection_events_params(gr, rn);
+ return ede_ds;
}
- static const std::string& tracking_id_path()
+ void
+ add_eventdetection_events_dataset(
+ std::string const & gr, std::string const & rn,
+ EventDetection_Events_Dataset const & ede_ds)
{
- static const std::string _tracking_id_path = "/UniqueGlobalKey/tracking_id";
- return _tracking_id_path;
+ add_eventdetection_events(gr, rn, ede_ds.first);
+ add_eventdetection_events_params(gr, rn, ede_ds.second);
}
- static const std::string& raw_samples_root_path()
+ //
+ Basecall_Fastq_Pack
+ get_basecall_fastq_pack(unsigned st, std::string const & gr) const
{
- static const std::string _raw_samples_root_path = "/Raw/Reads";
- return _raw_samples_root_path;
+ Basecall_Fastq_Pack fq_pack;
+ auto p = basecall_fastq_pack_path(gr, st);
+ fq_pack.read(*this, p);
+ return fq_pack;
}
- static std::string raw_samples_params_path(const std::string& rn)
+ void
+ add_basecall_fastq(unsigned st, std::string const & gr, Basecall_Fastq_Pack const & fq_pack)
+ {
+ auto p = basecall_fastq_pack_path(gr, st);
+ fq_pack.write(*this, p);
+ reload();
+ }
+ //
+ Basecall_Events_Pack
+ get_basecall_events_pack(unsigned st, std::string const & gr) const
+ {
+ auto p = basecall_events_pack_path(gr, st);
+ Basecall_Events_Pack ev_pack;
+ ev_pack.read(*this, p);
+ return ev_pack;
+ }
+ void
+ add_basecall_events(unsigned st, std::string const & gr, Basecall_Events_Pack const & ev_pack)
+ {
+ auto p = basecall_events_pack_path(gr, st);
+ ev_pack.write(*this, p);
+ reload();
+ }
+ Basecall_Events_Dataset
+ get_basecall_events_dataset(unsigned st, std::string const & gr) const
+ {
+ Basecall_Events_Dataset bce_ds;
+ bce_ds.first = get_basecall_events(st, gr);
+ bce_ds.second = get_basecall_events_params(st, gr);
+ return bce_ds;
+ }
+ void
+ add_basecall_events_dataset(unsigned st, std::string const & gr, Basecall_Events_Dataset const & bce_ds)
+ {
+ add_basecall_events(st, gr, bce_ds.first);
+ add_basecall_events_params(st, gr, bce_ds.second);
+ }
+ //
+ Basecall_Alignment_Pack
+ get_basecall_alignment_pack(std::string const & gr) const
+ {
+ Basecall_Alignment_Pack al_pack;
+ auto p = basecall_alignment_pack_path(gr);
+ al_pack.read(*this, p);
+ return al_pack;
+ }
+ void
+ add_basecall_alignment(std::string const & gr, Basecall_Alignment_Pack const & al_pack)
+ {
+ auto p = basecall_alignment_pack_path(gr);
+ al_pack.write(*this, p);
+ reload();
+ }
+
+ //
+ // Packers & Unpackers
+ //
+ static Raw_Samples_Pack
+ pack_rw(Raw_Int_Samples_Dataset const & rsi_ds)
+ {
+ Raw_Samples_Pack rsp;
+ rsp.params = rsi_ds.second;
+ std::tie(rsp.signal, rsp.signal_params) = rw_coder().encode(rsi_ds.first, true);
+ return rsp;
+ }
+ static Raw_Int_Samples_Dataset
+ unpack_rw(Raw_Samples_Pack const & rs_pack)
+ {
+ Raw_Int_Samples_Dataset rsi_ds;
+ rsi_ds.second = rs_pack.params;
+ rsi_ds.first = rw_coder().decode< Raw_Int_Sample >(rs_pack.signal, rs_pack.signal_params);
+ return rsi_ds;
+ }
+ static std::pair< std::vector< long long >, std::vector< long long > >
+ pack_event_start_length(
+ unsigned num_events,
+ std::function< long long(unsigned) > get_start,
+ std::function< long long(unsigned) > get_length,
+ long long start_time)
+ {
+ std::pair< std::vector< long long >, std::vector< long long > > res;
+ auto & skip = res.first;
+ auto & len = res.second;
+ for (unsigned i = 0; i < num_events; ++i)
+ {
+ auto si = get_start(i);
+ auto li = get_length(i);
+ skip.push_back(si - start_time);
+ len.push_back(li);
+ start_time = si + li;
+ }
+ return res;
+ }
+ static void
+ unpack_event_start_length(
+ std::vector< long long > const & skip,
+ std::vector< long long > const & len,
+ std::function< void(unsigned, long long) > set_start,
+ std::function< void(unsigned, long long) > set_length,
+ long long start_time)
+ {
+ for (unsigned i = 0; i < skip.size(); ++i)
+ {
+ auto si = start_time + skip[i];
+ auto li = len[i];
+ set_start(i, si);
+ set_length(i, li);
+ start_time = si + li;
+ }
+ }
+ static void
+ unpack_event_mean_stdv(
+ unsigned num_events,
+ std::function< long long(unsigned) > get_start,
+ std::function< long long(unsigned) > get_length,
+ std::function< void(unsigned, double) > set_mean,
+ std::function< void(unsigned, double) > set_stdv,
+ std::vector< Raw_Sample > const & rs,
+ long long rs_start_time,
+ int offset)
+ {
+ for (unsigned i = 0; i < num_events; ++i)
+ {
+ long long rs_start_idx = get_start(i) - rs_start_time + offset;
+ long long rs_end_idx = rs_start_idx + get_length(i);
+ if (i == 0 and rs_start_idx < 0) rs_start_idx = 0;
+ if (i == num_events - 1 and rs_end_idx > (long long)rs.size()) rs_end_idx = rs.size();
+ if (rs_start_idx < 0
+ or rs_end_idx <= rs_start_idx
+ or rs_end_idx > (long long)rs.size())
+ {
+ LOG_THROW
+ << "bad index: rs_start_idx=" << rs_start_idx
+ << " rs_end_idx=" << rs_end_idx
+ << " i=" << i
+ << " length(i)=" << get_length(i)
+ << " rs_size=" << rs.size()
+ << " offset=" << offset;
+ }
+ bool all_equal = true;
+ double s = 0.0;
+ double s2 = 0.0;
+ unsigned n = rs_end_idx - rs_start_idx;
+ for (unsigned j = 0; j < n; ++j)
+ {
+ double x = rs[rs_start_idx + j];
+ if (j > 0 and all_equal)
+ {
+ all_equal = rs[rs_start_idx + j] == rs[rs_start_idx];
+ }
+ s += x;
+ s2 += x * x;
+ }
+ set_mean(i, s / n);
+ if (n > 1 and not all_equal)
+ {
+ double x = (s2 - s*s/n)/n;
+ set_stdv(i, x > 1e-3? std::sqrt(x) : 0);
+ }
+ else
+ {
+ set_stdv(i, 0);
+ }
+ }
+ }
+ static EventDetection_Events_Pack
+ pack_ed(EventDetection_Events_Dataset const & ede_ds)
+ {
+ EventDetection_Events_Pack ede_pack;
+ auto & ede = ede_ds.first;
+ auto & ede_params = ede_ds.second;
+ ede_pack.params = ede_params;
+ std::vector< long long > skip;
+ std::vector< long long > len;
+ std::tie(skip, len) = pack_event_start_length(
+ ede.size(),
+ [&] (unsigned i) { return ede.at(i).start; },
+ [&] (unsigned i) { return ede.at(i).length; },
+ ede_params.start_time);
+ std::tie(ede_pack.skip, ede_pack.skip_params) = ed_skip_coder().encode(skip, false);
+ std::tie(ede_pack.len, ede_pack.len_params) = ed_len_coder().encode(len, false);
+ return ede_pack;
+ }
+ static EventDetection_Events_Dataset
+ unpack_ed(EventDetection_Events_Pack const & ede_pack,
+ Raw_Samples_Dataset const & rs_ds)
+ {
+ EventDetection_Events_Dataset res;
+ auto & ede_params = ede_pack.params;
+ auto & rs = rs_ds.first;
+ auto & rs_params = rs_ds.second;
+ res.second = ede_params;
+ auto skip = ed_skip_coder().decode< long long >(ede_pack.skip, ede_pack.skip_params);
+ auto len = ed_len_coder().decode< long long >(ede_pack.len, ede_pack.len_params);
+ if (skip.size() != len.size())
+ {
+ LOG_THROW
+ << "wrong dataset size: skip_size=" << skip.size()
+ << " len_size=" << len.size();
+ }
+ auto & ede = res.first;
+ ede.resize(skip.size());
+ unpack_event_start_length(
+ skip,
+ len,
+ [&] (unsigned i, long long x) { return ede.at(i).start = x; },
+ [&] (unsigned i, long long x) { return ede.at(i).length = x; },
+ ede_params.start_time);
+ int offset = 0;
+ static bool warned = false;
+ if (offset != 0 and not warned)
+ {
+ LOG(warning) << "using workaround for old off-by-one ed events bug\n";
+ warned = true;
+ }
+ unpack_event_mean_stdv(
+ ede.size(),
+ [&] (unsigned i) { return ede.at(i).start; },
+ [&] (unsigned i) { return ede.at(i).length; },
+ [&] (unsigned i, double x) { return ede.at(i).mean = x; },
+ [&] (unsigned i, double x) { return ede.at(i).stdv = x; },
+ rs,
+ rs_params.start_time,
+ offset);
+ return res;
+ }
+ static Basecall_Fastq_Pack
+ pack_fq(std::string const & fq, unsigned qv_bits = 5)
+ {
+ static unsigned const max_qv_bits = 5;
+ static std::uint8_t const max_qv = ((std::uint8_t)1 << max_qv_bits) - 1;
+ Basecall_Fastq_Pack fq_pack;
+ auto fqa = split_fq(fq);
+ fq_pack.read_name = fqa[0];
+ std::vector< std::int8_t > bp(fqa[1].begin(), fqa[1].end());
+ qv_bits = std::min(qv_bits, max_qv_bits);
+ auto qv_mask = max_qv & (max_qv << (max_qv_bits - qv_bits));
+ fq_pack.qv_bits = qv_bits;
+ std::vector< std::uint8_t > qv;
+ for (auto c : fqa[3])
+ {
+ std::uint8_t val = (std::uint8_t)(c - 33);
+ val = std::min(val, max_qv);
+ val &= qv_mask;
+ qv.push_back(val);
+ }
+ std::tie(fq_pack.bp, fq_pack.bp_params) = fq_bp_coder().encode(bp, false);
+ std::tie(fq_pack.qv, fq_pack.qv_params) = fq_qv_coder().encode(qv, false);
+ return fq_pack;
+ }
+ static std::string
+ unpack_fq(Basecall_Fastq_Pack const & fq_pack)
+ {
+ std::string res;
+ res += "@";
+ res += fq_pack.read_name;
+ res += "\n";
+ auto bp = fq_bp_coder().decode< std::int8_t >(fq_pack.bp, fq_pack.bp_params);
+ for (auto c : bp) res += c;
+ res += "\n+\n";
+ auto qv = fq_qv_coder().decode< std::uint8_t >(fq_pack.qv, fq_pack.qv_params);
+ for (auto c : qv) res += (char)33 + c;
+ res += "\n";
+ return res;
+ }
+ static Basecall_Events_Pack
+ pack_ev(Basecall_Events_Dataset const & ev_ds,
+ Basecall_Group_Description const & bc_desc,
+ std::string const & sq,
+ std::vector< EventDetection_Event > const & ed,
+ std::string const & ed_gr,
+ Channel_Id_Params const & cid_params,
+ double median_sd_temp,
+ unsigned p_model_state_bits)
+ {
+ Basecall_Events_Pack ev_pack;
+ ev_pack.params = ev_ds.second;
+ auto & ev = ev_ds.first;
+ ev_pack.name = bc_desc.name;
+ ev_pack.version = bc_desc.version;
+ ev_pack.ed_gr = ed_gr;
+ ev_pack.start_time = time_to_int(ev[0].start, cid_params);
+ ev_pack.state_size = ev[0].get_model_state().size();
+ ev_pack.median_sd_temp = median_sd_temp;
+ ev_pack.p_model_state_bits = p_model_state_bits;
+ std::vector< long long > rel_skip;
+ std::vector< long long > skip;
+ std::vector< long long > len;
+ std::vector< std::uint8_t > mv;
+ std::vector< std::uint16_t > p_model_state;
+ // first pack start/duration
+ if (not ed_gr.empty())
+ {
+ // pack relative to ed events
+ long long j = -1;
+ for (unsigned i = 0; i < ev.size(); ++i)
+ {
+ auto ti = time_to_int(ev[i].start, cid_params);
+ auto last_j = j++;
+ while (j < (long long)ed.size() and ed[j].start < ti) ++j;
+ if (j == (long long)ed.size())
+ {
+ LOG_THROW
+ << "no matching ed event: i=" << i
+ << " ev[i]=(" << ti
+ << "," << time_to_int(ev[i].length, cid_params)
+ << "," << ev[i].mean
+ << "," << ev[i].stdv
+ << ")";
+ }
+ rel_skip.push_back(j - last_j - 1);
+ }
+ std::tie(ev_pack.rel_skip, ev_pack.rel_skip_params) = ev_rel_skip_coder().encode(rel_skip, false);
+ }
+ else
+ {
+ // pack start&length as for ed events
+ std::tie(skip, len) = pack_event_start_length(
+ ev.size(),
+ [&] (unsigned i) { return time_to_int(ev.at(i).start, cid_params); },
+ [&] (unsigned i) { return time_to_int(ev.at(i).length, cid_params); },
+ ev_pack.start_time);
+ std::tie(ev_pack.skip, ev_pack.skip_params) = ed_skip_coder().encode(skip, false);
+ std::tie(ev_pack.len, ev_pack.len_params) = ed_len_coder().encode(len, false);
+ }
+ unsigned sq_pos = 0;
+ for (unsigned i = 0; i < ev.size(); ++i)
+ {
+ auto s = ev[i].get_model_state();
+ if (s.size() != ev_pack.state_size)
+ {
+ LOG_THROW
+ << "unexpected state size: i=" << i
+ << " s=" << s
+ << " expected_size=" << ev_pack.state_size;
+ }
+ // check if move is valid
+ if (ev[i].move < 0 or ev[i].move > std::numeric_limits< uint8_t >::max())
+ {
+ LOG_THROW
+ << "invalid move: i=" << i
+ << "ev[i].move=" << ev[i].move;
+ }
+ int real_move = ev[i].move;
+ if (sq.substr(sq_pos + real_move, ev_pack.state_size) != s)
+ {
+ // move is not valid, compute alternative:
+ // allow move > state_size only if previous state is homopolymer
+ auto next_sq_pos = sq.find(s, sq_pos);
+ if (next_sq_pos != std::string::npos
+ and (next_sq_pos <= sq_pos + ev_pack.state_size
+ or sq.substr(sq_pos, ev_pack.state_size) == std::string(ev_pack.state_size, sq[sq_pos])))
+ {
+ real_move = next_sq_pos - sq_pos;
+ }
+ else
+ {
+ real_move = -1;
+ }
+ if (real_move >= 0)
+ {
+ LOG(warning)
+ << "using workaround for invalid move: i=" << i
+ << " sq=" << sq.substr(sq_pos, 2 * ev_pack.state_size)
+ << " move[i]=" << ev[i].move
+ << " state[i]=" << s
+ << " real_move=" << real_move << std::endl;
+ }
+ else
+ {
+ LOG_THROW
+ << "invalid move: i=" << i
+ << " sq=" << sq.substr(sq_pos, 2 * ev_pack.state_size)
+ << " move[i]=" << ev[i].move
+ << " state[i]=" << s;
+ }
+ }
+ mv.push_back(real_move);
+ sq_pos += real_move;
+ // p_model_state
+ std::uint16_t p_model_state_val = ev[i].p_model_state * (1u << p_model_state_bits);
+ if (p_model_state_val >= (1u << p_model_state_bits)) p_model_state_val = (1u << p_model_state_bits) - 1;
+ p_model_state.push_back(p_model_state_val);
+ }
+ if (sq_pos + ev_pack.state_size != sq.size())
+ {
+ LOG_THROW
+ << "leftover base sequence: sq_size=" << sq.size()
+ << " sq_end_pos=" << sq_pos + ev_pack.state_size;
+ }
+ std::tie(ev_pack.move, ev_pack.move_params) = ev_move_coder().encode(mv, false);
+ std::tie(ev_pack.p_model_state, ev_pack.p_model_state_params) = bit_packer().encode(p_model_state, p_model_state_bits);
+ return ev_pack;
+ } // pack_ev()
+ static std::vector< EventDetection_Event >
+ unpack_implicit_ed(Basecall_Events_Pack const & ev_pack,
+ Raw_Samples_Dataset const & rs_ds)
+ {
+ std::vector< EventDetection_Event > ede;
+ auto & rs = rs_ds.first;
+ auto & rs_params = rs_ds.second;
+ auto skip = ed_skip_coder().decode< long long >(ev_pack.skip, ev_pack.skip_params);
+ auto len = ed_len_coder().decode< long long >(ev_pack.len, ev_pack.len_params);
+ if (skip.empty() or skip.size() != len.size())
+ {
+ LOG_THROW
+ << "wrong dataset size: skip_size=" << skip.size()
+ << " len_size=" << len.size();
+ }
+ ede.resize(skip.size());
+ unpack_event_start_length(
+ skip,
+ len,
+ [&] (unsigned i, long long x) { return ede.at(i).start = x; },
+ [&] (unsigned i, long long x) { return ede.at(i).length = x; },
+ ev_pack.start_time);
+ int offset = 0;
+ static bool warned = false;
+ if (offset != 0 and not warned)
+ {
+ LOG(warning) << "using workaround for bug in "
+ << ev_pack.name << ":" << ev_pack.version << "\n";
+ warned = true;
+ }
+ unpack_event_mean_stdv(
+ ede.size(),
+ [&] (unsigned i) { return ede.at(i).start; },
+ [&] (unsigned i) { return ede.at(i).length; },
+ [&] (unsigned i, double x) { return ede.at(i).mean = x; },
+ [&] (unsigned i, double x) { return ede.at(i).stdv = x; },
+ rs,
+ rs_params.start_time,
+ offset);
+ return ede;
+ }
+ static Basecall_Events_Dataset
+ unpack_ev(Basecall_Events_Pack const & ev_pack,
+ std::string const & sq,
+ std::vector< EventDetection_Event > const & ed,
+ Channel_Id_Params const & cid_params)
+ {
+ Basecall_Events_Dataset ev_ds;
+ ev_ds.second = ev_pack.params;
+ auto & ev = ev_ds.first;
+ std::vector< long long > rel_skip;
+ if (not ev_pack.rel_skip.empty())
+ {
+ rel_skip = ev_rel_skip_coder().decode< long long >(ev_pack.rel_skip, ev_pack.rel_skip_params);
+ }
+ auto mv = ev_move_coder().decode< std::uint8_t >(ev_pack.move, ev_pack.move_params);
+ auto p_model_state = bit_packer().decode< std::uint16_t >(ev_pack.p_model_state, ev_pack.p_model_state_params);
+ if ((not rel_skip.empty() and rel_skip.size() != mv.size()) or p_model_state.size() != mv.size())
+ {
+ LOG_THROW
+ << "wrong dataset size: rel_skip_size=" << rel_skip.size()
+ << " mv_size=" << mv.size()
+ << " p_model_state_size=" << p_model_state.size();
+ }
+ ev.resize(mv.size());
+ long long j = -1;
+ std::string s;
+ unsigned sq_pos = 0;
+ unsigned p_model_state_bits;
+ std::istringstream(ev_pack.p_model_state_params.at("num_bits")) >> p_model_state_bits;
+ long long unsigned max_p_model_state_int = 1llu << p_model_state_bits;
+ for (unsigned i = 0; i < ev.size(); ++i)
+ {
+ j += (not rel_skip.empty()? rel_skip[i] : 0) + 1;
+ ev[i].start = time_to_float(ed[j].start, cid_params);
+ ev[i].length = time_to_float(ed[j].length, cid_params);
+ ev[i].mean = ed[j].mean;
+ ev[i].stdv = ed[j].stdv;
+ if (ev[i].stdv == 0.0) ev[i].stdv = ev_pack.median_sd_temp;
+ ev[i].move = mv[i];
+ if (i > 0) s = s.substr(mv[i]); // apply move
+ while (s.size() < ev_pack.state_size) s += sq[sq_pos++];
+ std::copy(s.begin(), s.end(), ev[i].model_state.begin());
+ if (ev_pack.state_size < MAX_K_LEN) ev[i].model_state[ev_pack.state_size] = 0;
+ ev[i].p_model_state = (double)p_model_state[i] / max_p_model_state_int;
+ }
+ return ev_ds;
+ } // unpack_ev()
+ static Basecall_Alignment_Pack
+ pack_al(std::vector< Basecall_Alignment_Entry > const & al,
+ std::string const & sq)
+ {
+ Basecall_Alignment_Pack al_pack;
+ std::array< std::vector< uint8_t > , 2 > step_v;
+ std::vector< int8_t > mv;
+ step_v[0].reserve(al.size());
+ step_v[1].reserve(al.size());
+ mv.reserve(al.size());
+ std::array< int, 2 > start_index = {{ -1, -1 }};
+ std::array< int, 2 > next_index = {{ -1, -1 }};
+ std::array< int, 2 > delta = {{ 1, -1 }};
+ auto get_idx = [&] (unsigned i, unsigned k) {
+ return k == 0? al[i].template_index : al[i].complement_index;
+ };
+ unsigned pos = 0;
+ for (unsigned i = 0; i < al.size(); ++i)
+ {
+ for (unsigned k = 0; k < 2; ++k)
+ {
+ auto idx = get_idx(i, k);
+ if (idx >= 0)
+ {
+ if (start_index[k] < 0)
+ {
+ start_index[k] = idx;
+ next_index[k] = idx;
+ }
+ if (idx != next_index[k])
+ {
+ LOG_THROW
+ << "bad index: idx=" << idx
+ << " next_index=" << next_index[k];
+ }
+ step_v[k].push_back(1);
+ next_index[k] += delta[k];
+ }
+ else // idx < 0
+ {
+ step_v[k].push_back(0);
+ }
+ }
+ // compute move
+ auto kmer = al[i].get_kmer();
+ size_t next_pos = sq.find(kmer, pos);
+ if (next_pos == std::string::npos)
+ {
+ LOG_THROW
+ << "missing kmer in 2d seq";
+ }
+ if (next_pos - pos > std::numeric_limits< int8_t >::max())
+ {
+ LOG_THROW
+ << "bad move: next_pos=" << next_pos
+ << " pos=" << pos;
+ }
+ mv.push_back(next_pos - pos);
+ pos = next_pos;
+ }
+ if (start_index[0] < 0)
+ {
+ LOG_THROW
+ << "no template events";
+ }
+ if (start_index[1] < 0)
+ {
+ LOG_THROW
+ << "no complement events";
+ }
+ al_pack.template_index_start = start_index[0];
+ al_pack.complement_index_start = start_index[1];
+ al_pack.kmer_size = al[0].get_kmer().size();
+ std::tie(al_pack.template_step, al_pack.template_step_params) = bit_packer().encode(step_v[0], 1);
+ std::tie(al_pack.complement_step, al_pack.complement_step_params) = bit_packer().encode(step_v[1], 1);
+ std::tie(al_pack.move, al_pack.move_params) = ev_move_coder().encode(mv, false);
+ return al_pack;
+ } // pack_al()
+ static std::vector< Basecall_Alignment_Entry >
+ unpack_al(Basecall_Alignment_Pack const & al_pack,
+ std::string const & sq)
+ {
+ std::vector< Basecall_Alignment_Entry > al;
+ std::array< std::vector< uint8_t >, 2 > step_v =
+ {{ bit_packer().decode< uint8_t >(al_pack.template_step, al_pack.template_step_params),
+ bit_packer().decode< uint8_t >(al_pack.complement_step, al_pack.complement_step_params) }};
+ auto mv = ev_move_coder().decode< int8_t >(al_pack.move, al_pack.move_params);
+ if (step_v[1].size() != step_v[0].size()
+ or mv.size() != step_v[0].size())
+ {
+ LOG_THROW
+ << "wrong dataset size: step_v[0]_size=" << step_v[0].size()
+ << " step_v[1]_size=" << step_v[1].size()
+ << " mv_size=" << mv.size();
+ }
+ al.resize(step_v[0].size());
+ std::array< unsigned, 2 > crt_index = {{ al_pack.template_index_start, al_pack.complement_index_start }};
+ std::array< int, 2 > delta = {{ 1, -1 }};
+ auto pos = 0;
+ auto set_idx = [&] (unsigned i, unsigned k, int val) {
+ if (k == 0)
+ {
+ al[i].template_index = val;
+ }
+ else
+ {
+ al[i].complement_index = val;
+ }
+ };
+ for (unsigned i = 0; i < step_v[0].size(); ++i)
+ {
+ for (unsigned k = 0; k < 2; ++k)
+ {
+ if (step_v[k][i] > 0)
+ {
+ set_idx(i, k, crt_index[k]);
+ crt_index[k] += delta[k];
+ }
+ else
+ {
+ set_idx(i, k, -1);
+ }
+ }
+ // set kmer
+ pos += mv[i];
+ std::copy(sq.begin() + pos, sq.begin() + pos + al_pack.kmer_size, al[i].kmer.begin());
+ if (al_pack.kmer_size < MAX_K_LEN) al[i].kmer[al_pack.kmer_size] = 0;
+ }
+ return al;
+ } // unpack_al()
+
+ //
+ // Fast5 internal paths
+ //
+ static std::string file_version_path() { return "/file_version"; }
+ static std::string channel_id_path() { return "/UniqueGlobalKey/channel_id"; }
+ static std::string tracking_id_path() { return "/UniqueGlobalKey/tracking_id"; }
+ static std::string sequences_path() { return "/Sequences/Meta"; }
+ static std::string raw_samples_root_path() { return "/Raw/Reads"; }
+ static std::string raw_samples_params_path(std::string const & rn)
{
return raw_samples_root_path() + "/" + rn;
}
- static std::string raw_samples_path(const std::string& rn)
+ static std::string raw_samples_path(std::string const & rn)
{
return raw_samples_root_path() + "/" + rn + "/Signal";
}
- static const std::string& sequences_path()
+ static std::string raw_samples_pack_path(std::string const & rn)
{
- static const std::string _sequences_path = "/Sequences/Meta";
- return _sequences_path;
+ return raw_samples_path(rn) + "_Pack";
}
- static const std::string& eventdetection_root_path()
+ static std::string raw_samples_params_pack_path(std::string const & rn)
{
- static const std::string _eventdetection_root_path = "/Analyses";
- return _eventdetection_root_path;
+ return raw_samples_pack_path(rn) + "/params";
}
- static const std::string& eventdetection_group_prefix()
+ static std::string eventdetection_root_path() { return "/Analyses"; }
+ static std::string eventdetection_group_prefix() { return "EventDetection_"; }
+ static std::string eventdetection_group_path(std::string const & gr)
{
- static const std::string _eventdetection_group_prefix = "EventDetection_";
- return _eventdetection_group_prefix;
+ return eventdetection_root_path() + "/" + eventdetection_group_prefix() + gr;
}
- static std::string eventdetection_params_path(const std::string& ed_gr)
+ static std::string eventdetection_events_params_path(std::string const & gr, std::string const & rn)
{
- return eventdetection_root_path() + "/" + eventdetection_group_prefix() + ed_gr;
+ return eventdetection_group_path(gr) + "/Reads/" + rn;
}
- static std::string eventdetection_event_params_path(const std::string& ed_gr, const std::string& rn)
+ static std::string eventdetection_events_path(std::string const & gr, std::string const & rn)
{
- return eventdetection_root_path() + "/" + eventdetection_group_prefix() + ed_gr + "/Reads/" + rn;
+ return eventdetection_group_path(gr) + "/Reads/" + rn + "/Events";
}
- static std::string eventdetection_events_path(const std::string& ed_gr, const std::string& rn)
+ static std::string eventdetection_events_pack_path(std::string const & gr, std::string const & rn)
{
- return eventdetection_root_path() + "/" + eventdetection_group_prefix() + ed_gr + "/Reads/" + rn + "/Events";
+ return eventdetection_events_path(gr, rn) + "_Pack";
}
-
- static const std::string& basecall_root_path()
+ static std::string eventdetection_events_params_pack_path(std::string const & gr, std::string const & rn)
+ {
+ return eventdetection_events_pack_path(gr, rn) + "/params";
+ }
+ static std::string basecall_root_path() { return "/Analyses"; }
+ static std::string basecall_group_prefix() { return "Basecall_"; }
+ static std::string strand_name(unsigned st)
+ {
+ static const std::array< std::string, 3 > _strand_name =
+ {{ "template", "complement", "2D" }};
+ return _strand_name.at(st);
+ }
+ static std::string basecall_strand_subgroup(unsigned st)
+ {
+ return std::string("BaseCalled_") + strand_name(st);
+ }
+ static std::string basecall_group_path(std::string const & gr)
+ {
+ return basecall_root_path() + "/" + basecall_group_prefix() + gr;
+ }
+ static std::string basecall_strand_group_path(std::string const & gr, unsigned st)
+ {
+ return basecall_group_path(gr) + "/" + basecall_strand_subgroup(st);
+ }
+ static std::string basecall_log_path(std::string const & gr)
+ {
+ return basecall_group_path(gr) + "/Log";
+ }
+ static std::string basecall_fastq_path(std::string const & gr, unsigned st)
+ {
+ return basecall_strand_group_path(gr, st) + "/Fastq";
+ }
+ static std::string basecall_fastq_pack_path(std::string const & gr, unsigned st)
+ {
+ return basecall_fastq_path(gr, st) + "_Pack";
+ }
+ static std::string basecall_model_path(std::string const & gr, unsigned st)
+ {
+ return basecall_strand_group_path(gr, st) + "/Model";
+ }
+ static std::string basecall_model_file_path(std::string const & gr, unsigned st)
{
- static const std::string _basecall_root_path = "/Analyses";
- return _basecall_root_path;
+ return basecall_group_path(gr) + "/Summary/basecall_1d_" + strand_name(st) + "/model_file";
}
- static const std::string& basecall_group_prefix()
+ static std::string basecall_events_path(std::string const & gr, unsigned st)
{
- static const std::string _basecall_group_prefix = "Basecall_";
- return _basecall_group_prefix;
+ return basecall_strand_group_path(gr, st) + "/Events";
}
- static const std::string& basecall_strand_subgroup(unsigned st)
+ static std::string basecall_events_pack_path(std::string const & gr, unsigned st)
{
- static const std::array< std::string, 3 > _basecall_strand_subgroup =
- {{ "BaseCalled_template", "BaseCalled_complement", "BaseCalled_2D" }};
- return _basecall_strand_subgroup[st];
+ return basecall_events_path(gr, st) + "_Pack";
}
- static std::string basecall_fastq_path(const std::string& bc_gr, unsigned st)
+ static std::string basecall_events_params_pack_path(std::string const & gr, unsigned st)
{
- return basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/"
- + basecall_strand_subgroup(st) + "/Fastq";
+ return basecall_events_pack_path(gr, st) + "/params";
}
- static std::string basecall_model_path(const std::string& bc_gr, unsigned st)
+ static std::string basecall_alignment_path(std::string const & gr)
{
- return basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/"
- + basecall_strand_subgroup(st) + "/Model";
+ return basecall_strand_group_path(gr, 2) + "/Alignment";
}
- static std::string basecall_model_file_path(const std::string& bc_gr, unsigned st)
+ static std::string basecall_alignment_pack_path(std::string const & gr)
{
- assert(st < 2);
- return basecall_root_path() + "/" + basecall_group_prefix() + bc_gr
- + "/Summary/basecall_1d_" + (st == 0? "template" : "complement") + "/model_file";
+ return basecall_alignment_path(gr) + "_Pack";
}
- static std::string basecall_events_path(const std::string& bc_gr, unsigned st)
+ static std::string basecall_config_path(std::string const & gr)
{
- return basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/"
- + basecall_strand_subgroup(st) + "/Events";
+ return basecall_group_path(gr) + "/Configuration";
}
- static std::string basecall_event_alignment_path(const std::string& bc_gr)
+ static std::string basecall_summary_path(std::string const & gr)
{
- return basecall_root_path() + "/" + basecall_group_prefix() + bc_gr + "/"
- + basecall_strand_subgroup(2) + "/Alignment";
+ return basecall_group_path(gr) + "/Summary";
}
+ //
+ // Packers
+ //
+ static Huffman_Packer const & rw_coder() { return Huffman_Packer::get_coder("fast5_rw_1"); }
+ static Huffman_Packer const & ed_skip_coder() { return Huffman_Packer::get_coder("fast5_ed_skip_1"); }
+ static Huffman_Packer const & ed_len_coder() { return Huffman_Packer::get_coder("fast5_ed_len_1"); }
+ static Huffman_Packer const & fq_bp_coder() { return Huffman_Packer::get_coder("fast5_fq_bp_1"); }
+ static Huffman_Packer const & fq_qv_coder() { return Huffman_Packer::get_coder("fast5_fq_qv_1"); }
+ static Huffman_Packer const & ev_rel_skip_coder() { return Huffman_Packer::get_coder("fast5_ev_rel_skip_1"); }
+ static Huffman_Packer const & ev_move_coder() { return Huffman_Packer::get_coder("fast5_ev_move_1"); }
+ static Bit_Packer const & bit_packer() { return Bit_Packer::get_packer(); }
}; // class File
} // namespace fast5
diff --git a/src/fast5_version.hpp b/src/fast5_version.hpp
new file mode 100644
index 0000000..5bb54e1
--- /dev/null
+++ b/src/fast5_version.hpp
@@ -0,0 +1,16 @@
+#ifndef __FAST5_VERSION_HPP
+#define __FAST5_VERSION_HPP
+
+namespace fast5
+{
+
+namespace
+{
+
+static char const * const version = "0.6.2";
+
+}
+
+}
+
+#endif
diff --git a/src/hdf5-mod.cpp b/src/hdf5-mod.cpp
index 314bdb3..818d36d 100644
--- a/src/hdf5-mod.cpp
+++ b/src/hdf5-mod.cpp
@@ -1,3 +1,10 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
#include <cassert>
#include <iostream>
#include <string>
@@ -226,6 +233,10 @@ int main(int argc, char* argv[])
clog << a << endl;
}
+ // reopen for reading
+ f.close();
+ f.open(file_name);
+
//
// test reading compound
//
@@ -295,6 +306,7 @@ int main(int argc, char* argv[])
}
}
+ f.close();
}
catch (hdf5_tools::Exception& e)
{
diff --git a/src/hdf5_tools.hpp b/src/hdf5_tools.hpp
index 252b1a2..6fa9124 100644
--- a/src/hdf5_tools.hpp
+++ b/src/hdf5_tools.hpp
@@ -1,7 +1,8 @@
//
-// The MIT License (MIT)
+// Part of: https://github.com/mateidavid/fast5
//
-// Copyright (c) 2015 Matei David, Ontario Institute for Cancer Research
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
//
#ifndef __HDF5_TOOLS_HPP
@@ -20,6 +21,7 @@
#include <deque>
#include <set>
#include <map>
+#include <queue>
#include <limits>
#include <type_traits>
@@ -413,6 +415,11 @@ struct Util
[] (void * vp) { return *reinterpret_cast< H5T_class_t * >(vp) != H5T_NO_CLASS; }
}
},
+ { (void(*)())&H5Tget_cset,
+ { "H5Tget_cset",
+ [] (void * vp) { return *reinterpret_cast< H5T_cset_t * >(vp) != H5T_CSET_ERROR; }
+ }
+ },
{ (void(*)())&H5Tget_member_index,
{ "H5Tget_member_index",
[] (void * vp) { return *reinterpret_cast< int * >(vp) >= 0; }
@@ -453,6 +460,11 @@ struct Util
[] (void * vp) { return *reinterpret_cast< htri_t * >(vp) >= 0; }
}
},
+ { (void(*)())&H5Tset_cset,
+ { "H5Tset_cset",
+ [] (void * vp) { return *reinterpret_cast< herr_t * >(vp) >= 0; }
+ }
+ },
{ (void(*)())&H5Tset_size,
{ "H5Tset_size",
[] (void * vp) { return *reinterpret_cast< herr_t * >(vp) >= 0; }
@@ -831,6 +843,7 @@ struct Reader_Base
{
file_dtype_is_vlen_str = false;
}
+ // datatype size
file_dtype_size = Util::wrap(H5Tget_size, file_dtype_id_holder.id);
}
HDF_Object_Holder obj_id_holder;
@@ -877,10 +890,13 @@ struct String_reader
HDF_Object_Holder mem_dtype_id_holder;
if (file_stype_class == H5T_STRING) // stored as a string
{
+ auto file_stype_cset = Util::wrap(H5Tget_cset, file_stype_id);
if (Util::wrap(H5Tis_variable_str, file_stype_id)) // stored as a varlen string
{
// compute mem_type
- mem_dtype_id_holder = mem_type_wrapper(Util::make_str_type(-1));
+ auto mem_stype_id_holder = Util::make_str_type(-1);
+ Util::wrap(H5Tset_cset, mem_stype_id_holder.id, file_stype_cset);
+ mem_dtype_id_holder = mem_type_wrapper(std::move(mem_stype_id_holder));
// prepare buffer to receive data
std::vector< char * > charptr_buff(res.size(), nullptr);
// perform the read
@@ -899,7 +915,9 @@ struct String_reader
{
// compute mem_type
size_t file_stype_size = Util::wrap(H5Tget_size, file_stype_id);
- mem_dtype_id_holder = mem_type_wrapper(Util::make_str_type(file_stype_size + 1));
+ auto mem_stype_id_holder = Util::make_str_type(file_stype_size + 1);
+ Util::wrap(H5Tset_cset, mem_stype_id_holder.id, file_stype_cset);
+ mem_dtype_id_holder = mem_type_wrapper(std::move(mem_stype_id_holder));
// prepare buffer to receieve data
std::vector< char > char_buff(res.size() * (file_stype_size + 1), '\0');
// perform the read
@@ -999,6 +1017,8 @@ struct Reader_helper< 2, Data_Type >
and not reader_base.file_dtype_is_vlen_str)
{
HDF_Object_Holder mem_dtype_id_holder(Util::make_str_type(sizeof(Data_Type)));
+ auto file_dtype_cset = Util::wrap(H5Tget_cset, reader_base.file_dtype_id_holder.id);
+ Util::wrap(H5Tset_cset, mem_dtype_id_holder.id, file_dtype_cset);
reader_base.reader(mem_dtype_id_holder.id, out);
}
else // conversion needed
@@ -1409,6 +1429,8 @@ struct Writer< std::vector< In_Data_Type > >
class File
{
public:
+ typedef std::map< std::string, std::string > Attr_Map;
+
File() : _file_id(0) {}
File(const std::string& file_name, bool rw = false) : _file_id(0) { open(file_name, rw); }
File(const File&) = delete;
@@ -1421,7 +1443,7 @@ public:
void create(const std::string& file_name, bool truncate = false)
{
- assert(not is_open());
+ if (is_open()) close();
_file_name = file_name;
_rw = true;
_file_id = H5Fcreate(file_name.c_str(), truncate? H5F_ACC_TRUNC : H5F_ACC_EXCL, H5P_DEFAULT, H5P_DEFAULT);
@@ -1429,7 +1451,7 @@ public:
}
void open(const std::string& file_name, bool rw = false)
{
- assert(not is_open());
+ if (is_open()) close();
_file_name = file_name;
_rw = rw;
_file_id = H5Fopen(file_name.c_str(), not rw? H5F_ACC_RDONLY : H5F_ACC_RDWR, H5P_DEFAULT);
@@ -1437,8 +1459,8 @@ public:
}
void close()
{
- assert(is_open());
- assert(H5Fget_obj_count(_file_id, H5F_OBJ_ALL | H5F_OBJ_LOCAL) == 1);
+ if (not is_open()) return;
+ if (H5Fget_obj_count(_file_id, H5F_OBJ_ALL | H5F_OBJ_LOCAL) != 1) throw Exception(_file_name + ": HDF5 memory leak");
int status = H5Fclose(_file_id);
if (status < 0) throw Exception(_file_name + ": error in H5Fclose");
_file_id = 0;
@@ -1466,49 +1488,55 @@ public:
}
/// Check if a group exists
- bool group_exists(const std::string& loc_full_name) const
+ bool group_exists(std::string const & loc_full_name) const
{
assert(is_open());
- assert(not loc_full_name.empty() and loc_full_name[0] == '/');
- std::string loc_path;
- std::string loc_name;
- std::tie(loc_path, loc_name) = split_full_name(loc_full_name);
+ assert(not loc_full_name.empty() and loc_full_name.front() == '/');
+ if (loc_full_name == "/") return true;
+ auto && loc = split_full_name(loc_full_name);
// check all path elements exist, except for what is to the right of the last '/'
// sets active path
- if (not path_exists(loc_path)) return false;
- return check_object_type(loc_full_name, H5O_TYPE_GROUP);
+ return path_exists(loc.first) and check_object_type(loc_full_name, H5O_TYPE_GROUP);
}
/// Check if a dataset exists
- bool dataset_exists(const std::string& loc_full_name) const
+ bool dataset_exists(std::string const & loc_full_name) const
{
assert(is_open());
- assert(not loc_full_name.empty() and loc_full_name[0] == '/');
- std::string loc_path;
- std::string loc_name;
- std::tie(loc_path, loc_name) = split_full_name(loc_full_name);
+ assert(not loc_full_name.empty() and loc_full_name.front() == '/');
+ if (loc_full_name == "/") return false;
+ auto && loc = split_full_name(loc_full_name);
// check all path elements exist, except for what is to the right of the last '/'
// sets active path
- if (not path_exists(loc_path)) return false;
- return check_object_type(loc_full_name, H5O_TYPE_DATASET);
+ return path_exists(loc.first) and check_object_type(loc_full_name, H5O_TYPE_DATASET);
+ }
+ bool group_or_dataset_exists(std::string const & loc_full_name) const
+ {
+ assert(is_open());
+ assert(not loc_full_name.empty() and loc_full_name.front() == '/');
+ if (loc_full_name == "/") return true;
+ auto && loc = split_full_name(loc_full_name);
+ // check all path elements exist, except for what is to the right of the last '/'
+ // sets active path
+ return (path_exists(loc.first) and
+ (check_object_type(loc_full_name, H5O_TYPE_DATASET) or
+ check_object_type(loc_full_name, H5O_TYPE_GROUP)));
}
/// Check if attribute exists
- bool attribute_exists(const std::string& loc_full_name) const
+ bool attribute_exists(std::string const & loc_full_name) const
{
assert(is_open());
- assert(not loc_full_name.empty() and loc_full_name[0] == '/');
- std::string loc_path;
- std::string loc_name;
- std::tie(loc_path, loc_name) = split_full_name(loc_full_name);
- int status;
+ assert(not loc_full_name.empty() and loc_full_name.front() == '/');
+ if (loc_full_name == "/") return false;
+ auto && loc = split_full_name(loc_full_name);
// check all path elements exist, except for what is to the right of the last '/'
// sets active path
- if (not path_exists(loc_path)) return false;
+ if (not group_or_dataset_exists(loc.first)) return false;
// check if target is an attribute
- status = H5Aexists_by_name(_file_id, loc_path.c_str(), loc_name.c_str(), H5P_DEFAULT);
+ int status = H5Aexists_by_name(_file_id, loc.first.c_str(), loc.second.c_str(), H5P_DEFAULT);
if (status < 0) throw Exception("error in H5Aexists_by_name");
return status > 0;
}
- bool exists(const std::string& loc_full_name) const
+ bool exists(std::string const & loc_full_name) const
{
return attribute_exists(loc_full_name) or dataset_exists(loc_full_name);
}
@@ -1519,14 +1547,12 @@ public:
{
assert(is_open());
assert(not loc_full_name.empty() and loc_full_name[0] == '/');
- std::string loc_path;
- std::string loc_name;
- std::tie(loc_path, loc_name) = split_full_name(loc_full_name);
+ auto && loc = split_full_name(loc_full_name);
Exception::active_path() = loc_full_name;
detail::HDF_Object_Holder grp_id_holder(
- detail::Util::wrap(H5Oopen, _file_id, loc_path.c_str(), H5P_DEFAULT),
+ detail::Util::wrap(H5Oopen, _file_id, loc.first.c_str(), H5P_DEFAULT),
detail::Util::wrapped_closer(H5Oclose));
- detail::Reader< Data_Storage >()(grp_id_holder.id, loc_name,
+ detail::Reader< Data_Storage >()(grp_id_holder.id, loc.second,
out, std::forward< Args >(args)...);
}
/// Write attribute or dataset
@@ -1537,16 +1563,13 @@ public:
assert(is_rw());
assert(not loc_full_name.empty() and loc_full_name[0] == '/');
assert(not exists(loc_full_name));
- std::string loc_path;
- std::string loc_name;
- std::tie(loc_path, loc_name) = split_full_name(loc_full_name);
+ auto && loc = split_full_name(loc_full_name);
Exception::active_path() = loc_full_name;
detail::HDF_Object_Holder grp_id_holder;
- std::string grp_path = loc_path != "/"? loc_path.substr(0, loc_path.size() - 1) : "/";
- if (group_exists(grp_path) or dataset_exists(grp_path))
+ if (group_or_dataset_exists(loc.first))
{
grp_id_holder.load(
- detail::Util::wrap(H5Oopen, _file_id, grp_path.c_str(), H5P_DEFAULT),
+ detail::Util::wrap(H5Oopen, _file_id, loc.first.c_str(), H5P_DEFAULT),
detail::Util::wrapped_closer(H5Oclose));
}
else
@@ -1556,10 +1579,10 @@ public:
detail::Util::wrapped_closer(H5Pclose));
detail::Util::wrap(H5Pset_create_intermediate_group, lcpl_id_holder.id, 1);
grp_id_holder.load(
- detail::Util::wrap(H5Gcreate2, _file_id, grp_path.c_str(), lcpl_id_holder.id, H5P_DEFAULT, H5P_DEFAULT),
+ detail::Util::wrap(H5Gcreate2, _file_id, loc.first.c_str(), lcpl_id_holder.id, H5P_DEFAULT, H5P_DEFAULT),
detail::Util::wrapped_closer(H5Gclose));
}
- detail::Writer< In_Data_Storage >()(grp_id_holder.id, loc_name, as_ds, in, std::forward< Args >(args)...);
+ detail::Writer< In_Data_Storage >()(grp_id_holder.id, loc.second, as_ds, in, std::forward< Args >(args)...);
}
template < typename In_Data_Storage, typename ...Args >
void write_dataset(const std::string& loc_full_name, const In_Data_Storage& in, Args&& ...args) const
@@ -1601,7 +1624,7 @@ public:
{
std::vector< std::string > res;
Exception::active_path() = loc_full_name;
- assert(group_exists(loc_full_name) or dataset_exists(loc_full_name));
+ assert(group_or_dataset_exists(loc_full_name));
detail::HDF_Object_Holder id_holder(
detail::Util::wrap(H5Oopen, _file_id, loc_full_name.c_str(), H5P_DEFAULT),
detail::Util::wrapped_closer(H5Oclose));
@@ -1619,6 +1642,43 @@ public:
}
return res;
} // get_attr_list
+ Attr_Map
+ get_attr_map(std::string const & path, bool recurse = false) const
+ {
+ Attr_Map res;
+ std::queue< std::string > q;
+ q.push("");
+ while (not q.empty())
+ {
+ auto pt = q.front();
+ q.pop();
+ auto full_path = pt.empty()? path : path + "/" + pt;
+ auto a_list = get_attr_list(full_path);
+ for (auto const & a : a_list)
+ {
+ std::string tmp;
+ read(full_path + "/" + a, tmp);
+ res[pt.empty()? a : pt + "/" + a] = tmp;
+ }
+ if (recurse and group_exists(full_path))
+ {
+ auto sg_l = list_group(full_path);
+ for (auto const & sg : sg_l)
+ {
+ q.push(pt.empty()? sg : pt + "/" + sg);
+ }
+ }
+ }
+ return res;
+ } // get_attr_map()
+ void
+ add_attr_map(std::string const & path, Attr_Map const & attr_m) const
+ {
+ for (auto const & p : attr_m)
+ {
+ write_attribute(path + "/" + p.first, p.second);
+ }
+ } // add_attr_map()
/// Return a list of struct field names in the given dataset/attribute
std::vector< std::string > get_struct_members(const std::string& loc_full_name) const
{
@@ -1630,11 +1690,9 @@ public:
detail::HDF_Object_Holder type_id_holder;
if (attribute_exists(loc_full_name))
{
- std::string loc_path;
- std::string loc_name;
- std::tie(loc_path, loc_name) = split_full_name(loc_full_name);
+ auto && loc = split_full_name(loc_full_name);
attr_id_holder.load(
- detail::Util::wrap(H5Aopen_by_name, _file_id, loc_path.c_str(), loc_name.c_str(),
+ detail::Util::wrap(H5Aopen_by_name, _file_id, loc.first.c_str(), loc.second.c_str(),
H5P_DEFAULT, H5P_DEFAULT),
detail::Util::wrapped_closer(H5Aclose));
type_id_holder.load(
@@ -1664,35 +1722,173 @@ public:
return res;
} // get_struct_members
+ /*
+ static void copy(File & src_f, File & dst_f, std::string const & path, bool shallow = false)
+ {
+ assert(src_f.is_open());
+ assert(dst_f.is_open());
+ assert(dst_f.is_rw());
+ assert(src_f.group_exists(path) or src_f.dataset_exists(path));
+ assert(not (dst_f.group_exists(path) or dst_f.dataset_exists(path)));
+ detail::HDF_Object_Holder ocpypl_id_holder(
+ detail::Util::wrap(H5Pcreate, H5P_OBJECT_COPY),
+ detail::Util::wrapped_closer(H5Pclose));
+ auto status = hdf5::H5Pset_copy_object(ocpypl_id_holder.id, H5O_COPY_MERGE_COMMITTED_DTYPE_FLAG);
+ if (status < 0) throw Exception("error in H5Pset_copy_object");
+ if (shallow)
+ {
+ status = hdf5::H5Pset_copy_object(ocpypl_id_holder.id, H5O_COPY_SHALLOW_HIERARCHY_FLAG);
+ if (status < 0) throw Exception("error in H5Pset_copy_object");
+ }
+ auto res = hdf5::H5Ocopy(src_f._file_id, path.c_str(),
+ dst_f._file_id, path.c_str(),
+ ocpypl_id_holder.id, H5P_DEFAULT);
+ if (res < 0) throw Exception("error in H5Ocopy");
+ } // copy
+ */
+
+ static void copy_attribute(File const & src_f, File const & dst_f,
+ std::string const & src_full_path, std::string const & _dst_full_path = std::string())
+ {
+ if (not src_f.is_open()) throw Exception("source file not open");
+ if (not dst_f.is_open()) throw Exception("destination file not open");
+ if (not dst_f.is_rw()) throw Exception("destination file not writeable");
+ std::string const & dst_full_path = (_dst_full_path.empty()? src_full_path : _dst_full_path);
+ if (not src_f.attribute_exists(src_full_path)) throw Exception("source attribute missing");
+ if (dst_f.group_or_dataset_exists(dst_full_path) or
+ dst_f.attribute_exists(dst_full_path)) throw Exception("destination path exists");
+ // compute paths
+ auto && src_path = split_full_name(src_full_path);
+ auto && dst_path = split_full_name(dst_full_path);
+ // open source attribute
+ detail::HDF_Object_Holder src_attr_id_holder(
+ detail::Util::wrap(H5Aopen_by_name, src_f._file_id, src_path.first.c_str(), src_path.second.c_str(),
+ H5P_DEFAULT, H5P_DEFAULT),
+ detail::Util::wrapped_closer(H5Aclose));
+ // open source attribute datatype
+ detail::HDF_Object_Holder src_attr_dtype_id_holder(
+ detail::Util::wrap(H5Aget_type, src_attr_id_holder.id),
+ detail::Util::wrapped_closer(H5Tclose));
+ if (hdf5::H5Tget_class(src_attr_dtype_id_holder.id) == H5T_INTEGER)
+ {
+ if (hdf5::H5Tget_sign(src_attr_dtype_id_holder.id) == H5T_SGN_NONE)
+ {
+ unsigned long long tmp;
+ src_f.read(src_full_path, tmp);
+ dst_f.write_attribute(dst_full_path, tmp, src_attr_dtype_id_holder.id);
+ }
+ else if (hdf5::H5Tget_sign(src_attr_dtype_id_holder.id) == H5T_SGN_2)
+ {
+ long long tmp;
+ src_f.read(src_full_path, tmp);
+ dst_f.write_attribute(dst_full_path, tmp, src_attr_dtype_id_holder.id);
+ }
+ else
+ {
+ throw Exception("error in H5Tget_sign");
+ }
+ }
+ else if (hdf5::H5Tget_class(src_attr_dtype_id_holder.id) == H5T_FLOAT)
+ {
+ long double tmp;
+ src_f.read(src_full_path, tmp);
+ dst_f.write_attribute(dst_full_path, tmp, src_attr_dtype_id_holder.id);
+ }
+ else if (hdf5::H5Tget_class(src_attr_dtype_id_holder.id) == H5T_STRING)
+ {
+ std::string tmp;
+ src_f.read(src_full_path, tmp);
+ auto is_varlen = hdf5::H5Tis_variable_str(src_attr_dtype_id_holder.id);
+ if (is_varlen < 0) throw Exception("error in H5Tis_variable_str");
+ if (is_varlen)
+ {
+ dst_f.write_attribute(dst_full_path, tmp, -1);
+ }
+ else
+ {
+ // not varlen; now deal with array-of-size-1 chars
+ int sz = hdf5::H5Tget_size(src_attr_dtype_id_holder.id);
+ if (sz == 0) throw Exception("error in H5Tget_size");
+ detail::HDF_Object_Holder src_attr_dspace_id_holder(
+ detail::Util::wrap(H5Aget_space, src_attr_id_holder.id),
+ detail::Util::wrapped_closer(H5Sclose));
+ auto dspace_type = hdf5::H5Sget_simple_extent_type(src_attr_dspace_id_holder.id);
+ if (dspace_type == H5S_SCALAR)
+ {
+ dst_f.write_attribute(dst_full_path, tmp, 0);
+ }
+ else if (dspace_type == H5S_SIMPLE)
+ {
+ if (sz != 1) throw Exception("unsupported attribute type for copying: extent of string of size > 1");
+ std::vector< char[1] > tmp_v(tmp.size());
+ for (unsigned i = 0; i < tmp.size(); ++i)
+ {
+ tmp_v[i][0] = tmp[i];
+ }
+ dst_f.write_attribute(dst_full_path, tmp_v);
+ }
+ else
+ {
+ throw Exception("error in H5Sget_simple_extent_type");
+ }
+ }
+ }
+ else
+ {
+ throw Exception("unsupported attribute type for copying");
+ }
+ } // copy_attribute
+
+ static void
+ copy_attributes(File const & src_f, File const & dst_f, std::string const & path, bool recurse = false)
+ {
+ auto a_l = src_f.get_attr_list(not path.empty()? path : std::string("/"));
+ for (auto const & a : a_l)
+ {
+ copy_attribute(src_f, dst_f, path + "/" + a);
+ }
+ if (not recurse) return;
+ auto sg_l = src_f.list_group(not path.empty()? path : std::string("/"));
+ for (auto const & sg : sg_l)
+ {
+ if (src_f.group_exists(path + "/" + sg))
+ {
+ copy_attributes(src_f, dst_f, path + "/" + sg, true);
+ }
+ }
+ } // copy_attributes()
private:
std::string _file_name;
hid_t _file_id;
bool _rw;
/// Split a full name into path and name
+ /// full_name must begin with '/', and not end with '/' unless it equals "/"
static std::pair< std::string, std::string > split_full_name(const std::string& full_name)
{
- auto last_slash_pos = full_name.find_last_of('/');
- std::string path = last_slash_pos != std::string::npos? full_name.substr(0, last_slash_pos + 1) : std::string();
- std::string name = last_slash_pos != std::string::npos? full_name.substr(last_slash_pos + 1) : full_name;
- return std::make_pair(path, name);
+ assert(not full_name.empty() and
+ full_name.front() == '/' and
+ (full_name.size() == 1 or full_name.back() != '/'));
+ if (full_name == "/") return std::make_pair(std::string("/"), std::string());
+ auto pos = full_name.find_last_of('/');
+ return (pos != std::string::npos
+ ? std::make_pair(full_name.substr(0, pos > 0? pos : 1), full_name.substr(pos + 1))
+ : std::make_pair(std::string(), std::string()));
} // split_full_name
/// Determine if a path to an element exists
- bool path_exists(const std::string& full_path_name) const
+ bool path_exists(std::string const & full_path_name) const
{
assert(is_open());
- assert(not full_path_name.empty()
- and full_path_name[0] == '/'
- and full_path_name[full_path_name.size() - 1] == '/');
+ assert(not full_path_name.empty() and full_path_name.front() == '/');
+ if (full_path_name == "/") return true;
Exception::active_path() = full_path_name;
// check all path elements exist, except for what is to the right of the last '/'
size_t pos = 0;
- while (true)
+ while (pos != std::string::npos)
{
++pos;
pos = full_path_name.find('/', pos);
- if (pos == std::string::npos) break;
std::string tmp = full_path_name.substr(0, pos);
// check link exists
if (not detail::Util::wrap(H5Lexists, _file_id, tmp.c_str(), H5P_DEFAULT)) return false;
@@ -1711,7 +1907,7 @@ private:
} // path_exists()
/// Check if a group exists
- bool check_object_type(const std::string& loc_full_name, H5O_type_t type_id) const
+ bool check_object_type(std::string const & loc_full_name, H5O_type_t type_id) const
{
// check link exists
if (loc_full_name != "/"
diff --git a/src/huffman-decode.cpp b/src/huffman-decode.cpp
new file mode 100644
index 0000000..80af4a7
--- /dev/null
+++ b/src/huffman-decode.cpp
@@ -0,0 +1,55 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "fast5_pack.hpp"
+#include "logger.hpp"
+
+using namespace std;
+
+int main(int argc, char * argv[])
+{
+ logger::Logger::set_default_level(logger::level::debug);
+ if (argc != 2)
+ {
+ cerr << "use: " << argv[0] << " <codeword_file>" << endl;
+ exit(EXIT_FAILURE);
+ }
+ string cw_fn = argv[1];
+ ifstream cw_f(cw_fn);
+ fast5_pack::Huffman_Diff_Coder hc(cw_f, cw_fn);
+ string l;
+ map< string, string > cw_v_id;
+ vector< uint8_t > cw_v;
+ while (getline(cin, l))
+ {
+ if (l[0] == '#')
+ {
+ istringstream iss(l.substr(1));
+ string tmp0;
+ string tmp1;
+ getline(iss, tmp0, '=');
+ iss >> tmp1;
+ cw_v_id[tmp0] = tmp1;
+ }
+ else
+ {
+ unsigned x;
+ istringstream(l) >> x;
+ cw_v.push_back(x);
+ }
+ }
+ auto val_v = hc.decode<int16_t>(cw_v, cw_v_id);
+ for (auto x : val_v)
+ {
+ cout << x << endl;
+ }
+}
diff --git a/src/huffman-encode.cpp b/src/huffman-encode.cpp
new file mode 100644
index 0000000..20e4c33
--- /dev/null
+++ b/src/huffman-encode.cpp
@@ -0,0 +1,44 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "fast5_pack.hpp"
+#include "logger.hpp"
+
+using namespace std;
+
+int main(int argc, char * argv[])
+{
+ logger::Logger::set_default_level(logger::level::debug);
+ if (argc != 2)
+ {
+ cerr << "use: " << argv[0] << " <codeword_file>" << endl;
+ exit(EXIT_FAILURE);
+ }
+ string cw_fn = argv[1];
+ ifstream cw_f(cw_fn);
+ fast5_pack::Huffman_Diff_Coder hc(cw_f, cw_fn);
+ int16_t x;
+ std::vector< int16_t > val_v;
+ while (cin >> x)
+ {
+ val_v.push_back(x);
+ }
+ auto p = hc.encode(val_v);
+ for (auto const & p2 : p.second)
+ {
+ cout << "#" << p2.first << "=" << p2.second << endl;
+ }
+ for (auto y : p.first)
+ {
+ cout << (int)y << endl;
+ }
+}
diff --git a/src/hufftk b/src/hufftk
new file mode 100755
index 0000000..9706b2e
--- /dev/null
+++ b/src/hufftk
@@ -0,0 +1,171 @@
+#!/usr/bin/env python2
+
+import argparse
+import bisect
+import collections
+import logging
+import os
+
+import fast5
+
+import signal
+signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+
+def construct_codewords(val_d, cw_d, n, s, rg):
+ if n < rg[1] + 1:
+ cw_d[n] = s
+ else:
+ construct_codewords(val_d, cw_d, val_d[n][1][0], s + "0", rg)
+ construct_codewords(val_d, cw_d, val_d[n][1][1], s + "1", rg)
+
+def codeword_sort(s0, s1):
+ if len(s0) == len(s1):
+ return [-1, 1][s0 > s1]
+ else:
+ return len(s0) - len(s1)
+
+def load_rw(args):
+ # construct histogram
+ val_d = {val: [1, None] for val in range(args.rw_range[0] - 1, args.rw_range[1] + 1)}
+ #val_d[args.rw_range[0] - 1] = [1, None]
+ for fn in args.input:
+ f = fast5.File(fn)
+ if not f.have_raw_samples():
+ continue
+ a = f.get_raw_int_samples()
+ for i in range(1, len(a)):
+ val = a[i] - a[i-1]
+ if args.rw_range[0] <= val and val <= args.rw_range[1]:
+ val_idx = val
+ else:
+ val_idx = args.rw_range[0] - 1
+ val_d[val_idx][0] += 1
+ return val_d
+
+def load_ed_len(args):
+ val_d = {val: [1, None] for val in range(args.ed_len_range[0] - 1, args.ed_len_range[1] + 1)}
+ #val_d[args.ed_len_range[0] - 1] = [1, None]
+ val_sum = 0
+ val_cnt = 0
+ for fn in args.input:
+ f = fast5.File(fn)
+ if not f.have_eventdetection_events(args.gr):
+ continue
+ d = f.get_eventdetection_events(args.gr)
+ for e in d:
+ val = e.length
+ val_sum += val
+ val_cnt += 1
+ if args.ed_len_range[0] <= val and val <= args.ed_len_range[1]:
+ val_idx = val
+ else:
+ val_idx = args.ed_len_range[0] - 1
+ val_d[val_idx][0] += 1
+ logger.debug("mean val: " + str(float(val_sum)/val_cnt))
+ return val_d
+
+def load_fq_qv(args):
+ val_d = {val: [1, None] for val in range(args.fq_qv_range[0] - 1, args.fq_qv_range[1] + 1)}
+ val_sum = 0
+ val_cnt = 0
+ for fn in args.input:
+ f = fast5.File(fn)
+ if not f.have_basecall_fastq(0, args.gr):
+ continue
+ fq = f.get_basecall_fastq(0, args.gr)
+ qv = fq.split()[3]
+ for c in qv:
+ val = ord(c) - 33
+ val_sum += val
+ val_cnt += 1
+ if args.fq_qv_range[0] <= val and val <= args.fq_qv_range[1]:
+ val_idx = val
+ else:
+ val_idx = args.fq_qv_range[0] - 1
+ val_d[val_idx][0] += 1
+ logger.debug("mean val: " + str(float(val_sum)/val_cnt))
+ return val_d
+
+def run_build_tree(val_d, rg):
+ # dump histogram
+ #for val in range(-args.range_width, args.range_width + 1):
+ # print('%s\t%s\t%s' % (val, val_d[val], float(val_d[val])/val_count))
+ # initialize codes
+ kw_l = [(val_d[val][0], val) for val in range(rg[0] - 1, rg[1] + 1)]
+ kw_l.sort()
+ logger.debug("smallest frequency: " + str(kw_l[:10]))
+ logger.debug("highest frequency: " + str(kw_l[-10:]))
+ next_node = rg[1] + 1
+ # main loop
+ while len(kw_l) > 1:
+ e = list()
+ for i in range(2):
+ e.append(kw_l[0])
+ del kw_l[0]
+ logger.debug('e=' + str(e))
+ val_d[next_node] = [e[0][0] + e[1][0], [e[0][1], e[1][1]]]
+ logger.debug('next_node=' + str(next_node) + ' val=' + str(val_d[next_node]))
+ bisect.insort(kw_l, (val_d[next_node][0], next_node))
+ next_node += 1
+ # construct codewords
+ assert kw_l[0][1] > rg[1]
+ cw_d = dict()
+ construct_codewords(val_d, cw_d, kw_l[0][1], "", rg)
+ cw_key_l = cw_d.keys()
+ cw_key_l.sort(lambda w0, w1: codeword_sort(cw_d[w0], cw_d[w1]))
+ print("{")
+ for cw in cw_key_l:
+ print('"%s", "%s",' % (['.', cw][cw != rg[0] - 1], cw_d[cw]))
+ print("}")
+
+if __name__ == "__main__":
+ description = """
+ Toolkit for constructing Huffman codes for encoding fast5 files.
+ """
+ parser = argparse.ArgumentParser(description=description, epilog="")
+ parser.add_argument("--log-level", default="info",
+ help="log level")
+ parser.add_argument("--gr", type=str, default="",
+ help="Group")
+ parser.add_argument("--rw-range", default=[-100,100],
+ help="Encoding range for raw sample differences.")
+ parser.add_argument("--ed-skip-range", default=[0,1],
+ help="Encoding range for ed skip values.")
+ parser.add_argument("--ed-len-range", default=[1,100],
+ help="Encoding range for ed length values.")
+ parser.add_argument("--fq-qv-range", default=[0,31],
+ help="Encoding range for fq qv values.")
+ parser.add_argument("command", choices=["rw", "ed-skip", "ed-len", "fq-qv"])
+ parser.add_argument("input", nargs="*",
+ help="Fast5 file")
+ args = parser.parse_args()
+
+ numeric_log_level = getattr(logging, args.log_level.upper(), None)
+ if not isinstance(numeric_log_level, int):
+ raise ValueError("Invalid log level: '%s'" % args.log_level)
+ logging.basicConfig(level=numeric_log_level,
+ format="%(asctime)s %(name)s.%(levelname)s %(message)s",
+ datefmt="%Y/%m/%d %H:%M:%S")
+ logger = logging.getLogger(os.path.basename(__file__))
+ logger.debug("args: " + str(args))
+
+ if type(args.rw_range) != list:
+ args.rw_range = list((int(i) for i in args.rw_range.split(',')))[:2]
+ assert args.rw_range[0] < args.rw_range[1]
+ if type(args.ed_skip_range) != list:
+ args.ed_skip_range = list((int(i) for i in args.ed_skip_range.split(',')))[:2]
+ assert args.ed_skip_range[0] < args.ed_skip_range[1]
+ if type(args.ed_len_range) != list:
+ args.ed_len_range = list((int(i) for i in args.ed_len_range.split(',')))[:2]
+ assert args.ed_len_range[0] < args.ed_len_range[1]
+ logger.debug("args: " + str(args))
+
+ if args.command == "rw":
+ d = load_rw(args)
+ run_build_tree(d, args.rw_range)
+ elif args.command == "ed-len":
+ d = load_ed_len(args)
+ run_build_tree(d, args.ed_len_range)
+ elif args.command == "fq-qv":
+ d = load_fq_qv(args)
+ run_build_tree(d, args.fq_qv_range)
diff --git a/src/logger.hpp b/src/logger.hpp
new file mode 100644
index 0000000..2c69592
--- /dev/null
+++ b/src/logger.hpp
@@ -0,0 +1,378 @@
+/// Part of: https://github.com/mateidavid/hpptools
+/// Commit: 5a6f39c
+
+/// @author Matei David, Ontario Institute for Cancer Research
+/// @version 1.0
+/// @date 2013-2017
+/// @copyright MIT Public License
+///
+/// Logger mechanism.
+///
+/// Properties:
+/// - thread-safe, non-garbled output (uses c++11's thread_local)
+/// - customizable ostream sink. by default, uses std::clog
+///
+/// Exports:
+/// - macro: LOG (takes 1, 2, or 3 arguments, see below)
+/// - macros: LOG_EXIT_, LOG_ABORT, LOG_EXIT, LOG_THROW_, and LOG_THROW
+/// - namespace logger
+/// - enum logger::level
+/// - class logger::Logger
+///
+/// To use:
+/// - In source code, use:
+///
+/// LOG(info) << "hello" << endl;
+/// // or
+/// LOG("main", info) << "hello" << endl;
+/// // or
+/// LOG("main", info, sink_os) << "hello" << endl;
+///
+/// Here, "main" is the facility (a string) and info is the message level.
+/// Note that "logger" is a macro which knows how to look up the name info
+/// inside logger namespace. The macro introduces C++ code equivalent to:
+///
+/// if (...message should be ignored...) then; else sink_os
+///
+/// NOTE: As with assert(), the code in the output stream following the
+/// LOG() macro will ***not be executed*** if the log level of the
+/// facility is higher than the level of the message.
+///
+/// - To set the default log level (for unspecified facilities):
+///
+/// logger::Logger::set_default_level(logger::Logger::level_from_string(s));
+///
+/// - To set the log level for the "main" facility:
+///
+/// logger::Logger::set_facility_level("main", logger::Logger::level_from_string(s));
+///
+/// - To parse a log facility level setting in the form "[<facility>:]<level>":
+///
+/// logger::Logger::set_level_from_option("alt:debug1", &cerr);
+///
+/// - By using these functions, one can set log levels using command-line
+/// parameters and achieve dynamic log level settings without recompiling.
+///
+/// - The macros LOG_EXIT_, LOG_ABORT, LOG_EXIT, LOG_THROW_, and LOG_THROW
+/// provide a way to specify what to do after logging the message.
+///
+/// LOG_EXIT_(exit_code) << "print this message to std::cerr, call std::exit(exit_code)";
+/// LOG_ABORT << "print this message to std::cerr, call std::abort()";
+/// LOG_EXIT << "print this message to std::cerr, call std::exit(EXIT_FAILURE)";
+/// LOG_THROW_(Exception) << "throw Exception with this message";
+/// LOG_THROW << "throw std::runtime_error with this message";
+
+#ifndef __LOGGER_HPP
+#define __LOGGER_HPP
+
+#include <string>
+#include <vector>
+#include <map>
+#include <sstream>
+#include <iostream>
+#include <mutex>
+#include <stdexcept>
+
+namespace logger
+{
+
+// log levels
+enum level
+{
+ error = 0,
+ warning,
+ info,
+ debug,
+ debug1,
+ debug2
+};
+
+class Logger
+{
+public:
+ // Constructor: initialize buffer.
+ Logger(std::string const & facility, level msg_level,
+ std::string const & file_name, unsigned line_num, std::string const & func_name,
+ std::ostream & os = std::clog)
+ : _os_p(&os)
+ {
+ _oss << "= " << facility << "." << int(msg_level)
+ << " " << file_name << ":" << line_num << " " << func_name << " ";
+ _on_destruct = [&] () {
+ _os_p->write(_oss.str().c_str(), _oss.str().size());
+ };
+ }
+ // Constructor for exiting
+ Logger(int exit_code,
+ std::string const & file_name, unsigned line_num, std::string const & func_name,
+ std::ostream & os = std::cerr)
+ : _os_p(&os), _exit_code(exit_code)
+ {
+ _oss << file_name << ":" << line_num << " " << func_name << " ";
+ _on_destruct = [&] () {
+ _os_p->write(_oss.str().c_str(), _oss.str().size());
+ if (_exit_code < 0)
+ {
+ std::abort();
+ }
+ else
+ {
+ std::exit(_exit_code);
+ }
+ };
+ }
+ // Constructor for throwing exceptions
+ // first argument is only used to deduce the template argument type
+ template <typename Exception>
+ Logger(Exception const &,
+ std::string const & file_name, unsigned line_num, std::string const & func_name,
+ typename std::enable_if<std::is_base_of<std::exception, Exception>::value>::type * = 0)
+ {
+ _oss << file_name << ":" << line_num << " " << func_name << " ";
+ _on_destruct = [&] () {
+ throw Exception(_oss.str());
+ };
+ }
+ // Destructor: dump buffer to output.
+ ~Logger() noexcept(false)
+ {
+ _on_destruct();
+ }
+ // Produce l-value for output chaining.
+ std::ostream & l_value() { return _oss; }
+
+ // static methods for setting and getting facility log levels.
+ static level get_default_level()
+ {
+ return default_level();
+ }
+ static void set_default_level(level l)
+ {
+ static std::mutex m;
+ std::lock_guard<std::mutex> lg(m);
+ default_level() = l;
+ }
+ static void set_default_level(int l)
+ {
+ set_default_level(get_level(l));
+ }
+ static void set_default_level(std::string const & s)
+ {
+ set_default_level(get_level(s));
+ }
+ static level get_facility_level(std::string const & facility)
+ {
+ return (facility_level_map().count(facility) > 0?
+ facility_level_map().at(facility) : get_default_level());
+ }
+ static void set_facility_level(std::string const & facility, level l)
+ {
+ static std::mutex m;
+ std::lock_guard<std::mutex> lg(m);
+ facility_level_map()[facility] = l;
+ }
+ static void set_facility_level(std::string const & facility, int l)
+ {
+ set_facility_level(facility, get_level(l));
+ }
+ static void set_facility_level(std::string const & facility, std::string const & s)
+ {
+ set_facility_level(facility, get_level(s));
+ }
+ // static methods for setting log levels from command-line options
+ static void set_level_from_option(std::string const & l, std::ostream * os_p = nullptr)
+ {
+ size_t i = l.find(':');
+ if (i == std::string::npos)
+ {
+ set_default_level(l);
+ if (os_p)
+ {
+ (*os_p) << "set default log level to: "
+ << static_cast<int>(Logger::get_default_level()) << std::endl;
+ }
+ }
+ else
+ {
+ set_facility_level(l.substr(0, i), l.substr(i + 1));
+ if (os_p)
+ {
+ (*os_p) << "set log level of '" << l.substr(0, i) << "' to: "
+ << static_cast<int>(Logger::get_facility_level(l.substr(0, i))) << std::endl;
+ }
+ }
+ }
+ static void set_levels_from_options(std::vector<std::string> const & v, std::ostream * os_p = nullptr)
+ {
+ for (auto const & l : v)
+ {
+ set_level_from_option(l, os_p);
+ }
+ }
+ // public static utility functions (used by LOG macro)
+ static level get_level(level l) { return l; }
+ static level get_level(int i) { return static_cast<level>(i); }
+ static level get_level(std::string const & s) { return level_from_string(s); }
+ // public static member (used by LOG macro)
+ static level& thread_local_last_level()
+ {
+ static thread_local level _last_level = error;
+ return _last_level;
+ }
+private:
+ std::ostringstream _oss;
+ std::function<void()> _on_destruct;
+ std::ostream * _os_p;
+ int _exit_code;
+
+ // private static data members
+ static level & default_level()
+ {
+ static level _default_level = error;
+ return _default_level;
+ }
+ static std::map<std::string, level> & facility_level_map()
+ {
+ static std::map<std::string, level> _facility_level_map;
+ return _facility_level_map;
+ }
+ // private static utility functions
+ static level level_from_string(std::string const & s)
+ {
+ std::istringstream iss(s + "\n");
+ int tmp_int = -1;
+ iss >> tmp_int;
+ if (iss.good())
+ {
+ return level(tmp_int);
+ }
+ else
+ {
+ if (s == "error") return logger::error;
+ else if (s == "warning") return logger::warning;
+ else if (s == "info") return logger::info;
+ else if (s == "debug") return logger::debug;
+ else if (s == "debug1") return logger::debug1;
+ else if (s == "debug2") return logger::debug2;
+ else
+ {
+ std::ostringstream oss;
+ oss << "could not parse log level: " << s;
+ throw std::invalid_argument(oss.str());
+ }
+ }
+ }
+}; // class Logger
+
+} //namespace logger
+
+#define __FILENAME__ (std::string(__FILE__).find('/') != std::string::npos? std::string(__FILE__).substr(std::string(__FILE__).rfind('/') + 1) : std::string(__FILE__))
+
+/**
+ * LOG macro
+ *
+ * Synopsis:
+ * LOG(facility, level_spec, sink) << message
+ * LOG(facility, level_spec) << message
+ * LOG(level_spec) << message
+ *
+ * `facility` : string
+ * `level_spec` : integer, string, or logger level
+ * `sink` : sink ostream
+ *
+ * Log to `facility` at logger level `level_spec` and dump output to `sink`.
+ * If sink is omitted, it defaults to std::clog.
+ * If `facility` is omitted (logger has single argument), the macro LOG_FACILITY
+ * is used instead, defaulting to "main".
+ */
+
+#define __LOG_3(facility, level_spec, sink) \
+ { using namespace logger; logger::Logger::thread_local_last_level() = logger::Logger::get_level(level_spec); } \
+ if (logger::Logger::thread_local_last_level() > logger::Logger::get_facility_level(facility)) ; \
+ else logger::Logger(facility, logger::Logger::thread_local_last_level(), __FILENAME__, __LINE__, __func__, sink).l_value()
+
+#define __LOG_2(facility, level_spec) \
+ { using namespace logger; logger::Logger::thread_local_last_level() = logger::Logger::get_level(level_spec); } \
+ if (logger::Logger::thread_local_last_level() > logger::Logger::get_facility_level(facility)) ; \
+ else logger::Logger(facility, logger::Logger::thread_local_last_level(), __FILENAME__, __LINE__, __func__).l_value()
+
+#define __LOG_1(level_spec) \
+ __LOG_2(LOG_FACILITY, level_spec)
+
+// we need 2-level indirection in order to trigger expansion after token pasting
+// http://stackoverflow.com/questions/1597007/creating-c-macro-with-and-line-token-concatenation-with-positioning-macr
+// http://stackoverflow.com/a/11763196/717706
+#ifdef WIN32
+#define __EXPAND(...) __VA_ARGS__
+#define __LOG_aux2(N, ...) __EXPAND(__LOG_ ## N (__VA_ARGS__))
+#else
+#define __LOG_aux2(N, ...) __LOG_ ## N (__VA_ARGS__)
+#endif
+
+#define __LOG_aux1(N, ...) __LOG_aux2(N, __VA_ARGS__)
+
+#define __NARGS_AUX(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, ...) _9
+
+#ifdef WIN32
+#define __NARGS(...) __EXPAND(__NARGS_AUX(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0))
+#else
+#define __NARGS(...) __NARGS_AUX(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0)
+#endif
+
+#ifndef LOG_FACILITY
+#define LOG_FACILITY "main"
+#endif
+
+#define LOG(...) __LOG_aux1(__NARGS(__VA_ARGS__), __VA_ARGS__)
+
+#define LOG_EXIT_(exit_code) logger::Logger((exit_code), __FILENAME__, __LINE__, __func__).l_value()
+#define LOG_ABORT LOG_EXIT_(-1)
+#define LOG_EXIT LOG_EXIT_(EXIT_FAILURE)
+
+#define LOG_THROW_(Exception) logger::Logger(Exception(""), __FILENAME__, __LINE__, __func__).l_value()
+#define LOG_THROW LOG_THROW_(std::runtime_error)
+
+#endif
+
+#ifdef SAMPLE_LOGGER
+
+/*
+
+Compile:
+
+g++ -std=c++11 -D SAMPLE_LOGGER -x c++ logger.hpp -o sample-logger
+
+Run:
+./sample-logger info
+./sample-logger info alt:debug1
+
+*/
+
+using namespace std;
+
+int main(int argc, char* argv[])
+{
+ if (argc < 2)
+ {
+ cerr << "Use: " << argv[0] << " <log_level_setting> ..." << endl
+ << "The program sends 5 log messages with decreasing priority (0=highest, 4=lowest)" << endl
+ << "to 2 facilities \"main\" and \"alt\". Command-line arguments are interpreted as" << endl
+ << "log facility level settings in the form [<facility>:]<level>." << endl;
+ return EXIT_FAILURE;
+ }
+ for (int i = 1; i < argc; ++i)
+ {
+ cerr << "processing argument [" << argv[i] << "]" << endl;
+ logger::Logger::set_level_from_option(argv[i], &cerr);
+ }
+ vector<string> const level_name{ "error", "warning", "info", "debug", "debug1", "debug2" };
+ for (int l = 0; l < 5; ++l)
+ {
+ LOG(level_name[l]) << "message at level " << l << " (" << level_name[l]
+ << ") for facility main" << endl;
+ LOG("alt", l) << "message at level " << l << " (" << level_name[l]
+ << ") for facility alt" << endl;
+ }
+}
+
+#endif
diff --git a/src/tmp.cpp b/src/tmp.cpp
index fb9a363..a767537 100644
--- a/src/tmp.cpp
+++ b/src/tmp.cpp
@@ -1,3 +1,10 @@
+//
+// Part of: https://github.com/mateidavid/fast5
+//
+// Copyright (c) 2015-2017 Matei David, Ontario Institute for Cancer Research
+// MIT License
+//
+
#include <cassert>
#include <exception>
#include <functional>
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fast5.git
More information about the debian-med-commit
mailing list