[med-svn] [deepnano] 01/02: Imported Upstream version 0.0+20110617
Andreas Tille
tille at debian.org
Tue Sep 13 12:31:35 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository deepnano.
commit 14a7251ddbb75f3b3b8c7d7059c49dbe4943acb0
Author: Andreas Tille <tille at debian.org>
Date: Tue Sep 13 14:23:39 2016 +0200
Imported Upstream version 0.0+20110617
---
LICENSE | 24 +++
README.md | 102 +++++++++++
align_2d.cc | 161 +++++++++++++++++
basecall.py | 184 ++++++++++++++++++++
basecall_no_metrichor.py | 276 ++++++++++++++++++++++++++++++
basecall_no_metrichor_devel.py | 371 ++++++++++++++++++++++++++++++++++++++++
helpers.py | 76 ++++++++
nets_data/map5-2d.npz | Bin 0 -> 5482026 bytes
nets_data/map5comp.npz | Bin 0 -> 1718506 bytes
nets_data/map5temp.npz | Bin 0 -> 1718506 bytes
nets_data/map6-2d-big.npz | Bin 0 -> 15126826 bytes
nets_data/map6-2d-no-metr.npz | Bin 0 -> 15126826 bytes
nets_data/map6-2d-no-metr10.npz | Bin 0 -> 15126826 bytes
nets_data/map6-2d-no-metr20.npz | Bin 0 -> 15126826 bytes
nets_data/map6-2d-no-metr23.npz | Bin 0 -> 15126826 bytes
nets_data/map6-2d.npz | Bin 0 -> 5482026 bytes
nets_data/map6comp.npz | Bin 0 -> 1718506 bytes
nets_data/map6temp.npz | Bin 0 -> 1718506 bytes
rnn_fin.py | 81 +++++++++
training/README | 28 +++
training/helpers.py | 31 ++++
training/prepare_dataset.py | 92 ++++++++++
training/realign.cc | 173 +++++++++++++++++++
training/rnn.py | 105 ++++++++++++
training/train.py | 166 ++++++++++++++++++
25 files changed, 1870 insertions(+)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f1dee35
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,24 @@
+Copyright (c) 2016, Vladimir Boza, Comenius University
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Comenius University nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL COMENIUS UNIVERSITY BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d819a45
--- /dev/null
+++ b/README.md
@@ -0,0 +1,102 @@
+### DeepNano: alternative basecaller for MinION reads
+
+DeepNano is alternative basecaller for Oxford Nanopore MinION reads
+based on deep recurrent neural networks.
+
+Currently it works with SQK-MAP-006 and SQK-MAP-005 chemistry and as a postprocessor for Metrichor.
+
+Here are our benchmarks for SQK-MAP-006 chemistry, which compare mapping accuracy (we trained on reads which align to one half on the
+Ecoli and tested on other half of Ecoli and Klebsiela):
+
+| | Ecoli Metrichor | Ecoli DeepNano | Klebsiella Metrichor | Klebsiella DeepNano |
+|------------------|-----------------|----------------|----------------------|---------------------|
+| Template reads | 71.3% | 77.9% | 68.1% | 76.3% |
+| Complement reads | 71.4% | 76.4% | 69.5% | 75.7% |
+| 2D reads | 86.8% | 88.5% | 84.8% | 86.7% |
+
+Links to datasets with reads:
+
+- http://www.ebi.ac.uk/ena/data/view/ERR1147230
+- https://www.ebi.ac.uk/ena/data/view/SAMEA3713789
+
+
+Requirements
+================
+
+We use Python 2.7.
+
+Here are versions of Python packages, that we used:
+
+- Cython==0.23.4
+- numpy==1.10.2
+- h5py==2.5.0
+- Theano==0.7.0
+- python-dateutil==2.5.0
+
+Basic usage:
+================
+
+For SQK-MAP-006 chemistry just use:
+
+`OMP_NUM_THREADS=1 python basecall.py <list of fast5 files>`
+
+or
+
+`OMP_NUM_THREADS=1 python basecall.py --directory <directory with reads>`
+
+It outputs basecalls for template, complement and 2D into file named output.fasta.
+
+For SQK-MAP-005 chemistry use:
+
+`OMP_NUM_THREADS=1 python basecall.py --template_net nets_data/map5temp.npz --complement_net nets_data/map5comp.npz --big_net nets_data/map5-2d.npz <list of fast5 files>`
+
+Advanced arguments:
+=================
+
+- `-h` - prints help message
+- `--template_net PATH` - path to network which basecalls template (has reasonable default)
+- `--complement_net PATH` - path to network which basecalls complement (has reasonable default)
+- `--big_net PATH` - path to network which basecalls 2D (has reasonable default)
+- `--timing` - if set, display timing information for each read
+- `--type template/complement/2d/all` - type of basecalling output (defaults to all)
+- `--output FILENAME` - output filename
+- `--output_orig` - if set, outputs also Metrichor basecalls
+- `--directory DIRECTORY` Directory where read files are stored
+
+Usage without metrichor:
+================
+
+Basecaller above used spliting to template and
+complement, scaling parameters and alignment from metrichor.
+
+The tools below does not require metrichor at some expense at accuracy.
+Currently the accuracy for 1D basecall is almost similar as for the tools above
+(except for some reads where we really mess up scaling).
+The accuracy for 2D basecall is approximatelly 0.5% lower than for metrichor (whereas
+the tool above was 2% better than metrichor).
+
+To use this tool, first compile alignment code:
+`g++ -O2 -std=gnu++0x align_2d.cc -o align_2d`
+
+Then run:
+
+`OMP_NUM_THREADS=1 python basecall_no_metrichor.py <list of fast5 files>`
+
+Arguments:
+
+- `-h` - prints help message
+- `--template_net PATH` - path to network which basecalls template (has reasonable default)
+- `--complement_net PATH` - path to network which basecalls complement (has reasonable default)
+- `--big_net PATH` - path to network which basecalls 2D (has reasonable default)
+- `--type template/complement/2d/all` - type of basecalling output (defaults to all)
+- `--output FILENAME` - output filename
+- `--directory DIRECTORY` Directory where read files are stored
+
+If you want to watch a directory for new files, first install:
+
+- watchdog==0.8.3
+
+And then use (the output parameter has no effect, we output separate fasta files for each fast5 file):
+
+`OMP_NUM_THREADS=1 python basecall_no_metrichor.py --watch <directory name>`
+
diff --git a/align_2d.cc b/align_2d.cc
new file mode 100644
index 0000000..2dbfd99
--- /dev/null
+++ b/align_2d.cc
@@ -0,0 +1,161 @@
+#include <cstdio>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <cmath>
+
+using namespace std;
+
+struct Prob {
+ double p[4];
+ double n;
+};
+
+Prob LoadProb(int parity) {
+ Prob p;
+ scanf("%lf %lf %lf %lf %lf", &p.p[0], &p.p[1], &p.p[2], &p.p[3], &p.n);
+ for (int i = 0; i < 4; i++) {
+ p.p[i] = log(p.p[i]);
+ }
+ p.n = log(p.n);
+ return p;
+}
+
+Prob LoadProbC(int parity) {
+ Prob p;
+ scanf("%lf %lf %lf %lf %lf", &p.p[3], &p.p[2], &p.p[1], &p.p[0], &p.n);
+ for (int i = 0; i < 4; i++) {
+ p.p[i] = log(p.p[i]);
+ }
+ p.n = log(p.n);
+ return p;
+}
+
+int main() {
+ vector<Prob> temp, comp;
+ int tc, cc;
+ scanf("%d", &tc);
+ for (int i = 0; i < tc; i++) {
+ temp.push_back(LoadProb(i%2));
+ }
+ for (int i = 0; i < tc; i+=2) {
+ temp[i].n -= log(0.9) / 2;
+ }
+ scanf("%d", &cc);
+ for (int i = 0; i < cc; i++) {
+ comp.push_back(LoadProbC(i%2));
+ }
+ for (int i = 0; i < cc; i+=2) {
+ comp[i].n -= log(0.9) / 2;
+ }
+ reverse(comp.begin(), comp.end());
+ vector<vector<double>> probs(temp.size()+1, vector<double>(comp.size()+1));
+ // 0 - 3 ACGT
+ // 4 - Ncomp
+ // 5 - Ntemp
+ vector<vector<int>> prevs(temp.size()+1, vector<int>(comp.size()+1));
+
+ double minf = -1000000000000.0;
+ double pen = -0.5;
+ probs[0][0] = 0;
+ for (int i = 0; i < probs.size(); i++) {
+ for (int j = 0; j < probs[i].size(); j++) {
+ if (i == 0 && j == 0) continue;
+ probs[i][j] = 0;
+// if (i == 0 && (j < 2000 && j < probs[i].size() / 4)) probs[i][j] = 0;
+// if (j == 0 && (i < 2000 && i < probs.size() / 4)) probs[i][j] = 0;
+ prevs[i][j] = 6;
+ if (j > 0) {
+ double np = probs[i][j-1] + comp[j-1].n - pen;
+ if (np > probs[i][j]) {
+ prevs[i][j] = 4;
+ probs[i][j] = np;
+ }
+ }
+ if (i > 0) {
+ double np = probs[i-1][j] + temp[i-1].n - pen;
+ if (np > probs[i][j]) {
+ prevs[i][j] = 5;
+ probs[i][j] = np;
+ }
+ }
+ if (i > 0 && j > 0) {
+ for (int k = 0; k < 4; k++) {
+ double np = probs[i-1][j-1] + (temp[i-1].p[k] + comp[j-1].p[k]) - 2*pen;
+ if (np > probs[i][j]) {
+ prevs[i][j] = k;
+ probs[i][j] = np;
+ }
+ }
+ }
+ }
+ }
+ fprintf(stderr, "%lf\n", probs.back().back());
+
+ char alph[] = "ACGT";
+ string seq;
+ int ipos = temp.size();
+ int jpos = comp.size();
+/* int margin = min(2000, (int)temp.size() / 4);
+ for (int i = temp.size(); i >= temp.size() - margin && i >= 0; i--) {
+ if (probs[i][comp.size()] > probs[ipos][jpos]) {
+ ipos = i;
+ jpos = comp.size();
+ }
+ }
+ margin = min(2000, (int)comp.size() / 4);
+ for (int j = comp.size(); j >= comp.size() - margin && j >= 0; j--) {
+ if (probs[temp.size()][j] > probs[ipos][jpos]) {
+ ipos = temp.size();
+ jpos = j;
+ }
+ }*/
+ for (int i = 0; i < temp.size(); i++) {
+ for (int j = 0; j < comp.size(); j++) {
+ if (probs[i][j] > probs[ipos][jpos]) {
+ ipos = i;
+ jpos = j;
+ }
+ }
+ }
+
+ vector<pair<int, int>> trace;
+ while (ipos > 0 && jpos > 0) {
+ if (prevs[ipos][jpos] == 6) {
+ break;
+ }
+ trace.push_back(make_pair(ipos, jpos));
+ if (prevs[ipos][jpos] == 4) {
+ jpos--;
+ } else if (prevs[ipos][jpos] == 5) {
+ ipos--;
+ } else {
+ seq += alph[prevs[ipos][jpos]];
+ ipos--;
+ jpos--;
+ }
+ }
+ reverse(trace.begin(), trace.end());
+ fprintf(stderr, "%d\n", seq.size());
+ reverse(seq.begin(), seq.end());
+ printf("%s\n", seq.c_str());
+
+ int last_temp = -47;
+ int last_comp = -47;
+ for (int i = 10; i + 10 < trace.size(); i++) {
+ auto t = trace[i];
+ int temp_al = -1;
+ int comp_al = -1;
+ if (t.first != last_temp && t.first % 2 == 0) {
+ temp_al = (t.first - 1) / 2;
+ }
+ if (t.second != last_comp && t.second % 2 == 1) {
+ comp_al = comp.size() / 2 - 2 - t.second / 2;
+ }
+ if (temp_al != -1 || comp_al != -1) {
+ printf("%d %d\n", temp_al, comp_al);
+ }
+ last_temp = t.first;
+ last_comp = t.second;
+ }
+}
diff --git a/basecall.py b/basecall.py
new file mode 100644
index 0000000..0026f54
--- /dev/null
+++ b/basecall.py
@@ -0,0 +1,184 @@
+import argparse
+from rnn_fin import RnnPredictor
+import h5py
+import sys
+import numpy as np
+import theano as th
+import os
+import re
+import dateutil.parser
+import datetime
+from helpers import *
+
+def load_read_data(read_file):
+ h5 = h5py.File(read_file, "r")
+ ret = {}
+
+ extract_timing(h5, ret)
+
+ base_loc = get_base_loc(h5)
+
+ try:
+ ret["called_template"] = h5[base_loc+"/BaseCalled_template/Fastq"][()].split('\n')[1]
+ ret["called_complement"] = h5[base_loc+"/BaseCalled_complement/Fastq"][()].split('\n')[1]
+ ret["called_2d"] = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Fastq"][()].split('\n')[1]
+ except Exception as e:
+ pass
+ try:
+ events = h5[base_loc+"/BaseCalled_template/Events"]
+ tscale, tscale_sd, tshift, tdrift = extract_scaling(h5, "template", base_loc)
+ ret["temp_events"] = extract_1d_event_data(
+ h5, "template", base_loc, tscale, tscale_sd, tshift, tdrift)
+ except:
+ pass
+
+ try:
+ cscale, cscale_sd, cshift, cdrift = extract_scaling(h5, "complement", base_loc)
+ ret["comp_events"] = extract_1d_event_data(
+ h5, "complement", base_loc, cscale, cscale_sd, cshift, cdrift)
+ except Exception as e:
+ pass
+
+ try:
+ al = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Alignment"]
+ temp_events = h5[base_loc+"/BaseCalled_template/Events"]
+ comp_events = h5[base_loc+"/BaseCalled_complement/Events"]
+ ret["2d_events"] = []
+ for a in al:
+ ev = []
+ if a[0] == -1:
+ ev += [0, 0, 0, 0, 0]
+ else:
+ e = temp_events[a[0]]
+ mean = (e["mean"] - tshift) / cscale
+ stdv = e["stdv"] / tscale_sd
+ length = e["length"]
+ ev += [1] + preproc_event(mean, stdv, length)
+ if a[1] == -1:
+ ev += [0, 0, 0, 0, 0]
+ else:
+ e = comp_events[a[1]]
+ mean = (e["mean"] - cshift) / cscale
+ stdv = e["stdv"] / cscale_sd
+ length = e["length"]
+ ev += [1] + preproc_event(mean, stdv, length)
+ ret["2d_events"].append(ev)
+ ret["2d_events"] = np.array(ret["2d_events"], dtype=np.float32)
+ except Exception as e:
+ print e
+ pass
+
+ h5.close()
+ return ret
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--template_net', type=str, default="nets_data/map6temp.npz")
+parser.add_argument('--complement_net', type=str, default="nets_data/map6comp.npz")
+parser.add_argument('--big_net', type=str, default="nets_data/map6-2d-big.npz")
+parser.add_argument('reads', type=str, nargs='*')
+parser.add_argument('--timing', action='store_true', default=False)
+parser.add_argument('--type', type=str, default="all", help="One of: template, complement, 2d, all, use comma to separate multiple options, eg.: template,complement")
+parser.add_argument('--output', type=str, default="output.fasta")
+parser.add_argument('--output_orig', action='store_true', default=False)
+parser.add_argument('--directory', type=str, default='', help="Directory where read files are stored")
+
+args = parser.parse_args()
+types = args.type.split(',')
+do_template = False
+do_complement = False
+do_2d = False
+
+if "all" in types or "template" in types:
+ do_template = True
+if "all" in types or "complement" in types:
+ do_complement = True
+if "all" in types or "2d" in types:
+ do_2d = True
+
+assert do_template or do_complement or do_2d, "Nothing to do"
+assert len(args.reads) != 0 or len(args.directory) != 0, "Nothing to basecall"
+
+if do_template:
+ print "loading template net"
+ temp_net = RnnPredictor(args.template_net)
+ print "done"
+if do_complement:
+ print "loading complement net"
+ comp_net = RnnPredictor(args.complement_net)
+ print "done"
+if do_2d:
+ print "loading 2D net"
+ big_net = RnnPredictor(args.big_net)
+ print "done"
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+fo = open(args.output, "w")
+
+total_bases = [0, 0, 0]
+
+files = args.reads
+if len(args.directory):
+ files += [os.path.join(args.directory, x) for x in os.listdir(args.directory)]
+
+for i, read in enumerate(files):
+ basename = os.path.basename(read)
+ try:
+ data = load_read_data(read)
+ except Exception as e:
+ print "error at file", read
+ print e
+ continue
+ if not data:
+ continue
+ print "\rcalling read %d/%d %s" % (i, len(files), read),
+ sys.stdout.flush()
+ if args.output_orig:
+ try:
+ if "called_template" in data:
+ print >>fo, ">%s_template" % basename
+ print >>fo, data["called_template"]
+ if "called_complement" in data:
+ print >>fo, ">%s_complement" % basename
+ print >>fo, data["called_complement"]
+ if "called_2d" in data:
+ print >>fo, ">%s_2d" % basename
+ print >>fo, data["called_2d"]
+ except:
+ pass
+
+ temp_start = datetime.datetime.now()
+ if do_template and "temp_events" in data:
+ predict_and_write(data["temp_events"], temp_net, fo, "%s_template_rnn" % basename)
+ temp_time = datetime.datetime.now() - temp_start
+
+ comp_start = datetime.datetime.now()
+ if do_complement and "comp_events" in data:
+ predict_and_write(data["comp_events"], comp_net, fo, "%s_complement_rnn" % basename)
+ comp_time = datetime.datetime.now() - comp_start
+
+ start_2d = datetime.datetime.now()
+ if do_2d and "2d_events" in data:
+ predict_and_write(data["2d_events"], big_net, fo, "%s_2d_rnn" % basename)
+ time_2d = datetime.datetime.now() - start_2d
+
+ if args.timing:
+ try:
+ print "Events: %d/%d" % (len(data["temp_events"]), len(data["comp_events"]))
+ print "Our times: %f/%f/%f" % (temp_time.total_seconds(), comp_time.total_seconds(),
+ time_2d.total_seconds())
+ print "Our times per base: %f/%f/%f" % (
+ temp_time.total_seconds() / len(data["temp_events"]),
+ comp_time.total_seconds() / len(data["comp_events"]),
+ time_2d.total_seconds() / (len(data["comp_events"]) + len(data["temp_events"])))
+ print "Their times: %f/%f/%f" % (data["temp_time"].total_seconds(), data["comp_time"].total_seconds(), data["2d_time"].total_seconds())
+ print "Their times per base: %f/%f/%f" % (
+ data["temp_time"].total_seconds() / len(data["temp_events"]),
+ data["comp_time"].total_seconds() / len(data["comp_events"]),
+ data["2d_time"].total_seconds() / (len(data["comp_events"]) + len(data["temp_events"])))
+ except:
+ # Don't let timing throw us out
+ pass
+ fo.flush()
+fo.close()
diff --git a/basecall_no_metrichor.py b/basecall_no_metrichor.py
new file mode 100644
index 0000000..6915843
--- /dev/null
+++ b/basecall_no_metrichor.py
@@ -0,0 +1,276 @@
+import argparse
+from rnn_fin import RnnPredictor
+import h5py
+import sys
+import numpy as np
+import theano as th
+import os
+import re
+import dateutil.parser
+import datetime
+from helpers import *
+import subprocess
+import time
+
+def get_scaling_template(events, has_std):
+ down = 48.4631279889
+ up = 65.7312554591
+ our_down = np.percentile(events["mean"], 10)
+ our_up = np.percentile(events["mean"], 90)
+ scale = (our_up - our_down) / (up - down)
+ shift = (our_up / scale - up) * scale
+
+ sd = 0.807981325017
+ if has_std:
+ return scale, np.percentile(events["stdv"], 50) / sd, shift
+ else:
+ return scale, np.sqrt(np.percentile(events["variance"], 50)) / sd, shift
+
+
+def get_scaling_complement(events, has_std):
+ down = 49.2638926877
+ up = 69.0192568072
+ our_down = np.percentile(events["mean"], 10)
+ our_up = np.percentile(events["mean"], 90)
+ scale = (our_up - our_down) / (up - down)
+ shift = (our_up / scale - up) * scale
+
+ sd = 1.04324844612
+ if has_std:
+ return scale, np.percentile(events["stdv"], 50) / sd, shift
+ else:
+ return scale, np.sqrt(np.percentile(events["variance"], 50)) / sd, shift
+
+def template_complement_loc(events):
+ abasic_level = np.percentile(events["mean"], 99) + 5
+ abasic_locs = (events["mean"] > abasic_level).nonzero()[0]
+ last = -47
+ run_len = 1
+ runs = []
+ for x in abasic_locs:
+ if x - last == 1:
+ run_len += 1
+ else:
+ if run_len >= 5:
+ if len(runs) and last - runs[-1][0] < 50:
+ run_len = last - runs[-1][0]
+ run_len += runs[-1][1]
+ runs[-1] = (last, run_len)
+ else:
+ runs.append((last, run_len))
+ run_len = 1
+ last = x
+ to_sort = []
+ mid = len(events) / 2
+ low_third = len(events) / 3
+ high_third = len(events) / 3 * 2
+ for r in runs:
+ if r[0] < low_third:
+ continue
+ if r[0] > high_third:
+ continue
+ to_sort.append((abs(r[0] - mid), r[0] - r[1], r[0]))
+ to_sort.sort()
+ if len(to_sort) == 0:
+ return None
+ trim_size = 10
+ return {"temp": (trim_size, to_sort[0][1] - trim_size),
+ "comp": (to_sort[0][2] + trim_size, len(events) - trim_size)}
+
+def load_read_data(read_file):
+ h5 = h5py.File(read_file, "r")
+ ret = {}
+
+ read_key = h5["Analyses/EventDetection_000/Reads"].keys()[0]
+ base_events = h5["Analyses/EventDetection_000/Reads"][read_key]["Events"]
+ temp_comp_loc = template_complement_loc(base_events)
+ sampling_rate = h5["UniqueGlobalKey/channel_id"].attrs["sampling_rate"]
+
+ if temp_comp_loc:
+ events = base_events[temp_comp_loc["temp"][0]:temp_comp_loc["temp"][1]]
+ else:
+ events = base_events
+ has_std = True
+ try:
+ std = events[0]["stdv"]
+ except:
+ has_std = False
+ tscale2, tscale_sd2, tshift2 = get_scaling_template(events, has_std)
+
+ index = 0.0
+ ret["temp_events2"] = []
+ for e in events:
+ mean = (e["mean"] - tshift2) / tscale2
+ if has_std:
+ stdv = e["stdv"] / tscale_sd2
+ else:
+ stdv = np.sqrt(e["variance"]) / tscale_sd2
+ length = e["length"] / sampling_rate
+ ret["temp_events2"].append(preproc_event(mean, stdv, length))
+
+ ret["temp_events2"] = np.array(ret["temp_events2"], dtype=np.float32)
+
+ if not temp_comp_loc:
+ return ret
+
+ events = base_events[temp_comp_loc["comp"][0]:temp_comp_loc["comp"][1]]
+ cscale2, cscale_sd2, cshift2 = get_scaling_complement(events, has_std)
+
+ index = 0.0
+ ret["comp_events2"] = []
+ for e in events:
+ mean = (e["mean"] - cshift2) / cscale2
+ if has_std:
+ stdv = e["stdv"] / cscale_sd2
+ else:
+ stdv = np.sqrt(e["variance"]) / cscale_sd2
+ length = e["length"] / sampling_rate
+ ret["comp_events2"].append(preproc_event(mean, stdv, length))
+
+ ret["comp_events2"] = np.array(ret["comp_events2"], dtype=np.float32)
+
+ return ret
+
+def basecall(read_file_name, fo):
+ basename = os.path.basename(read_file_name)
+ try:
+ data = load_read_data(read_file_name)
+ except Exception as e:
+ print e
+ print "error at file", read_file_name
+ return
+
+ if do_template or do_2d:
+ o1, o2 = predict_and_write(
+ data["temp_events2"], temp_net,
+ fo if do_template else None,
+ "%s_template_rnn" % basename)
+
+ if (do_complement or do_2d) and "comp_events2" in data:
+ o1c, o2c = predict_and_write(
+ data["comp_events2"], comp_net,
+ fo if do_complement else None,
+ "%s_complement_rnn" % basename)
+
+ if do_2d and "comp_events2" in data and\
+ len(data["comp_events2"]) <= args.max_2d_length and\
+ len(data["temp_events2"]) <= args.max_2d_length:
+ p = subprocess.Popen("./align_2d", stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ f2d = p.stdin
+ print >>f2d, len(o1)+len(o2)
+ for a, b in zip(o1, o2):
+ print >>f2d, " ".join(map(str, a))
+ print >>f2d, " ".join(map(str, b))
+ print >>f2d, len(o1c)+len(o2c)
+ for a, b in zip(o1c, o2c):
+ print >>f2d, " ".join(map(str, a))
+ print >>f2d, " ".join(map(str, b))
+ f2do, f2de = p.communicate()
+ if p.returncode != 0:
+ return
+ lines = f2do.strip().split('\n')
+ print >>fo, ">%s_2d_rnn_simple" % basename
+ print >>fo, lines[0].strip()
+ events_2d = []
+ for l in lines[1:]:
+ temp_ind, comp_ind = map(int, l.strip().split())
+ e = []
+ if temp_ind == -1:
+ e += [0, 0, 0, 0, 0]
+ else:
+ e += [1] + list(data["temp_events2"][temp_ind])
+ if comp_ind == -1:
+ e += [0, 0, 0, 0, 0]
+ else:
+ e += [1] + list(data["comp_events2"][comp_ind])
+ events_2d.append(e)
+ events_2d = np.array(events_2d, dtype=np.float32)
+ predict_and_write(events_2d, big_net, fo, "%s_2d_rnn" % basename)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--template_net', type=str, default="nets_data/map6temp.npz")
+parser.add_argument('--complement_net', type=str, default="nets_data/map6comp.npz")
+parser.add_argument('--big_net', type=str, default="nets_data/map6-2d-no-metr23.npz")
+parser.add_argument('--max_2d_length', type=int, default=10000, help='Max length for 2d basecall')
+parser.add_argument('reads', type=str, nargs='*')
+parser.add_argument('--type', type=str, default="all", help="One of: template, complement, 2d, all, use comma to separate multiple options, eg.: template,complement")
+parser.add_argument('--output', type=str, default="output.fasta")
+parser.add_argument('--directory', type=str, default='', help="Directory where read files are stored")
+parser.add_argument('--watch', type=str, default='', help='Watched directory')
+
+
+args = parser.parse_args()
+types = args.type.split(',')
+do_template = False
+do_complement = False
+do_2d = False
+
+if "all" in types or "template" in types:
+ do_template = True
+if "all" in types or "complement" in types:
+ do_complement = True
+if "all" in types or "2d" in types:
+ do_2d = True
+
+assert do_template or do_complement or do_2d, "Nothing to do"
+assert len(args.reads) != 0 or len(args.directory) != 0 or len(args.watch) != 0, "Nothing to basecall"
+
+if do_template or do_2d:
+ print "loading template net"
+ temp_net = RnnPredictor(args.template_net)
+ print "done"
+if do_complement or do_2d:
+ print "loading complement net"
+ comp_net = RnnPredictor(args.complement_net)
+ print "done"
+if do_2d:
+ print "loading 2D net"
+ big_net = RnnPredictor(args.big_net)
+ print "done"
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+if len(args.reads) or len(args.directory) != 0:
+ fo = open(args.output, "w")
+
+ files = args.reads
+ if len(args.directory):
+ files += [os.path.join(args.directory, x) for x in os.listdir(args.directory)]
+
+ for i, read in enumerate(files):
+ basecall(read, fo)
+
+ fo.close()
+
+if len(args.watch) != 0:
+ try:
+ from watchdog.observers import Observer
+ from watchdog.events import PatternMatchingEventHandler
+ except:
+ print "Please install watchdog to watch directories"
+ sys.exit()
+
+ class Fast5Handler(PatternMatchingEventHandler):
+ """Class for handling creation fo fast5-files"""
+ patterns = ["*.fast5"]
+ def on_created(self, event):
+ print "Calling", event
+ file_name = str(os.path.basename(event.src_path))
+ fasta_file_name = os.path.splitext(event.src_path)[0] + '.fasta'
+ with open(fasta_file_name, "w") as fo:
+ basecall(event.src_path, fo)
+ print('Watch dir: ' + args.watch)
+ observer = Observer()
+ print('Starting Observerer')
+ # start watching directory for fast5-files
+ observer.start()
+ observer.schedule(Fast5Handler(), path=args.watch)
+ try:
+ while True:
+ time.sleep(1)
+ # quit script using ctrl+c
+ except KeyboardInterrupt:
+ observer.stop()
+
+ observer.join()
diff --git a/basecall_no_metrichor_devel.py b/basecall_no_metrichor_devel.py
new file mode 100644
index 0000000..4384be4
--- /dev/null
+++ b/basecall_no_metrichor_devel.py
@@ -0,0 +1,371 @@
+import argparse
+from rnn_fin import RnnPredictor
+import h5py
+import sys
+import numpy as np
+import theano as th
+import os
+import re
+import dateutil.parser
+import datetime
+
+def preproc_event(mean, std, length):
+ mean = mean / 100.0 - 0.66
+ std = std - 1
+ return [mean, mean*mean, std, length]
+
+def get_scaling_template(events):
+ down = 48.4631279889
+ up = 65.7312554591
+ our_down = np.percentile(events["mean"], 10)
+ our_up = np.percentile(events["mean"], 90)
+ scale = (our_up - our_down) / (up - down)
+ shift = (our_up / scale - up) * scale
+
+ sd = 0.807981325017
+ return scale, np.percentile(events["stdv"], 50) / sd, shift
+
+def get_scaling_complement(events):
+ down = 49.2638926877
+ up = 69.0192568072
+ our_down = np.percentile(events["mean"], 10)
+ our_up = np.percentile(events["mean"], 90)
+ scale = (our_up - our_down) / (up - down)
+ shift = (our_up / scale - up) * scale
+
+ sd = 1.04324844612
+ return scale, np.percentile(events["stdv"], 50) / sd, shift
+
+def template_complement_loc(events):
+ abasic_level = np.percentile(events["mean"], 99) + 5
+ abasic_locs = (events["mean"] > abasic_level).nonzero()[0]
+ last = -47
+ run_len = 1
+ runs = []
+ for x in abasic_locs:
+ if x - last == 1:
+ run_len += 1
+ else:
+ if run_len >= 5:
+ if len(runs) and last - runs[-1][0] < 50:
+ run_len = last - runs[-1][0]
+ run_len += runs[-1][1]
+ runs[-1] = (last, run_len)
+ else:
+ runs.append((last, run_len))
+ run_len = 1
+ last = x
+ to_sort = []
+ mid = len(events) / 2
+ low_third = len(events) / 3
+ high_third = len(events) / 3 * 2
+ for r in runs:
+ if r[0] < low_third:
+ continue
+ if r[0] > high_third:
+ continue
+ to_sort.append((abs(r[0] - mid), r[0] - r[1], r[0]))
+ to_sort.sort()
+ if len(to_sort) == 0:
+ return None
+ trim_size = 10
+ return {"temp": (trim_size, to_sort[0][1] - trim_size),
+ "comp": (to_sort[0][2] + trim_size, len(events) - trim_size)}
+
+def load_read_data(read_file):
+ h5 = h5py.File(read_file, "r")
+ ret = {}
+
+ read_key = h5["Analyses/EventDetection_000/Reads"].keys()[0]
+ base_events = h5["Analyses/EventDetection_000/Reads"][read_key]["Events"]
+ temp_comp_loc = template_complement_loc(base_events)
+ if not temp_comp_loc:
+ return None
+
+# print "temp_comp_loc", temp_comp_loc["temp"], temp_comp_loc["comp"]
+# print h5["Analyses/Basecall_2D_000/Summary/split_hairpin"].attrs["start_index_temp"],
+# print h5["Analyses/Basecall_2D_000/Summary/split_hairpin"].attrs["end_index_temp"],
+# print h5["Analyses/Basecall_2D_000/Summary/split_hairpin"].attrs["start_index_comp"],
+# print h5["Analyses/Basecall_2D_000/Summary/split_hairpin"].attrs["end_index_comp"]
+
+ sampling_rate = h5["UniqueGlobalKey/channel_id"].attrs["sampling_rate"]
+
+ try:
+ ret["called_template"] = h5["Analyses/Basecall_2D_000/BaseCalled_template/Fastq"][()].split('\n')[1]
+ ret["called_complement"] = h5["Analyses/Basecall_2D_000/BaseCalled_complement/Fastq"][()].split('\n')[1]
+ ret["called_2d"] = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Fastq"][()].split('\n')[1]
+ except Exception as e:
+ print "wat", e
+ return None
+ events = base_events[temp_comp_loc["temp"][0]:temp_comp_loc["temp"][1]]
+ tscale2, tscale_sd2, tshift2 = get_scaling_template(events)
+
+ index = 0.0
+ ret["temp_events2"] = []
+ for e in events:
+ mean = (e["mean"] - tshift2) / tscale2
+ stdv = e["stdv"] / tscale_sd2
+ length = e["length"] / sampling_rate
+ ret["temp_events2"].append(preproc_event(mean, stdv, length))
+ events = h5["Analyses/Basecall_2D_000/BaseCalled_template/Events"]
+ tscale = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_template"].attrs["scale"]
+ tscale_sd = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_template"].attrs["scale_sd"]
+ tshift = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_template"].attrs["shift"]
+ tdrift = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_template"].attrs["drift"]
+ index = 0.0
+ ret["temp_events"] = []
+ for e in events:
+ mean = (e["mean"] - tshift - index * tdrift) / tscale
+ stdv = e["stdv"] / tscale_sd
+ length = e["length"]
+ ret["temp_events"].append(preproc_event(mean, stdv, length))
+ index += e["length"]
+
+ events = base_events[temp_comp_loc["comp"][0]:temp_comp_loc["comp"][1]]
+ cscale2, cscale_sd2, cshift2 = get_scaling_complement(events)
+
+ index = 0.0
+ ret["comp_events2"] = []
+ for e in events:
+ mean = (e["mean"] - cshift2) / cscale2
+ stdv = e["stdv"] / cscale_sd2
+ length = e["length"] / sampling_rate
+ ret["comp_events2"].append(preproc_event(mean, stdv, length))
+
+ events = h5["Analyses/Basecall_2D_000/BaseCalled_complement/Events"]
+ cscale = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_complement"].attrs["scale"]
+ cscale_sd = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_complement"].attrs["scale_sd"]
+ cshift = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_complement"].attrs["shift"]
+ cdrift = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_complement"].attrs["drift"]
+ index = 0.0
+ ret["comp_events"] = []
+ for e in events:
+ mean = (e["mean"] - cshift - index * cdrift) / cscale
+ stdv = e["stdv"] / cscale_sd
+ length = e["length"]
+ ret["comp_events"].append(preproc_event(mean, stdv, length))
+ index += e["length"]
+
+ ret["temp_events2"] = np.array(ret["temp_events2"], dtype=np.float32)
+ ret["comp_events2"] = np.array(ret["comp_events2"], dtype=np.float32)
+ ret["temp_events"] = np.array(ret["temp_events"], dtype=np.float32)
+ ret["comp_events"] = np.array(ret["comp_events"], dtype=np.float32)
+
+ al = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Alignment"]
+ ret["al"] = al
+ temp_events = h5["Analyses/Basecall_2D_000/BaseCalled_template/Events"]
+ comp_events = h5["Analyses/Basecall_2D_000/BaseCalled_complement/Events"]
+ ret["2d_events"] = []
+ for a in al:
+ ev = []
+ if a[0] == -1:
+ ev += [0, 0, 0, 0, 0]
+ else:
+ e = temp_events[a[0]]
+ mean = (e["mean"] - tshift - index * tdrift) / cscale
+ stdv = e["stdv"] / tscale_sd
+ length = e["length"]
+ ev += [1] + preproc_event(mean, stdv, length)
+ if a[1] == -1:
+ ev += [0, 0, 0, 0, 0]
+ else:
+ e = comp_events[a[1]]
+ mean = (e["mean"] - cshift - index * cdrift) / cscale
+ stdv = e["stdv"] / cscale_sd
+ length = e["length"]
+ ev += [1] + preproc_event(mean, stdv, length)
+ ret["2d_events"].append(ev)
+ ret["2d_events"] = np.array(ret["2d_events"], dtype=np.float32)
+ return ret
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--template_net', type=str, default="nets_data/map6temp.npz")
+parser.add_argument('--complement_net', type=str, default="nets_data/map6comp.npz")
+parser.add_argument('--big_net', type=str, default="nets_data/map6-2d-big.npz")
+parser.add_argument('reads', type=str, nargs='+')
+parser.add_argument('--type', type=str, default="all", help="One of: template, complement, 2d, all, use comma to separate multiple options, eg.: template,complement")
+parser.add_argument('--output', type=str, default="output.fasta")
+parser.add_argument('--output_orig', action='store_true', default=True)
+
+args = parser.parse_args()
+types = args.type.split(',')
+do_template = False
+do_complement = False
+do_2d = False
+
+if "all" in types or "template" in types:
+ do_template = True
+if "all" in types or "complement" in types:
+ do_complement = True
+if "all" in types or "2d" in types:
+ do_2d = True
+
+assert do_template or do_complement or do_2d, "Nothing to do"
+
+if do_template or do_2d:
+ print "loading template net"
+ temp_net = RnnPredictor(args.template_net)
+ print "done"
+if do_complement or do_2d:
+ print "loading complement net"
+ comp_net = RnnPredictor(args.complement_net)
+ print "done"
+if do_2d:
+ print "loading 2D net"
+ big_net = RnnPredictor(args.big_net)
+ big_net_orig = RnnPredictor("nets_data/map6-2d-big.npz")
+ print "done"
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+fo = open(args.output, "w")
+
+total_bases = [0, 0, 0]
+
+for i, read in enumerate(args.reads):
+ if True:
+ data = load_read_data(read)
+# except Exception as e:
+# print e
+# print "error at file", read
+# continue
+ if not data:
+ continue
+ if args.output_orig:
+ print >>fo, ">%d_template" % i
+ print >>fo, data["called_template"]
+ print >>fo, ">%d_complement" % i
+ print >>fo, data["called_complement"]
+ print >>fo, ">%d_2d" % i
+ print >>fo, data["called_2d"]
+
+ if do_template or do_2d:
+ o1, o2 = temp_net.predict(data["temp_events"])
+ o1m = (np.argmax(o1, 1))
+ o2m = (np.argmax(o2, 1))
+ print >>fo, ">%d_temp_rnn" % i
+ for a, b in zip(o1m, o2m):
+ if a < 4:
+ fo.write(chars[a])
+ if b < 4:
+ fo.write(chars[b])
+ fo.write('\n')
+ o1, o2 = temp_net.predict(data["temp_events2"])
+ o1m = (np.argmax(o1, 1))
+ o2m = (np.argmax(o2, 1))
+ if do_template:
+ print >>fo, ">%d_temp_rnn2" % i
+ for a, b in zip(o1m, o2m):
+ if a < 4:
+ fo.write(chars[a])
+ if b < 4:
+ fo.write(chars[b])
+ fo.write('\n')
+
+ if do_complement or do_2d:
+ o1c, o2c = comp_net.predict(data["comp_events"])
+ o1cm = (np.argmax(o1c, 1))
+ o2cm = (np.argmax(o2c, 1))
+ print >>fo, ">%d_comp_rnn" % i
+ for a, b in zip(o1cm, o2cm):
+ if a < 4:
+ fo.write(chars[a])
+ if b < 4:
+ fo.write(chars[b])
+ fo.write('\n')
+ o1c, o2c = comp_net.predict(data["comp_events2"])
+ o1cm = (np.argmax(o1c, 1))
+ o2cm = (np.argmax(o2c, 1))
+ if do_complement:
+ print >>fo, ">%d_comp_rnn2" % i
+ for a, b in zip(o1cm, o2cm):
+ if a < 4:
+ fo.write(chars[a])
+ if b < 4:
+ fo.write(chars[b])
+ fo.write('\n')
+
+ if do_2d:
+ f2d = open("2d.in", "w")
+ print >>f2d, len(o1)+len(o2)
+ for a, b in zip(o1, o2):
+ print >>f2d, " ".join(map(str, a))
+ print >>f2d, " ".join(map(str, b))
+ print >>f2d, len(o1c)+len(o2c)
+ for a, b in zip(o1c, o2c):
+ print >>f2d, " ".join(map(str, a))
+ print >>f2d, " ".join(map(str, b))
+ f2d.close()
+ os.system("./align_2d <2d.in >2d.out")
+ f2do = open("2d.out")
+ call2d = f2do.next().strip()
+ print >>fo, ">%d_2d_rnn_simple" % i
+ print >>fo, call2d
+
+ start_temp_ours = None
+ end_temp_ours = None
+ start_comp_ours = None
+ end_comp_ours = None
+ events_2d = []
+ for l in f2do:
+ temp_ind, comp_ind = map(int, l.strip().split())
+ e = []
+ if temp_ind == -1:
+ e += [0, 0, 0, 0, 0]
+ else:
+ e += [1] + list(data["temp_events2"][temp_ind])
+ if not start_temp_ours:
+ start_temp_ours = temp_ind
+ end_temp_ours = temp_ind
+ if comp_ind == -1:
+ e += [0, 0, 0, 0, 0]
+ else:
+ e += [1] + list(data["comp_events2"][comp_ind])
+ if not end_comp_ours:
+ end_comp_ours = comp_ind
+ start_comp_ours = comp_ind
+ events_2d.append(e)
+ events_2d = np.array(events_2d, dtype=np.float32)
+ o1c, o2c = big_net.predict(events_2d)
+ o1cm = (np.argmax(o1c, 1))
+ o2cm = (np.argmax(o2c, 1))
+ print >>fo, ">%d_2d_rnn2" % i
+ for a, b in zip(o1cm, o2cm):
+ if a < 4:
+ fo.write(chars[a])
+ if b < 4:
+ fo.write(chars[b])
+ fo.write('\n')
+ o1c, o2c = big_net.predict(data["2d_events"])
+ o1cm = (np.argmax(o1c, 1))
+ o2cm = (np.argmax(o2c, 1))
+ print >>fo, ">%d_2d_rnn" % i
+ for a, b in zip(o1cm, o2cm):
+ if a < 4:
+ fo.write(chars[a])
+ if b < 4:
+ fo.write(chars[b])
+ fo.write('\n')
+
+ start_temp_th = None
+ end_temp_th = None
+ start_comp_th = None
+ end_comp_th = None
+ for a in data["al"]:
+ if a[0] != -1:
+ if not start_temp_th:
+ start_temp_th = a[0]
+ end_temp_th = a[0]
+ if a[1] != -1:
+ if not end_comp_th:
+ end_comp_th = a[1]
+ start_comp_th = a[1]
+
+ print "Ours:",
+ print start_temp_ours, end_temp_ours, start_comp_ours, end_comp_ours,
+ print 1. * len(events_2d) / (end_temp_ours - start_temp_ours + end_comp_ours - start_comp_ours)
+ print "Their:",
+ print start_temp_th, end_temp_th, start_comp_th, end_comp_th,
+ print 1. * len(data["al"]) / (end_temp_th - start_temp_th + end_comp_th - start_comp_th)
+ print
diff --git a/helpers.py b/helpers.py
new file mode 100644
index 0000000..6808562
--- /dev/null
+++ b/helpers.py
@@ -0,0 +1,76 @@
+from rnn_fin import RnnPredictor
+import h5py
+import sys
+import numpy as np
+import theano as th
+import os
+import re
+import dateutil.parser
+import datetime
+import argparse
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+def preproc_event(mean, std, length):
+ mean = mean / 100.0 - 0.66
+ std = std - 1
+ return [mean, mean*mean, std, length]
+
+def predict_and_write(events, ntwk, fo, read_name):
+ o1, o2 = ntwk.predict(events)
+ if fo:
+ o1m = (np.argmax(o1, 1))
+ o2m = (np.argmax(o2, 1))
+ print >>fo, ">%s" % read_name
+ for a, b in zip(o1m, o2m):
+ if a < 4:
+ fo.write(chars[a])
+ if b < 4:
+ fo.write(chars[b])
+ fo.write('\n')
+ return o1, o2
+
+def extract_timing(h5, ret):
+ try:
+ log = h5["Analyses/Basecall_2D_000/Log"][()]
+ temp_time = dateutil.parser.parse(re.search(r"(.*) Basecalling template.*", log).groups()[0])
+ comp_time = dateutil.parser.parse(re.search(r"(.*) Basecalling complement.*", log).groups()[0])
+ comp_end_time = dateutil.parser.parse(re.search(r"(.*) Aligning hairpin.*", log).groups()[0])
+
+ start_2d_time = dateutil.parser.parse(re.search(r"(.*) Performing full 2D.*", log).groups()[0])
+ end_2d_time = dateutil.parser.parse(re.search(r"(.*) Workflow completed.*", log).groups()[0])
+
+ ret["temp_time"] = comp_time - temp_time
+ ret["comp_time"] = comp_end_time - comp_time
+ ret["2d_time"] = end_2d_time - start_2d_time
+ except:
+ pass
+
+def get_base_loc(h5):
+ base_loc = "Analyses/Basecall_2D_000"
+ try:
+ events = h5["Analyses/Basecall_2D_000/BaseCalled_template/Events"]
+ except:
+ base_loc = "Analyses/Basecall_1D_000"
+ return base_loc
+
+def extract_scaling(h5, read_type, base_loc):
+ scale = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["scale"]
+ scale_sd = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["scale_sd"]
+ shift = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["shift"]
+ drift = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["drift"]
+ return scale, scale_sd, shift, drift
+
+def extract_1d_event_data(h5, read_type, base_loc, scale, scale_sd, shift, drift):
+ events = h5[base_loc+"/BaseCalled_%s/Events" % read_type]
+ index = 0.0
+ data = []
+ for e in events:
+ mean = (e["mean"] - shift - index * drift) / scale
+ stdv = e["stdv"] / scale_sd
+ length = e["length"]
+ data.append(preproc_event(mean, stdv, length))
+ index += e["length"]
+ return np.array(data, dtype=np.float32)
+
diff --git a/nets_data/map5-2d.npz b/nets_data/map5-2d.npz
new file mode 100644
index 0000000..5d52795
Binary files /dev/null and b/nets_data/map5-2d.npz differ
diff --git a/nets_data/map5comp.npz b/nets_data/map5comp.npz
new file mode 100644
index 0000000..89b6d0e
Binary files /dev/null and b/nets_data/map5comp.npz differ
diff --git a/nets_data/map5temp.npz b/nets_data/map5temp.npz
new file mode 100644
index 0000000..b545b6d
Binary files /dev/null and b/nets_data/map5temp.npz differ
diff --git a/nets_data/map6-2d-big.npz b/nets_data/map6-2d-big.npz
new file mode 100644
index 0000000..39bd0d9
Binary files /dev/null and b/nets_data/map6-2d-big.npz differ
diff --git a/nets_data/map6-2d-no-metr.npz b/nets_data/map6-2d-no-metr.npz
new file mode 100644
index 0000000..bde1287
Binary files /dev/null and b/nets_data/map6-2d-no-metr.npz differ
diff --git a/nets_data/map6-2d-no-metr10.npz b/nets_data/map6-2d-no-metr10.npz
new file mode 100644
index 0000000..e6d9970
Binary files /dev/null and b/nets_data/map6-2d-no-metr10.npz differ
diff --git a/nets_data/map6-2d-no-metr20.npz b/nets_data/map6-2d-no-metr20.npz
new file mode 100644
index 0000000..6cdc011
Binary files /dev/null and b/nets_data/map6-2d-no-metr20.npz differ
diff --git a/nets_data/map6-2d-no-metr23.npz b/nets_data/map6-2d-no-metr23.npz
new file mode 100644
index 0000000..8a4c29e
Binary files /dev/null and b/nets_data/map6-2d-no-metr23.npz differ
diff --git a/nets_data/map6-2d.npz b/nets_data/map6-2d.npz
new file mode 100644
index 0000000..1e209df
Binary files /dev/null and b/nets_data/map6-2d.npz differ
diff --git a/nets_data/map6comp.npz b/nets_data/map6comp.npz
new file mode 100644
index 0000000..3840f2c
Binary files /dev/null and b/nets_data/map6comp.npz differ
diff --git a/nets_data/map6temp.npz b/nets_data/map6temp.npz
new file mode 100644
index 0000000..80107af
Binary files /dev/null and b/nets_data/map6temp.npz differ
diff --git a/rnn_fin.py b/rnn_fin.py
new file mode 100644
index 0000000..a1795e8
--- /dev/null
+++ b/rnn_fin.py
@@ -0,0 +1,81 @@
+import theano as th
+import theano.tensor as T
+from theano.tensor.nnet import sigmoid
+import numpy as np
+import pickle
+
+def share(array, dtype=th.config.floatX, name=None):
+ return th.shared(value=np.asarray(array, dtype=dtype), name=name)
+
+class OutLayer:
+ def __init__(self, input, in_size, n_classes):
+ w = share(np.zeros((in_size, n_classes)))
+ b = share(np.zeros(n_classes))
+ eps = 0.0000001
+ self.output = T.clip(T.nnet.softmax(T.dot(input, w) + b), eps, 1-eps)
+ self.params = [w, b]
+
+class SimpleLayer:
+ def __init__(self, input, nin, nunits):
+ id = str(np.random.randint(0, 10000000))
+ wio = share(np.zeros((nin, nunits)), name="wio"+id) # input to output
+ wir = share(np.zeros((nin, nunits)), name="wir"+id) # input to output
+ wiu = share(np.zeros((nin, nunits)), name="wiu"+id) # input to output
+ woo = share(np.zeros((nunits, nunits)), name="woo"+id) # output to output
+ wou = share(np.zeros((nunits, nunits)), name="wou"+id) # output to output
+ wor = share(np.zeros((nunits, nunits)), name="wor"+id) # output to output
+ bo = share(np.zeros(nunits), name="bo"+id)
+ bu = share(np.zeros(nunits), name="bu"+id)
+ br = share(np.zeros(nunits), name="br"+id)
+ h0 = share(np.zeros(nunits), name="h0"+id)
+
+ def step(in_t, out_tm1):
+ update_gate = sigmoid(T.dot(out_tm1, wou) + T.dot(in_t, wiu) + bu)
+ reset_gate = sigmoid(T.dot(out_tm1, wor) + T.dot(in_t, wir) + br)
+ new_val = T.tanh(T.dot(in_t, wio) + reset_gate * T.dot(out_tm1, woo) + bo)
+ return update_gate * out_tm1 + (1 - update_gate) * new_val
+
+ self.output, _ = th.scan(
+ step, sequences=[input],
+ outputs_info=[h0])
+
+ self.params = [wio, woo, bo, wir, wiu, wor, wou, br, bu, h0]
+
+class BiSimpleLayer():
+ def __init__(self, input, nin, nunits):
+ fwd = SimpleLayer(input, nin, nunits)
+ bwd = SimpleLayer(input[::-1], nin, nunits)
+ self.params = fwd.params + bwd.params
+ self.output = T.concatenate([fwd.output, bwd.output[::-1]], axis=1)
+
+class RnnPredictor:
+ def __init__(self, filename):
+ package = np.load(filename)
+ assert(len(package.files) % 20 == 4)
+ n_layers = len(package.files) / 20
+
+ self.input = T.fmatrix()
+ last_output = self.input
+ last_size = package['arr_0'].shape[0]
+ hidden_size = package['arr_0'].shape[1]
+ par_index = 0
+ for i in range(n_layers):
+ layer = BiSimpleLayer(last_output, last_size, hidden_size)
+ for i in range(20):
+ layer.params[i].set_value(package['arr_%d' % par_index])
+ par_index += 1
+
+ last_output = layer.output
+ last_size = 2*hidden_size
+ out_layer1 = OutLayer(last_output, last_size, 5)
+ for i in range(2):
+ out_layer1.params[i].set_value(package['arr_%d' % par_index])
+ par_index += 1
+ out_layer2 = OutLayer(last_output, last_size, 5)
+ for i in range(2):
+ out_layer2.params[i].set_value(package['arr_%d' % par_index])
+ par_index += 1
+ output1 = out_layer1.output
+ output2 = out_layer2.output
+
+ self.predict = th.function(inputs=[self.input], outputs=[output1, output2])
diff --git a/training/README b/training/README
new file mode 100644
index 0000000..bbc76d3
--- /dev/null
+++ b/training/README
@@ -0,0 +1,28 @@
+Preparing training files
+=========================
+
+`python prepare_dataset <type> <list_of_files_and_references> <output_directory>`
+
+Where:
+Type is one of temp/comp/2d.
+List_of_files_and_references is a file which contains names of read files followed
+by reference sequence (one pair on each line), e.g.:
+
+file1.fast5 ACCGGGTTAACCC
+file2.fast5 CCCGGTTAAACCCGGGG
+file3.fast5 AAATTGGCCTTGGAAA
+
+The reference sequence should cover whole read sequence and maybe overextend a bit
+(like 20 bases on each side).
+
+Output directory must exist before calling the script.
+
+Training
+===============
+
+`OMP_NUM_THREADS=1 python train.py <starting network> <list_of_training_files>`
+
+Use starting network from ../nets_data/ (map6temp/map6comp/map6-2d-big)
+
+Then this creates a directory a starts training. Each 20th iteration it outputs network into that
+directory with name dump-%s.npz (where %s is ID of iteration).
diff --git a/training/helpers.py b/training/helpers.py
new file mode 100644
index 0000000..45d32ab
--- /dev/null
+++ b/training/helpers.py
@@ -0,0 +1,31 @@
+import h5py
+import sys
+import numpy as np
+import os
+import re
+import dateutil.parser
+import datetime
+import argparse
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+def preproc_event(mean, std, length):
+ mean = mean / 100.0 - 0.66
+ std = std - 1
+ return [mean, mean*mean, std, length]
+
+def get_base_loc(h5):
+ base_loc = "Analyses/Basecall_2D_000"
+ try:
+ events = h5["Analyses/Basecall_2D_000/BaseCalled_template/Events"]
+ except:
+ base_loc = "Analyses/Basecall_1D_000"
+ return base_loc
+
+def extract_scaling(h5, read_type, base_loc):
+ scale = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["scale"]
+ scale_sd = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["scale_sd"]
+ shift = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["shift"]
+ drift = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["drift"]
+ return scale, scale_sd, shift, drift
diff --git a/training/prepare_dataset.py b/training/prepare_dataset.py
new file mode 100644
index 0000000..6c60779
--- /dev/null
+++ b/training/prepare_dataset.py
@@ -0,0 +1,92 @@
+import argparse
+import os
+import h5py
+from helpers import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('type', choices=['temp', 'comp', '2d'])
+parser.add_argument('source_file', type=str)
+parser.add_argument('output_directory', type=str)
+args = parser.parse_args()
+
+finput = open(args.source_file)
+
+for i, l in enumerate(finput):
+ parts = l.strip().split()
+ filename = ' '.join(parts[:-1])
+ ref = parts[-1]
+ h5 = h5py.File(filename, "r")
+
+ fo = open(os.path.join(args.output_directory, "%s.txt" % i), "w")
+ print >>fo, ref
+ base_loc = get_base_loc(h5)
+ if args.type == 'temp':
+ scale, scale_sd, shift, drift = extract_scaling(h5, "template", base_loc)
+ events = h5[base_loc+"/BaseCalled_%s/Events" % "template"]
+ index = 0.0
+ data = []
+ for e in events:
+ mean = (e["mean"] - shift) / scale
+ stdv = e["stdv"] / scale_sd
+ length = e["length"]
+ print >>fo, " ".join(map(str, preproc_event(mean, stdv, length))),
+ move = e["move"]
+ if move == 0:
+ print >>fo, "NN"
+ if move == 1:
+ print >>fo, "N%s" % e["model_state"][2]
+ if move == 2:
+ print >>fo, "%s%s" % (e["model_state"][1], e["model_state"][2])
+ if args.type == 'comp':
+ scale, scale_sd, shift, drift = extract_scaling(h5, "complement", base_loc)
+ events = h5[base_loc+"/BaseCalled_%s/Events" % "complement"]
+ index = 0.0
+ data = []
+ for e in events:
+ mean = (e["mean"] - shift) / scale
+ stdv = e["stdv"] / scale_sd
+ length = e["length"]
+ print >>fo, " ".join(map(str, preproc_event(mean, stdv, length))),
+ move = e["move"]
+ if move == 0:
+ print >>fo, "NN"
+ if move == 1:
+ print >>fo, "N%s" % e["model_state"][2]
+ if move == 2:
+ print >>fo, "%s%s" % (e["model_state"][1], e["model_state"][2])
+ if args.type == '2d':
+ tscale, tscale_sd, tshift, tdrift = extract_scaling(h5, "template", base_loc)
+ cscale, cscale_sd, cshift, cdrift = extract_scaling(h5, "complement", base_loc)
+ al = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Alignment"]
+ temp_events = h5[base_loc+"/BaseCalled_template/Events"]
+ comp_events = h5[base_loc+"/BaseCalled_complement/Events"]
+ prev = None
+ for a in al:
+ ev = []
+ if a[0] == -1:
+ ev += [0, 0, 0, 0, 0]
+ else:
+ e = temp_events[a[0]]
+ mean = (e["mean"] - tshift) / cscale
+ stdv = e["stdv"] / tscale_sd
+ length = e["length"]
+ ev += [1] + preproc_event(mean, stdv, length)
+ if a[1] == -1:
+ ev += [0, 0, 0, 0, 0]
+ else:
+ e = comp_events[a[1]]
+ mean = (e["mean"] - cshift) / cscale
+ stdv = e["stdv"] / cscale_sd
+ length = e["length"]
+ ev += [1] + preproc_event(mean, stdv, length)
+ print >>fo, " ".join(map(str, ev)),
+ if prev == a[2]:
+ print >>fo, "NN"
+ elif not prev or a[2][:-1] == prev[1:]:
+ print >>fo, "N%c" % a[2][2]
+ else:
+ print >>fo, "%c%c" % (a[2][1], a[2][2])
+
+
+ fo.close()
+ h5.close()
diff --git a/training/realign.cc b/training/realign.cc
new file mode 100644
index 0000000..00286a3
--- /dev/null
+++ b/training/realign.cc
@@ -0,0 +1,173 @@
+#include <cstdio>
+#include <vector>
+#include <cmath>
+#include <map>
+#include <string>
+#include <algorithm>
+
+using namespace std;
+
+class string2 {
+ public:
+ string2() {}
+ string2(const char*x) {
+ x_[0] = 0;
+ x_[1] = 0;
+ x_[2] = 0;
+ for (int i = 0; i < 2 && x[i]; i++) {
+ x_[i] = x[i];
+ }
+ }
+
+ string2 operator=(const char*x) {
+ x_[0] = 0;
+ x_[1] = 0;
+ x_[2] = 0;
+ for (int i = 0; i < 2 && x[i]; i++) {
+ x_[i] = x[i];
+ }
+ return *this;
+ }
+
+ string2 operator=(const string2 x) {
+ x_[0]= x.x_[0];
+ x_[1]= x.x_[1];
+ x_[2]= x.x_[2];
+ return *this;
+ }
+
+ string2 operator=(const string& x) {
+ x_[0] = 0;
+ x_[1] = 0;
+ x_[2] = 0;
+ for (int i = 0; i < 2 && i < x.size(); i++) {
+ x_[i] = x[i];
+ }
+ return *this;
+ }
+
+ char& operator[](int i) {
+ return x_[i];
+ }
+
+ char x_[3];
+};
+
+int main() {
+ char ss[100000];
+ scanf("%s\n", ss);
+
+ string ref(ss);
+ int mapping[256];
+ mapping['A'] = 0;
+ mapping['C'] = 1;
+ mapping['G'] = 2;
+ mapping['T'] = 3;
+ int range = 4000;
+ vector<vector<double>> probs;
+ while (true) {
+ double a, c, g, t, n;
+ if (scanf("%lf %lf %lf %lf %lf", &a, &c, &g, &t, &n)<5) {
+ break;
+ }
+ probs.push_back(vector<double>({log(a), log(c), log(g), log(t), log(n)}));
+ }
+ while (true) {
+ vector<vector<double>> poses(probs.size()+1);
+ vector<vector<string2>> prevs(probs.size()+1);
+ for (int i = 0; i < poses.size(); i+=2) {
+ poses[i] = vector<double>(ref.size()+1, -1e30);
+ prevs[i] = vector<string2>(ref.size()+1, "");
+ }
+ poses[0][0] = 0;
+ for (int i = 0; i < poses[0].size() && i < poses[0].size() - 500; i++) {
+ poses[0][i] = 0;
+ }
+ int last_bp = 50;
+ for (int i = 2; i <= probs.size(); i+=2) {
+ for (int j = max(1, last_bp - range); j <= ref.size() && j <= last_bp + range; j++) {
+ // NN
+ if (i > 2) {
+ double np = poses[i-2][j] + probs[i-2][4] + probs[i-1][4];
+ if (np > poses[i][j]) {
+ poses[i][j] = np;
+ prevs[i][j] = "NN";
+ }
+ }
+ // NX
+ double np = poses[i-2][j-1] + probs[i-2][4] + probs[i-1][mapping[ref[j-1]]];
+ if (np > poses[i][j]) {
+ poses[i][j] = np;
+ prevs[i][j] = string("N") + ref[j-1];
+ }
+
+ //XX
+ if (j > 1) {
+ double np = poses[i-2][j-2] + probs[i-2][mapping[ref[j-2]]] + probs[i-1][mapping[ref[j-1]]];
+ if (np > poses[i][j]) {
+ poses[i][j] = np;
+ prevs[i][j] = ref.substr(j-2, 2);
+ }
+ }
+ }
+ int cur_bp = max(1, last_bp - range);
+ for (int j = max(1, last_bp - range); j <= ref.size() && j <= last_bp + range; j++) {
+ if (poses[i][j] > poses[i][cur_bp]) {
+ cur_bp = j;
+ }
+ }
+ last_bp = cur_bp;
+ }
+// fprintf(stderr, "back size %d last_bp %d\n", poses.back().size(), last_bp);
+ int best_pos = poses.back().size()-40;
+ for (int i = min(500, (int)poses.back().size()-500); i < poses.back().size(); i++) {
+ if (poses.back()[i] > poses.back()[best_pos]) {
+ best_pos = i;
+ }
+ }
+ /*int total = 0;
+ int rel = 0;
+ int better = 0;
+ for (int i = 0; i < poses.size(); i += 2) {
+ for (int j = 0; j < poses[i].size(); j++) {
+ total += 1;
+ if (poses[i][j] > -1e29) {
+ rel += 1;
+ }
+ if (poses[i][j] > poses.back()[best_pos]) {
+ better += 1;
+ }
+ }
+ }
+ fprintf(stderr, "total %d rel %d better %d\n", total, rel, better);*/
+ if (poses.back()[best_pos] / probs.size() * 2 > -10) {
+ fprintf(stderr, "best pos %d %lf %d\n", best_pos, poses.back()[best_pos] / probs.size() * 2, range);
+ int ipos = poses.size()-1;
+ int jpos = best_pos;
+
+ vector<string2> out;
+ while (ipos > 0) {
+ auto back = prevs[ipos][jpos];
+ out.push_back(back);
+ if (back[0] == 'N' && back[1] == 'N') {
+ ipos -= 2;
+ }
+ else if (back[0] == 'N' && back[1] != 'N') {
+ ipos -= 2;
+ jpos -= 1;
+ }
+ else if (back[0] != 'N' && back[1] != 'N') {
+ ipos -= 2;
+ jpos -= 2;
+ }
+ }
+ reverse(out.begin(), out.end());
+ for (auto &o: out) {
+ printf("%s\n", o.x_);
+ }
+ fprintf(stderr, "start pos %d\n", jpos);
+ return 0;
+ }
+ range *= 2;
+ }
+}
diff --git a/training/rnn.py b/training/rnn.py
new file mode 100644
index 0000000..cafaf20
--- /dev/null
+++ b/training/rnn.py
@@ -0,0 +1,105 @@
+import theano as th
+import theano.tensor as T
+from theano.tensor.nnet import sigmoid
+import numpy as np
+import pickle
+from theano_toolkit import updates
+
+def share(array, dtype=th.config.floatX, name=None):
+ return th.shared(value=np.asarray(array, dtype=dtype), name=name)
+
+class OutLayer:
+ def __init__(self, input, in_size, n_classes):
+ id = str(np.random.randint(0, 10000000))
+ w = share(np.zeros((in_size, n_classes)), name="wout"+id)
+ b = share(np.zeros(n_classes), name="bout"+id)
+ eps = 0.0000001
+ self.output = T.clip(T.nnet.softmax(T.dot(input, w) + b), eps, 1-eps)
+ self.params = [w, b]
+
+class SimpleLayer:
+ def __init__(self, input, nin, nunits):
+ id = str(np.random.randint(0, 10000000))
+ wio = share(np.zeros((nin, nunits)), name="wio"+id) # input to output
+ wir = share(np.zeros((nin, nunits)), name="wir"+id) # input to output
+ wiu = share(np.zeros((nin, nunits)), name="wiu"+id) # input to output
+ woo = share(np.zeros((nunits, nunits)), name="woo"+id) # output to output
+ wou = share(np.zeros((nunits, nunits)), name="wou"+id) # output to output
+ wor = share(np.zeros((nunits, nunits)), name="wor"+id) # output to output
+ bo = share(np.zeros(nunits), name="bo"+id)
+ bu = share(np.zeros(nunits), name="bu"+id)
+ br = share(np.zeros(nunits), name="br"+id)
+ h0 = share(np.zeros(nunits), name="h0"+id)
+
+ def step(in_t, out_tm1):
+ update_gate = sigmoid(T.dot(out_tm1, wou) + T.dot(in_t, wiu) + bu)
+ reset_gate = sigmoid(T.dot(out_tm1, wor) + T.dot(in_t, wir) + br)
+ new_val = T.tanh(T.dot(in_t, wio) + reset_gate * T.dot(out_tm1, woo) + bo)
+ return update_gate * out_tm1 + (1 - update_gate) * new_val
+
+ self.output, _ = th.scan(
+ step, sequences=[input],
+ outputs_info=[h0])
+
+ self.params = [wio, woo, bo, wir, wiu, wor, wou, br, bu, h0]
+
+class BiSimpleLayer():
+ def __init__(self, input, nin, nunits):
+ fwd = SimpleLayer(input, nin, nunits)
+ bwd = SimpleLayer(input[::-1], nin, nunits)
+ self.params = fwd.params + bwd.params
+ self.output = T.concatenate([fwd.output, bwd.output[::-1]], axis=1)
+
+class Rnn:
+ def __init__(self, filename):
+ package = np.load(filename)
+ assert(len(package.files) % 20 == 4)
+ n_layers = len(package.files) / 20
+
+ self.params = []
+ self.input = T.fmatrix()
+ last_output = self.input
+ last_size = package['arr_0'].shape[0]
+ hidden_size = package['arr_0'].shape[1]
+ par_index = 0
+ for i in range(n_layers):
+ layer = BiSimpleLayer(last_output, last_size, hidden_size)
+ self.params += layer.params
+ for i in range(20):
+ layer.params[i].set_value(package['arr_%d' % par_index])
+ par_index += 1
+
+ last_output = layer.output
+ last_size = 2*hidden_size
+ out_layer1 = OutLayer(last_output, last_size, 5)
+ for i in range(2):
+ out_layer1.params[i].set_value(package['arr_%d' % par_index])
+ par_index += 1
+ out_layer2 = OutLayer(last_output, last_size, 5)
+ for i in range(2):
+ out_layer2.params[i].set_value(package['arr_%d' % par_index])
+ par_index += 1
+ output1 = out_layer1.output
+ output2 = out_layer2.output
+ self.params += out_layer1.params
+ self.params += out_layer2.params
+
+ self.predict = th.function(inputs=[self.input], outputs=[output1, output2])
+ self.tester = th.function(inputs=[self.input], outputs=[output1, output2])
+
+ self.lr = T.fscalar()
+ self.targets = T.ivector()
+ self.targets2 = T.ivector()
+ self.cost = 0
+ self.cost = -T.mean(T.log(output1)[T.arange(self.targets.shape[0]), self.targets])
+ self.cost += -T.mean(T.log(output2)[T.arange(self.targets2.shape[0]), self.targets2])
+
+ self.trainer = th.function(
+ inputs=[self.input, self.targets, self.targets2, self.lr],
+ outputs=[self.cost, output1, output2],
+ updates=updates.momentum(self.params, (T.grad(self.cost, self.params)),
+ learning_rate=self.lr, mu=0.8))
+
+ def save(self, fn):
+ pp = [p.get_value() for p in self.params]
+ np.savez(fn, *pp)
diff --git a/training/train.py b/training/train.py
new file mode 100644
index 0000000..4b81d2f
--- /dev/null
+++ b/training/train.py
@@ -0,0 +1,166 @@
+from rnn import Rnn
+import pickle
+import sys
+import numpy as np
+import datetime
+from collections import defaultdict
+import os
+from sklearn.metrics import confusion_matrix
+import theano as th
+from multiprocessing import Pool
+
+def print_stats(o):
+ stats = defaultdict(int)
+ for x in o:
+ stats[x] += 1
+ print (stats)
+
+def flatten2(x):
+ return x.reshape((x.shape[0] * x.shape[1], -1))
+
+def realign(s):
+ ps = s
+ o1, o2 = ntwk.tester(data_x[ps])
+ o1m = (np.argmax(o1, 1))
+ o2m = (np.argmax(o2, 1))
+ alph = "ACGTN"
+ f = open(base_dir+"tmpb-%s.in" % s, "w")
+ print >>f, refs[ps]
+ for a, b in zip(o1, o2):
+ print >>f, " ".join(map(str, a))
+ print >>f, " ".join(map(str, b))
+ f.close()
+
+ print "s", s
+ if os.system("./realign <%stmpb-%s.in >%stmpb-%s.out" % (base_dir, s, base_dir, s)) != 0:
+ print "watwat", s
+ sys.exit()
+
+ f = open(base_dir+"tmpb-%s.out" % s)
+ for i, l in enumerate(f):
+ data_y[ps][i] = mapping[l[0]]
+ data_y2[ps][i] = mapping[l[1]]
+
+ return data_y[ps], data_y2[ps]
+
+if __name__ == '__main__':
+ chars = "ACGT"
+ mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+ n_dims = 4
+ n_classes = 5
+
+ data_x = []
+ data_y = []
+ data_y2 = []
+ refs = []
+ names = []
+
+ for fn in sys.argv[2:]:
+ f = open(fn)
+ ref = f.readline()
+ if len(ref) > 30000:
+ print "out", len(ref)
+ continue
+ refs.append(ref.strip())
+ names.append(fn)
+ X = []
+ Y = []
+ Y2 = []
+ for l in f:
+ its = l.strip().split()
+ X.append(map(float, its[:-1]))
+ Y.append(mapping[its[-1][0]])
+ Y2.append(mapping[its[-1][1]])
+ data_x.append(np.array(X, dtype=np.float32))
+ data_y.append(np.array(Y, dtype=np.int32))
+ data_y2.append(np.array(Y2, dtype=np.int32))
+
+ print ("done", sum(len(x) for x in refs))
+ sys.stdout.flush()
+
+ ntwk = Rnn(sys.argv[1])
+
+ print ("net rdy")
+
+ s_arr = []
+ p_arr = []
+ subseq_size = 400
+ for s in range(len(data_x)):
+ s_arr += [s]
+ p_arr += [len(data_x[s]) - subseq_size]
+
+ sum_p = sum(p_arr)
+ for i in range(len(p_arr)):
+ p_arr[i] = 1.*p_arr[i] / sum_p
+
+ base_dir = str(datetime.datetime.now())
+ base_dir = base_dir.replace(' ', '_')
+
+ os.mkdir(base_dir)
+ base_dir += "/"
+ batch_size = 1
+ n_batches = len(data_x) / batch_size
+ print len(data_x), batch_size, n_batches, datetime.datetime.now()
+
+ for epoch in range(1000):
+ if (epoch % 20 == 0 and epoch > 0) or (epoch == 0):
+ p = Pool(8)
+ new_labels = p.map(realign, range(len(data_x)))
+ for i in range(len(new_labels)):
+ data_y[i] = new_labels[i][0]
+ data_y2[i] = new_labels[i][1]
+
+ taken_gc = []
+ out_gc = []
+ tc = 0
+ tc2 = 0
+ tc3 = 0
+ o1mm = []
+ y1mm = []
+ o2mm = []
+ y2mm = []
+ for s in range(len(data_x)):
+ s2 = np.random.choice(s_arr, p=p_arr)
+ r = np.random.randint(0, data_x[s2].shape[0] - subseq_size)
+ x = data_x[s2][r:r+subseq_size]
+ # x[:,0] += np.random.binomial(n=1, p=0.1, size=x.shape[0]) * np.random.normal(scale=0.01, size=x.shape[0])
+ y = data_y[s2][r:r+subseq_size]
+ y2 = data_y2[s2][r:r+subseq_size]
+
+ lr = 1e-2
+ # if epoch >= 3:
+ # lr = 2e-1
+ # if epoch >= 50:
+ # lr = 2e-1
+ if epoch >= 970:
+ lr = 1e-3
+ cost, o1, o2 = ntwk.trainer(x, y, y2, lr)
+ tc += cost
+
+ o1m = (np.argmax(o1, 1))
+ o2m = (np.argmax(o2, 1))
+
+ o1mm += list(o1m)
+ o2mm += list(o2m)
+ y1mm += list(y)
+ y2mm += list(y2)
+
+ tc2 += np.sum(np.equal(o1m, y))
+ tc3 += np.sum(np.equal(o2m, y2))
+ sys.stdout.write('\r%d' % s)
+ sys.stdout.flush()
+
+ print
+
+ print epoch, tc / n_batches, 1.*tc2 / n_batches / batch_size, 1.*tc3 / n_batches / batch_size, datetime.datetime.now()
+ print_stats(o1mm)
+ print_stats(o2mm)
+ print confusion_matrix(y1mm, o1mm)
+ print confusion_matrix(y2mm, o2mm)
+ # print "out", np.min(out_gc), np.median(out_gc), np.max(out_gc), len(out_gc)
+ sys.stdout.flush()
+
+ if epoch % 20 == 2:
+ ntwk.save(base_dir+"dumpx-%d.npz" % epoch)
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/deepnano.git
More information about the debian-med-commit
mailing list