[med-svn] [deepnano] 01/02: Imported Upstream version 0.0+20110617

Andreas Tille tille at debian.org
Tue Sep 13 12:31:35 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository deepnano.

commit 14a7251ddbb75f3b3b8c7d7059c49dbe4943acb0
Author: Andreas Tille <tille at debian.org>
Date:   Tue Sep 13 14:23:39 2016 +0200

    Imported Upstream version 0.0+20110617
---
 LICENSE                         |  24 +++
 README.md                       | 102 +++++++++++
 align_2d.cc                     | 161 +++++++++++++++++
 basecall.py                     | 184 ++++++++++++++++++++
 basecall_no_metrichor.py        | 276 ++++++++++++++++++++++++++++++
 basecall_no_metrichor_devel.py  | 371 ++++++++++++++++++++++++++++++++++++++++
 helpers.py                      |  76 ++++++++
 nets_data/map5-2d.npz           | Bin 0 -> 5482026 bytes
 nets_data/map5comp.npz          | Bin 0 -> 1718506 bytes
 nets_data/map5temp.npz          | Bin 0 -> 1718506 bytes
 nets_data/map6-2d-big.npz       | Bin 0 -> 15126826 bytes
 nets_data/map6-2d-no-metr.npz   | Bin 0 -> 15126826 bytes
 nets_data/map6-2d-no-metr10.npz | Bin 0 -> 15126826 bytes
 nets_data/map6-2d-no-metr20.npz | Bin 0 -> 15126826 bytes
 nets_data/map6-2d-no-metr23.npz | Bin 0 -> 15126826 bytes
 nets_data/map6-2d.npz           | Bin 0 -> 5482026 bytes
 nets_data/map6comp.npz          | Bin 0 -> 1718506 bytes
 nets_data/map6temp.npz          | Bin 0 -> 1718506 bytes
 rnn_fin.py                      |  81 +++++++++
 training/README                 |  28 +++
 training/helpers.py             |  31 ++++
 training/prepare_dataset.py     |  92 ++++++++++
 training/realign.cc             | 173 +++++++++++++++++++
 training/rnn.py                 | 105 ++++++++++++
 training/train.py               | 166 ++++++++++++++++++
 25 files changed, 1870 insertions(+)

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f1dee35
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,24 @@
+Copyright (c) 2016, Vladimir Boza, Comenius University
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Comenius University nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL COMENIUS UNIVERSITY BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d819a45
--- /dev/null
+++ b/README.md
@@ -0,0 +1,102 @@
+### DeepNano: alternative basecaller for MinION reads
+
+DeepNano is alternative basecaller for Oxford Nanopore MinION reads
+based on deep recurrent neural networks.
+
+Currently it works with SQK-MAP-006 and SQK-MAP-005 chemistry and as a postprocessor for Metrichor.
+
+Here are our benchmarks for SQK-MAP-006 chemistry, which compare mapping accuracy (we trained on reads which align to one half on the
+Ecoli and tested on other half of Ecoli and Klebsiela):
+
+|                  | Ecoli Metrichor | Ecoli DeepNano | Klebsiella Metrichor | Klebsiella DeepNano |
+|------------------|-----------------|----------------|----------------------|---------------------|
+| Template reads   | 71.3%           | 77.9%          | 68.1%                | 76.3%               |
+| Complement reads | 71.4%           | 76.4%          | 69.5%                | 75.7%               |
+| 2D reads         | 86.8%           | 88.5%          | 84.8%                | 86.7%               |
+
+Links to datasets with reads:
+
+- http://www.ebi.ac.uk/ena/data/view/ERR1147230
+- https://www.ebi.ac.uk/ena/data/view/SAMEA3713789
+
+
+Requirements
+================
+
+We use Python 2.7.
+
+Here are versions of Python packages, that we used:
+
+- Cython==0.23.4
+- numpy==1.10.2
+- h5py==2.5.0
+- Theano==0.7.0
+- python-dateutil==2.5.0
+
+Basic usage:
+================
+
+For SQK-MAP-006 chemistry just use:
+
+`OMP_NUM_THREADS=1 python basecall.py <list of fast5 files>`
+
+or 
+
+`OMP_NUM_THREADS=1 python basecall.py --directory <directory with reads>`
+
+It outputs basecalls for template, complement and 2D into file named output.fasta.
+
+For SQK-MAP-005 chemistry use:
+
+`OMP_NUM_THREADS=1 python basecall.py --template_net nets_data/map5temp.npz --complement_net nets_data/map5comp.npz --big_net nets_data/map5-2d.npz <list of fast5 files>`
+
+Advanced arguments:
+=================
+
+- `-h` - prints help message
+- `--template_net PATH` - path to network which basecalls template (has reasonable default)
+- `--complement_net PATH` - path to network which basecalls complement (has reasonable default)
+- `--big_net PATH` - path to network which basecalls 2D (has reasonable default)
+- `--timing` - if set, display timing information for each read
+- `--type template/complement/2d/all` - type of basecalling output (defaults to all)
+- `--output FILENAME` - output filename
+- `--output_orig` - if set, outputs also Metrichor basecalls
+- `--directory DIRECTORY` Directory where read files are stored
+
+Usage without metrichor:
+================
+
+Basecaller above used spliting to template and
+complement, scaling parameters and alignment from metrichor.
+
+The tools below does not require metrichor at some expense at accuracy.
+Currently the accuracy for 1D basecall is almost similar as for the tools above
+(except for some reads where we really mess up scaling).
+The accuracy for 2D basecall is approximatelly 0.5% lower than for metrichor (whereas
+the tool above was 2% better than metrichor). 
+
+To use this tool, first compile alignment code:
+`g++ -O2 -std=gnu++0x align_2d.cc -o align_2d`
+
+Then run:
+
+`OMP_NUM_THREADS=1 python basecall_no_metrichor.py <list of fast5 files>`
+
+Arguments:
+
+- `-h` - prints help message
+- `--template_net PATH` - path to network which basecalls template (has reasonable default)
+- `--complement_net PATH` - path to network which basecalls complement (has reasonable default)
+- `--big_net PATH` - path to network which basecalls 2D (has reasonable default)
+- `--type template/complement/2d/all` - type of basecalling output (defaults to all)
+- `--output FILENAME` - output filename
+- `--directory DIRECTORY` Directory where read files are stored
+
+If you want to watch a directory for new files, first install:
+
+- watchdog==0.8.3
+
+And then use (the output parameter has no effect, we output separate fasta files for each fast5 file):
+
+`OMP_NUM_THREADS=1 python basecall_no_metrichor.py --watch <directory name>`
+
diff --git a/align_2d.cc b/align_2d.cc
new file mode 100644
index 0000000..2dbfd99
--- /dev/null
+++ b/align_2d.cc
@@ -0,0 +1,161 @@
+#include <cstdio>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <cmath>
+
+using namespace std;
+
+struct Prob {
+  double p[4];
+  double n;
+};
+
+Prob LoadProb(int parity) {
+  Prob p;
+  scanf("%lf %lf %lf %lf %lf", &p.p[0], &p.p[1], &p.p[2], &p.p[3], &p.n);
+  for (int i = 0; i < 4; i++) {
+    p.p[i] = log(p.p[i]);
+  }
+  p.n = log(p.n); 
+  return p;
+}
+
+Prob LoadProbC(int parity) {
+  Prob p;
+  scanf("%lf %lf %lf %lf %lf", &p.p[3], &p.p[2], &p.p[1], &p.p[0], &p.n);
+  for (int i = 0; i < 4; i++) {
+    p.p[i] = log(p.p[i]);
+  }
+  p.n = log(p.n);
+  return p;
+}
+
+int main() {
+  vector<Prob> temp, comp;
+  int tc, cc;
+  scanf("%d", &tc);
+  for (int i = 0; i < tc; i++) {
+    temp.push_back(LoadProb(i%2));
+  }
+  for (int i = 0; i < tc; i+=2) {
+    temp[i].n -= log(0.9) / 2;
+  }
+  scanf("%d", &cc);
+  for (int i = 0; i < cc; i++) {
+    comp.push_back(LoadProbC(i%2));
+  }
+  for (int i = 0; i < cc; i+=2) {
+    comp[i].n -= log(0.9) / 2;
+  }
+  reverse(comp.begin(), comp.end());
+  vector<vector<double>> probs(temp.size()+1, vector<double>(comp.size()+1));
+  // 0 - 3 ACGT
+  // 4 - Ncomp
+  // 5 - Ntemp
+  vector<vector<int>> prevs(temp.size()+1, vector<int>(comp.size()+1));
+
+  double minf = -1000000000000.0;
+  double pen = -0.5;
+  probs[0][0] = 0;
+  for (int i = 0; i < probs.size(); i++) {
+    for (int j = 0; j < probs[i].size(); j++) {
+      if (i == 0 && j == 0) continue;
+      probs[i][j] = 0;
+//      if (i == 0 && (j < 2000 && j < probs[i].size() / 4)) probs[i][j] = 0;
+//      if (j == 0 && (i < 2000 && i < probs.size() / 4)) probs[i][j] = 0;
+      prevs[i][j] = 6;
+      if (j > 0) {
+        double np = probs[i][j-1] + comp[j-1].n - pen;
+        if (np > probs[i][j]) {
+          prevs[i][j] = 4;
+          probs[i][j] = np;
+        }
+      }
+      if (i > 0) {
+        double np = probs[i-1][j] + temp[i-1].n - pen;
+        if (np > probs[i][j]) {
+          prevs[i][j] = 5;
+          probs[i][j] = np;
+        }
+      }
+      if (i > 0 && j > 0) {
+        for (int k = 0; k < 4; k++) {
+          double np = probs[i-1][j-1] + (temp[i-1].p[k] + comp[j-1].p[k]) - 2*pen;
+          if (np > probs[i][j]) {
+            prevs[i][j] = k;
+            probs[i][j] = np;
+          }
+        }
+      }
+    }
+  }
+  fprintf(stderr, "%lf\n", probs.back().back());
+
+  char alph[] = "ACGT";
+  string seq;
+  int ipos = temp.size();
+  int jpos = comp.size();
+/*  int margin = min(2000, (int)temp.size() / 4);
+  for (int i = temp.size(); i >= temp.size() - margin && i >= 0; i--) {
+    if (probs[i][comp.size()] > probs[ipos][jpos]) {
+      ipos = i;
+      jpos = comp.size();
+    }
+  }
+  margin = min(2000, (int)comp.size() / 4);
+  for (int j = comp.size(); j >= comp.size() - margin && j >= 0; j--) {
+    if (probs[temp.size()][j] > probs[ipos][jpos]) {
+      ipos = temp.size();
+      jpos = j;
+    }
+  }*/
+  for (int i = 0; i < temp.size(); i++) {
+    for (int j = 0; j < comp.size(); j++) {
+      if (probs[i][j] > probs[ipos][jpos]) {
+        ipos = i;
+        jpos = j;
+      }
+    }
+  }
+
+  vector<pair<int, int>> trace;
+  while (ipos > 0 && jpos > 0) {
+    if (prevs[ipos][jpos] == 6) {
+      break;
+    }
+    trace.push_back(make_pair(ipos, jpos));
+    if (prevs[ipos][jpos] == 4) {
+      jpos--;
+    } else if (prevs[ipos][jpos] == 5) {
+      ipos--;
+    } else {
+      seq += alph[prevs[ipos][jpos]];
+      ipos--;
+      jpos--;
+    }
+  }
+  reverse(trace.begin(), trace.end());
+  fprintf(stderr, "%d\n", seq.size());
+  reverse(seq.begin(), seq.end());
+  printf("%s\n", seq.c_str());
+  
+  int last_temp = -47;
+  int last_comp = -47;
+  for (int i = 10; i + 10 < trace.size(); i++) {
+    auto t = trace[i];
+    int temp_al = -1;
+    int comp_al = -1;
+    if (t.first != last_temp && t.first % 2 == 0) {
+      temp_al = (t.first - 1) / 2;
+    }
+    if (t.second != last_comp && t.second % 2 == 1) {
+      comp_al = comp.size() / 2 - 2 - t.second / 2;
+    }
+    if (temp_al != -1 || comp_al != -1) {
+      printf("%d %d\n", temp_al, comp_al);
+    }
+    last_temp = t.first;
+    last_comp = t.second;
+  }
+}
diff --git a/basecall.py b/basecall.py
new file mode 100644
index 0000000..0026f54
--- /dev/null
+++ b/basecall.py
@@ -0,0 +1,184 @@
+import argparse
+from rnn_fin import RnnPredictor
+import h5py
+import sys
+import numpy as np
+import theano as th
+import os
+import re
+import dateutil.parser
+import datetime
+from helpers import *
+
+def load_read_data(read_file):
+  h5 = h5py.File(read_file, "r")
+  ret = {}
+
+  extract_timing(h5, ret)
+
+  base_loc = get_base_loc(h5)
+
+  try:
+    ret["called_template"] = h5[base_loc+"/BaseCalled_template/Fastq"][()].split('\n')[1]
+    ret["called_complement"] = h5[base_loc+"/BaseCalled_complement/Fastq"][()].split('\n')[1]
+    ret["called_2d"] = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Fastq"][()].split('\n')[1]
+  except Exception as e:
+    pass
+  try:
+    events = h5[base_loc+"/BaseCalled_template/Events"]
+    tscale, tscale_sd, tshift, tdrift = extract_scaling(h5, "template", base_loc)
+    ret["temp_events"] = extract_1d_event_data(
+        h5, "template", base_loc, tscale, tscale_sd, tshift, tdrift)
+  except:
+    pass
+
+  try:
+    cscale, cscale_sd, cshift, cdrift = extract_scaling(h5, "complement", base_loc)
+    ret["comp_events"] = extract_1d_event_data(
+        h5, "complement", base_loc, cscale, cscale_sd, cshift, cdrift)
+  except Exception as e:
+    pass
+
+  try:
+    al = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Alignment"]
+    temp_events = h5[base_loc+"/BaseCalled_template/Events"]
+    comp_events = h5[base_loc+"/BaseCalled_complement/Events"]
+    ret["2d_events"] = []
+    for a in al:
+      ev = []
+      if a[0] == -1:
+        ev += [0, 0, 0, 0, 0]
+      else:
+        e = temp_events[a[0]]
+        mean = (e["mean"] - tshift) / cscale
+        stdv = e["stdv"] / tscale_sd
+        length = e["length"]
+        ev += [1] + preproc_event(mean, stdv, length)
+      if a[1] == -1:
+        ev += [0, 0, 0, 0, 0]
+      else:
+        e = comp_events[a[1]]
+        mean = (e["mean"] - cshift) / cscale
+        stdv = e["stdv"] / cscale_sd
+        length = e["length"]
+        ev += [1] + preproc_event(mean, stdv, length)
+      ret["2d_events"].append(ev) 
+    ret["2d_events"] = np.array(ret["2d_events"], dtype=np.float32)
+  except Exception as e:
+    print e
+    pass
+
+  h5.close()
+  return ret
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--template_net', type=str, default="nets_data/map6temp.npz")
+parser.add_argument('--complement_net', type=str, default="nets_data/map6comp.npz")
+parser.add_argument('--big_net', type=str, default="nets_data/map6-2d-big.npz")
+parser.add_argument('reads', type=str, nargs='*')
+parser.add_argument('--timing', action='store_true', default=False)
+parser.add_argument('--type', type=str, default="all", help="One of: template, complement, 2d, all, use comma to separate multiple options, eg.: template,complement")
+parser.add_argument('--output', type=str, default="output.fasta")
+parser.add_argument('--output_orig', action='store_true', default=False)
+parser.add_argument('--directory', type=str, default='', help="Directory where read files are stored")
+
+args = parser.parse_args()
+types = args.type.split(',')
+do_template = False
+do_complement = False
+do_2d = False
+
+if "all" in types or "template" in types:
+  do_template = True
+if "all" in types or "complement" in types:
+  do_complement = True
+if "all" in types or "2d" in types:
+  do_2d = True
+
+assert do_template or do_complement or do_2d, "Nothing to do"
+assert len(args.reads) != 0 or len(args.directory) != 0, "Nothing to basecall"
+
+if do_template:
+  print "loading template net"
+  temp_net = RnnPredictor(args.template_net)
+  print "done"
+if do_complement:
+  print "loading complement net"
+  comp_net = RnnPredictor(args.complement_net)
+  print "done"
+if do_2d:
+  print "loading 2D net"
+  big_net = RnnPredictor(args.big_net)
+  print "done"
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+fo = open(args.output, "w")
+
+total_bases = [0, 0, 0]
+
+files = args.reads
+if len(args.directory):
+  files += [os.path.join(args.directory, x) for x in os.listdir(args.directory)]  
+
+for i, read in enumerate(files):
+  basename = os.path.basename(read)
+  try:
+    data = load_read_data(read)
+  except Exception as e:
+    print "error at file", read
+    print e
+    continue
+  if not data:  
+    continue
+  print "\rcalling read %d/%d %s" % (i, len(files), read),
+  sys.stdout.flush()
+  if args.output_orig:
+    try:
+      if "called_template" in data:
+        print >>fo, ">%s_template" % basename
+        print >>fo, data["called_template"]
+      if "called_complement" in data:
+        print >>fo, ">%s_complement" % basename
+        print >>fo, data["called_complement"]
+      if "called_2d" in data:
+        print >>fo, ">%s_2d" % basename
+        print >>fo, data["called_2d"]
+    except:
+      pass
+
+  temp_start = datetime.datetime.now()
+  if do_template and "temp_events" in data:
+    predict_and_write(data["temp_events"], temp_net, fo, "%s_template_rnn" % basename)
+  temp_time = datetime.datetime.now() - temp_start
+
+  comp_start = datetime.datetime.now()
+  if do_complement and "comp_events" in data:
+    predict_and_write(data["comp_events"], comp_net, fo, "%s_complement_rnn" % basename)
+  comp_time = datetime.datetime.now() - comp_start
+
+  start_2d = datetime.datetime.now()
+  if do_2d and "2d_events" in data:
+    predict_and_write(data["2d_events"], big_net, fo, "%s_2d_rnn" % basename) 
+  time_2d = datetime.datetime.now() - start_2d
+
+  if args.timing:
+    try:
+      print "Events: %d/%d" % (len(data["temp_events"]), len(data["comp_events"]))
+      print "Our times: %f/%f/%f" % (temp_time.total_seconds(), comp_time.total_seconds(),
+         time_2d.total_seconds())
+      print "Our times per base: %f/%f/%f" % (
+        temp_time.total_seconds() / len(data["temp_events"]),
+        comp_time.total_seconds() / len(data["comp_events"]),
+        time_2d.total_seconds() / (len(data["comp_events"]) + len(data["temp_events"])))
+      print "Their times: %f/%f/%f" % (data["temp_time"].total_seconds(), data["comp_time"].total_seconds(), data["2d_time"].total_seconds())
+      print "Their times per base: %f/%f/%f" % (
+        data["temp_time"].total_seconds() / len(data["temp_events"]),
+        data["comp_time"].total_seconds() / len(data["comp_events"]),
+        data["2d_time"].total_seconds() / (len(data["comp_events"]) + len(data["temp_events"])))
+    except:
+      # Don't let timing throw us out
+      pass
+  fo.flush()
+fo.close()
diff --git a/basecall_no_metrichor.py b/basecall_no_metrichor.py
new file mode 100644
index 0000000..6915843
--- /dev/null
+++ b/basecall_no_metrichor.py
@@ -0,0 +1,276 @@
+import argparse
+from rnn_fin import RnnPredictor
+import h5py
+import sys
+import numpy as np
+import theano as th
+import os
+import re
+import dateutil.parser
+import datetime
+from helpers import *
+import subprocess
+import time
+
+def get_scaling_template(events, has_std):
+  down = 48.4631279889
+  up = 65.7312554591
+  our_down = np.percentile(events["mean"], 10)
+  our_up = np.percentile(events["mean"], 90)
+  scale = (our_up - our_down) / (up - down)
+  shift = (our_up / scale - up) * scale
+
+  sd = 0.807981325017
+  if has_std:
+    return scale, np.percentile(events["stdv"], 50) / sd, shift
+  else:
+    return scale, np.sqrt(np.percentile(events["variance"], 50)) / sd, shift
+    
+
+def get_scaling_complement(events, has_std):
+  down = 49.2638926877
+  up = 69.0192568072
+  our_down = np.percentile(events["mean"], 10)
+  our_up = np.percentile(events["mean"], 90)
+  scale = (our_up - our_down) / (up - down)
+  shift = (our_up / scale - up) * scale
+
+  sd = 1.04324844612
+  if has_std:
+    return scale, np.percentile(events["stdv"], 50) / sd, shift
+  else:
+    return scale, np.sqrt(np.percentile(events["variance"], 50)) / sd, shift
+
+def template_complement_loc(events):
+  abasic_level = np.percentile(events["mean"], 99) + 5
+  abasic_locs = (events["mean"] > abasic_level).nonzero()[0]
+  last = -47
+  run_len = 1
+  runs = []
+  for x in abasic_locs:
+    if x - last == 1:
+      run_len += 1
+    else:
+      if run_len >= 5:
+        if len(runs) and last - runs[-1][0] < 50:
+          run_len = last - runs[-1][0]
+          run_len += runs[-1][1]
+          runs[-1] = (last, run_len)
+        else:
+          runs.append((last, run_len))
+      run_len = 1
+    last = x
+  to_sort = []
+  mid = len(events) / 2
+  low_third = len(events) / 3
+  high_third = len(events) / 3 * 2
+  for r in runs:
+    if r[0] < low_third:
+      continue
+    if r[0] > high_third:
+      continue
+    to_sort.append((abs(r[0] - mid), r[0] - r[1], r[0]))
+  to_sort.sort()
+  if len(to_sort) == 0:
+    return None
+  trim_size = 10
+  return {"temp": (trim_size, to_sort[0][1] - trim_size),
+          "comp": (to_sort[0][2] + trim_size, len(events) - trim_size)}
+
+def load_read_data(read_file):
+  h5 = h5py.File(read_file, "r")
+  ret = {}
+
+  read_key = h5["Analyses/EventDetection_000/Reads"].keys()[0]
+  base_events = h5["Analyses/EventDetection_000/Reads"][read_key]["Events"]
+  temp_comp_loc = template_complement_loc(base_events)
+  sampling_rate = h5["UniqueGlobalKey/channel_id"].attrs["sampling_rate"]
+
+  if temp_comp_loc:
+    events = base_events[temp_comp_loc["temp"][0]:temp_comp_loc["temp"][1]]
+  else:
+    events = base_events    
+  has_std = True
+  try:
+    std = events[0]["stdv"]
+  except:
+    has_std = False
+  tscale2, tscale_sd2, tshift2 = get_scaling_template(events, has_std)
+
+  index = 0.0
+  ret["temp_events2"] = []
+  for e in events:
+    mean = (e["mean"] - tshift2) / tscale2
+    if has_std:
+      stdv = e["stdv"] / tscale_sd2
+    else:
+      stdv = np.sqrt(e["variance"]) / tscale_sd2
+    length = e["length"] / sampling_rate
+    ret["temp_events2"].append(preproc_event(mean, stdv, length))
+
+  ret["temp_events2"] = np.array(ret["temp_events2"], dtype=np.float32)
+
+  if not temp_comp_loc:
+    return ret
+  
+  events = base_events[temp_comp_loc["comp"][0]:temp_comp_loc["comp"][1]]
+  cscale2, cscale_sd2, cshift2 = get_scaling_complement(events, has_std)
+
+  index = 0.0
+  ret["comp_events2"] = []
+  for e in events:
+    mean = (e["mean"] - cshift2) / cscale2
+    if has_std:
+      stdv = e["stdv"] / cscale_sd2
+    else:
+      stdv = np.sqrt(e["variance"]) / cscale_sd2
+    length = e["length"] / sampling_rate
+    ret["comp_events2"].append(preproc_event(mean, stdv, length))
+
+  ret["comp_events2"] = np.array(ret["comp_events2"], dtype=np.float32)
+
+  return ret
+
+def basecall(read_file_name, fo):
+  basename = os.path.basename(read_file_name)
+  try:
+    data = load_read_data(read_file_name)
+  except Exception as e:
+    print e
+    print "error at file", read_file_name
+    return
+
+  if do_template or do_2d:
+    o1, o2 = predict_and_write(
+        data["temp_events2"], temp_net, 
+        fo if do_template else None,
+        "%s_template_rnn" % basename)
+
+  if (do_complement or do_2d) and "comp_events2" in data:
+    o1c, o2c = predict_and_write(
+        data["comp_events2"], comp_net, 
+        fo if do_complement else None,
+        "%s_complement_rnn" % basename)
+
+  if do_2d and "comp_events2" in data and\
+     len(data["comp_events2"]) <= args.max_2d_length and\
+     len(data["temp_events2"]) <= args.max_2d_length:
+    p = subprocess.Popen("./align_2d", stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    f2d = p.stdin
+    print >>f2d, len(o1)+len(o2)
+    for a, b in zip(o1, o2):
+      print >>f2d, " ".join(map(str, a))
+      print >>f2d, " ".join(map(str, b))
+    print >>f2d, len(o1c)+len(o2c)
+    for a, b in zip(o1c, o2c):
+      print >>f2d, " ".join(map(str, a))
+      print >>f2d, " ".join(map(str, b))
+    f2do, f2de = p.communicate()
+    if p.returncode != 0:
+      return
+    lines = f2do.strip().split('\n')
+    print >>fo, ">%s_2d_rnn_simple" % basename
+    print >>fo, lines[0].strip()
+    events_2d = []
+    for l in lines[1:]:
+      temp_ind, comp_ind = map(int, l.strip().split())
+      e = []
+      if temp_ind == -1:
+        e += [0, 0, 0, 0, 0]
+      else: 
+        e += [1] + list(data["temp_events2"][temp_ind])
+      if comp_ind == -1:
+        e += [0, 0, 0, 0, 0]
+      else:
+        e += [1] + list(data["comp_events2"][comp_ind])
+      events_2d.append(e)
+    events_2d = np.array(events_2d, dtype=np.float32)
+    predict_and_write(events_2d, big_net, fo, "%s_2d_rnn" % basename)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--template_net', type=str, default="nets_data/map6temp.npz")
+parser.add_argument('--complement_net', type=str, default="nets_data/map6comp.npz")
+parser.add_argument('--big_net', type=str, default="nets_data/map6-2d-no-metr23.npz")
+parser.add_argument('--max_2d_length', type=int, default=10000, help='Max length for 2d basecall')
+parser.add_argument('reads', type=str, nargs='*')
+parser.add_argument('--type', type=str, default="all", help="One of: template, complement, 2d, all, use comma to separate multiple options, eg.: template,complement")
+parser.add_argument('--output', type=str, default="output.fasta")
+parser.add_argument('--directory', type=str, default='', help="Directory where read files are stored")
+parser.add_argument('--watch', type=str, default='', help='Watched directory')
+
+
+args = parser.parse_args()
+types = args.type.split(',')
+do_template = False
+do_complement = False
+do_2d = False
+
+if "all" in types or "template" in types:
+  do_template = True
+if "all" in types or "complement" in types:
+  do_complement = True
+if "all" in types or "2d" in types:
+  do_2d = True
+
+assert do_template or do_complement or do_2d, "Nothing to do"
+assert len(args.reads) != 0 or len(args.directory) != 0 or len(args.watch) != 0, "Nothing to basecall"
+
+if do_template or do_2d:
+  print "loading template net"
+  temp_net = RnnPredictor(args.template_net)
+  print "done"
+if do_complement or do_2d:
+  print "loading complement net"
+  comp_net = RnnPredictor(args.complement_net)
+  print "done"
+if do_2d:
+  print "loading 2D net"
+  big_net = RnnPredictor(args.big_net)
+  print "done"
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+if len(args.reads) or len(args.directory) != 0:
+  fo = open(args.output, "w")
+
+  files = args.reads
+  if len(args.directory):
+    files += [os.path.join(args.directory, x) for x in os.listdir(args.directory)]  
+
+  for i, read in enumerate(files):
+    basecall(read, fo)
+
+  fo.close()
+
+if len(args.watch) != 0:
+  try:
+    from watchdog.observers import Observer
+    from watchdog.events import PatternMatchingEventHandler
+  except:
+    print "Please install watchdog to watch directories"
+    sys.exit()
+
+  class Fast5Handler(PatternMatchingEventHandler):
+    """Class for handling creation fo fast5-files"""
+    patterns = ["*.fast5"]
+    def on_created(self, event):
+      print "Calling", event
+      file_name = str(os.path.basename(event.src_path))
+      fasta_file_name = os.path.splitext(event.src_path)[0] + '.fasta'
+      with open(fasta_file_name, "w") as fo:
+        basecall(event.src_path, fo)
+  print('Watch dir: ' + args.watch)
+  observer = Observer()
+  print('Starting Observerer')
+  # start watching directory for fast5-files
+  observer.start()
+  observer.schedule(Fast5Handler(), path=args.watch)
+  try:
+    while True:
+      time.sleep(1)
+  # quit script using ctrl+c
+  except KeyboardInterrupt:
+    observer.stop()
+
+  observer.join()
diff --git a/basecall_no_metrichor_devel.py b/basecall_no_metrichor_devel.py
new file mode 100644
index 0000000..4384be4
--- /dev/null
+++ b/basecall_no_metrichor_devel.py
@@ -0,0 +1,371 @@
+import argparse
+from rnn_fin import RnnPredictor
+import h5py
+import sys
+import numpy as np
+import theano as th
+import os
+import re
+import dateutil.parser
+import datetime
+
+def preproc_event(mean, std, length):
+  mean = mean / 100.0 - 0.66
+  std = std - 1
+  return [mean, mean*mean, std, length]
+
+def get_scaling_template(events):
+  down = 48.4631279889
+  up = 65.7312554591
+  our_down = np.percentile(events["mean"], 10)
+  our_up = np.percentile(events["mean"], 90)
+  scale = (our_up - our_down) / (up - down)
+  shift = (our_up / scale - up) * scale
+
+  sd = 0.807981325017
+  return scale, np.percentile(events["stdv"], 50) / sd, shift
+
+def get_scaling_complement(events):
+  down = 49.2638926877
+  up = 69.0192568072
+  our_down = np.percentile(events["mean"], 10)
+  our_up = np.percentile(events["mean"], 90)
+  scale = (our_up - our_down) / (up - down)
+  shift = (our_up / scale - up) * scale
+
+  sd = 1.04324844612
+  return scale, np.percentile(events["stdv"], 50) / sd, shift
+
+def template_complement_loc(events):
+  abasic_level = np.percentile(events["mean"], 99) + 5
+  abasic_locs = (events["mean"] > abasic_level).nonzero()[0]
+  last = -47
+  run_len = 1
+  runs = []
+  for x in abasic_locs:
+    if x - last == 1:
+      run_len += 1
+    else:
+      if run_len >= 5:
+        if len(runs) and last - runs[-1][0] < 50:
+          run_len = last - runs[-1][0]
+          run_len += runs[-1][1]
+          runs[-1] = (last, run_len)
+        else:
+          runs.append((last, run_len))
+      run_len = 1
+    last = x
+  to_sort = []
+  mid = len(events) / 2
+  low_third = len(events) / 3
+  high_third = len(events) / 3 * 2
+  for r in runs:
+    if r[0] < low_third:
+      continue
+    if r[0] > high_third:
+      continue
+    to_sort.append((abs(r[0] - mid), r[0] - r[1], r[0]))
+  to_sort.sort()
+  if len(to_sort) == 0:
+    return None
+  trim_size = 10
+  return {"temp": (trim_size, to_sort[0][1] - trim_size),
+          "comp": (to_sort[0][2] + trim_size, len(events) - trim_size)}
+
+def load_read_data(read_file):
+  h5 = h5py.File(read_file, "r")
+  ret = {}
+
+  read_key = h5["Analyses/EventDetection_000/Reads"].keys()[0]
+  base_events = h5["Analyses/EventDetection_000/Reads"][read_key]["Events"]
+  temp_comp_loc = template_complement_loc(base_events)
+  if not temp_comp_loc:
+    return None
+
+#  print "temp_comp_loc", temp_comp_loc["temp"], temp_comp_loc["comp"]
+#  print h5["Analyses/Basecall_2D_000/Summary/split_hairpin"].attrs["start_index_temp"],
+#  print h5["Analyses/Basecall_2D_000/Summary/split_hairpin"].attrs["end_index_temp"],
+#  print h5["Analyses/Basecall_2D_000/Summary/split_hairpin"].attrs["start_index_comp"],
+#  print h5["Analyses/Basecall_2D_000/Summary/split_hairpin"].attrs["end_index_comp"]
+
+  sampling_rate = h5["UniqueGlobalKey/channel_id"].attrs["sampling_rate"]
+
+  try:
+    ret["called_template"] = h5["Analyses/Basecall_2D_000/BaseCalled_template/Fastq"][()].split('\n')[1]
+    ret["called_complement"] = h5["Analyses/Basecall_2D_000/BaseCalled_complement/Fastq"][()].split('\n')[1]
+    ret["called_2d"] = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Fastq"][()].split('\n')[1]
+  except Exception as e:
+    print "wat", e 
+    return None
+  events = base_events[temp_comp_loc["temp"][0]:temp_comp_loc["temp"][1]]
+  tscale2, tscale_sd2, tshift2 = get_scaling_template(events)
+
+  index = 0.0
+  ret["temp_events2"] = []
+  for e in events:
+    mean = (e["mean"] - tshift2) / tscale2
+    stdv = e["stdv"] / tscale_sd2
+    length = e["length"] / sampling_rate
+    ret["temp_events2"].append(preproc_event(mean, stdv, length))
+  events = h5["Analyses/Basecall_2D_000/BaseCalled_template/Events"]
+  tscale = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_template"].attrs["scale"]
+  tscale_sd = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_template"].attrs["scale_sd"]
+  tshift = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_template"].attrs["shift"]
+  tdrift = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_template"].attrs["drift"]
+  index = 0.0
+  ret["temp_events"] = []
+  for e in events:
+    mean = (e["mean"] - tshift - index * tdrift) / tscale
+    stdv = e["stdv"] / tscale_sd
+    length = e["length"]
+    ret["temp_events"].append(preproc_event(mean, stdv, length))
+    index += e["length"]
+
+  events = base_events[temp_comp_loc["comp"][0]:temp_comp_loc["comp"][1]]
+  cscale2, cscale_sd2, cshift2 = get_scaling_complement(events)
+
+  index = 0.0
+  ret["comp_events2"] = []
+  for e in events:
+    mean = (e["mean"] - cshift2) / cscale2
+    stdv = e["stdv"] / cscale_sd2
+    length = e["length"] / sampling_rate
+    ret["comp_events2"].append(preproc_event(mean, stdv, length))
+
+  events = h5["Analyses/Basecall_2D_000/BaseCalled_complement/Events"]
+  cscale = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_complement"].attrs["scale"]
+  cscale_sd = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_complement"].attrs["scale_sd"]
+  cshift = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_complement"].attrs["shift"]
+  cdrift = h5["/Analyses/Basecall_2D_000/Summary/basecall_1d_complement"].attrs["drift"]
+  index = 0.0
+  ret["comp_events"] = []
+  for e in events:
+    mean = (e["mean"] - cshift - index * cdrift) / cscale
+    stdv = e["stdv"] / cscale_sd
+    length = e["length"]
+    ret["comp_events"].append(preproc_event(mean, stdv, length))
+    index += e["length"]
+
+  ret["temp_events2"] = np.array(ret["temp_events2"], dtype=np.float32)
+  ret["comp_events2"] = np.array(ret["comp_events2"], dtype=np.float32)
+  ret["temp_events"] = np.array(ret["temp_events"], dtype=np.float32)
+  ret["comp_events"] = np.array(ret["comp_events"], dtype=np.float32)
+
+  al = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Alignment"]
+  ret["al"] = al
+  temp_events = h5["Analyses/Basecall_2D_000/BaseCalled_template/Events"]
+  comp_events = h5["Analyses/Basecall_2D_000/BaseCalled_complement/Events"]
+  ret["2d_events"] = []
+  for a in al:
+    ev = []
+    if a[0] == -1:
+      ev += [0, 0, 0, 0, 0]
+    else:
+      e = temp_events[a[0]]
+      mean = (e["mean"] - tshift - index * tdrift) / cscale
+      stdv = e["stdv"] / tscale_sd
+      length = e["length"]
+      ev += [1] + preproc_event(mean, stdv, length)
+    if a[1] == -1:
+      ev += [0, 0, 0, 0, 0]
+    else:
+      e = comp_events[a[1]]
+      mean = (e["mean"] - cshift - index * cdrift) / cscale
+      stdv = e["stdv"] / cscale_sd
+      length = e["length"]
+      ev += [1] + preproc_event(mean, stdv, length)
+    ret["2d_events"].append(ev) 
+  ret["2d_events"] = np.array(ret["2d_events"], dtype=np.float32)
+  return ret
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--template_net', type=str, default="nets_data/map6temp.npz")
+parser.add_argument('--complement_net', type=str, default="nets_data/map6comp.npz")
+parser.add_argument('--big_net', type=str, default="nets_data/map6-2d-big.npz")
+parser.add_argument('reads', type=str, nargs='+')
+parser.add_argument('--type', type=str, default="all", help="One of: template, complement, 2d, all, use comma to separate multiple options, eg.: template,complement")
+parser.add_argument('--output', type=str, default="output.fasta")
+parser.add_argument('--output_orig', action='store_true', default=True)
+
+args = parser.parse_args()
+types = args.type.split(',')
+do_template = False
+do_complement = False
+do_2d = False
+
+if "all" in types or "template" in types:
+  do_template = True
+if "all" in types or "complement" in types:
+  do_complement = True
+if "all" in types or "2d" in types:
+  do_2d = True
+
+assert do_template or do_complement or do_2d, "Nothing to do"
+
+if do_template or do_2d:
+  print "loading template net"
+  temp_net = RnnPredictor(args.template_net)
+  print "done"
+if do_complement or do_2d:
+  print "loading complement net"
+  comp_net = RnnPredictor(args.complement_net)
+  print "done"
+if do_2d:
+  print "loading 2D net"
+  big_net = RnnPredictor(args.big_net)
+  big_net_orig = RnnPredictor("nets_data/map6-2d-big.npz")
+  print "done"
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+fo = open(args.output, "w")
+
+total_bases = [0, 0, 0]
+
+for i, read in enumerate(args.reads):
+  if True:
+    data = load_read_data(read)
+#  except Exception as e:
+#    print e
+#    print "error at file", read
+#    continue
+  if not data:  
+    continue
+  if args.output_orig:
+    print >>fo, ">%d_template" % i
+    print >>fo, data["called_template"]
+    print >>fo, ">%d_complement" % i
+    print >>fo, data["called_complement"]
+    print >>fo, ">%d_2d" % i
+    print >>fo, data["called_2d"]
+
+  if do_template or do_2d:
+    o1, o2 = temp_net.predict(data["temp_events"]) 
+    o1m = (np.argmax(o1, 1))
+    o2m = (np.argmax(o2, 1))
+    print >>fo, ">%d_temp_rnn" % i
+    for a, b in zip(o1m, o2m):
+      if a < 4:
+        fo.write(chars[a])
+      if b < 4:
+        fo.write(chars[b])
+    fo.write('\n')
+    o1, o2 = temp_net.predict(data["temp_events2"]) 
+    o1m = (np.argmax(o1, 1))
+    o2m = (np.argmax(o2, 1))
+    if do_template:
+      print >>fo, ">%d_temp_rnn2" % i
+      for a, b in zip(o1m, o2m):
+        if a < 4:
+          fo.write(chars[a])
+        if b < 4:
+          fo.write(chars[b])
+      fo.write('\n')
+
+  if do_complement or do_2d:
+    o1c, o2c = comp_net.predict(data["comp_events"]) 
+    o1cm = (np.argmax(o1c, 1))
+    o2cm = (np.argmax(o2c, 1))
+    print >>fo, ">%d_comp_rnn" % i
+    for a, b in zip(o1cm, o2cm):
+      if a < 4:
+        fo.write(chars[a])
+      if b < 4:
+        fo.write(chars[b])
+    fo.write('\n')
+    o1c, o2c = comp_net.predict(data["comp_events2"]) 
+    o1cm = (np.argmax(o1c, 1))
+    o2cm = (np.argmax(o2c, 1))
+    if do_complement:
+      print >>fo, ">%d_comp_rnn2" % i
+      for a, b in zip(o1cm, o2cm):
+        if a < 4:
+          fo.write(chars[a])
+        if b < 4:
+          fo.write(chars[b])
+      fo.write('\n')
+
+  if do_2d:
+    f2d = open("2d.in", "w")
+    print >>f2d, len(o1)+len(o2)
+    for a, b in zip(o1, o2):
+      print >>f2d, " ".join(map(str, a))
+      print >>f2d, " ".join(map(str, b))
+    print >>f2d, len(o1c)+len(o2c)
+    for a, b in zip(o1c, o2c):
+      print >>f2d, " ".join(map(str, a))
+      print >>f2d, " ".join(map(str, b))
+    f2d.close()
+    os.system("./align_2d <2d.in >2d.out")
+    f2do = open("2d.out")
+    call2d = f2do.next().strip()
+    print >>fo, ">%d_2d_rnn_simple" % i
+    print >>fo, call2d
+
+    start_temp_ours = None
+    end_temp_ours = None
+    start_comp_ours = None
+    end_comp_ours = None
+    events_2d = []
+    for l in f2do:
+      temp_ind, comp_ind = map(int, l.strip().split())
+      e = []
+      if temp_ind == -1:
+        e += [0, 0, 0, 0, 0]
+      else: 
+        e += [1] + list(data["temp_events2"][temp_ind])
+        if not start_temp_ours:
+          start_temp_ours = temp_ind
+        end_temp_ours = temp_ind
+      if comp_ind == -1:
+        e += [0, 0, 0, 0, 0]
+      else:
+        e += [1] + list(data["comp_events2"][comp_ind])
+        if not end_comp_ours:
+          end_comp_ours = comp_ind
+        start_comp_ours = comp_ind
+      events_2d.append(e)
+    events_2d = np.array(events_2d, dtype=np.float32)
+    o1c, o2c = big_net.predict(events_2d) 
+    o1cm = (np.argmax(o1c, 1))
+    o2cm = (np.argmax(o2c, 1))
+    print >>fo, ">%d_2d_rnn2" % i
+    for a, b in zip(o1cm, o2cm):
+      if a < 4:
+        fo.write(chars[a])
+      if b < 4:
+        fo.write(chars[b])
+    fo.write('\n')
+    o1c, o2c = big_net.predict(data["2d_events"]) 
+    o1cm = (np.argmax(o1c, 1))
+    o2cm = (np.argmax(o2c, 1))
+    print >>fo, ">%d_2d_rnn" % i
+    for a, b in zip(o1cm, o2cm):
+      if a < 4:
+        fo.write(chars[a])
+      if b < 4:
+        fo.write(chars[b])
+    fo.write('\n')
+
+    start_temp_th = None
+    end_temp_th = None
+    start_comp_th = None
+    end_comp_th = None
+    for a in data["al"]: 
+      if a[0] != -1:
+        if not start_temp_th:
+          start_temp_th = a[0]
+        end_temp_th = a[0]
+      if a[1] != -1:
+        if not end_comp_th:
+          end_comp_th = a[1]
+        start_comp_th = a[1]
+
+    print "Ours:",
+    print start_temp_ours, end_temp_ours, start_comp_ours, end_comp_ours,
+    print 1. * len(events_2d) / (end_temp_ours - start_temp_ours + end_comp_ours - start_comp_ours) 
+    print "Their:",
+    print start_temp_th, end_temp_th, start_comp_th, end_comp_th,
+    print 1. * len(data["al"]) / (end_temp_th - start_temp_th + end_comp_th - start_comp_th) 
+    print
diff --git a/helpers.py b/helpers.py
new file mode 100644
index 0000000..6808562
--- /dev/null
+++ b/helpers.py
@@ -0,0 +1,76 @@
+from rnn_fin import RnnPredictor
+import h5py
+import sys
+import numpy as np
+import theano as th
+import os
+import re
+import dateutil.parser
+import datetime
+import argparse
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+def preproc_event(mean, std, length):
+  mean = mean / 100.0 - 0.66
+  std = std - 1
+  return [mean, mean*mean, std, length]
+
+def predict_and_write(events, ntwk, fo, read_name):
+  o1, o2 = ntwk.predict(events) 
+  if fo:
+    o1m = (np.argmax(o1, 1))
+    o2m = (np.argmax(o2, 1))
+    print >>fo, ">%s" % read_name
+    for a, b in zip(o1m, o2m):
+      if a < 4:
+        fo.write(chars[a])
+      if b < 4:
+        fo.write(chars[b])
+    fo.write('\n')
+  return o1, o2
+
+def extract_timing(h5, ret):
+  try:
+    log = h5["Analyses/Basecall_2D_000/Log"][()]
+    temp_time = dateutil.parser.parse(re.search(r"(.*) Basecalling template.*", log).groups()[0])
+    comp_time = dateutil.parser.parse(re.search(r"(.*) Basecalling complement.*", log).groups()[0])
+    comp_end_time = dateutil.parser.parse(re.search(r"(.*) Aligning hairpin.*", log).groups()[0])
+
+    start_2d_time = dateutil.parser.parse(re.search(r"(.*) Performing full 2D.*", log).groups()[0])
+    end_2d_time = dateutil.parser.parse(re.search(r"(.*) Workflow completed.*", log).groups()[0])
+
+    ret["temp_time"] = comp_time - temp_time
+    ret["comp_time"] = comp_end_time - comp_time
+    ret["2d_time"] = end_2d_time - start_2d_time
+  except:
+    pass
+
+def get_base_loc(h5):
+  base_loc = "Analyses/Basecall_2D_000"
+  try:
+    events = h5["Analyses/Basecall_2D_000/BaseCalled_template/Events"]
+  except:
+    base_loc = "Analyses/Basecall_1D_000"
+  return base_loc
+
+def extract_scaling(h5, read_type, base_loc):
+  scale = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["scale"]
+  scale_sd = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["scale_sd"]
+  shift = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["shift"]
+  drift = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["drift"]
+  return scale, scale_sd, shift, drift
+
+def extract_1d_event_data(h5, read_type, base_loc, scale, scale_sd, shift, drift):
+  events = h5[base_loc+"/BaseCalled_%s/Events" % read_type]
+  index = 0.0
+  data = []
+  for e in events:
+    mean = (e["mean"] - shift - index * drift) / scale
+    stdv = e["stdv"] / scale_sd
+    length = e["length"]
+    data.append(preproc_event(mean, stdv, length))
+    index += e["length"]
+  return np.array(data, dtype=np.float32)
+
diff --git a/nets_data/map5-2d.npz b/nets_data/map5-2d.npz
new file mode 100644
index 0000000..5d52795
Binary files /dev/null and b/nets_data/map5-2d.npz differ
diff --git a/nets_data/map5comp.npz b/nets_data/map5comp.npz
new file mode 100644
index 0000000..89b6d0e
Binary files /dev/null and b/nets_data/map5comp.npz differ
diff --git a/nets_data/map5temp.npz b/nets_data/map5temp.npz
new file mode 100644
index 0000000..b545b6d
Binary files /dev/null and b/nets_data/map5temp.npz differ
diff --git a/nets_data/map6-2d-big.npz b/nets_data/map6-2d-big.npz
new file mode 100644
index 0000000..39bd0d9
Binary files /dev/null and b/nets_data/map6-2d-big.npz differ
diff --git a/nets_data/map6-2d-no-metr.npz b/nets_data/map6-2d-no-metr.npz
new file mode 100644
index 0000000..bde1287
Binary files /dev/null and b/nets_data/map6-2d-no-metr.npz differ
diff --git a/nets_data/map6-2d-no-metr10.npz b/nets_data/map6-2d-no-metr10.npz
new file mode 100644
index 0000000..e6d9970
Binary files /dev/null and b/nets_data/map6-2d-no-metr10.npz differ
diff --git a/nets_data/map6-2d-no-metr20.npz b/nets_data/map6-2d-no-metr20.npz
new file mode 100644
index 0000000..6cdc011
Binary files /dev/null and b/nets_data/map6-2d-no-metr20.npz differ
diff --git a/nets_data/map6-2d-no-metr23.npz b/nets_data/map6-2d-no-metr23.npz
new file mode 100644
index 0000000..8a4c29e
Binary files /dev/null and b/nets_data/map6-2d-no-metr23.npz differ
diff --git a/nets_data/map6-2d.npz b/nets_data/map6-2d.npz
new file mode 100644
index 0000000..1e209df
Binary files /dev/null and b/nets_data/map6-2d.npz differ
diff --git a/nets_data/map6comp.npz b/nets_data/map6comp.npz
new file mode 100644
index 0000000..3840f2c
Binary files /dev/null and b/nets_data/map6comp.npz differ
diff --git a/nets_data/map6temp.npz b/nets_data/map6temp.npz
new file mode 100644
index 0000000..80107af
Binary files /dev/null and b/nets_data/map6temp.npz differ
diff --git a/rnn_fin.py b/rnn_fin.py
new file mode 100644
index 0000000..a1795e8
--- /dev/null
+++ b/rnn_fin.py
@@ -0,0 +1,81 @@
+import theano as th
+import theano.tensor as T
+from theano.tensor.nnet import sigmoid
+import numpy as np
+import pickle
+
+def share(array, dtype=th.config.floatX, name=None):
+  return th.shared(value=np.asarray(array, dtype=dtype), name=name)
+
+class OutLayer:
+  def __init__(self, input, in_size, n_classes):
+    w = share(np.zeros((in_size, n_classes)))
+    b = share(np.zeros(n_classes))
+    eps = 0.0000001
+    self.output = T.clip(T.nnet.softmax(T.dot(input, w) + b), eps, 1-eps)
+    self.params = [w, b]
+
+class SimpleLayer:
+  def __init__(self, input, nin, nunits):
+    id = str(np.random.randint(0, 10000000))
+    wio = share(np.zeros((nin, nunits)), name="wio"+id)  # input to output
+    wir = share(np.zeros((nin, nunits)), name="wir"+id)  # input to output
+    wiu = share(np.zeros((nin, nunits)), name="wiu"+id)  # input to output
+    woo = share(np.zeros((nunits, nunits)), name="woo"+id)  # output to output
+    wou = share(np.zeros((nunits, nunits)), name="wou"+id)  # output to output
+    wor = share(np.zeros((nunits, nunits)), name="wor"+id)  # output to output
+    bo = share(np.zeros(nunits), name="bo"+id)
+    bu = share(np.zeros(nunits), name="bu"+id)
+    br = share(np.zeros(nunits), name="br"+id)
+    h0 = share(np.zeros(nunits), name="h0"+id)
+
+    def step(in_t, out_tm1):
+      update_gate = sigmoid(T.dot(out_tm1, wou) + T.dot(in_t, wiu) + bu)
+      reset_gate = sigmoid(T.dot(out_tm1, wor) + T.dot(in_t, wir) + br)
+      new_val = T.tanh(T.dot(in_t, wio) + reset_gate * T.dot(out_tm1, woo) + bo)
+      return update_gate * out_tm1 + (1 - update_gate) * new_val
+    
+    self.output, _ = th.scan(
+      step, sequences=[input],
+      outputs_info=[h0])
+
+    self.params = [wio, woo, bo, wir, wiu, wor, wou, br, bu, h0]
+
+class BiSimpleLayer():
+  def __init__(self, input, nin, nunits):
+    fwd = SimpleLayer(input, nin, nunits)
+    bwd = SimpleLayer(input[::-1], nin, nunits)
+    self.params = fwd.params + bwd.params
+    self.output = T.concatenate([fwd.output, bwd.output[::-1]], axis=1)
+
+class RnnPredictor:
+  def __init__(self, filename):
+    package = np.load(filename)
+    assert(len(package.files) % 20 == 4)
+    n_layers = len(package.files) / 20
+
+    self.input = T.fmatrix()
+    last_output = self.input
+    last_size = package['arr_0'].shape[0]
+    hidden_size = package['arr_0'].shape[1]
+    par_index = 0
+    for i in range(n_layers):
+      layer = BiSimpleLayer(last_output, last_size, hidden_size)
+      for i in range(20):
+        layer.params[i].set_value(package['arr_%d' % par_index])
+        par_index += 1
+
+      last_output = layer.output
+      last_size = 2*hidden_size
+    out_layer1 = OutLayer(last_output, last_size, 5)
+    for i in range(2):
+      out_layer1.params[i].set_value(package['arr_%d' % par_index])
+      par_index += 1
+    out_layer2 = OutLayer(last_output, last_size, 5)
+    for i in range(2):
+      out_layer2.params[i].set_value(package['arr_%d' % par_index])
+      par_index += 1
+    output1 = out_layer1.output
+    output2 = out_layer2.output
+
+    self.predict = th.function(inputs=[self.input], outputs=[output1, output2])
diff --git a/training/README b/training/README
new file mode 100644
index 0000000..bbc76d3
--- /dev/null
+++ b/training/README
@@ -0,0 +1,28 @@
+Preparing training files
+=========================
+
+`python prepare_dataset <type> <list_of_files_and_references> <output_directory>`
+
+Where:
+Type is one of temp/comp/2d.
+List_of_files_and_references is a file which contains names of read files followed
+by reference sequence (one pair on each line), e.g.:
+
+file1.fast5 ACCGGGTTAACCC
+file2.fast5 CCCGGTTAAACCCGGGG
+file3.fast5 AAATTGGCCTTGGAAA
+
+The reference sequence should cover whole read sequence and maybe overextend a bit
+(like 20 bases on each side).
+
+Output directory must exist before calling the script.
+
+Training
+===============
+
+`OMP_NUM_THREADS=1 python train.py <starting network> <list_of_training_files>`
+
+Use starting network from ../nets_data/ (map6temp/map6comp/map6-2d-big)
+
+Then this creates a directory a starts training. Each 20th iteration it outputs network into that
+directory with name dump-%s.npz (where %s is ID of iteration).
diff --git a/training/helpers.py b/training/helpers.py
new file mode 100644
index 0000000..45d32ab
--- /dev/null
+++ b/training/helpers.py
@@ -0,0 +1,31 @@
+import h5py
+import sys
+import numpy as np
+import os
+import re
+import dateutil.parser
+import datetime
+import argparse
+
+chars = "ACGT"
+mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+def preproc_event(mean, std, length):
+  mean = mean / 100.0 - 0.66
+  std = std - 1
+  return [mean, mean*mean, std, length]
+
+def get_base_loc(h5):
+  base_loc = "Analyses/Basecall_2D_000"
+  try:
+    events = h5["Analyses/Basecall_2D_000/BaseCalled_template/Events"]
+  except:
+    base_loc = "Analyses/Basecall_1D_000"
+  return base_loc
+
+def extract_scaling(h5, read_type, base_loc):
+  scale = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["scale"]
+  scale_sd = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["scale_sd"]
+  shift = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["shift"]
+  drift = h5[base_loc+"/Summary/basecall_1d_"+read_type].attrs["drift"]
+  return scale, scale_sd, shift, drift
diff --git a/training/prepare_dataset.py b/training/prepare_dataset.py
new file mode 100644
index 0000000..6c60779
--- /dev/null
+++ b/training/prepare_dataset.py
@@ -0,0 +1,92 @@
+import argparse
+import os
+import h5py
+from helpers import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('type', choices=['temp', 'comp', '2d'])
+parser.add_argument('source_file', type=str)
+parser.add_argument('output_directory', type=str)
+args = parser.parse_args()
+
+finput = open(args.source_file)
+
+for i, l in enumerate(finput):
+  parts = l.strip().split()
+  filename = ' '.join(parts[:-1])
+  ref = parts[-1]
+  h5 = h5py.File(filename, "r")
+  
+  fo = open(os.path.join(args.output_directory, "%s.txt" % i), "w")
+  print >>fo, ref
+  base_loc = get_base_loc(h5)
+  if args.type == 'temp':
+    scale, scale_sd, shift, drift = extract_scaling(h5, "template", base_loc)
+    events = h5[base_loc+"/BaseCalled_%s/Events" % "template"]
+    index = 0.0
+    data = []
+    for e in events:
+      mean = (e["mean"] - shift) / scale
+      stdv = e["stdv"] / scale_sd
+      length = e["length"]
+      print >>fo, " ".join(map(str, preproc_event(mean, stdv, length))),
+      move = e["move"]
+      if move == 0:
+        print >>fo, "NN"
+      if move == 1:
+        print >>fo, "N%s" % e["model_state"][2]
+      if move == 2:
+        print >>fo, "%s%s" % (e["model_state"][1], e["model_state"][2])
+  if args.type == 'comp':
+    scale, scale_sd, shift, drift = extract_scaling(h5, "complement", base_loc)
+    events = h5[base_loc+"/BaseCalled_%s/Events" % "complement"]
+    index = 0.0
+    data = []
+    for e in events:
+      mean = (e["mean"] - shift) / scale
+      stdv = e["stdv"] / scale_sd
+      length = e["length"]
+      print >>fo, " ".join(map(str, preproc_event(mean, stdv, length))),
+      move = e["move"]
+      if move == 0:
+        print >>fo, "NN"
+      if move == 1:
+        print >>fo, "N%s" % e["model_state"][2]
+      if move == 2:
+        print >>fo, "%s%s" % (e["model_state"][1], e["model_state"][2])
+  if args.type == '2d':
+    tscale, tscale_sd, tshift, tdrift = extract_scaling(h5, "template", base_loc)
+    cscale, cscale_sd, cshift, cdrift = extract_scaling(h5, "complement", base_loc)
+    al = h5["Analyses/Basecall_2D_000/BaseCalled_2D/Alignment"]
+    temp_events = h5[base_loc+"/BaseCalled_template/Events"]
+    comp_events = h5[base_loc+"/BaseCalled_complement/Events"]
+    prev = None
+    for a in al:
+      ev = []
+      if a[0] == -1:
+        ev += [0, 0, 0, 0, 0]
+      else:
+        e = temp_events[a[0]]
+        mean = (e["mean"] - tshift) / cscale
+        stdv = e["stdv"] / tscale_sd
+        length = e["length"]
+        ev += [1] + preproc_event(mean, stdv, length)
+      if a[1] == -1:
+        ev += [0, 0, 0, 0, 0]
+      else:
+        e = comp_events[a[1]]
+        mean = (e["mean"] - cshift) / cscale
+        stdv = e["stdv"] / cscale_sd
+        length = e["length"]
+        ev += [1] + preproc_event(mean, stdv, length)
+      print >>fo, " ".join(map(str, ev)),
+      if prev == a[2]:
+        print >>fo, "NN"
+      elif not prev or a[2][:-1] == prev[1:]:
+        print >>fo, "N%c" % a[2][2]
+      else:
+        print >>fo, "%c%c" % (a[2][1], a[2][2])
+
+
+  fo.close()
+  h5.close()
diff --git a/training/realign.cc b/training/realign.cc
new file mode 100644
index 0000000..00286a3
--- /dev/null
+++ b/training/realign.cc
@@ -0,0 +1,173 @@
+#include <cstdio>
+#include <vector>
+#include <cmath>
+#include <map>
+#include <string>
+#include <algorithm>
+
+using namespace std;
+
+class string2 {
+ public:
+  string2() {}
+  string2(const char*x) {
+    x_[0] = 0;
+    x_[1] = 0;
+    x_[2] = 0;
+    for (int i = 0; i < 2 && x[i]; i++) {
+      x_[i] = x[i];
+    }
+  }
+
+  string2 operator=(const char*x) {
+    x_[0] = 0;
+    x_[1] = 0;
+    x_[2] = 0;
+    for (int i = 0; i < 2 && x[i]; i++) {
+      x_[i] = x[i];
+    }
+    return *this;
+  }
+
+  string2 operator=(const string2 x) {
+    x_[0]= x.x_[0];
+    x_[1]= x.x_[1];
+    x_[2]= x.x_[2];
+    return *this;
+  }
+
+  string2 operator=(const string& x) {
+    x_[0] = 0;
+    x_[1] = 0;
+    x_[2] = 0;
+    for (int i = 0; i < 2 && i < x.size(); i++) {
+      x_[i] = x[i];
+    }
+    return *this;
+  }
+
+  char& operator[](int i) {
+    return x_[i];
+  }
+
+  char x_[3];
+};
+
+int main() {
+  char ss[100000];
+  scanf("%s\n", ss);
+
+  string ref(ss);
+  int mapping[256];
+  mapping['A'] = 0;
+  mapping['C'] = 1;
+  mapping['G'] = 2;
+  mapping['T'] = 3;
+  int range = 4000;
+  vector<vector<double>> probs;
+  while (true) {
+    double a, c, g, t, n;
+    if (scanf("%lf %lf %lf %lf %lf", &a, &c, &g, &t, &n)<5) {
+      break;
+    }
+    probs.push_back(vector<double>({log(a), log(c), log(g), log(t), log(n)}));
+  }
+  while (true) {
+    vector<vector<double>> poses(probs.size()+1);
+    vector<vector<string2>> prevs(probs.size()+1);
+    for (int i = 0; i < poses.size(); i+=2) {
+      poses[i] = vector<double>(ref.size()+1, -1e30);
+      prevs[i] = vector<string2>(ref.size()+1, "");
+    }
+    poses[0][0] = 0;
+    for (int i = 0; i < poses[0].size() && i < poses[0].size() - 500; i++) {
+      poses[0][i] = 0;
+    }
+    int last_bp = 50;
+    for (int i = 2; i <= probs.size(); i+=2) {
+      for (int j = max(1, last_bp - range); j <= ref.size() && j <= last_bp + range; j++) {
+        // NN
+        if (i > 2) {
+          double np = poses[i-2][j] + probs[i-2][4] + probs[i-1][4];
+          if (np > poses[i][j]) {
+            poses[i][j] = np;
+            prevs[i][j] = "NN";
+          }
+        }
+        // NX
+        double np = poses[i-2][j-1] + probs[i-2][4] + probs[i-1][mapping[ref[j-1]]];
+        if (np > poses[i][j]) {
+          poses[i][j] = np;
+          prevs[i][j] = string("N") + ref[j-1];
+        }
+
+        //XX
+        if (j > 1) {
+          double np = poses[i-2][j-2] + probs[i-2][mapping[ref[j-2]]] + probs[i-1][mapping[ref[j-1]]];
+          if (np > poses[i][j]) {
+            poses[i][j] = np;
+            prevs[i][j] = ref.substr(j-2, 2);
+          }
+        }
+      }
+      int cur_bp = max(1, last_bp - range);
+      for (int j = max(1, last_bp - range); j <= ref.size() && j <= last_bp + range; j++) {
+        if (poses[i][j] > poses[i][cur_bp]) {
+          cur_bp = j;
+        }
+      }
+      last_bp = cur_bp;
+    }
+//    fprintf(stderr, "back size %d last_bp %d\n", poses.back().size(), last_bp);
+    int best_pos = poses.back().size()-40;
+    for (int i = min(500, (int)poses.back().size()-500); i < poses.back().size(); i++) {
+      if (poses.back()[i] > poses.back()[best_pos]) {
+        best_pos = i;
+      }
+    }
+    /*int total = 0;
+    int rel = 0;
+    int better = 0;
+    for (int i = 0; i < poses.size(); i += 2) {
+      for (int j = 0; j < poses[i].size(); j++) {
+        total += 1;
+        if (poses[i][j] > -1e29) {
+          rel += 1;
+        }
+        if (poses[i][j] > poses.back()[best_pos]) {
+          better += 1;
+        }
+      }
+    }
+    fprintf(stderr, "total %d rel %d better %d\n", total, rel, better);*/
+    if (poses.back()[best_pos] / probs.size() * 2 > -10) {
+      fprintf(stderr, "best pos %d %lf %d\n", best_pos, poses.back()[best_pos] / probs.size() * 2, range);
+      int ipos = poses.size()-1;
+      int jpos = best_pos;
+
+      vector<string2> out;
+      while (ipos > 0) {
+        auto back = prevs[ipos][jpos];
+        out.push_back(back);
+        if (back[0] == 'N' && back[1] == 'N') {
+          ipos -= 2;
+        }
+        else if (back[0] == 'N' && back[1] != 'N') {
+          ipos -= 2;
+          jpos -= 1;
+        }
+        else if (back[0] != 'N' && back[1] != 'N') {
+          ipos -= 2;
+          jpos -= 2;
+        }
+      }
+      reverse(out.begin(), out.end());
+      for (auto &o: out) {
+        printf("%s\n", o.x_);
+      }
+      fprintf(stderr, "start pos %d\n", jpos);
+      return 0;
+    }
+    range *= 2;
+  }
+}
diff --git a/training/rnn.py b/training/rnn.py
new file mode 100644
index 0000000..cafaf20
--- /dev/null
+++ b/training/rnn.py
@@ -0,0 +1,105 @@
+import theano as th
+import theano.tensor as T
+from theano.tensor.nnet import sigmoid
+import numpy as np
+import pickle
+from theano_toolkit import updates
+
+def share(array, dtype=th.config.floatX, name=None):
+  return th.shared(value=np.asarray(array, dtype=dtype), name=name)
+
+class OutLayer:
+  def __init__(self, input, in_size, n_classes):
+    id = str(np.random.randint(0, 10000000))
+    w = share(np.zeros((in_size, n_classes)), name="wout"+id)
+    b = share(np.zeros(n_classes), name="bout"+id)
+    eps = 0.0000001
+    self.output = T.clip(T.nnet.softmax(T.dot(input, w) + b), eps, 1-eps)
+    self.params = [w, b]
+
+class SimpleLayer:
+  def __init__(self, input, nin, nunits):
+    id = str(np.random.randint(0, 10000000))
+    wio = share(np.zeros((nin, nunits)), name="wio"+id)  # input to output
+    wir = share(np.zeros((nin, nunits)), name="wir"+id)  # input to output
+    wiu = share(np.zeros((nin, nunits)), name="wiu"+id)  # input to output
+    woo = share(np.zeros((nunits, nunits)), name="woo"+id)  # output to output
+    wou = share(np.zeros((nunits, nunits)), name="wou"+id)  # output to output
+    wor = share(np.zeros((nunits, nunits)), name="wor"+id)  # output to output
+    bo = share(np.zeros(nunits), name="bo"+id)
+    bu = share(np.zeros(nunits), name="bu"+id)
+    br = share(np.zeros(nunits), name="br"+id)
+    h0 = share(np.zeros(nunits), name="h0"+id)
+
+    def step(in_t, out_tm1):
+      update_gate = sigmoid(T.dot(out_tm1, wou) + T.dot(in_t, wiu) + bu)
+      reset_gate = sigmoid(T.dot(out_tm1, wor) + T.dot(in_t, wir) + br)
+      new_val = T.tanh(T.dot(in_t, wio) + reset_gate * T.dot(out_tm1, woo) + bo)
+      return update_gate * out_tm1 + (1 - update_gate) * new_val
+    
+    self.output, _ = th.scan(
+      step, sequences=[input],
+      outputs_info=[h0])
+
+    self.params = [wio, woo, bo, wir, wiu, wor, wou, br, bu, h0]
+
+class BiSimpleLayer():
+  def __init__(self, input, nin, nunits):
+    fwd = SimpleLayer(input, nin, nunits)
+    bwd = SimpleLayer(input[::-1], nin, nunits)
+    self.params = fwd.params + bwd.params
+    self.output = T.concatenate([fwd.output, bwd.output[::-1]], axis=1)
+
+class Rnn:
+  def __init__(self, filename):
+    package = np.load(filename)
+    assert(len(package.files) % 20 == 4)
+    n_layers = len(package.files) / 20
+
+    self.params = []
+    self.input = T.fmatrix()
+    last_output = self.input
+    last_size = package['arr_0'].shape[0]
+    hidden_size = package['arr_0'].shape[1]
+    par_index = 0
+    for i in range(n_layers):
+      layer = BiSimpleLayer(last_output, last_size, hidden_size)
+      self.params += layer.params
+      for i in range(20):
+        layer.params[i].set_value(package['arr_%d' % par_index])
+        par_index += 1
+
+      last_output = layer.output
+      last_size = 2*hidden_size
+    out_layer1 = OutLayer(last_output, last_size, 5)
+    for i in range(2):
+      out_layer1.params[i].set_value(package['arr_%d' % par_index])
+      par_index += 1
+    out_layer2 = OutLayer(last_output, last_size, 5)
+    for i in range(2):
+      out_layer2.params[i].set_value(package['arr_%d' % par_index])
+      par_index += 1
+    output1 = out_layer1.output
+    output2 = out_layer2.output
+    self.params += out_layer1.params
+    self.params += out_layer2.params
+
+    self.predict = th.function(inputs=[self.input], outputs=[output1, output2])
+    self.tester = th.function(inputs=[self.input], outputs=[output1, output2])
+
+    self.lr = T.fscalar()
+    self.targets = T.ivector()
+    self.targets2 = T.ivector()
+    self.cost = 0
+    self.cost = -T.mean(T.log(output1)[T.arange(self.targets.shape[0]), self.targets]) 
+    self.cost += -T.mean(T.log(output2)[T.arange(self.targets2.shape[0]), self.targets2]) 
+
+    self.trainer = th.function(
+        inputs=[self.input, self.targets, self.targets2, self.lr],
+        outputs=[self.cost, output1, output2],
+        updates=updates.momentum(self.params, (T.grad(self.cost, self.params)),
+                                 learning_rate=self.lr, mu=0.8))
+
+  def save(self, fn):
+    pp = [p.get_value() for p in self.params] 
+    np.savez(fn, *pp)
diff --git a/training/train.py b/training/train.py
new file mode 100644
index 0000000..4b81d2f
--- /dev/null
+++ b/training/train.py
@@ -0,0 +1,166 @@
+from rnn import Rnn
+import pickle
+import sys
+import numpy as np
+import datetime
+from collections import defaultdict
+import os
+from sklearn.metrics import confusion_matrix
+import theano as th
+from multiprocessing import Pool
+
+def print_stats(o):
+  stats = defaultdict(int)
+  for x in o:
+    stats[x] += 1
+  print (stats)
+
+def flatten2(x):
+  return x.reshape((x.shape[0] * x.shape[1], -1))
+
+def realign(s):
+  ps = s 
+  o1, o2 = ntwk.tester(data_x[ps]) 
+  o1m = (np.argmax(o1, 1))
+  o2m = (np.argmax(o2, 1))
+  alph = "ACGTN"
+  f = open(base_dir+"tmpb-%s.in" % s, "w")
+  print >>f, refs[ps]
+  for a, b in zip(o1, o2):
+    print >>f, " ".join(map(str, a))
+    print >>f, " ".join(map(str, b))
+  f.close()
+
+  print "s", s
+  if os.system("./realign <%stmpb-%s.in >%stmpb-%s.out" % (base_dir, s, base_dir, s)) != 0:
+    print "watwat", s
+    sys.exit()
+
+  f = open(base_dir+"tmpb-%s.out" % s)
+  for i, l in enumerate(f):
+    data_y[ps][i] = mapping[l[0]]
+    data_y2[ps][i] = mapping[l[1]]
+
+  return data_y[ps], data_y2[ps]
+
+if __name__ == '__main__':
+  chars = "ACGT"
+  mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+  n_dims = 4
+  n_classes = 5
+
+  data_x = []
+  data_y = []
+  data_y2 = []
+  refs = []
+  names = []
+
+  for fn in sys.argv[2:]:
+    f = open(fn)
+    ref = f.readline()
+    if len(ref) > 30000:
+      print "out", len(ref)
+      continue
+    refs.append(ref.strip())
+    names.append(fn)
+    X = []
+    Y = []
+    Y2 = []
+    for l in f:
+      its = l.strip().split()
+      X.append(map(float, its[:-1]))
+      Y.append(mapping[its[-1][0]])
+      Y2.append(mapping[its[-1][1]])
+    data_x.append(np.array(X, dtype=np.float32))
+    data_y.append(np.array(Y, dtype=np.int32))
+    data_y2.append(np.array(Y2, dtype=np.int32))
+
+  print ("done", sum(len(x) for x in refs))
+  sys.stdout.flush()
+
+  ntwk = Rnn(sys.argv[1])
+
+  print ("net rdy")
+
+  s_arr = []
+  p_arr = []
+  subseq_size = 400
+  for s in range(len(data_x)):
+    s_arr += [s]
+    p_arr += [len(data_x[s]) - subseq_size]
+
+  sum_p = sum(p_arr)
+  for i in range(len(p_arr)):
+    p_arr[i] = 1.*p_arr[i] / sum_p
+
+  base_dir = str(datetime.datetime.now())
+  base_dir = base_dir.replace(' ', '_')
+
+  os.mkdir(base_dir)
+  base_dir += "/"
+  batch_size = 1
+  n_batches = len(data_x) / batch_size
+  print len(data_x), batch_size, n_batches, datetime.datetime.now()
+
+  for epoch in range(1000):
+    if (epoch % 20 == 0 and epoch > 0) or (epoch == 0):
+      p = Pool(8)
+      new_labels = p.map(realign, range(len(data_x))) 
+      for i in range(len(new_labels)):
+        data_y[i] = new_labels[i][0]
+        data_y2[i] = new_labels[i][1]
+
+    taken_gc = []
+    out_gc = []
+    tc = 0
+    tc2 = 0
+    tc3 = 0
+    o1mm = []
+    y1mm = []
+    o2mm = []
+    y2mm = []
+    for s in range(len(data_x)):
+      s2 = np.random.choice(s_arr, p=p_arr)
+      r = np.random.randint(0, data_x[s2].shape[0] - subseq_size)
+      x = data_x[s2][r:r+subseq_size]
+      #    x[:,0] += np.random.binomial(n=1, p=0.1, size=x.shape[0]) * np.random.normal(scale=0.01, size=x.shape[0])
+      y = data_y[s2][r:r+subseq_size]
+      y2 = data_y2[s2][r:r+subseq_size]
+
+      lr = 1e-2
+  #    if epoch >= 3:
+  #      lr = 2e-1
+  #    if epoch >= 50:
+  #      lr = 2e-1
+      if epoch >= 970:
+        lr = 1e-3
+      cost, o1, o2 = ntwk.trainer(x, y, y2, lr)
+      tc += cost
+
+      o1m = (np.argmax(o1, 1))
+      o2m = (np.argmax(o2, 1))
+
+      o1mm += list(o1m)
+      o2mm += list(o2m)
+      y1mm += list(y)
+      y2mm += list(y2)
+
+      tc2 += np.sum(np.equal(o1m, y))
+      tc3 += np.sum(np.equal(o2m, y2))
+      sys.stdout.write('\r%d' % s)
+      sys.stdout.flush()
+
+    print
+
+    print epoch, tc / n_batches, 1.*tc2 / n_batches / batch_size, 1.*tc3 / n_batches / batch_size, datetime.datetime.now()
+    print_stats(o1mm)
+    print_stats(o2mm)
+    print confusion_matrix(y1mm, o1mm)
+    print confusion_matrix(y2mm, o2mm)
+  #  print "out", np.min(out_gc), np.median(out_gc), np.max(out_gc), len(out_gc)
+    sys.stdout.flush()
+
+    if epoch % 20 == 2:
+      ntwk.save(base_dir+"dumpx-%d.npz" % epoch)
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/deepnano.git



More information about the debian-med-commit mailing list