[med-svn] [Git][med-team/deepnano][upstream] New upstream version 0.0+git20170813.e8a621e
Andreas Tille
gitlab at salsa.debian.org
Fri Apr 27 08:45:03 BST 2018
Andreas Tille pushed to branch upstream at Debian Med / deepnano
Commits:
354bf44a by Andreas Tille at 2018-04-27T09:14:07+02:00
New upstream version 0.0+git20170813.e8a621e
- - - - -
25 changed files:
- README.md
- basecall_no_metrichor.py
- + r9/README.md
- + r9/basecall.py
- + r9/extract_events.py
- + r9/networks/r9.pkl
- + r9/networks/r94.pkl
- + r9/qrnnf.py
- + r9/rnnf.py
- + r9/training/README
- + r9/training/example_data/100.txt
- + r9/training/example_data/1007.txt
- + r9/training/qrnn.py
- + r9/training/realign
- + r9/training/realign.cc
- + r9/training/realign.py
- + r9/training/rnn.py
- + r9/training/theano_toolkit/README.md
- + r9/training/theano_toolkit/__init__.py
- + r9/training/theano_toolkit/hinton.py
- + r9/training/theano_toolkit/ops.py
- + r9/training/theano_toolkit/parameters.py
- + r9/training/theano_toolkit/updates.py
- + r9/training/theano_toolkit/utils.py
- + r9/training/train.py
Changes:
=====================================
README.md
=====================================
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
### DeepNano: alternative basecaller for MinION reads
+## DeepNano version for R9 and R9.4 chemistry can be found in R9 directory of this repository.
+
DeepNano is alternative basecaller for Oxford Nanopore MinION reads
based on deep recurrent neural networks.
@@ -50,6 +52,8 @@ For SQK-MAP-005 chemistry use:
`OMP_NUM_THREADS=1 python basecall.py --template_net nets_data/map5temp.npz --complement_net nets_data/map5comp.npz --big_net nets_data/map5-2d.npz <list of fast5 files>`
+For R9 and R9.4 check r9 directory.
+
Advanced arguments:
=================
=====================================
basecall_no_metrichor.py
=====================================
--- a/basecall_no_metrichor.py
+++ b/basecall_no_metrichor.py
@@ -185,7 +185,8 @@ def basecall(read_file_name, fo):
e += [1] + list(data["comp_events2"][comp_ind])
events_2d.append(e)
events_2d = np.array(events_2d, dtype=np.float32)
- predict_and_write(events_2d, big_net, fo, "%s_2d_rnn" % basename)
+ if len(events_2d) >= 5:
+ predict_and_write(events_2d, big_net, fo, "%s_2d_rnn" % basename)
parser = argparse.ArgumentParser()
parser.add_argument('--template_net', type=str, default="nets_data/map6temp.npz")
=====================================
r9/README.md
=====================================
--- /dev/null
+++ b/r9/README.md
@@ -0,0 +1,29 @@
+### DeepNano: alternative basecaller for MinION reads - R9(.4) version
+
+Requirements
+================
+
+We use Python 2.7.
+
+Here are versions of Python packages, that we used:
+
+- Cython==0.23.4
+- numpy==1.10.2
+- h5py==2.5.0
+- python-dateutil==2.5.0
+
+Basic usage:
+================
+
+`python basecall.py --chemistry <r9/r9.4> <list of fast5 files>`
+
+Advanced arguments:
+=================
+
+- `-h` - prints help message
+- `--output FILENAME` - output filename
+- `--directory DIRECTORY` Directory where read files are stored
+- `--cut-data` - sometimes read detection can produce huge regions with no bases and basecaller gets
+ confused. This option is workaround around that issue.
+
+Basecaller will perform event extraction, if events are not present.
=====================================
r9/basecall.py
=====================================
--- /dev/null
+++ b/r9/basecall.py
@@ -0,0 +1,193 @@
+from rnnf import Rnn
+from qrnnf import Rnn as Rnnq
+import h5py
+import argparse
+import os
+import datetime
+import numpy as np
+from extract_events import extract_events
+
+def scale94(X):
+ m25 = np.percentile(X[:,0], 25)
+ m75 = np.percentile(X[:,0], 75)
+ s50 = np.median(X[:,2])
+ me25 = -0.3
+ me75= 0.3
+ se50 = 0.6103758
+ ret = np.array(X)
+ scale = (me75 - me25) / (m75 - m25)
+ m25 *= scale
+ shift = me25 - m25
+ ret[:,0] = X[:,0] * scale + shift
+ ret[:,1] = ret[:,0]**2
+
+ sscale = se50 / s50
+
+ ret[:,2] = X[:,2] * sscale
+ return ret
+
+def scale(X):
+ m25 = np.percentile(X[:,0], 25)
+ m75 = np.percentile(X[:,0], 75)
+ s50 = np.median(X[:,2])
+ me25 = 0.07499809
+ me75 = 0.26622871
+ se50 = 0.6103758
+ ret = np.array(X)
+ scale = (me75 - me25) / (m75 - m25)
+ m25 *= scale
+ shift = me25 - m25
+ ret[:,0] = X[:,0] * scale + shift
+ ret[:,1] = ret[:,0]**2
+
+ sscale = se50 / s50
+
+ ret[:,2] = X[:,2] * sscale
+ return ret
+
+def get_events(h5):
+ if not args.event_detect:
+ try:
+ e = h5["Analyses/Basecall_RNN_1D_000/BaseCalled_template/Events"]
+ return e
+ except:
+ pass
+ try:
+ e = h5["Analyses/Basecall_1D_000/BaseCalled_template/Events"]
+ return e
+ except:
+ pass
+
+ return extract_events(h5, args.chemistry)
+
+def basecall(filename, output_file):
+ try:
+ h5 = h5py.File(filename, "r")
+ events = get_events(h5)
+ if events is None:
+ print "No events in file %s" % filename
+ h5.close()
+ return 0
+
+ if len(events) < 300:
+ print "Read %s too short, not basecalling" % filename
+ h5.close()
+ return 0
+
+ events = events[50:-50][:args.max_events]
+ mean = events["mean"]
+ std = events["stdv"]
+ length = events["length"]
+ scale_f = scale if args.chemistry == 'r9' else scale94
+ X = np.array(np.vstack([mean, mean*mean, std, length]).T, dtype=np.float32)
+
+ if len(X) < 2500 or args.cut_data == False:
+ X = scale_f(X)
+ o1, o2 = ntwk.predict(X)
+ else:
+ preds1 = []
+ preds2 = []
+ for i in range(0, len(X), 2000):
+ o1, o2 = ntwk.predict(scale_f(X[i:i+2500]))
+ if i > 0:
+ o1 = o1[250:]
+ o2 = o2[250:]
+ if i + 2500 < len(X):
+ o1 = o1[:-250]
+ o2 = o2[:-250]
+ preds1.append(o1)
+ preds2.append(o2)
+
+ o1 = np.vstack(preds1)
+ o2 = np.vstack(preds2)
+
+ o1m = (np.argmax(o1, 1))
+ o2m = (np.argmax(o2, 1))
+
+ om = np.vstack((o1m,o2m)).reshape((-1,),order='F')
+ output = "".join(map(lambda x: alph[x], om)).replace("N", "")
+ print >>output_file, ">%s_template_deepnano" % filename
+ print >>output_file, output
+ output_file.flush()
+
+ h5.close()
+ return len(events)
+ except Exception as e:
+ print "Read %s failed with %s" % (filename, e)
+ return 0
+
+alph = "ACGTN"
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--chemistry', choices=['r9', 'r9.4'], default='r9.4')
+parser.add_argument('--output', type=str, default="output.fasta")
+parser.add_argument('--directory', type=str, default='', help="Directory where read files are stored")
+parser.add_argument('--watch', type=str, default='', help='Watched directory')
+parser.add_argument('reads', type=str, nargs='*')
+parser.add_argument('--debug', dest='debug', action='store_true')
+parser.add_argument('--no-debug', dest='debug', action='store_false')
+parser.add_argument('--event-detect', dest='event_detect', action='store_true')
+parser.add_argument('--max-events', type=int, default=50000, help='Max amount of events to basecall')
+parser.add_argument('--cut-data', dest='cut_data', action='store_true', help='Cut data into smaller chunks and basecall them separatelly. Helps in case of bad preprocessing.')
+parser.set_defaults(debug=False)
+parser.set_defaults(event_detect=False)
+parser.set_defaults(cut_data=False)
+
+args = parser.parse_args()
+
+assert len(args.reads) != 0 or len(args.directory) != 0 or len(args.watch) != 0, "Nothing to basecall"
+
+ntwks = {"r9": os.path.join("networks", "r9.pkl"), "r9.4": os.path.join("networks", "r94.pkl")}
+
+ntwk = Rnn() if args.chemistry == 'r9' else Rnnq()
+ntwk.load(ntwks[args.chemistry])
+
+if len(args.reads) or len(args.directory) != 0:
+ fo = open(args.output, "w")
+
+ files = args.reads
+ if len(args.directory):
+ files += [os.path.join(args.directory, x) for x in os.listdir(args.directory)]
+
+ total_events = 0
+ start_time = datetime.datetime.now()
+ for i, read in enumerate(files):
+ current_events = basecall(read, fo)
+ if args.debug:
+ total_events += current_events
+ time_diff = (datetime.datetime.now() - start_time).total_seconds() + 0.000001
+ print "Basecalled %d events in %f (%f ev/s)" % (total_events, time_diff, total_events / time_diff)
+
+ fo.close()
+
+if len(args.watch) != 0:
+ try:
+ from watchdog.observers import Observer
+ from watchdog.events import PatternMatchingEventHandler
+ except:
+ print "Please install watchdog to watch directories"
+ sys.exit()
+
+ class Fast5Handler(PatternMatchingEventHandler):
+ """Class for handling creation fo fast5-files"""
+ patterns = ["*.fast5"]
+ def on_created(self, event):
+ print "Calling", event
+ file_name = str(os.path.basename(event.src_path))
+ fasta_file_name = os.path.splitext(event.src_path)[0] + '.fasta'
+ with open(fasta_file_name, "w") as fo:
+ basecall(event.src_path, fo)
+ print('Watch dir: ' + args.watch)
+ observer = Observer()
+ print('Starting Observerer')
+ # start watching directory for fast5-files
+ observer.start()
+ observer.schedule(Fast5Handler(), path=args.watch)
+ try:
+ while True:
+ time.sleep(1)
+ # quit script using ctrl+c
+ except KeyboardInterrupt:
+ observer.stop()
+
+ observer.join()
=====================================
r9/extract_events.py
=====================================
--- /dev/null
+++ b/r9/extract_events.py
@@ -0,0 +1,202 @@
+import numpy as np
+import sys
+import datetime
+
+defs = {
+ 'r9.4': {
+ 'ed_params': {
+ 'window_lengths':[3, 6], 'thresholds':[1.4, 1.1],
+ 'peak_height':0.2
+ }
+ },
+ 'r9': {
+ 'ed_params': {
+ 'window_lengths':[5, 10], 'thresholds':[2.0, 1.1],
+ 'peak_height':1.2
+ }
+ }
+}
+
+def get_raw(h5):
+ rk = h5["Raw/Reads"].keys()[0]
+
+ raw = h5["Raw/Reads"][rk]["Signal"]
+ meta = h5["UniqueGlobalKey/channel_id"].attrs
+ offset = meta["offset"]
+ raw_unit = meta['range'] / meta['digitisation']
+ raw = (raw + offset) * raw_unit
+ sl = meta["sampling_rate"]
+
+ return raw, sl
+
+def find_stall(events, threshold):
+ count_above = 0
+ start_ev_ind = 0
+ for ev_ind, event in enumerate(events[:100]):
+ if event['mean'] <= threshold:
+ count_above = 0
+ else:
+ count_above += 1
+
+ if count_above == 2:
+ start_ev_ind = ev_ind - 1
+ break
+
+ new_start = 0
+ count = 0
+ for idx in range(start_ev_ind, len(events)):
+ if events['mean'][idx] > threshold:
+ count = 0
+ else:
+ count += 1
+
+ if count == 3:
+ new_start = idx - 2
+ break
+
+ return new_start
+
+def get_tstat(s, s2, wl):
+ eta = 1e-100
+ windows1 = np.concatenate([[s[wl-1]], s[wl:] - s[:-wl]])
+ windows2 = np.concatenate([[s2[wl-1]], s2[wl:] - s2[:-wl]])
+ means = windows1 / wl
+ variances = windows2 / wl - means * means
+ variances = np.maximum(variances, eta)
+ delta = means[wl:] - means[:-wl]
+ deltav = (variances[wl:] + variances[:-wl]) / wl
+ return np.concatenate([np.zeros(wl), np.abs(delta / np.sqrt(deltav)), np.zeros(wl-1)])
+
+
+def extract_events(h5, chem):
+ print "ed"
+ raw, sl = get_raw(h5)
+
+ events = event_detect(raw, sl, **defs[chem]["ed_params"])
+ med, mad = med_mad(events['mean'][-100:])
+ max_thresh = med + 1.48 * 2 + mad
+
+ first_event = find_stall(events, max_thresh)
+
+ return events[first_event:]
+
+def med_mad(data):
+ dmed = np.median(data)
+ dmad = np.median(abs(data - dmed))
+ return dmed, dmad
+
+def compute_prefix_sums(data):
+ data_sq = data * data
+ return np.cumsum(data), np.cumsum(data_sq)
+
+def peak_detect(short_data, long_data, short_window, long_window, short_threshold, long_threshold,
+peak_height):
+ long_mask = 0
+ NO_PEAK_POS = -1
+ NO_PEAK_VAL = 1e100
+ peaks = []
+ short_peak_pos = NO_PEAK_POS
+ short_peak_val = NO_PEAK_VAL
+ short_found_peak = False
+ long_peak_pos = NO_PEAK_POS
+ long_peak_val = NO_PEAK_VAL
+ long_found_peak = False
+
+ for i in range(len(short_data)):
+ val = short_data[i]
+ if short_peak_pos == NO_PEAK_POS:
+ if val < short_peak_val:
+ short_peak_val = val
+ elif val - short_peak_val > peak_height:
+ short_peak_val = val
+ short_peak_pos = i
+ else:
+ if val > short_peak_val:
+ short_peak_pos = i
+ short_peak_val = val
+ if short_peak_val > short_threshold:
+ long_mask = short_peak_pos + short_window
+ long_peak_pos = NO_PEAK_POS
+ long_peak_val = NO_PEAK_VAL
+ long_found_peak = False
+ if short_peak_val - val > peak_height and short_peak_val > short_threshold:
+ short_found_peak = True
+ if short_found_peak and (i - short_peak_pos) > short_window / 2:
+ peaks.append(short_peak_pos)
+ short_peak_pos = NO_PEAK_POS
+ short_peak_val = val
+ short_found_peak = False
+
+ if i <= long_mask:
+ continue
+ val = long_data[i]
+ if long_peak_pos == NO_PEAK_POS:
+ if val < long_peak_val:
+ long_peak_val = val
+ elif val - long_peak_val > peak_height:
+ long_peak_val = val
+ long_peak_pos = i
+ else:
+ if val > long_peak_val:
+ long_peak_pos = i
+ long_peak_val = val
+ if long_peak_val - val > peak_height and long_peak_val > long_threshold:
+ long_found_peak = True
+ if long_found_peak and (i - long_peak_pos) > long_window / 2:
+ peaks.append(long_peak_pos)
+ long_peak_pos = NO_PEAK_POS
+ long_peak_val = val
+ long_found_peak = False
+
+ return peaks
+
+def generate_events(ss1, ss2, peaks, sample_rate):
+ peaks.append(len(ss1))
+ events = np.empty(len(peaks), dtype=[('start', float), ('length', float),
+ ('mean', float), ('stdv', float)])
+ s = 0
+ s1 = 0
+ s2 = 0
+ for i, e in enumerate(peaks):
+ events[i]["start"] = s
+ l = e - s
+ events[i]["length"] = l
+ m = (ss1[e-1] - s1) / l
+ events[i]["mean"] = m
+ v = max(0.0, (ss2[e-1] - s2) / l - m*m)
+ events[i]["stdv"] = np.sqrt(v)
+ s = e
+ s1 = ss1[e-1]
+ s2 = ss2[e-1]
+
+ events["start"] /= sample_rate
+ events["length"] /= sample_rate
+
+ return events
+
+
+def event_detect(raw_data, sample_rate, window_lengths=[16, 40], thresholds=[8.0, 4.0], peak_height = 1.0):
+ """Basic, standard even detection using two t-tests
+
+ :param raw_data: ADC values
+ :param sample_rate: Sampling rate of data in Hz
+ :param window_lengths: Length 2 list of window lengths across
+ raw data from which `t_stats` are derived
+ :param thresholds: Length 2 list of thresholds on t-statistics
+ :peak_height: Absolute height a peak in signal must rise below
+ previous and following minima to be considered relevant
+ """
+ sums, sumsqs = compute_prefix_sums(raw_data)
+
+ tstats = []
+ for i, w_len in enumerate(window_lengths):
+ tstat = get_tstat(sums, sumsqs, w_len)
+ tstats.append(tstat)
+
+ peaks = peak_detect(tstats[0], tstats[1], window_lengths[0], window_lengths[1], thresholds[0],
+ thresholds[1], peak_height)
+ events = generate_events(sums, sumsqs, peaks, sample_rate)
+
+ return events
+
+
=====================================
r9/networks/r9.pkl
=====================================
The diff for this file was not included because it is too large.
=====================================
r9/networks/r94.pkl
=====================================
The diff for this file was not included because it is too large.
=====================================
r9/qrnnf.py
=====================================
--- /dev/null
+++ b/r9/qrnnf.py
@@ -0,0 +1,111 @@
+import numpy as np
+import pickle
+
+def sigmoid(x):
+ return 1 / (1 + np.exp(-x))
+
+class OutLayer:
+ def __init__(self):
+ self.n_params = 2
+ self.params = [None, None]
+
+ def calc(self, input):
+ otmp = np.dot(input, self.params[0]) + self.params[1]
+ e_x = np.exp(otmp - otmp.max(axis=1, keepdims=True))
+ return e_x / e_x.sum(axis=1, keepdims=True)
+
+class SimpleLayer:
+ def __init__(self):
+ self.n_params = 5
+ self.params = [None for i in range(5)]
+
+ def conv(self, input, w, b, width=7):
+ output = np.zeros((input.shape[0], w.shape[0]))
+ mid = width/2
+ for i in range(width):
+ wx = w[:,:,i,0].T
+ oo = np.dot(input, wx)
+ if i < mid:
+ output[:-(mid-i)] += oo[mid-i:]
+ elif i == mid:
+ output += oo
+ else:
+ output[i-mid:] += oo[:-(i-mid)]
+
+ output += b
+ return output
+
+ def calc(self, input):
+ state = self.params[4]
+
+ c1 = np.tanh(self.conv(input, self.params[0], self.params[1]))
+ f1 = sigmoid(self.conv(input, self.params[2], self.params[3]))
+
+ output = np.zeros((len(input), self.params[4].shape[0]), dtype=np.float32)
+ for i in range(len(input)):
+ state = f1[i] * state + (1 - f1[i]) * c1[i]
+ output[i] = state
+ return np.array(output)
+
+class BiSimpleLayer:
+ def __init__(self):
+ self.fwd = SimpleLayer()
+ self.bwd = SimpleLayer()
+
+ def calc(self, input):
+ return np.concatenate([self.fwd.calc(input), self.bwd.calc(input[::-1])[::-1]],
+ axis=1)
+
+class Rnn:
+ def __init__(self):
+ pass
+
+ def tester(self, input):
+ input = input[0]
+ l1 = self.layer1.calc(input)
+ l2 = self.layer2.calc(l1)
+ l3 = self.layer3.calc(l2)
+ l4 = self.layer4.calc(l3)
+ return [self.output1.calc(l4)], [self.output2.calc(l4)]
+
+ def predict(self, input):
+ l1 = self.layer1.calc(input)
+ l2 = self.layer2.calc(l1)
+ l3 = self.layer3.calc(l2)
+ l4 = self.layer4.calc(l3)
+ return self.output1.calc(l4), self.output2.calc(l4)
+
+ def debug(self, input):
+ l1 = self.layer1.calc(input)
+ l2 = self.layer2.calc(l1)
+ l3 = self.layer3.calc(l2)
+ return l1, l2, l3
+
+ def load(self, fn):
+ with open(fn, "rb") as f:
+ self.layer1 = BiSimpleLayer()
+ for i in range(5):
+ self.layer1.fwd.params[i] = pickle.load(f)
+ for i in range(5):
+ self.layer1.bwd.params[i] = pickle.load(f)
+ self.layer2 = BiSimpleLayer()
+ for i in range(5):
+ self.layer2.fwd.params[i] = pickle.load(f)
+ for i in range(5):
+ self.layer2.bwd.params[i] = pickle.load(f)
+ self.layer3 = BiSimpleLayer()
+ for i in range(5):
+ self.layer3.fwd.params[i] = pickle.load(f)
+ for i in range(5):
+ self.layer3.bwd.params[i] = pickle.load(f)
+ self.layer4 = BiSimpleLayer()
+ for i in range(5):
+ self.layer4.fwd.params[i] = pickle.load(f)
+ for i in range(5):
+ self.layer4.bwd.params[i] = pickle.load(f)
+ self.output1 = OutLayer()
+ self.output2 = OutLayer()
+ for i in range(2):
+ self.output1.params[i] = pickle.load(f)
+ for i in range(2):
+ self.output2.params[i] = pickle.load(f)
=====================================
r9/rnnf.py
=====================================
--- /dev/null
+++ b/r9/rnnf.py
@@ -0,0 +1,87 @@
+import numpy as np
+import pickle
+
+def sigmoid(x):
+ return 1 / (1 + np.exp(-x))
+
+class OutLayer:
+ def __init__(self):
+ self.n_params = 2
+ self.params = [None, None]
+
+ def calc(self, input):
+ otmp = np.dot(input, self.params[0]) + self.params[1]
+ e_x = np.exp(otmp - otmp.max(axis=1, keepdims=True))
+ return e_x / e_x.sum(axis=1, keepdims=True)
+
+class SimpleLayer:
+ def __init__(self):
+ self.n_params = 10
+ self.params = [None for i in range(10)]
+
+ def calc(self, input):
+ state = self.params[9]
+# output = []
+ output = np.zeros((len(input), self.params[2].shape[0]), dtype=np.float32)
+ for i in range(len(input)):
+ update_gate = sigmoid(np.dot(state, self.params[6]) +
+ np.dot(input[i], self.params[4]) +
+ self.params[8])
+ reset_gate = sigmoid(np.dot(state, self.params[5]) +
+ np.dot(input[i], self.params[3]) +
+ self.params[7])
+ new_val = np.tanh(np.dot(input[i], self.params[0]) +
+ reset_gate * np.dot(state, self.params[1]) +
+ self.params[2])
+ state = update_gate * state + (1 - update_gate) * new_val
+ output[i] = state
+ return np.array(output)
+
+class BiSimpleLayer:
+ def __init__(self):
+ self.fwd = SimpleLayer()
+ self.bwd = SimpleLayer()
+
+ def calc(self, input):
+ return np.concatenate([self.fwd.calc(input), self.bwd.calc(input[::-1])[::-1]],
+ axis=1)
+
+class Rnn:
+ def __init__(self):
+ pass
+
+ def predict(self, input):
+ l1 = self.layer1.calc(input)
+ l2 = self.layer2.calc(l1)
+ l3 = self.layer3.calc(l2)
+ return self.output1.calc(l3), self.output2.calc(l3)
+
+ def debug(self, input):
+ l1 = self.layer1.calc(input)
+ l2 = self.layer2.calc(l1)
+ l3 = self.layer3.calc(l2)
+ return l1, l2, l3
+
+ def load(self, fn):
+ with open(fn, "rb") as f:
+ self.layer1 = BiSimpleLayer()
+ for i in range(10):
+ self.layer1.fwd.params[i] = pickle.load(f)
+ for i in range(10):
+ self.layer1.bwd.params[i] = pickle.load(f)
+ self.layer2 = BiSimpleLayer()
+ for i in range(10):
+ self.layer2.fwd.params[i] = pickle.load(f)
+ for i in range(10):
+ self.layer2.bwd.params[i] = pickle.load(f)
+ self.layer3 = BiSimpleLayer()
+ for i in range(10):
+ self.layer3.fwd.params[i] = pickle.load(f)
+ for i in range(10):
+ self.layer3.bwd.params[i] = pickle.load(f)
+ self.output1 = OutLayer()
+ self.output2 = OutLayer()
+ for i in range(2):
+ self.output1.params[i] = pickle.load(f)
+ for i in range(2):
+ self.output2.params[i] = pickle.load(f)
=====================================
r9/training/README
=====================================
--- /dev/null
+++ b/r9/training/README
@@ -0,0 +1,31 @@
+You need to have theano and sklearn installed for training.
+Input files should have following format:
+
+<reference sequence>
+<mean of first event> <std of first event> <lenght of first event>
+<mean of second event> <std of second event> <lenght of second event>
+...
+
+See examples.
+
+Get alignment of events:
+python realign.py ../networks/r94.pkl <list of files>
+
+For each file with <name> it creates a file <name>r, so for file 1.txt it will create 1.txtr.
+
+For training:
+python train.py <optionally name of starting network like ../networks/r94.pkl> <list of files>
+
+Remember to put files ending with r into training.
+
+After each epoch the training will output:
+<number of epoch> <current training log likelihood> <accuracy of first softmax> <accuracy of second
+softmax> <approximate overall sequence identity> <time>
+<distribution of outputs from first softmax>
+<distribution of outputs from second softmax>
+<confusion matrix for first softmax>
+<confusion matrix for second softmax>
+
+In folder with timestamp of training start there will be file names latest.pkl and each 20
+epoch will be saved current snapshot of the network.
+
=====================================
r9/training/example_data/100.txt
=====================================
The diff for this file was not included because it is too large.
=====================================
r9/training/example_data/1007.txt
=====================================
The diff for this file was not included because it is too large.
=====================================
r9/training/qrnn.py
=====================================
--- /dev/null
+++ b/r9/training/qrnn.py
@@ -0,0 +1,158 @@
+import theano as th
+import theano.tensor as T
+from theano.tensor.nnet import sigmoid
+from theano.tensor.nnet import conv2d
+import numpy as np
+from theano_toolkit import updates
+import pickle
+
+hidden_size = 128
+
+def orthonormal_wts(n, m):
+ nm = max(n, m)
+ svd_u = np.linalg.svd(np.random.randn(nm, nm))[0]
+ return svd_u.astype(th.config.floatX)[:n, :m]
+
+def init_wts(*argv):
+ return 0.1 * (np.random.rand(*argv) - 0.5)
+
+def share(array, dtype=th.config.floatX, name=None):
+ return th.shared(value=np.asarray(array, dtype=dtype), name=name)
+
+class ConvLayer:
+ def __init__(self, input, nin, nunits, width=7):
+ id = str(np.random.randint(0, 10000000))
+ wc = share(init_wts(nunits, nin, width, 1), name="wc"+id)
+ bc = share(np.zeros(nunits), name="bc"+id)
+ input = input.dimshuffle(1, 0, 2)
+ inpr = input.reshape((input.shape[0], input.shape[1], input.shape[2], 1)).dimshuffle(0, 2, 1, 3)
+
+ c1 = conv2d(input=inpr, filters=wc, border_mode='half') + bc.reshape((1, bc.shape[0], 1, 1))
+ self.output = T.tanh(c1.flatten(3).dimshuffle(0, 2, 1).dimshuffle(1, 0, 2))
+
+ self.params = [wc, bc]
+
+
+class BatchLayer:
+ def __init__(self, input, nin, nunits, width=7):
+ id = str(np.random.randint(0, 10000000))
+ wc = share(init_wts(nunits, nin, width, 1), name="wc"+id)
+ bc = share(np.zeros(nunits), name="bc"+id)
+ wf = share(init_wts(nunits, nin, width, 1), name="wf"+id)
+ bf = share(np.zeros(nunits), name="bf"+id)
+ h0 = share(np.zeros(nunits), name="h0"+id)
+ input = input.dimshuffle(1, 0, 2)
+ inpr = input.reshape((input.shape[0], input.shape[1], input.shape[2], 1)).dimshuffle(0, 2, 1, 3)
+
+ c1 = conv2d(input=inpr, filters=wc, border_mode='half') + bc.reshape((1, bc.shape[0], 1, 1))
+ c1 = T.tanh(c1.flatten(3).dimshuffle(0, 2, 1).dimshuffle(1, 0, 2))
+
+ f1 = conv2d(input=inpr, filters=wf, border_mode='half') + bf.reshape((1, bf.shape[0], 1, 1))
+ f1 = T.nnet.sigmoid(f1.flatten(3).dimshuffle(0, 2, 1).dimshuffle(1, 0, 2))
+ self.dbg = f1
+
+ def step(in_t, in_g, out_tm1):
+# return in_t
+ return in_g * out_tm1 + (1 - in_g) * in_t
+
+ self.output, _ = th.scan(
+ step, sequences=[c1, f1],
+ outputs_info=[T.alloc(h0, input.shape[0], nunits)])
+
+
+ self.params = [wc, bc, wf, bf, h0]
+
+class BiBatchLayer():
+ def __init__(self, input, nin, nunits):
+ fwd = BatchLayer(input, nin, nunits)
+ bwd = BatchLayer(input[::-1], nin, nunits)
+ self.params = fwd.params + bwd.params
+ self.output = T.concatenate([fwd.output, bwd.output[::-1]], axis=2)
+ self.dbg = fwd.dbg
+
+class OutBatchLayer:
+ def __init__(self, input, in_size, n_classes):
+ id = str(np.random.randint(0, 10000000))
+ w = share(init_wts(in_size, n_classes), name=id+"w")
+ b = share(np.zeros(n_classes), name=id+"b")
+ otmp = T.dot(input, w) + b
+ e_x = T.exp(otmp - otmp.max(axis=2, keepdims=True))
+ o2 = e_x / e_x.sum(axis=2, keepdims=True)
+ eps = 0.00000001
+ self.output = T.clip(o2, eps, 1-eps)
+ self.params = [w, b]
+
+class BatchNet:
+ def __init__(self, in_size, n_classes, only_test=False, with_jacob=False, use_first=True,
+use_second=True):
+ np.random.seed(49)
+ self.input = T.ftensor3()
+ self.targets = T.imatrix()
+ self.targets2 = T.imatrix()
+
+ self.layer = BiBatchLayer(self.input.dimshuffle(1, 0, 2), in_size, hidden_size)
+ self.layer2 = BiBatchLayer(self.layer.output, 2*hidden_size, hidden_size)
+# self.layer2 = ConvLayer(self.layer.output, 2*hidden_size, 2*hidden_size)
+ self.layer3 = BiBatchLayer(self.layer2.output, 2*hidden_size, hidden_size)
+ self.layer4 = BiBatchLayer(self.layer3.output, 2*hidden_size, hidden_size)
+# self.layer4 = ConvLayer(self.layer3.output, 2*hidden_size, 2*hidden_size)
+ self.out_layer = OutBatchLayer(self.layer4.output, 2*hidden_size, n_classes)
+ self.output = self.out_layer.output.dimshuffle(1, 0, 2)
+ self.output_flat = T.reshape(self.output, (self.output.shape[0] * self.output.shape[1], -1))
+ self.targets_flat = self.targets.flatten(ndim=1)
+ self.cost = 0
+ if use_first:
+ self.cost = -T.mean(T.log(self.output_flat)[T.arange(self.targets_flat.shape[0]), self.targets_flat])
+
+ self.out_layer2 = OutBatchLayer(self.layer4.output, 2*hidden_size, n_classes)
+ self.output2 = self.out_layer2.output.dimshuffle(1, 0, 2)
+ self.output2_flat = T.reshape(self.output2, (self.output2.shape[0] * self.output2.shape[1], -1))
+ self.targets2_flat = self.targets2.flatten(ndim=1)
+ if use_second:
+ self.cost += -T.mean(T.log(self.output2_flat)[T.arange(self.targets2_flat.shape[0]), self.targets2_flat])
+
+# self.coster = th.function(inputs=[self.input, self.targets, self.targets2], outputs=[self.cost])
+ #self.predict = th.function(inputs=[self.input], outputs=[self.output])
+ self.tester = th.function(inputs=[self.input], outputs=[self.output, self.output2])
+ self.debug = th.function(inputs=[self.input], outputs=[self.layer.output, self.layer2.output,
+ self.layer3.output])
+
+ self.layers = [self.layer, self.layer2, self.layer3, self.layer4, self.out_layer, self.out_layer2]
+
+ self.params = [p for l in self.layers for p in l.params]
+
+ self.grad_params = [p for l in self.layers[:-2] for p in l.params]
+ if use_first:
+ self.grad_params += self.out_layer.params
+ if use_second:
+ self.grad_params += self.out_layer2.params
+
+# self.top_prob_sum = T.mean(T.log(T.max(self.output[0], axis=1))) +\
+# T.mean(T.log(T.max(self.output2[0], axis=1)))
+
+# self.grad_in = th.function(inputs=[self.input], outputs=T.grad(self.top_prob_sum, self.input))
+
+ if not only_test:
+ self.lr = T.fscalar()
+ inputs = [self.input]
+ if use_first:
+ inputs.append(self.targets)
+ if use_second:
+ inputs.append(self.targets2)
+# inputs.append(self.lr)
+ self.trainer = th.function(
+ inputs=inputs,
+ outputs=[self.cost, self.output, self.output2],
+ updates=updates.adam(self.grad_params, (T.grad(self.cost, self.grad_params)),
+ learning_rate=0.0001, epsilon=1e-6))
+
+ def save(self, fn):
+ with open(fn, "wb") as f:
+ for p in self.params:
+ pickle.dump(p.get_value(), f)
+
+ def load(self, fn):
+ with open(fn, "rb") as f:
+ for p in self.params:
+ p.set_value(pickle.load(f))
+
=====================================
r9/training/realign
=====================================
Binary files /dev/null and b/r9/training/realign differ
=====================================
r9/training/realign.cc
=====================================
--- /dev/null
+++ b/r9/training/realign.cc
@@ -0,0 +1,178 @@
+#include <cstdio>
+#include <vector>
+#include <cmath>
+#include <map>
+#include <string>
+#include <algorithm>
+
+using namespace std;
+
+class string2 {
+ public:
+ string2() {}
+ string2(const char*x) {
+ x_[0] = 0;
+ x_[1] = 0;
+ x_[2] = 0;
+ for (int i = 0; i < 2 && x[i]; i++) {
+ x_[i] = x[i];
+ }
+ }
+
+ string2 operator=(const char*x) {
+ x_[0] = 0;
+ x_[1] = 0;
+ x_[2] = 0;
+ for (int i = 0; i < 2 && x[i]; i++) {
+ x_[i] = x[i];
+ }
+ return *this;
+ }
+
+ string2 operator=(const string2 x) {
+ x_[0]= x.x_[0];
+ x_[1]= x.x_[1];
+ x_[2]= x.x_[2];
+ return *this;
+ }
+
+ string2 operator=(const string& x) {
+ x_[0] = 0;
+ x_[1] = 0;
+ x_[2] = 0;
+ for (int i = 0; i < 2 && i < x.size(); i++) {
+ x_[i] = x[i];
+ }
+ return *this;
+ }
+
+ char& operator[](int i) {
+ return x_[i];
+ }
+
+ char x_[3];
+};
+
+int main() {
+ char ss[100000];
+ scanf("%s\n", ss);
+
+ string ref(ss);
+ int mapping[256];
+ mapping['A'] = 0;
+ mapping['C'] = 1;
+ mapping['G'] = 2;
+ mapping['T'] = 3;
+ int range = 50;
+ vector<vector<double>> probs;
+ while (true) {
+ double a, c, g, t, n;
+ if (scanf("%lf %lf %lf %lf %lf", &a, &c, &g, &t, &n)<5) {
+ break;
+ }
+ probs.push_back(vector<double>({log(a), log(c), log(g), log(t), log(n)}));
+ }
+ int limit = min(100, (int)ref.size()/2);
+ while (range < 30000) {
+ vector<vector<double>> poses(probs.size()+1);
+ vector<vector<string2>> prevs(probs.size()+1);
+ for (int i = 0; i < poses.size(); i+=2) {
+ poses[i] = vector<double>(ref.size()+1, -1e30);
+ prevs[i] = vector<string2>(ref.size()+1, "");
+ }
+ poses[0][0] = 0;
+ for (int i = 0; i < poses[0].size() && i < limit; i++) {
+ poses[0][i] = 0;
+ }
+ int last_bp = 50;
+ for (int i = 2; i <= probs.size(); i+=2) {
+ for (int j = max(1, last_bp - range); j <= ref.size() && j <= last_bp + range; j++) {
+ // NN
+ if (i > 2) {
+ double np = poses[i-2][j] + probs[i-2][4] + probs[i-1][4];
+ if (np > poses[i][j]) {
+ poses[i][j] = np;
+ prevs[i][j] = "NN";
+ }
+ }
+ // NX
+ double np = poses[i-2][j-1] + probs[i-2][4] + probs[i-1][mapping[ref[j-1]]];
+ if (np > poses[i][j]) {
+ poses[i][j] = np;
+ prevs[i][j] = string("N") + ref[j-1];
+ }
+
+ //XX
+ if (j > 1) {
+ double np = poses[i-2][j-2] + probs[i-2][mapping[ref[j-2]]] + probs[i-1][mapping[ref[j-1]]];
+ if (np > poses[i][j]) {
+ poses[i][j] = np;
+ prevs[i][j] = ref.substr(j-2, 2);
+ }
+ }
+ }
+ int cur_bp = max(1, last_bp - range);
+ for (int j = max(1, last_bp - range); j <= ref.size() && j <= last_bp + range; j++) {
+ if (poses[i][j] > poses[i][cur_bp]) {
+ cur_bp = j;
+ }
+ }
+ last_bp = cur_bp;
+ }
+// fprintf(stderr, "back size %d last_bp %d\n", poses.back().size(), last_bp);
+ int best_pos = poses.back().size()-40;
+ for (int i = poses.back().size()-limit; i < poses.back().size(); i++) {
+ if (poses.back()[i] > poses.back()[best_pos]) {
+ best_pos = i;
+ }
+ }
+ /*int total = 0;
+ int rel = 0;
+ int better = 0;
+ for (int i = 0; i < poses.size(); i += 2) {
+ for (int j = 0; j < poses[i].size(); j++) {
+ total += 1;
+ if (poses[i][j] > -1e29) {
+ rel += 1;
+ }
+ if (poses[i][j] > poses.back()[best_pos]) {
+ better += 1;
+ }
+ }
+ }
+ fprintf(stderr, "total %d rel %d better %d\n", total, rel, better);*/
+ if (poses.back()[best_pos] / probs.size() * 2 > -10) {
+ fprintf(stderr, "best pos %d %lf %d\n", best_pos, poses.back()[best_pos] / probs.size() * 2, range);
+ int ipos = poses.size()-1;
+ int jpos = best_pos;
+
+ vector<string2> out;
+ while (ipos > 0) {
+ auto back = prevs[ipos][jpos];
+ out.push_back(back);
+ if (back[0] == 'N' && back[1] == 'N') {
+ ipos -= 2;
+ }
+ else if (back[0] == 'N' && back[1] != 'N') {
+ ipos -= 2;
+ jpos -= 1;
+ }
+ else if (back[0] != 'N' && back[1] != 'N') {
+ ipos -= 2;
+ jpos -= 2;
+ }
+ }
+ reverse(out.begin(), out.end());
+ for (auto &o: out) {
+ printf("%s\n", o.x_);
+ }
+ fprintf(stderr, "start pos %d\n", jpos);
+ if (best_pos == poses.back().size() - limit || jpos == limit - 1) {
+ return 1;
+ }
+ return 47;
+ }
+ range *= 2;
+ }
+ return 1;
+}
=====================================
r9/training/realign.py
=====================================
--- /dev/null
+++ b/r9/training/realign.py
@@ -0,0 +1,131 @@
+from qrnn import BatchNet
+import pickle
+import sys
+import numpy as np
+import datetime
+from collections import defaultdict
+import os
+from sklearn.metrics import confusion_matrix
+import theano as th
+from multiprocessing import Pool
+import subprocess
+
+def scale(X):
+ m25 = np.percentile(X[:,0], 25)
+ m75 = np.percentile(X[:,0], 75)
+ s50 = np.median(X[:,2])
+ me25 = -0.3
+ me75= 0.3
+ se50 = 0.6103758
+ ret = np.array(X)
+ scale = (me75 - me25) / (m75 - m25)
+ m25 *= scale
+ shift = me25 - m25
+ ret[:,0] = X[:,0] * scale + shift
+ ret[:,1] = ret[:,0]**2
+
+ sscale = se50 / s50
+
+ ret[:,2] = X[:,2] * sscale
+ return ret
+
+def print_stats(o):
+ stats = defaultdict(int)
+ for x in o:
+ stats[x] += 1
+ print (stats)
+
+def flatten2(x):
+ return x.reshape((x.shape[0] * x.shape[1], -1))
+
+def realign(s):
+ ps = s
+ o1, o2 = ntwk.tester([scale(data_x[ps])])
+ o1m = (np.argmax(o1[0], 1))
+ o2m = (np.argmax(o2[0], 1))
+ alph = "ACGTN"
+# fo = open(base_dir+"output%s.fasta" % ps, "w")
+# print (">xx", file=fo)
+# for a, b in zip(o1m, o2m):
+# if a < 4:
+# fo.write(alph[a])
+# if b < 4:
+# fo.write(alph[b])
+# fo.close()
+ f = open(base_dir+"tmpb-%s.in" % s, "w")
+ print >>f, refs[ps]
+ for a, b in zip(o1[0], o2[0]):
+ print >>f, " ".join(map(str, a))
+ print >>f, " ".join(map(str, b))
+ f.close()
+
+ print "s", s
+ ret = subprocess.call("ulimit -v 32000000; ./realign <%stmpb-%s.in >%stmpb-%s.out" % (base_dir, s, base_dir, s), shell=True)
+
+ if ret != 47:
+ print "fail", s
+ return
+
+ f = open(base_dir+"tmpb-%s.out" % s)
+ for i, l in enumerate(f):
+ data_y[ps][i] = mapping[l[0]]
+ data_y2[ps][i] = mapping[l[1]]
+
+ fo = open(names[s] + "r", "w")
+ print >>fo, refs[s]
+ for x, y, y2 in zip(data_x[s], data_y[s], data_y2[s]):
+ print >>fo, " ".join(map(str, x)), "%c%c" % (alph[y], alph[y2])
+ fo.close()
+
+if __name__ == '__main__':
+
+ chars = "ACGT"
+ mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+ n_dims = 4
+ n_classes = 5
+
+ data_x = []
+ data_y = []
+ data_y2 = []
+ refs = []
+ names = []
+
+ for fn in sys.argv[2:]:
+ f = open(fn)
+ ref = f.readline()
+ if len(ref) > 30000:
+ print "out", len(ref)
+ continue
+ refs.append(ref.strip())
+ names.append(fn)
+ X = []
+ Y = []
+ Y2 = []
+ print "\rfn", fn,
+ sys.stdout.flush()
+ for l in f:
+ its = l.strip().split()
+ mean = float(its[0])
+ std = float(its[1])
+ length = float(its[2])
+ X.append([mean, mean*mean, std, length])
+ Y.append(4)
+ Y2.append(4)
+ data_x.append(np.array(X, dtype=th.config.floatX))
+ data_y.append(np.array(Y, dtype=np.int32))
+ data_y2.append(np.array(Y2, dtype=np.int32))
+
+ print ("done", sum(len(x) for x in refs))
+ sys.stdout.flush()
+
+ ntwk = BatchNet(n_dims, n_classes, only_test=True)
+ ntwk.load(sys.argv[1])
+ base_dir = str(datetime.datetime.now())
+ base_dir = base_dir.replace(' ', '_')
+
+ os.mkdir(base_dir)
+ base_dir += "/"
+
+ p = Pool(8)
+ p.map(realign, range(len(data_x)))
=====================================
r9/training/rnn.py
=====================================
--- /dev/null
+++ b/r9/training/rnn.py
@@ -0,0 +1,272 @@
+import theano as th
+import theano.tensor as T
+from theano.tensor.nnet import sigmoid
+import numpy as np
+from theano_toolkit import updates
+import pickle
+
+hidden_size = 100
+
+def orthonormal_wts(n, m):
+ nm = max(n, m)
+ svd_u = np.linalg.svd(np.random.randn(nm, nm))[0]
+ return svd_u.astype(th.config.floatX)[:n, :m]
+
+def init_wts(*argv):
+ return 0.4 * (np.random.rand(*argv) - 0.5)
+
+def share(array, dtype=th.config.floatX, name=None):
+ return th.shared(value=np.asarray(array, dtype=dtype), name=name)
+
+class OutLayer:
+ def __init__(self, input, in_size, n_classes):
+ id = str(np.random.randint(0, 10000000))
+ w = share(init_wts(in_size, n_classes), name=id+"w")
+ b = share(np.zeros(n_classes), name=id+"b")
+ eps = 0.0000001
+ self.output = T.clip(T.nnet.softmax(T.dot(input, w) + b), eps, 1-eps)
+ self.params = [w, b]
+
+class SimpleLayer:
+ def __init__(self, input, nin, nunits):
+ id = str(np.random.randint(0, 10000000))
+ wio = share(orthonormal_wts(nin, nunits), name="wio"+id) # input to output
+ wir = share(orthonormal_wts(nin, nunits), name="wir"+id) # input to output
+ wiu = share(orthonormal_wts(nin, nunits), name="wiu"+id) # input to output
+ woo = share(orthonormal_wts(nunits, nunits), name="woo"+id) # output to output
+ wou = share(orthonormal_wts(nunits, nunits), name="wou"+id) # output to output
+ wor = share(orthonormal_wts(nunits, nunits), name="wor"+id) # output to output
+ bo = share(np.zeros(nunits), name="bo"+id)
+ bu = share(np.zeros(nunits), name="bu"+id)
+ br = share(np.zeros(nunits), name="br"+id)
+ h0 = share(np.zeros(nunits), name="h0"+id)
+
+ def step(in_t, out_tm1):
+ update_gate = sigmoid(T.dot(out_tm1, wou) + T.dot(in_t, wiu) + bu)
+ reset_gate = sigmoid(T.dot(out_tm1, wor) + T.dot(in_t, wir) + br)
+ new_val = T.tanh(T.dot(in_t, wio) + reset_gate * T.dot(out_tm1, woo) + bo)
+ return update_gate * out_tm1 + (1 - update_gate) * new_val
+
+ self.output, _ = th.scan(
+ step, sequences=[input],
+ outputs_info=[h0])
+
+ self.params = [wio, woo, bo, wir, wiu, wor, wou, br, bu, h0]
+
+class BiSimpleLayer():
+ def __init__(self, input, nin, nunits):
+ fwd = SimpleLayer(input, nin, nunits)
+ bwd = SimpleLayer(input[::-1], nin, nunits)
+ self.params = fwd.params + bwd.params
+ self.output = T.concatenate([fwd.output, bwd.output[::-1]], axis=1)
+
+class SimpleNet:
+ def __init__(self, in_size, n_classes, only_test=False):
+ np.random.seed(47)
+ self.input = T.fmatrix()
+ self.targets = T.ivector()
+ self.targets2 = T.ivector()
+
+ self.layer = BiSimpleLayer(self.input, in_size, hidden_size)
+ self.layer2 = BiSimpleLayer(self.layer.output, 2*hidden_size, hidden_size)
+ self.layer3 = BiSimpleLayer(self.layer2.output, 2*hidden_size, hidden_size)
+ self.out_layer = OutLayer(self.layer3.output, 2*hidden_size, n_classes)
+ self.output = self.out_layer.output
+ self.out_layer2 = OutLayer(self.layer3.output, 2*hidden_size, n_classes)
+ self.output2 = self.out_layer2.output
+ self.cost = -T.mean(T.log(self.output)[T.arange(self.targets.shape[0]), self.targets])
+ self.cost += -T.mean(T.log(self.output2)[T.arange(self.targets2.shape[0]), self.targets2])
+
+# self.coster = th.function(inputs=[self.input, self.targets, self.targets2], outputs=self.cost)
+# self.predict = th.function(inputs=[self.input], outputs=[self.output])
+ self.tester = th.function(inputs=[self.input], outputs=[self.output, self.output2])
+
+ self.layers = [self.layer, self.layer2, self.layer3, self.out_layer, self.out_layer2]
+
+ self.params = [p for l in self.layers for p in l.params]
+ total_pars = 0
+ for p in self.params:
+ shape = p.get_value().shape
+ if len(shape) == 1:
+ total_pars += shape[0]
+ elif len(shape) == 2:
+ total_pars += shape[0] * shape[1]
+ else:
+ assert(False)
+ self.total_pars = total_pars
+ if not only_test:
+ self.lr = T.fscalar()
+ inputs = [self.input]
+ inputs.append(self.targets)
+ inputs.append(self.targets2)
+ inputs.append(self.lr)
+ self.trainer = th.function(
+ inputs=inputs,
+ outputs=[self.cost, self.output, self.output2],
+ updates=updates.momentum(self.params, (T.grad(self.cost, self.params)),
+ learning_rate=self.lr, mu=0.8))
+ self.cost_and_grad = th.function(
+ inputs=inputs[:3],
+ outputs=[self.cost] + T.grad(self.cost, self.params))
+
+ def save(self, fn):
+ with open(fn, "wb") as f:
+ for p in self.params:
+ pickle.dump(p.get_value(), f)
+
+ def load(self, fn):
+ with open(fn, "rb") as f:
+ for p in self.params:
+ p.set_value(pickle.load(f))
+
+ def flatten_grad(self, to_flatten):
+ ret = np.zeros(self.total_pars, dtype=th.config.floatX)
+ cur_pos = 0
+ for p in to_flatten:
+ par = p.flatten()
+ l = len(par)
+ ret[cur_pos:cur_pos + l] = par
+ cur_pos += l
+ return ret
+
+ def get_params(self):
+ ret = np.zeros(self.total_pars, dtype=th.config.floatX)
+ cur_pos = 0
+ for p in self.params:
+ par = p.get_value().flatten()
+ l = len(par)
+ ret[cur_pos:cur_pos + l] = par
+ cur_pos += l
+ return ret
+
+ def set_params(self, params):
+ cur_pos = 0
+ params = np.asarray(params, dtype=th.config.floatX)
+ for p in self.params:
+ l = len(p.get_value().flatten())
+ p.set_value(params[cur_pos:cur_pos+l].reshape(p.get_value().shape))
+ cur_pos += l
+
+class BatchLayer:
+ def __init__(self, input, nin, nunits):
+ id = str(np.random.randint(0, 10000000))
+ wio = share(orthonormal_wts(nin, nunits), name="wio"+id) # input to output
+ wir = share(orthonormal_wts(nin, nunits), name="wir"+id) # input to output
+ wiu = share(orthonormal_wts(nin, nunits), name="wiu"+id) # input to output
+ woo = share(orthonormal_wts(nunits, nunits), name="woo"+id) # output to output
+ wou = share(orthonormal_wts(nunits, nunits), name="wou"+id) # output to output
+ wor = share(orthonormal_wts(nunits, nunits), name="wor"+id) # output to output
+ bo = share(np.zeros(nunits), name="bo"+id)
+ bu = share(np.zeros(nunits), name="bu"+id)
+ br = share(np.zeros(nunits), name="br"+id)
+ h0 = share(np.zeros(nunits), name="h0"+id)
+
+ def step(in_t, out_tm1):
+ update_gate = sigmoid(T.dot(out_tm1, wou) + T.dot(in_t, wiu) + bu)
+ reset_gate = sigmoid(T.dot(out_tm1, wor) + T.dot(in_t, wir) + br)
+ new_val = T.tanh(T.dot(in_t, wio) + reset_gate * T.dot(out_tm1, woo) + bo)
+ return update_gate * out_tm1 + (1 - update_gate) * new_val
+
+ self.step_res = step(input[0], T.alloc(h0, input.shape[1]))
+ self.output, _ = th.scan(
+ step, sequences=[input],
+ outputs_info=[T.alloc(h0, input.shape[1], nunits)])
+
+ self.params = [wio, woo, bo, wir, wiu, wor, wou, br, bu, h0]
+
+class BiBatchLayer():
+ def __init__(self, input, nin, nunits):
+ fwd = BatchLayer(input, nin, nunits)
+ bwd = BatchLayer(input[::-1], nin, nunits)
+ self.params = fwd.params + bwd.params
+ self.output = T.concatenate([fwd.output, bwd.output[::-1]], axis=2)
+
+class OutBatchLayer:
+ def __init__(self, input, in_size, n_classes):
+ id = str(np.random.randint(0, 10000000))
+ w = share(init_wts(in_size, n_classes), name=id+"w")
+ b = share(np.zeros(n_classes), name=id+"b")
+ otmp = T.dot(input, w) + b
+ e_x = T.exp(otmp - otmp.max(axis=2, keepdims=True))
+ o2 = e_x / e_x.sum(axis=2, keepdims=True)
+ eps = 0.00000001
+ self.output = T.clip(o2, eps, 1-eps)
+ self.params = [w, b]
+
+class BatchNet:
+ def __init__(self, in_size, n_classes, only_test=False, with_jacob=False, use_first=True,
+use_second=True):
+ np.random.seed(47)
+ self.input = T.ftensor3()
+ self.targets = T.imatrix()
+ self.targets2 = T.imatrix()
+
+ self.layer = BiBatchLayer(self.input.dimshuffle(1, 0, 2), in_size, hidden_size)
+ self.layer2 = BiBatchLayer(self.layer.output, 2*hidden_size, hidden_size)
+ self.layer3 = BiBatchLayer(self.layer2.output, 2*hidden_size, hidden_size)
+ self.out_layer = OutBatchLayer(self.layer3.output, 2*hidden_size, n_classes)
+ self.output = self.out_layer.output.dimshuffle(1, 0, 2)
+ self.output_flat = T.reshape(self.output, (self.output.shape[0] * self.output.shape[1], -1))
+ self.targets_flat = self.targets.flatten(ndim=1)
+ self.cost = 0
+ if use_first:
+ self.cost = -T.mean(T.log(self.output_flat)[T.arange(self.targets_flat.shape[0]), self.targets_flat])
+
+ self.out_layer2 = OutBatchLayer(self.layer3.output, 2*hidden_size, n_classes)
+ self.output2 = self.out_layer2.output.dimshuffle(1, 0, 2)
+ self.output2_flat = T.reshape(self.output2, (self.output2.shape[0] * self.output2.shape[1], -1))
+ self.targets2_flat = self.targets2.flatten(ndim=1)
+ if use_second:
+ self.cost += -T.mean(T.log(self.output2_flat)[T.arange(self.targets2_flat.shape[0]), self.targets2_flat])
+
+# self.coster = th.function(inputs=[self.input, self.targets, self.targets2], outputs=[self.cost])
+ self.predict = th.function(inputs=[self.input], outputs=[self.output])
+ self.tester = th.function(inputs=[self.input], outputs=[self.output, self.output2])
+ self.debug = th.function(inputs=[self.input], outputs=[self.layer.output, self.layer2.output,
+ self.layer3.output])
+
+ self.layers = [self.layer, self.layer2, self.layer3, self.out_layer, self.out_layer2]
+
+ self.params = [p for l in self.layers for p in l.params]
+
+ self.grad_params = [p for l in self.layers[:-2] for p in l.params]
+ if use_first:
+ self.grad_params += self.out_layer.params
+ if use_second:
+ self.grad_params += self.out_layer2.params
+
+# self.top_prob_sum = T.mean(T.log(T.max(self.output[0], axis=1))) +\
+# T.mean(T.log(T.max(self.output2[0], axis=1)))
+
+# self.grad_in = th.function(inputs=[self.input], outputs=T.grad(self.top_prob_sum, self.input))
+
+ if with_jacob:
+ self.index = T.iscalar()
+ self.j1 = T.grad(T.log(T.max(self.output[0][self.index])), self.input)
+ self.j2 = T.grad(T.log(T.max(self.output2[0][self.index])), self.input)
+ self.jacob = th.function(
+ inputs=[self.input, self.index], outputs=[self.j1[0], self.j2[0]])
+ if not only_test:
+ self.lr = T.fscalar()
+ inputs = [self.input]
+ if use_first:
+ inputs.append(self.targets)
+ if use_second:
+ inputs.append(self.targets2)
+ inputs.append(self.lr)
+ self.trainer = th.function(
+ inputs=inputs,
+ outputs=[self.cost, self.output, self.output2],
+ updates=updates.momentum(self.grad_params, (T.grad(self.cost, self.grad_params)),
+ learning_rate=self.lr, mu=0.8))
+
+ def save(self, fn):
+ with open(fn, "wb") as f:
+ for p in self.params:
+ pickle.dump(p.get_value(), f)
+
+ def load(self, fn):
+ with open(fn, "rb") as f:
+ for p in self.params:
+ p.set_value(pickle.load(f))
+
=====================================
r9/training/theano_toolkit/README.md
=====================================
--- /dev/null
+++ b/r9/training/theano_toolkit/README.md
@@ -0,0 +1,46 @@
+Helper methods for Theano
+=========================
+
+Some helper methods for working with Theano for neural networks.
+
+## `hinton.py`
+![Hinton Diagrams](https://blog.wtf.sg/wp-content/uploads/2014/05/Screenshot-from-2014-05-04-013804.png)
+Quick visualisation method of numpy matrices in the terminal. See the [blog post](https://blog.wtf.sg/2014/05/04/terminal-hinton-diagrams/).
+
+## `utils.py`
+
+Miscellaneous helper functions for initialising weight matrices, vector softmax, etc.
+
+## `parameters.py`
+
+Syntax sugar when declaring parameters.
+
+```python
+import theano_toolkit.parameters as Parameters
+
+P = Parameters()
+
+P.W = np.zeros(10,10)
+P.b = np.zeros(10)
+
+# build model here.
+
+params = P.values()
+gradients = T.grad(cost,wrt=params)
+```
+
+#### Experimental
+
+More syntax sugar for initialising parameters.
+```python
+P = Parameters()
+
+with P:
+ W = np.zeros(10,10)
+ b = np.zeros(10)
+
+# use W and b to define model instead of P.W and P.b
+
+params = P.values()
+:
+```
=====================================
r9/training/theano_toolkit/__init__.py
=====================================
--- /dev/null
+++ b/r9/training/theano_toolkit/__init__.py
=====================================
r9/training/theano_toolkit/hinton.py
=====================================
--- /dev/null
+++ b/r9/training/theano_toolkit/hinton.py
@@ -0,0 +1,41 @@
+# coding=utf-8
+from __future__ import print_function
+import numpy as np
+chars = [" ", "▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"]
+
+
+class BarHack(str):
+
+ def __str__(self):
+ return self.internal
+
+ def __len__(self):
+ return 1
+
+
+def plot(arr, max_val=None):
+ if max_val is None:
+ max_arr = arr
+ max_val = max(abs(np.max(max_arr)), abs(np.min(max_arr)))
+
+ opts = np.get_printoptions()
+ np.set_printoptions(edgeitems=500)
+ print(np.array2string(arr,
+ formatter={
+ 'float_kind': lambda x: visual(x, max_val),
+ 'int_kind': lambda x: visual(x, max_val)},
+ max_line_width=5000
+ ))
+ np.set_printoptions(**opts)
+
+
+def visual(val, max_val):
+ if abs(val) == max_val:
+ step = len(chars) - 1
+ else:
+ step = int(abs(float(val) / max_val) * len(chars))
+ colourstart = ""
+ colourend = ""
+ if val < 0:
+ colourstart, colourend = '\033[90m', '\033[0m'
+ return colourstart + chars[step] + colourend
=====================================
r9/training/theano_toolkit/ops.py
=====================================
--- /dev/null
+++ b/r9/training/theano_toolkit/ops.py
@@ -0,0 +1,36 @@
+import theano.tensor as T
+
+
+def log_sum_exp(x, axis=None, keepdims=False):
+ k = T.max(x, axis=axis, keepdims=True)
+ sum_x_ = T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims))
+ return sum_x_ + k.reshape(sum_x_.shape)
+
+
+def log_softmax(x):
+ return x - log_sum_exp(x, axis=-1, keepdims=True)
+
+
+def log_sigmoid(x):
+ return -T.nnet.softplus(-x)
+
+
+def softmax(x):
+ e_x = T.exp(x - T.max(x, axis=-1, keepdims=True))
+ out = e_x / T.sum(e_x, axis=-1, keepdims=True)
+ return out
+
+
+def log_add(x, y):
+ k = T.maximum(x, y)
+ return T.log(T.exp(x - k) + T.exp(y - k)) + k
+
+
+def binary_crossentropy(sigmoid_x, y):
+ if sigmoid_x.owner.op == T.nnet.sigmoid:
+ def softplus(x):
+ return T.switch(x > 20, x, T.nnet.softplus(x))
+ x = sigmoid_x.owner.inputs[0]
+ return - (y * -softplus(-x) + (1 - y) * -softplus(x))
+ else:
+ return T.nnet.binary_crossentropy(sigmoid_x, y)
=====================================
r9/training/theano_toolkit/parameters.py
=====================================
--- /dev/null
+++ b/r9/training/theano_toolkit/parameters.py
@@ -0,0 +1,91 @@
+import theano
+import numpy as np
+import cPickle as pickle
+from functools import reduce
+import inspect
+
+import sys
+
+
+class Parameters():
+
+ def __init__(self):
+ self.__dict__['params'] = {}
+
+ def __setattr__(self, name, array):
+ params = self.__dict__['params']
+ if name not in params:
+ params[name] = theano.shared(
+ value=np.asarray(
+ array,
+ dtype=theano.config.floatX
+ ),
+ borrow=True,
+ name=name
+ )
+ else:
+ print >> sys.stderr, "%s already assigned" % name
+
+ params[name].set_value(np.asarray(
+ array,
+ dtype=theano.config.floatX
+ ))
+
+ def __setitem__(self, name, array):
+ self.__setattr__(name, array)
+
+ def __getitem__(self, name):
+ return self.__getattr__(name)
+
+ def __getattr__(self, name):
+ params = self.__dict__['params']
+ return params[name]
+
+ def remove(self, name):
+ del self.__dict__['params'][name]
+
+ def values(self):
+ params = self.__dict__['params']
+ return params.values()
+
+ def save(self, filename):
+ params = self.__dict__['params']
+ with open(filename, 'wb') as f:
+ pickle.dump({p.name: p.get_value() for p in params.values()}, f, 2)
+
+ def load(self, filename):
+ params = self.__dict__['params']
+ loaded = pickle.load(open(filename, 'rb'))
+ for k in params:
+ if k in loaded:
+ params[k].set_value(loaded[k])
+ else:
+ print >> sys.stderr, "%s does not exist." % k
+
+ def __enter__(self):
+ _, _, _, env_locals = inspect.getargvalues(
+ inspect.currentframe().f_back)
+ self.__dict__['_env_locals'] = env_locals.keys()
+
+ def __exit__(self, type, value, traceback):
+ _, _, _, env_locals = inspect.getargvalues(
+ inspect.currentframe().f_back)
+ prev_env_locals = self.__dict__['_env_locals']
+ del self.__dict__['_env_locals']
+ for k in env_locals.keys():
+ if k not in prev_env_locals:
+ self.__setattr__(k, env_locals[k])
+ env_locals[k] = self.__getattr__(k)
+ return True
+
+ def parameter_count(self):
+ import operator
+ params = self.__dict__['params']
+ count = 0
+ for p in params.values():
+ shape = p.get_value().shape
+ if len(shape) == 0:
+ count += 1
+ else:
+ count += reduce(operator.mul, shape)
+ return count
=====================================
r9/training/theano_toolkit/updates.py
=====================================
--- /dev/null
+++ b/r9/training/theano_toolkit/updates.py
@@ -0,0 +1,171 @@
+from itertools import izip
+import theano.tensor as T
+import numpy as np
+from parameters import Parameters
+from functools import reduce
+
+
+def clip_deltas(gradients, clip_size):
+ grad_mag = T.sqrt(sum(T.sum(T.sqr(w)) for w in gradients))
+ scale = clip_size / T.maximum(clip_size, grad_mag)
+ return [scale * g for g in gradients]
+
+
+def nan_shield(parameters, deltas, other_updates):
+ delta_sum = sum(T.sum(d) for d in deltas)
+ not_finite = T.isnan(delta_sum) | T.isinf(delta_sum)
+ parameter_updates = [(p, T.switch(not_finite, 0.9 * p, p - d))
+ for p, d in izip(parameters, deltas)]
+ other_updates = [(p, T.switch(not_finite, p, u))
+ for p, u in other_updates]
+ return parameter_updates, other_updates
+
+
+def track_parameters(update_fun):
+ def decorated_fun(parameters, gradients, **kwargs):
+ if "P" not in kwargs:
+ kwargs["P"] = Parameters()
+ deltas, updates = update_fun(parameters, gradients, **kwargs)
+ assert(len(deltas) == len(parameters))
+ parameter_updates, other_updates = nan_shield(parameters,
+ deltas, updates)
+ return parameter_updates + other_updates
+ return decorated_fun
+
+
+def create_param(P, name, w):
+ P[name] = w
+ return P[name]
+
+
+def get_shapes(parameters):
+ return [p.get_value().shape for p in parameters]
+
+
+ at track_parameters
+def adadelta(parameters, gradients,
+ rho=np.float32(0.95),
+ learning_rate=np.float32(1e-4),
+ P=None):
+ eps = learning_rate
+ shapes = get_shapes(parameters)
+
+ acc_gradients_sq = [create_param(P, "grad_sq_" + p.name, np.zeros(s))
+ for p, s in izip(parameters, shapes)]
+ acc_deltas_sq = [create_param(P, "deltas_sq_" + p.name, np.zeros(s))
+ for p, s in izip(parameters, shapes)]
+
+ gradients_sq = [T.sqr(g) for g in gradients]
+ gradients_sq_new = [rho * acc_g_sq + (np.float32(1) - rho) * g_sq
+ for acc_g_sq, g_sq in izip(
+ acc_gradients_sq, gradients_sq)]
+ learning_rate_sq = [(d_sq + eps) / (g_sq + eps)
+ for d_sq, g_sq in izip(
+ acc_deltas_sq, gradients_sq_new)]
+
+ deltas_sq = [lr_sq * g_sq for lr_sq,
+ g_sq in izip(learning_rate_sq, gradients_sq)]
+ deltas_sq_new = [rho * acc_d_sq + (np.float32(1.) - rho) *
+ d_sq for acc_d_sq, d_sq in izip(acc_deltas_sq, deltas_sq)]
+
+ deltas = [T.sqrt(lr_sq) * g for lr_sq,
+ g in izip(learning_rate_sq, gradients)]
+
+ gradient_sq_updates = zip(acc_gradients_sq, gradients_sq_new)
+ deltas_sq_updates = zip(acc_deltas_sq, deltas_sq_new)
+ return deltas, gradient_sq_updates + deltas_sq_updates
+
+
+ at track_parameters
+def adagrad(parameters, gradients, learning_rate=1e-4, P=None):
+ shapes = get_shapes(parameters)
+
+ grad_sq = [create_param(P, "acc_sq_" + p.name, np.zeros(s))
+ for p, s in izip(parameters, shapes)]
+
+ grad_sq_new = [g_sq + g**2 for g, g_sq in izip(gradients, grad_sq)]
+ deltas = [learning_rate * g / T.sqrt(g_sq + 1e-6)
+ for g, g_sq in izip(gradients, grad_sq_new)]
+ grad_sq_update = zip(grad_sq, grad_sq_new)
+
+ return deltas, grad_sq_update
+
+
+ at track_parameters
+def momentum(parameters, gradients, mu=0.9, learning_rate=1e-3, P=None):
+ eps = learning_rate
+ P.t = 1
+ m = (1 - 3.0 / (P.t + 5) < mu)
+ mu = m * (1 - 3.0 / (P.t + 5)) + (1 - m) * mu
+ shapes = get_shapes(parameters)
+ deltas = [create_param(P, "deltas_" + p.name, np.zeros(s))
+ for p, s in izip(parameters, shapes)]
+ delta_nexts = [mu * delta + eps * grad for delta,
+ grad in zip(deltas, gradients)]
+ delta_updates = [(delta, delta_next)
+ for delta, delta_next in zip(deltas, delta_nexts)]
+ return delta_nexts, delta_updates + [(P.t, P.t + 1)]
+
+
+ at track_parameters
+def rmsprop(parameters, gradients,
+ discount=0.95,
+ momentum=0.9,
+ learning_rate=1e-4,
+ epsilon=1e-4,
+ P=None):
+ shapes = get_shapes(parameters)
+ sq_acc = [create_param(P, "sq_acc_" + p.name, np.zeros(s))
+ for p, s in izip(parameters, shapes)]
+ acc = [create_param(P, "acc_" + p.name, np.zeros(s))
+ for p, s in izip(parameters, shapes)]
+ delta_acc = [create_param(P, "delta_acc_" + p.name, np.zeros(s))
+ for p, s in izip(parameters, shapes)]
+
+ sq_avg = [discount * sq_a + (1 - discount) * (g**2)
+ for sq_a, g in izip(sq_acc, gradients)]
+ avg = [discount * a + (1 - discount) * g for a,
+ g in izip(acc, gradients)]
+ scaled_grads = [g / T.sqrt(sq_a - a**2 + epsilon)
+ for g, a, sq_a in izip(gradients, acc, sq_acc)]
+ deltas = [momentum * d_a + learning_rate *
+ s_g for d_a, s_g in izip(delta_acc, scaled_grads)]
+
+ sq_acc_updates = [(sq_a, sq_aa) for sq_a, sq_aa in izip(sq_acc, sq_avg)]
+ acc_updates = [(a, aa) for a, aa in izip(acc, avg)]
+ delta_updates = [(d_a, d) for d_a, d in izip(delta_acc, deltas)]
+
+ return deltas, acc_updates + sq_acc_updates + delta_updates
+
+
+ at track_parameters
+def adam(parameters, gradients,
+ learning_rate=0.001,
+ moment1_decay=0.9,
+ moment2_decay=0.999,
+ epsilon=1e-8,
+ P=None):
+ shapes = get_shapes(parameters)
+ P.t = np.float32(1)
+
+ moment1_acc = [create_param(P, "moment1_" + p.name, np.zeros(s))
+ for p, s in izip(parameters, shapes)]
+
+ moment2_acc = [create_param(P, "moment2_" + p.name, np.zeros(s))
+ for p, s in izip(parameters, shapes)]
+
+ deltas = []
+ updates = []
+ updates.append((P.t, P.t + 1))
+ for m1, m2, g in izip(moment1_acc, moment2_acc, gradients):
+ new_m1 = moment1_decay * m1 + (1 - moment1_decay) * g
+ new_m2 = moment2_decay * m2 + (1 - moment2_decay) * T.sqr(g)
+ bc_m1 = new_m1 / (1 - moment1_decay**P.t)
+ bc_m2 = new_m2 / (1 - moment2_decay**P.t)
+ delta = learning_rate * bc_m1 / (T.sqrt(bc_m2) + epsilon)
+
+ deltas.append(delta)
+ updates.append((m1, new_m1))
+ updates.append((m2, new_m2))
+
+ return deltas, updates
=====================================
r9/training/theano_toolkit/utils.py
=====================================
--- /dev/null
+++ b/r9/training/theano_toolkit/utils.py
@@ -0,0 +1,37 @@
+import numpy as np
+import theano
+import theano.tensor as T
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
+import random
+
+
+theano_rng = RandomStreams(np.random.RandomState(1234).randint(2**30))
+np.random.seed(1234)
+random.seed(1234)
+
+theano.config.floatX = 'float32'
+
+
+def initial_weights(*argv):
+ return np.asarray(
+ np.random.uniform(
+ low=-np.sqrt(6. / sum(argv)),
+ high=np.sqrt(6. / sum(argv)),
+ size=argv
+ ),
+ dtype=theano.config.floatX
+ )
+
+
+def create_shared(array, dtype=theano.config.floatX, name=None):
+ return theano.shared(
+ value=np.asarray(
+ array,
+ dtype=dtype
+ ),
+ name=name,
+ )
+
+
+def vector_softmax(vec):
+ return T.nnet.softmax(vec.reshape((1, vec.shape[0])))[0]
=====================================
r9/training/train.py
=====================================
--- /dev/null
+++ b/r9/training/train.py
@@ -0,0 +1,246 @@
+from qrnn import BatchNet
+import pickle
+import sys
+import numpy as np
+import datetime
+from collections import defaultdict
+import os
+from sklearn.metrics import confusion_matrix
+import theano as th
+from multiprocessing import Pool
+
+def scale(X):
+ m25 = np.percentile(X[:,0], 25)
+ m75 = np.percentile(X[:,0], 75)
+ s50 = np.median(X[:,2])
+# me25 = 0.07499809
+# me75 = 0.26622871
+ me25 = -0.3
+ me75= 0.3
+ se50 = 0.6103758
+ ret = np.array(X)
+ scale = (me75 - me25) / (m75 - m25)
+ m25 *= scale
+ shift = me25 - m25
+ ret[:,0] = X[:,0] * scale + shift
+ ret[:,1] = ret[:,0]**2
+
+ sscale = se50 / s50
+
+ ret[:,2] = X[:,2] * sscale
+ return ret
+
+def print_stats(o):
+ stats = defaultdict(int)
+ for x in o:
+ stats[x] += 1
+ print (stats)
+
+def flatten2(x):
+ return x.reshape((x.shape[0] * x.shape[1], -1))
+
+def realign(s):
+ ps = s
+ o1, o2 = ntwk.tester([scale(data_x[ps])])
+ o1m = (np.argmax(o1[0], 1))
+ o2m = (np.argmax(o2[0], 1))
+ alph = "ACGTN"
+# fo = open(base_dir+"output%s.fasta" % ps, "w")
+# print (">xx", file=fo)
+# for a, b in zip(o1m, o2m):
+# if a < 4:
+# fo.write(alph[a])
+# if b < 4:
+# fo.write(alph[b])
+# fo.close()
+ f = open(base_dir+"tmpb-%s.in" % s, "w")
+ lc = 0
+ print >>f, refs[ps]
+ for a, b in zip(o1[0], o2[0]):
+ print >>f, " ".join(map(str, a))
+ print >>f, " ".join(map(str, b))
+ lc += 1
+ f.close()
+
+ print "s", s, datetime.datetime.now(), len(refs[ps]), lc, len(data_x[ps])
+ if os.system("./realign %stmpb-%s.in <%stmpb-%s.in >%stmpb-%s.out" % (base_dir, s, base_dir, s, base_dir, s)) != 0:
+ print "watwat", s
+ sys.exit()
+
+ f = open(base_dir+"tmpb-%s.out" % s)
+ for i, l in enumerate(f):
+ data_y[ps][i] = mapping[l[0]]
+ data_y2[ps][i] = mapping[l[1]]
+
+ fo = open(names[s] + "r", "w")
+ print >>fo, refs[s]
+ for x, y, y2 in zip(data_x[s], data_y[s], data_y2[s]):
+ print >>fo, " ".join(map(str, x)), "%c%c" % (alph[y], alph[y2])
+ fo.close()
+ return data_y[ps], data_y2[ps]
+
+if __name__ == '__main__':
+ chars = "ACGT"
+ mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
+
+ n_dims = 4
+ n_classes = 5
+
+ data_x = []
+ data_y = []
+ data_y2 = []
+ refs = []
+ names = []
+
+ for fn in sys.argv[1:]:
+ if fn.endswith("pkl"):
+ continue
+ f = open(fn)
+ ref = f.readline()
+ if len(ref) > 30000:
+ print "out", len(ref)
+ continue
+ X = []
+ Y = []
+ Y2 = []
+ good = True
+ for l in f:
+ its = l.strip().split()
+ X.append(list(map(float, its[:4])))
+ try:
+ Y.append(mapping[its[4][0]])
+ Y2.append(mapping[its[4][1]])
+ except:
+ print "wat", fn
+ good = False
+ break
+ if good:
+ if len(X) > 2000:
+ print "\rfn", fn, len(X), len(Y), len(Y2), len(ref), len(refs),
+ sys.stdout.flush()
+ refs.append(ref.strip())
+ names.append(fn)
+ data_x.append(np.array(X, dtype=th.config.floatX))
+ data_y.append(np.array(Y, dtype=np.int32))
+ data_y2.append(np.array(Y2, dtype=np.int32))
+
+ print
+ print ("done", sum(len(x) for x in refs), sum(len(x) for x in data_x))
+ sys.stdout.flush()
+
+ ntwk = BatchNet(n_dims, n_classes)
+ if sys.argv[1].endswith("pkl"):
+ ntwk.load(sys.argv[1])
+
+ print ("net rdy")
+ sys.stdout.flush()
+
+ s_arr = []
+ p_arr = []
+ subseq_size = 2000
+ for s in range(len(data_x)):
+ s_arr += [s]
+ p_arr += [len(data_x[s]) - subseq_size]
+
+ sum_p = sum(p_arr)
+ for i in range(len(p_arr)):
+ p_arr[i] = 1.*p_arr[i] / sum_p
+
+ base_dir = str(datetime.datetime.now())
+ base_dir = base_dir.replace(' ', '_')
+
+ os.mkdir(base_dir)
+ base_dir += "/"
+ batch_size = 8
+ n_batches = len(data_x) / batch_size + 1
+ print len(data_x), batch_size, n_batches, datetime.datetime.now()
+
+ for epoch in range(1000):
+ taken_gc = []
+ out_gc = []
+ tc = 0
+ tcs = 0
+ tcb = 0
+ tc2 = 0
+ tc3 = 0
+ ccb = 0
+ ccs = 0
+ total_size = 0
+ o1mm = []
+ y1mm = []
+ o2mm = []
+ y2mm = []
+ for s in range(n_batches):
+ bx = []
+ by = []
+ by2 = []
+ if s % 10 == 0:
+ s2 = np.random.choice(s_arr)
+ x = data_x[s2]
+ y = data_y[s2]
+ y2 = data_y2[s2]
+ total_size += len(x)
+ bx.append(scale(x))
+ by.append(y)
+ by2.append(y2)
+ else:
+ for b in range(batch_size):
+ s2 = np.random.choice(s_arr, p=p_arr)
+ r = np.random.randint(0, data_x[s2].shape[0] - subseq_size)
+ x = data_x[s2][r:r+subseq_size]
+ total_size += len(x)
+ y = data_y[s2][r:r+subseq_size]
+ y2 = data_y2[s2][r:r+subseq_size]
+ bx.append(scale(x))
+ by.append(y)
+ by2.append(y2)
+
+ lr = 8e-3
+# lr = 1e-3
+ # if epoch >= 3:
+ # lr = 2e-1
+ if epoch >= 970:
+ lr = 1e-3
+ by = np.array(by)
+ by2 = np.array(by2)
+ cost, o1, o2 = ntwk.trainer(bx, by, by2)
+ tc += cost
+ if s % 20 == 0:
+ tcb += cost
+ ccb += 1
+ else:
+ tcs += cost
+ ccs += 1
+
+ o1m = (np.argmax(flatten2(o1), 1))
+ o2m = (np.argmax(flatten2(o2), 1))
+
+ o1mm += list(o1m)
+ o2mm += list(o2m)
+ y1mm += list(flatten2(by))
+ y2mm += list(flatten2(by2))
+
+ tc2 += np.sum(np.equal(o1m, by.flatten()))
+ tc3 += np.sum(np.equal(o2m, by2.flatten()))
+
+ sys.stdout.write('\r%d %f %f %f' % (s, tc / (s+1), tcs / max(1, ccs), tcb / max(1, ccb)))
+ sys.stdout.flush()
+
+ print
+ conf1 = confusion_matrix(y1mm, o1mm)
+ conf2 = confusion_matrix(y2mm, o2mm)
+ good = conf1[0,0] + conf1[1,1] + conf1[2,2] + conf1[3,3]
+ good += conf2[0,0] + conf2[1,1] + conf2[2,2] + conf2[3,3]
+ bad = np.sum(conf1) + np.sum(conf2) - good - conf1[4,4] - conf2[4,4]
+
+ print epoch, tc / n_batches, 1.*tc2 / total_size, 1.*tc3 / total_size, 1.*good / (good + bad), datetime.datetime.now()
+ print_stats(o1mm)
+ print_stats(o2mm)
+ print conf1
+ print conf2
+ # print "out", np.min(out_gc), np.median(out_gc), np.max(out_gc), len(out_gc)
+ sys.stdout.flush()
+
+ if epoch % 20 == 19:
+ ntwk.save(base_dir+"dumpx-%d.pkl" % epoch)
+ ntwk.save(base_dir+"latest.pkl")
View it on GitLab: https://salsa.debian.org/med-team/deepnano/commit/354bf44a8662c2977649d49f4c278cacae4d5726
---
View it on GitLab: https://salsa.debian.org/med-team/deepnano/commit/354bf44a8662c2977649d49f4c278cacae4d5726
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180427/2e4553cb/attachment-0001.html>
More information about the debian-med-commit
mailing list