[med-svn] [nanopolish] 01/08: Imported Upstream version 0.5.0
Afif Elghraoui
afif at moszumanska.debian.org
Fri Aug 12 07:08:42 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository nanopolish.
commit aa2ebdb1bad083827384db00bad62678d3a91bd6
Author: Afif Elghraoui <afif at debian.org>
Date: Thu Aug 11 19:55:08 2016 -0700
Imported Upstream version 0.5.0
---
.gitignore | 5 +
Makefile | 15 +-
README.md | 35 +-
etc/r9-models/nanopolish_models.fofn | 2 +
etc/r9-models/template_median68pA.5mers.model | 1030 ++++++
etc/r9-models/template_median68pA.model | 4101 +++++++++++++++++++++
scripts/consensus.make | 8 +-
scripts/dropmodel.py | 73 +
scripts/nanopolish_makerange.py | 13 +-
scripts/nanopolish_merge.py | 50 +-
src/alignment/nanopolish_alignment_db.cpp | 158 +-
src/alignment/nanopolish_alignment_db.h | 15 +-
src/alignment/nanopolish_anchor.cpp | 64 +-
src/alignment/nanopolish_anchor.h | 5 +-
src/alignment/nanopolish_eventalign.cpp | 158 +-
src/alignment/nanopolish_eventalign.h | 2 +-
src/common/logger.hpp | 260 ++
src/common/logsum.cpp | 3 +
src/common/logsumset.hpp | 80 +
src/common/nanopolish_alphabet.cpp | 125 +-
src/common/nanopolish_alphabet.h | 363 +-
src/common/nanopolish_common.cpp | 28 +
src/common/nanopolish_common.h | 18 +-
src/common/nanopolish_fast5_map.cpp | 4 +-
src/common/nanopolish_iupac.h | 2 +-
src/common/nanopolish_model_names.cpp | 66 +
src/common/nanopolish_model_names.h | 41 +
src/common/nanopolish_variant.cpp | 257 +-
src/common/nanopolish_variant.h | 26 +-
src/common/progress.h | 12 +-
src/hmm/invgauss.hpp | 149 +
src/hmm/nanopolish_duration_model.cpp | 90 +
src/hmm/nanopolish_duration_model.h | 57 +
src/hmm/nanopolish_emissions.h | 92 +-
src/hmm/nanopolish_hmm_input_sequence.h | 10 +-
src/hmm/nanopolish_pore_model_set.cpp | 100 +
src/hmm/nanopolish_pore_model_set.h | 72 +
src/hmm/nanopolish_profile_hmm.cpp | 94 +-
src/hmm/nanopolish_profile_hmm.h | 36 +-
src/hmm/nanopolish_profile_hmm.inl | 185 +-
src/hmm/nanopolish_transition_parameters.cpp | 200 +-
src/hmm/nanopolish_transition_parameters.h | 12 +-
src/main/nanopolish.cpp | 66 +-
src/nanopolish_call_variants.cpp | 697 +++-
src/nanopolish_call_variants.h | 2 +-
src/nanopolish_consensus.cpp | 92 +-
src/nanopolish_consensus.h | 2 +-
src/nanopolish_getmodel.cpp | 1 +
src/nanopolish_getmodel.h | 2 +-
src/nanopolish_haplotype.cpp | 36 +-
src/nanopolish_haplotype.h | 27 +-
src/nanopolish_methyltest.cpp | 155 +-
src/nanopolish_methyltrain.cpp | 859 +++--
src/nanopolish_methyltrain.h | 25 +-
src/nanopolish_poremodel.cpp | 214 +-
src/nanopolish_poremodel.h | 59 +-
src/nanopolish_scorereads.cpp | 593 +++
src/nanopolish_scorereads.h | 32 +
src/nanopolish_squiggle_read.cpp | 418 ++-
src/nanopolish_squiggle_read.h | 106 +-
src/nanopolish_train_poremodel_from_basecalls.cpp | 398 ++
src/nanopolish_train_poremodel_from_basecalls.h | 15 +
src/test/nanopolish_test.cpp | 416 ++-
src/training_core.cpp | 265 ++
src/training_core.hpp | 176 +
65 files changed, 11421 insertions(+), 1351 deletions(-)
diff --git a/.gitignore b/.gitignore
index 3b87bc6..c1adaa7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,8 @@ hdf5-*/
include/
lib/
share/
+
+hdf5-1.8.14.tar.gz
+3.2.5.tar.bz2
+eigen/
+local*
diff --git a/Makefile b/Makefile
index 40670fc..4adea67 100644
--- a/Makefile
+++ b/Makefile
@@ -67,6 +67,15 @@ lib/libhdf5.a:
tar -xzf hdf5-1.8.14.tar.gz
cd hdf5-1.8.14; ./configure --enable-threadsafe --prefix=`pwd`/..; make; make install
+
+# Download and install eigen if not already downloaded
+EIGEN=eigen/INSTALL
+
+$(EIGEN):
+ wget http://bitbucket.org/eigen/eigen/get/3.2.5.tar.bz2
+ tar -xjvf 3.2.5.tar.bz2
+ mv eigen-eigen-bdd17ee3b1b3 eigen
+
#
# Source files
#
@@ -84,7 +93,7 @@ C_OBJ=$(C_SRC:.c=.o)
PHONY=depend
depend: .depend
-.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB)
+.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN)
rm -f ./.depend
$(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend;
@@ -98,7 +107,7 @@ include .depend
$(CC) -o $@ -c $(CFLAGS) -fPIC $<
# Link main executable
-$(PROGRAM): src/main/nanopolish.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB)
+$(PROGRAM): src/main/nanopolish.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN)
$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS)
# Link test executable
@@ -109,4 +118,4 @@ test: $(TEST_PROGRAM)
./$(TEST_PROGRAM)
clean:
- rm -f nanopolish nanopolish_test $(CPP_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o
+ rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o
diff --git a/README.md b/README.md
index 64ecc46..ca30dce 100644
--- a/README.md
+++ b/README.md
@@ -18,37 +18,52 @@ make
This will automatically download and install libhdf5.
-## Brief usage instructions
+## Computing a new consensus sequence for a draft assembly
-The reads that are input into the HMM must be output as a ```.fa``` file by ```poretools```. This is important as ```poretools``` writes the path to the original ```.fast5``` file (containing the signal data) in the fasta header. These paths must be correct or nanopolish cannot find the events for each read. Let's say you have exported your reads to ```reads.fa``` and you want to polish ```draft.fa```. You should run:
+The reads that are input into nanopolish must be output as a ```.fa``` file by ```poretools```. This is important as ```poretools``` writes the path to the original ```.fast5``` file (containing the signal data) in the fasta header. These paths must be correct or nanopolish cannot find the events for each read. Let's say you have exported your reads to ```reads.fa``` and you want to polish ```draft.fa```. First we need to map the reads in base and event space to the draft assembly.
```
-make -f scripts/consensus.make READS=reads.fa ASSEMBLY=draft.fa
-```
+# Index the reference genome
+bwa index draft.fa
+
+# Align the reads in base space
+bwa mem -x ont2d -t 8 draft.fa reads.fa | samtools view -Sb - | samtools sort -f - reads.sorted.bam
+samtools index reads.sorted.bam
+
+# Copy the nanopolish model files into the working directory
+cp /path/to/nanopolish/etc/r9-models/* .
-This will map the reads to the assembly with ```bwa mem -x ont2d``` and export a file mapping read names to fast5 files.
+# Align the reads in event space
+nanopolish eventalign -t 8 --sam -r reads.fa -b reads.sorted.bam -g draft.fa --models nanopolish_models.fofn | samtools view -Sb - | samtools sort -f - reads.eventalign.sorted.bam
+samtools index reads.eventalign.sorted.bam
+```
-You can then run ```nanopolish consensus```. It is recommended that you run this in parallel.
+Now, we use nanopolish to compute the consensus sequence. We'll run this in parallel:
```
-python nanopolish_makerange.py draft.fa | parallel --results nanopolish.results -P 8 nanopolish consensus -o nanopolish.{1}.fa -w {1} --r reads.pp.fa -b reads.pp.sorted.bam -g draft.fa -t 4
+python nanopolish_makerange.py draft.fa | parallel --results nanopolish.results -P 8 \
+ nanopolish variants --consensus polished.{1}.fa -w {1} -r reads.fa -b reads.sorted.bam -g draft.fa -e reads.eventalign.sorted.bam -t 4 --min-candidate-frequency 0.1 --models nanopolish_models.fofn
```
-This command will run the consensus algorithm on eight 100kbp segments of the genome at a time, using 4 threads each. Change the ```-P``` and ```--threads``` options as appropriate for the machines you have available.
+This command will run the consensus algorithm on eight 10kbp segments of the genome at a time, using 4 threads each. Change the ```-P``` and ```--threads``` options as appropriate for the machines you have available.
After all polishing jobs are complete, you can merge the individual segments together into the final assembly:
```
-python nanopolish_merge.py draft.fa nanopolish.*.fa > polished.fa
+python nanopolish_merge.py polished.*.fa > polished_genome.fa
```
+## Fixing homopolymers
+
+Nanopolish 0.5 contains an experimental ```--fix-homopolymers``` option that will use event durations to improve the consensus accuracy around homopolymers. This option has only been tested on deep (>100X) data where it gives a minor improvement in accuracy. It is left off by default for now until it is tested further.
+
## To run using docker
First build the image from the dockerfile:
```
docker build .
```
-Note the uuid given upon successful build.
+Note the uuid given upon successful build.
Then you can run nanopolish from the image:
```
docker run -v /path/to/local/data/data/:/data/ -it :image_id ./nanopolish eventalign -r /data/reads.fa -b /data/alignments.sorted.bam -g /data/ref.fa
diff --git a/etc/r9-models/nanopolish_models.fofn b/etc/r9-models/nanopolish_models.fofn
new file mode 100644
index 0000000..5216622
--- /dev/null
+++ b/etc/r9-models/nanopolish_models.fofn
@@ -0,0 +1,2 @@
+template_median68pA.5mers.model
+template_median68pA.model
diff --git a/etc/r9-models/template_median68pA.5mers.model b/etc/r9-models/template_median68pA.5mers.model
new file mode 100644
index 0000000..7a39f6b
--- /dev/null
+++ b/etc/r9-models/template_median68pA.5mers.model
@@ -0,0 +1,1030 @@
+#model_name template_median68pA.model.dropmodel
+#type base
+#strand template
+#kit SQK007
+#derived_from template_median68pA.model
+kmer level_mean level_stdv sd_mean sd_stdv
+AAAAA 82.2094415908 1.75979722868 0.0 0.0 0.0
+AAAAC 73.7029044233 1.82602316065 0.0 0.0 0.0
+AAAAG 80.5417816472 1.69276634736 0.0 0.0 0.0
+AAAAT 72.729167775 1.72231116112 0.0 0.0 0.0
+AAACA 83.742353859 1.89564775404 0.0 0.0 0.0
+AAACC 80.6855181611 1.70714347958 0.0 0.0 0.0
+AAACG 82.7861794506 1.64402039648 0.0 0.0 0.0
+AAACT 79.5852670828 1.6531869263 0.0 0.0 0.0
+AAAGA 79.7377016246 2.11586469128 0.0 0.0 0.0
+AAAGC 75.7011539274 1.58675764043 0.0 0.0 0.0
+AAAGG 79.3192548374 1.61314858891 0.0 0.0 0.0
+AAAGT 73.5903805167 1.68996338752 0.0 0.0 0.0
+AAATA 78.1147869333 1.8558849245 0.0 0.0 0.0
+AAATC 72.1748729636 1.853982548 0.0 0.0 0.0
+AAATG 75.5299696688 1.81151536301 0.0 0.0 0.0
+AAATT 66.3003167375 2.04808982034 0.0 0.0 0.0
+AACAA 103.333766366 2.24728022641 0.0 0.0 0.0
+AACAC 96.2581091391 1.85525001062 0.0 0.0 0.0
+AACAG 100.91287416 2.13438489415 0.0 0.0 0.0
+AACAT 95.6044468796 2.05507997095 0.0 0.0 0.0
+AACCA 103.074087522 2.14793093135 0.0 0.0 0.0
+AACCC 100.091300723 1.87081317307 0.0 0.0 0.0
+AACCG 101.912856441 2.12322258545 0.0 0.0 0.0
+AACCT 98.8770655688 1.95222839373 0.0 0.0 0.0
+AACGA 98.0993576458 2.50828178325 0.0 0.0 0.0
+AACGC 95.4434020393 1.99931568056 0.0 0.0 0.0
+AACGG 98.5108993864 2.24509468472 0.0 0.0 0.0
+AACGT 94.7797247025 2.26817355889 0.0 0.0 0.0
+AACTA 96.3594525981 2.09340519216 0.0 0.0 0.0
+AACTC 89.3276825104 1.95165903759 0.0 0.0 0.0
+AACTG 93.6080009681 2.14624848096 0.0 0.0 0.0
+AACTT 84.2265884821 1.85554013132 0.0 0.0 0.0
+AAGAA 75.2881118442 2.27538346958 0.0 0.0 0.0
+AAGAC 65.520878874 2.08300701393 0.0 0.0 0.0
+AAGAG 74.2228859385 2.09453730482 0.0 0.0 0.0
+AAGAT 64.9515143354 1.78516619267 0.0 0.0 0.0
+AAGCA 82.2069108173 1.94723580859 0.0 0.0 0.0
+AAGCC 78.7895419866 1.65494083477 0.0 0.0 0.0
+AAGCG 79.9691733191 1.96089591144 0.0 0.0 0.0
+AAGCT 77.264532206 1.66336256418 0.0 0.0 0.0
+AAGGA 73.4160397148 2.77659794911 0.0 0.0 0.0
+AAGGC 69.711558387 1.731481133 0.0 0.0 0.0
+AAGGG 74.8215205963 1.93513755154 0.0 0.0 0.0
+AAGGT 68.1111177178 1.74206540271 0.0 0.0 0.0
+AAGTA 71.7886426618 2.20941330654 0.0 0.0 0.0
+AAGTC 65.1063229057 1.97063643836 0.0 0.0 0.0
+AAGTG 67.9123150264 2.27606320873 0.0 0.0 0.0
+AAGTT 59.2460579615 2.14227148939 0.0 0.0 0.0
+AATAA 107.09961863 2.87834932026 0.0 0.0 0.0
+AATAC 103.527918003 2.97812121016 0.0 0.0 0.0
+AATAG 105.641856064 3.04290223213 0.0 0.0 0.0
+AATAT 105.539487898 3.68259818873 0.0 0.0 0.0
+AATCA 115.425699603 2.72621747588 0.0 0.0 0.0
+AATCC 115.179995709 2.69272048882 0.0 0.0 0.0
+AATCG 115.419047553 2.95040517923 0.0 0.0 0.0
+AATCT 115.524546725 2.59273467376 0.0 0.0 0.0
+AATGA 103.293053074 3.04041578361 0.0 0.0 0.0
+AATGC 104.557462465 3.09014497119 0.0 0.0 0.0
+AATGG 104.729978865 3.04715495198 0.0 0.0 0.0
+AATGT 108.146707585 3.41105838379 0.0 0.0 0.0
+AATTA 102.379906336 2.53019609877 0.0 0.0 0.0
+AATTC 97.7768802649 2.20304453716 0.0 0.0 0.0
+AATTG 100.352322908 2.57635559402 0.0 0.0 0.0
+AATTT 93.6212543252 2.0245142873 0.0 0.0 0.0
+ACAAA 88.5507616466 2.69247592545 0.0 0.0 0.0
+ACAAC 80.1039876685 2.1189050025 0.0 0.0 0.0
+ACAAG 87.191735049 2.55265839764 0.0 0.0 0.0
+ACAAT 78.4620172943 2.22847694847 0.0 0.0 0.0
+ACACA 90.6478274892 1.59810821745 0.0 0.0 0.0
+ACACC 88.3490616687 1.43420011762 0.0 0.0 0.0
+ACACG 89.8420747375 1.61503481432 0.0 0.0 0.0
+ACACT 86.2729889178 1.74038366353 0.0 0.0 0.0
+ACAGA 83.1545793342 2.97991619369 0.0 0.0 0.0
+ACAGC 79.9163391229 1.975540458 0.0 0.0 0.0
+ACAGG 83.6684421848 2.3953584494 0.0 0.0 0.0
+ACAGT 76.0533186288 2.31907048707 0.0 0.0 0.0
+ACATA 84.6886033185 2.3426100684 0.0 0.0 0.0
+ACATC 78.880324668 2.14445948687 0.0 0.0 0.0
+ACATG 82.0578186175 2.3878974599 0.0 0.0 0.0
+ACATT 72.369949321 2.56305035089 0.0 0.0 0.0
+ACCAA 99.692627765 1.82728307716 0.0 0.0 0.0
+ACCAC 92.7137330607 1.71944646092 0.0 0.0 0.0
+ACCAG 97.8665220629 1.88200156337 0.0 0.0 0.0
+ACCAT 91.3636689217 1.84661097711 0.0 0.0 0.0
+ACCCA 98.5476960529 2.0054085344 0.0 0.0 0.0
+ACCCC 95.5835456017 1.71625005096 0.0 0.0 0.0
+ACCCG 97.43398287 1.83912693949 0.0 0.0 0.0
+ACCCT 94.1302351699 1.69613075525 0.0 0.0 0.0
+ACCGA 93.9137483023 2.43686545088 0.0 0.0 0.0
+ACCGC 91.0277295135 1.71062441373 0.0 0.0 0.0
+ACCGG 94.5493170386 1.97182013862 0.0 0.0 0.0
+ACCGT 88.8953691174 1.9369036782 0.0 0.0 0.0
+ACCTA 92.9864359155 2.11427756768 0.0 0.0 0.0
+ACCTC 87.3305741333 2.04281629279 0.0 0.0 0.0
+ACCTG 90.4058893777 2.18488752241 0.0 0.0 0.0
+ACCTT 81.7789533047 2.16484731784 0.0 0.0 0.0
+ACGAA 81.3950298505 2.47744841438 0.0 0.0 0.0
+ACGAC 72.3315401577 2.05293532962 0.0 0.0 0.0
+ACGAG 80.4940199304 2.37760499653 0.0 0.0 0.0
+ACGAT 70.785378344 1.95863519861 0.0 0.0 0.0
+ACGCA 86.1597340646 1.69832192553 0.0 0.0 0.0
+ACGCC 83.4445403755 1.61181426561 0.0 0.0 0.0
+ACGCG 84.3808951926 1.93287230113 0.0 0.0 0.0
+ACGCT 81.1219977032 1.79828231405 0.0 0.0 0.0
+ACGGA 79.8157864263 2.94960954325 0.0 0.0 0.0
+ACGGC 76.8813613966 1.9652301327 0.0 0.0 0.0
+ACGGG 81.7747817623 2.313154641 0.0 0.0 0.0
+ACGGT 74.468979741 2.27124491113 0.0 0.0 0.0
+ACGTA 77.236259635 2.45396554013 0.0 0.0 0.0
+ACGTC 71.2356270619 2.10372591351 0.0 0.0 0.0
+ACGTG 73.8337258747 2.7450268771 0.0 0.0 0.0
+ACGTT 64.8665480264 2.42151112595 0.0 0.0 0.0
+ACTAA 105.573607673 2.25029589118 0.0 0.0 0.0
+ACTAC 102.237735197 1.99812824561 0.0 0.0 0.0
+ACTAG 103.53103067 2.0194024405 0.0 0.0 0.0
+ACTAT 102.829201704 2.51090226724 0.0 0.0 0.0
+ACTCA 110.764486382 2.17837913388 0.0 0.0 0.0
+ACTCC 110.092964679 2.26305036869 0.0 0.0 0.0
+ACTCG 110.39520472 2.35615327653 0.0 0.0 0.0
+ACTCT 110.072955613 2.27441975121 0.0 0.0 0.0
+ACTGA 100.618160629 2.40727835975 0.0 0.0 0.0
+ACTGC 100.394914243 2.13717409564 0.0 0.0 0.0
+ACTGG 101.43452411 2.33885996256 0.0 0.0 0.0
+ACTGT 101.40145933 2.48981830586 0.0 0.0 0.0
+ACTTA 98.9632902722 2.23238368831 0.0 0.0 0.0
+ACTTC 94.9670120893 1.80747700966 0.0 0.0 0.0
+ACTTG 97.0212787497 2.26572736559 0.0 0.0 0.0
+ACTTT 91.0044085919 1.96560750201 0.0 0.0 0.0
+AGAAA 75.6684034766 2.06368518208 0.0 0.0 0.0
+AGAAC 66.0683425748 2.04449181603 0.0 0.0 0.0
+AGAAG 74.0411642842 1.78116452179 0.0 0.0 0.0
+AGAAT 65.8530451447 1.85877621472 0.0 0.0 0.0
+AGACA 80.3285704766 2.15577403058 0.0 0.0 0.0
+AGACC 76.8216281829 2.11732622988 0.0 0.0 0.0
+AGACG 79.2296346396 1.87456294849 0.0 0.0 0.0
+AGACT 76.1322834323 1.79149794696 0.0 0.0 0.0
+AGAGA 74.9411870804 1.91350161129 0.0 0.0 0.0
+AGAGC 69.5476919258 1.6881297257 0.0 0.0 0.0
+AGAGG 74.9596086023 1.48151751649 0.0 0.0 0.0
+AGAGT 69.1447607955 1.60802041446 0.0 0.0 0.0
+AGATA 73.5163292092 2.06765164424 0.0 0.0 0.0
+AGATC 67.5932274385 2.15797666429 0.0 0.0 0.0
+AGATG 70.8683167717 2.03613133069 0.0 0.0 0.0
+AGATT 62.690625879 1.94337774895 0.0 0.0 0.0
+AGCAA 101.011718878 2.23540091464 0.0 0.0 0.0
+AGCAC 93.0438584556 1.77181958307 0.0 0.0 0.0
+AGCAG 98.3720654154 2.10694827642 0.0 0.0 0.0
+AGCAT 92.6838863744 1.96628709091 0.0 0.0 0.0
+AGCCA 101.573416245 2.17219461102 0.0 0.0 0.0
+AGCCC 98.6895723955 1.81793575284 0.0 0.0 0.0
+AGCCG 100.323183426 2.1615095806 0.0 0.0 0.0
+AGCCT 97.4618008789 1.95296628432 0.0 0.0 0.0
+AGCGA 96.3785781723 2.49079534288 0.0 0.0 0.0
+AGCGC 92.7347814513 1.89064960735 0.0 0.0 0.0
+AGCGG 96.7598654841 2.21124233552 0.0 0.0 0.0
+AGCGT 92.9331810625 2.25221299195 0.0 0.0 0.0
+AGCTA 94.5934413452 2.15476355558 0.0 0.0 0.0
+AGCTC 87.5288092724 1.9476007846 0.0 0.0 0.0
+AGCTG 91.8052117703 2.18276705765 0.0 0.0 0.0
+AGCTT 82.6602028274 1.98247762333 0.0 0.0 0.0
+AGGAA 68.3326317685 2.1326350693 0.0 0.0 0.0
+AGGAC 57.5400012471 2.11575677746 0.0 0.0 0.0
+AGGAG 66.6987647521 1.77205090251 0.0 0.0 0.0
+AGGAT 57.7358669351 1.71797289323 0.0 0.0 0.0
+AGGCA 78.9921829805 1.98422432978 0.0 0.0 0.0
+AGGCC 75.664524067 1.72643070576 0.0 0.0 0.0
+AGGCG 77.4866119211 1.8044754731 0.0 0.0 0.0
+AGGCT 74.2735904149 1.68921835817 0.0 0.0 0.0
+AGGGA 69.9763792932 2.37479056384 0.0 0.0 0.0
+AGGGC 64.981133341 1.85418834507 0.0 0.0 0.0
+AGGGG 71.3759136027 1.6107809254 0.0 0.0 0.0
+AGGGT 64.3024543399 1.66954837158 0.0 0.0 0.0
+AGGTA 68.3782509531 2.21782165352 0.0 0.0 0.0
+AGGTC 61.928675383 2.01481431464 0.0 0.0 0.0
+AGGTG 64.6832508367 2.11518837003 0.0 0.0 0.0
+AGGTT 57.0383725298 2.13563739541 0.0 0.0 0.0
+AGTAA 103.446660668 2.64574435081 0.0 0.0 0.0
+AGTAC 97.9678350016 2.6059468728 0.0 0.0 0.0
+AGTAG 101.395634663 2.59101559621 0.0 0.0 0.0
+AGTAT 100.7347633 3.1723036331 0.0 0.0 0.0
+AGTCA 111.617842031 2.75888428602 0.0 0.0 0.0
+AGTCC 110.467429664 2.72620668595 0.0 0.0 0.0
+AGTCG 111.166405242 2.74680535941 0.0 0.0 0.0
+AGTCT 111.090025806 2.70838157019 0.0 0.0 0.0
+AGTGA 100.425731951 2.7654742722 0.0 0.0 0.0
+AGTGC 100.531141847 2.80920135014 0.0 0.0 0.0
+AGTGG 101.927343982 2.81968713882 0.0 0.0 0.0
+AGTGT 104.232287104 3.17630410994 0.0 0.0 0.0
+AGTTA 98.6589296865 2.47125037915 0.0 0.0 0.0
+AGTTC 93.5797860074 2.26748755379 0.0 0.0 0.0
+AGTTG 96.5527417307 2.61021821083 0.0 0.0 0.0
+AGTTT 89.7304587817 1.85375208896 0.0 0.0 0.0
+ATAAA 89.9383351041 2.33283053352 0.0 0.0 0.0
+ATAAC 81.7088492823 2.3567755164 0.0 0.0 0.0
+ATAAG 88.6922116753 2.45099302282 0.0 0.0 0.0
+ATAAT 80.7017871241 2.59412091116 0.0 0.0 0.0
+ATACA 90.8915849936 1.53693404957 0.0 0.0 0.0
+ATACC 88.9444437883 1.54262000814 0.0 0.0 0.0
+ATACG 89.9397180037 1.63220496825 0.0 0.0 0.0
+ATACT 87.4587856084 1.66648490377 0.0 0.0 0.0
+ATAGA 86.8770002273 2.70051525325 0.0 0.0 0.0
+ATAGC 83.8197395681 3.34095040423 0.0 0.0 0.0
+ATAGG 87.3670730867 2.47721840778 0.0 0.0 0.0
+ATAGT 81.3892041413 2.70396222459 0.0 0.0 0.0
+ATATA 86.1669750156 2.12550330484 0.0 0.0 0.0
+ATATC 82.3764182934 2.35464961948 0.0 0.0 0.0
+ATATG 84.0778544678 2.39191386484 0.0 0.0 0.0
+ATATT 76.8130615577 2.98829125444 0.0 0.0 0.0
+ATCAA 100.589739537 2.09860848061 0.0 0.0 0.0
+ATCAC 94.8090796233 2.25097459402 0.0 0.0 0.0
+ATCAG 99.3951597265 2.20839009666 0.0 0.0 0.0
+ATCAT 93.0285654079 2.64639781733 0.0 0.0 0.0
+ATCCA 99.2224760705 2.05866602093 0.0 0.0 0.0
+ATCCC 96.7537789138 1.82632913627 0.0 0.0 0.0
+ATCCG 98.4502421003 2.03800680138 0.0 0.0 0.0
+ATCCT 95.4409894905 2.260704828 0.0 0.0 0.0
+ATCGA 96.9813400979 2.69079497701 0.0 0.0 0.0
+ATCGC 93.9367380394 2.46515032845 0.0 0.0 0.0
+ATCGG 97.4594861826 2.3607976133 0.0 0.0 0.0
+ATCGT 91.6356994458 4.1546782781 0.0 0.0 0.0
+ATCTA 94.2940943789 3.57531002297 0.0 0.0 0.0
+ATCTC 90.2567158487 3.28492671715 0.0 0.0 0.0
+ATCTG 92.5232037076 2.44045829941 0.0 0.0 0.0
+ATCTT 85.4688803749 3.55802970976 0.0 0.0 0.0
+ATGAA 83.3972699977 3.18746390219 0.0 0.0 0.0
+ATGAC 74.1545537649 2.56519404384 0.0 0.0 0.0
+ATGAG 83.2779145786 3.77880856948 0.0 0.0 0.0
+ATGAT 73.3512936272 2.76193201446 0.0 0.0 0.0
+ATGCA 88.938200653 1.72394221144 0.0 0.0 0.0
+ATGCC 86.7955828063 1.84839979212 0.0 0.0 0.0
+ATGCG 87.8913754268 1.89270416264 0.0 0.0 0.0
+ATGCT 85.006949716 1.96339789839 0.0 0.0 0.0
+ATGGA 85.5404281201 3.34967065398 0.0 0.0 0.0
+ATGGC 82.2681638484 2.89812929349 0.0 0.0 0.0
+ATGGG 86.9268009854 2.60847440086 0.0 0.0 0.0
+ATGGT 80.9985001434 3.37728598385 0.0 0.0 0.0
+ATGTA 82.804634736 2.60556797526 0.0 0.0 0.0
+ATGTC 79.4871244072 2.96062489778 0.0 0.0 0.0
+ATGTG 80.5481607214 3.04186086884 0.0 0.0 0.0
+ATGTT 73.759183776 3.2661407665 0.0 0.0 0.0
+ATTAA 103.535595706 1.836683406 0.0 0.0 0.0
+ATTAC 99.244435282 1.74946672832 0.0 0.0 0.0
+ATTAG 101.93386027 1.75098246385 0.0 0.0 0.0
+ATTAT 99.4514414439 2.17061351917 0.0 0.0 0.0
+ATTCA 106.119726434 2.10093305349 0.0 0.0 0.0
+ATTCC 104.718970569 2.08409544538 0.0 0.0 0.0
+ATTCG 105.569243666 2.1665148132 0.0 0.0 0.0
+ATTCT 105.331383572 1.99075447033 0.0 0.0 0.0
+ATTGA 99.3556471398 2.11390937961 0.0 0.0 0.0
+ATTGC 98.2859918514 1.77426074996 0.0 0.0 0.0
+ATTGG 99.9995454171 1.84068460006 0.0 0.0 0.0
+ATTGT 99.1174220321 2.08324737067 0.0 0.0 0.0
+ATTTA 96.613718553 2.01937254915 0.0 0.0 0.0
+ATTTC 92.5408097581 1.62147481703 0.0 0.0 0.0
+ATTTG 95.0457803803 2.08465942553 0.0 0.0 0.0
+ATTTT 89.0170103802 1.83958379813 0.0 0.0 0.0
+CAAAA 82.2618327694 1.95810869046 0.0 0.0 0.0
+CAAAC 73.2880647039 1.90805489629 0.0 0.0 0.0
+CAAAG 80.436873088 1.7588892838 0.0 0.0 0.0
+CAAAT 72.1653355724 1.76806317219 0.0 0.0 0.0
+CAACA 84.4995670022 1.67165575243 0.0 0.0 0.0
+CAACC 81.5679674847 1.56734328981 0.0 0.0 0.0
+CAACG 83.5273730693 1.59056624457 0.0 0.0 0.0
+CAACT 80.4162625397 1.56247219068 0.0 0.0 0.0
+CAAGA 79.8634933703 2.30074378917 0.0 0.0 0.0
+CAAGC 75.4220791792 1.56954981135 0.0 0.0 0.0
+CAAGG 79.4683708398 1.82345492112 0.0 0.0 0.0
+CAAGT 73.3038530854 1.69180153292 0.0 0.0 0.0
+CAATA 78.2640513405 1.84660414073 0.0 0.0 0.0
+CAATC 72.0075726569 1.8282248012 0.0 0.0 0.0
+CAATG 75.639555689 1.83532503715 0.0 0.0 0.0
+CAATT 66.4209626351 2.18630977127 0.0 0.0 0.0
+CACAA 101.910835003 2.12774278902 0.0 0.0 0.0
+CACAC 94.5330570986 1.55104025962 0.0 0.0 0.0
+CACAG 99.1998454323 1.85872404925 0.0 0.0 0.0
+CACAT 93.9097936973 1.71101478544 0.0 0.0 0.0
+CACCA 101.909527268 2.03650873047 0.0 0.0 0.0
+CACCC 98.682356222 1.84924889924 0.0 0.0 0.0
+CACCG 100.466364877 2.03501989315 0.0 0.0 0.0
+CACCT 97.4417149492 1.85955200527 0.0 0.0 0.0
+CACGA 95.8907676656 2.33179707715 0.0 0.0 0.0
+CACGC 92.9340683552 1.65095935411 0.0 0.0 0.0
+CACGG 96.1164366759 2.09816561464 0.0 0.0 0.0
+CACGT 92.2197451279 1.83074252096 0.0 0.0 0.0
+CACTA 95.0843151276 2.2647853373 0.0 0.0 0.0
+CACTC 87.9866025081 1.96708986629 0.0 0.0 0.0
+CACTG 91.6663757694 2.25947235462 0.0 0.0 0.0
+CACTT 82.7674471859 1.85798997761 0.0 0.0 0.0
+CAGAA 73.9566766468 1.90964852123 0.0 0.0 0.0
+CAGAC 64.129219802 2.11392101865 0.0 0.0 0.0
+CAGAG 72.4784121407 1.88551767858 0.0 0.0 0.0
+CAGAT 63.7251924383 1.7788178303 0.0 0.0 0.0
+CAGCA 81.9532607599 1.81528828319 0.0 0.0 0.0
+CAGCC 78.5742452818 1.52266848073 0.0 0.0 0.0
+CAGCG 80.113974538 1.65871337524 0.0 0.0 0.0
+CAGCT 77.151604925 1.52748941071 0.0 0.0 0.0
+CAGGA 72.6181183832 2.58189270703 0.0 0.0 0.0
+CAGGC 68.8513412199 1.73127004321 0.0 0.0 0.0
+CAGGG 73.7435630132 1.80753244712 0.0 0.0 0.0
+CAGGT 67.4008908304 1.65240689043 0.0 0.0 0.0
+CAGTA 71.5845975803 2.13398976586 0.0 0.0 0.0
+CAGTC 65.1420041986 2.02707055963 0.0 0.0 0.0
+CAGTG 68.0966621868 2.23837990402 0.0 0.0 0.0
+CAGTT 59.4087944324 2.30709385867 0.0 0.0 0.0
+CATAA 104.067099878 2.52091533401 0.0 0.0 0.0
+CATAC 100.239285402 2.33991794227 0.0 0.0 0.0
+CATAG 102.097400135 2.50136155634 0.0 0.0 0.0
+CATAT 102.331541119 2.95974789676 0.0 0.0 0.0
+CATCA 113.318816053 2.53208809081 0.0 0.0 0.0
+CATCC 112.645260629 2.66417215652 0.0 0.0 0.0
+CATCG 112.632693943 2.85231277782 0.0 0.0 0.0
+CATCT 113.388387928 2.38361440936 0.0 0.0 0.0
+CATGA 100.157567761 3.01081399985 0.0 0.0 0.0
+CATGC 100.86424486 2.68999351924 0.0 0.0 0.0
+CATGG 101.158199825 2.84799360378 0.0 0.0 0.0
+CATGT 104.178032766 2.99219040914 0.0 0.0 0.0
+CATTA 99.3273339077 2.52985673257 0.0 0.0 0.0
+CATTC 93.9474486945 3.51582956044 0.0 0.0 0.0
+CATTG 97.0878503761 2.80026814288 0.0 0.0 0.0
+CATTT 90.718112578 2.00892144879 0.0 0.0 0.0
+CCAAA 88.2480098829 2.50745092549 0.0 0.0 0.0
+CCAAC 79.7639261998 1.98661416033 0.0 0.0 0.0
+CCAAG 86.8024607417 2.30168352099 0.0 0.0 0.0
+CCAAT 78.2399692889 2.08573995237 0.0 0.0 0.0
+CCACA 91.3557388624 1.73234469027 0.0 0.0 0.0
+CCACC 88.8089988001 1.44715196555 0.0 0.0 0.0
+CCACG 90.4637605676 1.76768863991 0.0 0.0 0.0
+CCACT 86.6867977276 1.63954605017 0.0 0.0 0.0
+CCAGA 83.2645695255 2.91641436134 0.0 0.0 0.0
+CCAGC 79.8333646883 1.99065458014 0.0 0.0 0.0
+CCAGG 83.6414330347 2.38845257555 0.0 0.0 0.0
+CCAGT 76.0161719747 2.23593263889 0.0 0.0 0.0
+CCATA 84.8726612074 3.30517911401 0.0 0.0 0.0
+CCATC 78.5984564873 2.08071555447 0.0 0.0 0.0
+CCATG 82.0163871653 2.4766801604 0.0 0.0 0.0
+CCATT 72.297249081 2.47122310664 0.0 0.0 0.0
+CCCAA 100.746995804 1.64931971053 0.0 0.0 0.0
+CCCAC 93.6629344834 1.79947977969 0.0 0.0 0.0
+CCCAG 98.5248603939 1.77063011385 0.0 0.0 0.0
+CCCAT 92.0872141205 1.78940381034 0.0 0.0 0.0
+CCCCA 99.7404279439 1.96979331261 0.0 0.0 0.0
+CCCCC 96.933432755 1.7408905911 0.0 0.0 0.0
+CCCCG 98.678579761 1.81933554941 0.0 0.0 0.0
+CCCCT 95.2614225857 1.53247951669 0.0 0.0 0.0
+CCCGA 94.5511180725 2.29902472851 0.0 0.0 0.0
+CCCGC 91.7613200012 1.71331535173 0.0 0.0 0.0
+CCCGG 95.0773926742 1.88279083673 0.0 0.0 0.0
+CCCGT 89.667345608 1.87139311432 0.0 0.0 0.0
+CCCTA 94.4292738436 2.95856231568 0.0 0.0 0.0
+CCCTC 88.2856122931 2.04038070821 0.0 0.0 0.0
+CCCTG 91.1836814557 1.98701504191 0.0 0.0 0.0
+CCCTT 82.7449396252 2.23016453027 0.0 0.0 0.0
+CCGAA 80.8062068271 2.20115220004 0.0 0.0 0.0
+CCGAC 71.7532616585 1.8351282799 0.0 0.0 0.0
+CCGAG 79.6018428188 2.12418001944 0.0 0.0 0.0
+CCGAT 70.2476455221 1.75361890923 0.0 0.0 0.0
+CCGCA 86.8972066543 1.72406950534 0.0 0.0 0.0
+CCGCC 83.9693850204 1.57016690118 0.0 0.0 0.0
+CCGCG 85.0590883139 1.93517070335 0.0 0.0 0.0
+CCGCT 81.6836596785 1.74804208698 0.0 0.0 0.0
+CCGGA 79.4965937925 2.75304404631 0.0 0.0 0.0
+CCGGC 76.405525041 1.78060638961 0.0 0.0 0.0
+CCGGG 81.2099357628 2.01686543994 0.0 0.0 0.0
+CCGGT 73.9674238507 2.15898455534 0.0 0.0 0.0
+CCGTA 77.2186467449 2.37215852536 0.0 0.0 0.0
+CCGTC 70.9769371331 2.16585705758 0.0 0.0 0.0
+CCGTG 73.5674918747 2.5274095551 0.0 0.0 0.0
+CCGTT 64.6851681041 2.43052408414 0.0 0.0 0.0
+CCTAA 107.726185179 2.37355290634 0.0 0.0 0.0
+CCTAC 104.335694625 2.31888943418 0.0 0.0 0.0
+CCTAG 104.980273848 3.07362520075 0.0 0.0 0.0
+CCTAT 104.976228633 2.69403084327 0.0 0.0 0.0
+CCTCA 113.335164917 2.36711122077 0.0 0.0 0.0
+CCTCC 112.706029413 2.21492275908 0.0 0.0 0.0
+CCTCG 112.814945375 2.4328855104 0.0 0.0 0.0
+CCTCT 112.535900997 2.23976195826 0.0 0.0 0.0
+CCTGA 102.871965991 2.5983609078 0.0 0.0 0.0
+CCTGC 102.837908642 2.37850649148 0.0 0.0 0.0
+CCTGG 103.913260843 2.49766113912 0.0 0.0 0.0
+CCTGT 103.953977004 2.74194420848 0.0 0.0 0.0
+CCTTA 101.348508317 2.28109904009 0.0 0.0 0.0
+CCTTC 96.6937275735 1.81567907784 0.0 0.0 0.0
+CCTTG 99.4928775908 2.32659604088 0.0 0.0 0.0
+CCTTT 92.9762087634 1.90522395264 0.0 0.0 0.0
+CGAAA 76.2834522654 1.87553804698 0.0 0.0 0.0
+CGAAC 66.2889148133 2.13652421619 0.0 0.0 0.0
+CGAAG 74.1921035895 1.70702899977 0.0 0.0 0.0
+CGAAT 65.6865622294 1.85950734008 0.0 0.0 0.0
+CGACA 81.5645707012 2.01396100181 0.0 0.0 0.0
+CGACC 78.2552129861 1.79345032026 0.0 0.0 0.0
+CGACG 80.4466481702 1.8581070838 0.0 0.0 0.0
+CGACT 77.3958503491 1.6910684736 0.0 0.0 0.0
+CGAGA 75.5266231171 2.15792390699 0.0 0.0 0.0
+CGAGC 70.2729111477 1.70468094557 0.0 0.0 0.0
+CGAGG 75.6193187263 1.58853696161 0.0 0.0 0.0
+CGAGT 69.367681863 1.61248665511 0.0 0.0 0.0
+CGATA 74.1794010098 2.08969343237 0.0 0.0 0.0
+CGATC 67.8517255224 1.97198006215 0.0 0.0 0.0
+CGATG 71.2787910067 1.97800997289 0.0 0.0 0.0
+CGATT 62.8687697713 2.19128889788 0.0 0.0 0.0
+CGCAA 101.720013209 2.21796023016 0.0 0.0 0.0
+CGCAC 93.3171321021 1.7444202802 0.0 0.0 0.0
+CGCAG 98.9959208039 2.14614822758 0.0 0.0 0.0
+CGCAT 92.9328391188 1.87168069369 0.0 0.0 0.0
+CGCCA 102.442298883 2.11547132594 0.0 0.0 0.0
+CGCCC 99.2932841794 1.87027742877 0.0 0.0 0.0
+CGCCG 101.077473744 2.08380696501 0.0 0.0 0.0
+CGCCT 98.0752079436 1.95136852 0.0 0.0 0.0
+CGCGA 96.6769828832 2.54998150287 0.0 0.0 0.0
+CGCGC 92.7079702521 1.81785016589 0.0 0.0 0.0
+CGCGG 96.8259034762 2.20593401374 0.0 0.0 0.0
+CGCGT 92.6090673178 2.03511287943 0.0 0.0 0.0
+CGCTA 95.2850611549 2.2314798643 0.0 0.0 0.0
+CGCTC 87.5974797134 2.08992522986 0.0 0.0 0.0
+CGCTG 92.167263085 2.19863163689 0.0 0.0 0.0
+CGCTT 82.8599228486 1.88649518889 0.0 0.0 0.0
+CGGAA 68.8822170323 1.97137697776 0.0 0.0 0.0
+CGGAC 58.195202509 2.27877257992 0.0 0.0 0.0
+CGGAG 67.0289188638 1.88763181021 0.0 0.0 0.0
+CGGAT 58.0726196609 1.84595741618 0.0 0.0 0.0
+CGGCA 80.8204810613 1.86925104242 0.0 0.0 0.0
+CGGCC 77.5451712973 1.6540004769 0.0 0.0 0.0
+CGGCG 79.271041235 1.78892901739 0.0 0.0 0.0
+CGGCT 76.0681236913 1.58301492812 0.0 0.0 0.0
+CGGGA 71.2439269914 2.43649022894 0.0 0.0 0.0
+CGGGC 66.3575303556 1.86565304283 0.0 0.0 0.0
+CGGGG 72.7054743856 1.79915023263 0.0 0.0 0.0
+CGGGT 65.3926332881 1.69240476221 0.0 0.0 0.0
+CGGTA 69.8206531487 2.30068323026 0.0 0.0 0.0
+CGGTC 63.2587472432 2.31419847927 0.0 0.0 0.0
+CGGTG 66.0412617879 2.30540830253 0.0 0.0 0.0
+CGGTT 58.0920794731 2.53941300112 0.0 0.0 0.0
+CGTAA 103.353619676 2.61490191522 0.0 0.0 0.0
+CGTAC 98.1743401253 2.381010133 0.0 0.0 0.0
+CGTAG 101.14280283 2.58945040166 0.0 0.0 0.0
+CGTAT 100.2505703 3.07338876111 0.0 0.0 0.0
+CGTCA 112.229537379 2.65178569603 0.0 0.0 0.0
+CGTCC 111.054513206 2.60436467465 0.0 0.0 0.0
+CGTCG 111.717453334 2.82698320453 0.0 0.0 0.0
+CGTCT 111.721101825 2.47579564865 0.0 0.0 0.0
+CGTGA 99.9800071495 2.85484332315 0.0 0.0 0.0
+CGTGC 100.30015182 2.68939466302 0.0 0.0 0.0
+CGTGG 101.448847444 2.87617394987 0.0 0.0 0.0
+CGTGT 103.533713995 3.07591897436 0.0 0.0 0.0
+CGTTA 98.4608422838 2.86734012888 0.0 0.0 0.0
+CGTTC 92.9673481974 2.32518931855 0.0 0.0 0.0
+CGTTG 95.6885029243 2.71662081994 0.0 0.0 0.0
+CGTTT 88.9777058922 2.0844424327 0.0 0.0 0.0
+CTAAA 89.9393226309 2.32490403426 0.0 0.0 0.0
+CTAAC 81.8093367516 2.28634911127 0.0 0.0 0.0
+CTAAG 88.741406952 2.44090491628 0.0 0.0 0.0
+CTAAT 80.8520403281 2.50692072508 0.0 0.0 0.0
+CTACA 91.5993772801 1.67489141187 0.0 0.0 0.0
+CTACC 89.6158262552 1.60434486661 0.0 0.0 0.0
+CTACG 90.6118163087 1.66008710163 0.0 0.0 0.0
+CTACT 88.0697885715 1.69639442108 0.0 0.0 0.0
+CTAGA 87.1673750389 2.49045098178 0.0 0.0 0.0
+CTAGC 83.3645904505 2.19317689285 0.0 0.0 0.0
+CTAGG 87.6529843162 2.61188653664 0.0 0.0 0.0
+CTAGT 81.1312088352 2.56575440967 0.0 0.0 0.0
+CTATA 86.738974501 2.69296257082 0.0 0.0 0.0
+CTATC 83.1199622124 2.52438276463 0.0 0.0 0.0
+CTATG 84.710227557 2.63419342608 0.0 0.0 0.0
+CTATT 77.8404923629 2.98605772798 0.0 0.0 0.0
+CTCAA 99.5582591471 1.97422464807 0.0 0.0 0.0
+CTCAC 94.2679442878 2.09006482453 0.0 0.0 0.0
+CTCAG 98.1255734186 2.03767472286 0.0 0.0 0.0
+CTCAT 92.7286054527 2.1449477483 0.0 0.0 0.0
+CTCCA 98.1067331421 1.97188139995 0.0 0.0 0.0
+CTCCC 95.696345802 1.71014922468 0.0 0.0 0.0
+CTCCG 97.4401843822 1.99934830924 0.0 0.0 0.0
+CTCCT 94.8070042606 2.01934477762 0.0 0.0 0.0
+CTCGA 96.0396992647 2.60341680058 0.0 0.0 0.0
+CTCGC 92.8585373079 2.13420565477 0.0 0.0 0.0
+CTCGG 96.249131681 2.1277707269 0.0 0.0 0.0
+CTCGT 91.1847409527 2.29214940896 0.0 0.0 0.0
+CTCTA 93.5958357486 2.20376720004 0.0 0.0 0.0
+CTCTC 89.8920272555 2.58363469093 0.0 0.0 0.0
+CTCTG 91.9824993506 2.29191010777 0.0 0.0 0.0
+CTCTT 84.9507575254 2.47475721204 0.0 0.0 0.0
+CTGAA 82.3944411921 2.64547550257 0.0 0.0 0.0
+CTGAC 73.8076850895 2.28318687455 0.0 0.0 0.0
+CTGAG 82.0570093834 2.60583368751 0.0 0.0 0.0
+CTGAT 72.841404179 2.42831157057 0.0 0.0 0.0
+CTGCA 88.2919792812 1.84786642446 0.0 0.0 0.0
+CTGCC 86.0127822901 1.78653120758 0.0 0.0 0.0
+CTGCG 87.1615774014 1.86364479323 0.0 0.0 0.0
+CTGCT 84.5088175241 1.8552694988 0.0 0.0 0.0
+CTGGA 84.1373981021 2.82759921438 0.0 0.0 0.0
+CTGGC 80.4319435449 2.44869782106 0.0 0.0 0.0
+CTGGG 85.3173828437 2.38363573141 0.0 0.0 0.0
+CTGGT 79.3800550925 2.89864787852 0.0 0.0 0.0
+CTGTA 81.9387592296 2.57499827777 0.0 0.0 0.0
+CTGTC 78.3645602433 2.69894013498 0.0 0.0 0.0
+CTGTG 79.6711398015 2.75923440897 0.0 0.0 0.0
+CTGTT 73.3376000679 3.17878717765 0.0 0.0 0.0
+CTTAA 101.294314407 1.91738694513 0.0 0.0 0.0
+CTTAC 97.6998858132 1.85023826433 0.0 0.0 0.0
+CTTAG 99.3202187769 1.86165967364 0.0 0.0 0.0
+CTTAT 98.4405509387 2.20098535078 0.0 0.0 0.0
+CTTCA 104.225285498 2.62167451552 0.0 0.0 0.0
+CTTCC 102.727075336 2.61022808124 0.0 0.0 0.0
+CTTCG 103.325918261 2.53256896443 0.0 0.0 0.0
+CTTCT 103.598205221 2.3613828637 0.0 0.0 0.0
+CTTGA 96.9245442215 2.11915829661 0.0 0.0 0.0
+CTTGC 96.0326060178 1.99046278098 0.0 0.0 0.0
+CTTGG 97.3538439928 1.9655612276 0.0 0.0 0.0
+CTTGT 97.4634154517 2.19354217086 0.0 0.0 0.0
+CTTTA 94.5295227155 2.47565176134 0.0 0.0 0.0
+CTTTC 90.6332579172 1.81202042863 0.0 0.0 0.0
+CTTTG 92.7284967597 2.37496776288 0.0 0.0 0.0
+CTTTT 87.8278652733 1.6735819591 0.0 0.0 0.0
+GAAAA 80.2336868486 1.59917571277 0.0 0.0 0.0
+GAAAC 71.1291733858 1.77367260351 0.0 0.0 0.0
+GAAAG 78.1732339417 1.70230791392 0.0 0.0 0.0
+GAAAT 70.812995333 1.63727004353 0.0 0.0 0.0
+GAACA 83.0842940507 2.03697771248 0.0 0.0 0.0
+GAACC 79.8827115529 1.80261959358 0.0 0.0 0.0
+GAACG 81.9364266611 1.88667523872 0.0 0.0 0.0
+GAACT 79.1374998434 1.82214138805 0.0 0.0 0.0
+GAAGA 78.1946262446 1.99038812226 0.0 0.0 0.0
+GAAGC 73.637429027 1.62989461463 0.0 0.0 0.0
+GAAGG 77.7998049792 1.58116142132 0.0 0.0 0.0
+GAAGT 72.1786250533 1.55450763144 0.0 0.0 0.0
+GAATA 76.4491876327 2.03836663522 0.0 0.0 0.0
+GAATC 70.1153381908 2.13364374882 0.0 0.0 0.0
+GAATG 73.6478553601 1.94535527812 0.0 0.0 0.0
+GAATT 65.1971592421 1.8424058391 0.0 0.0 0.0
+GACAA 102.091278017 2.22889591264 0.0 0.0 0.0
+GACAC 94.5658402256 1.7183826646 0.0 0.0 0.0
+GACAG 99.3130197554 2.21187379926 0.0 0.0 0.0
+GACAT 93.8675570337 2.05699434061 0.0 0.0 0.0
+GACCA 101.981911057 2.10847849482 0.0 0.0 0.0
+GACCC 98.7932159256 2.00806844158 0.0 0.0 0.0
+GACCG 100.585829939 2.05600150883 0.0 0.0 0.0
+GACCT 97.5884810718 1.9080470598 0.0 0.0 0.0
+GACGA 97.1455293213 2.66194652083 0.0 0.0 0.0
+GACGC 94.0352742606 1.99203795012 0.0 0.0 0.0
+GACGG 97.4710789326 2.34641875326 0.0 0.0 0.0
+GACGT 93.4758647057 2.26767801834 0.0 0.0 0.0
+GACTA 95.444153834 2.36762588489 0.0 0.0 0.0
+GACTC 88.1077380548 1.9130223614 0.0 0.0 0.0
+GACTG 92.4682457561 2.23201618327 0.0 0.0 0.0
+GACTT 82.9498271776 1.99616805545 0.0 0.0 0.0
+GAGAA 72.5755290592 2.19679704127 0.0 0.0 0.0
+GAGAC 61.982483151 2.32956621172 0.0 0.0 0.0
+GAGAG 70.8959789932 1.99763143796 0.0 0.0 0.0
+GAGAT 62.1051465493 1.96074855008 0.0 0.0 0.0
+GAGCA 80.9974242406 2.04328606612 0.0 0.0 0.0
+GAGCC 77.5189806023 1.83049353616 0.0 0.0 0.0
+GAGCG 78.8136589666 1.96392134063 0.0 0.0 0.0
+GAGCT 75.9937882754 1.65469280952 0.0 0.0 0.0
+GAGGA 72.005816334 2.54594878066 0.0 0.0 0.0
+GAGGC 67.7395636769 1.80731420778 0.0 0.0 0.0
+GAGGG 73.4098453471 1.68519627853 0.0 0.0 0.0
+GAGGT 66.560328996 1.7030607386 0.0 0.0 0.0
+GAGTA 70.0259881907 2.11967768833 0.0 0.0 0.0
+GAGTC 63.3542590509 1.99883648442 0.0 0.0 0.0
+GAGTG 66.2651936724 2.11311588136 0.0 0.0 0.0
+GAGTT 57.6579708488 2.06841933295 0.0 0.0 0.0
+GATAA 106.334053724 2.64802975187 0.0 0.0 0.0
+GATAC 102.412630566 2.53077158419 0.0 0.0 0.0
+GATAG 104.405862769 2.50539956477 0.0 0.0 0.0
+GATAT 104.558634188 3.10078206154 0.0 0.0 0.0
+GATCA 113.590224694 2.60404404791 0.0 0.0 0.0
+GATCC 113.286134788 2.61237001578 0.0 0.0 0.0
+GATCG 113.53457595 2.76169884824 0.0 0.0 0.0
+GATCT 113.583955234 2.49546200612 0.0 0.0 0.0
+GATGA 102.670793081 2.72673281558 0.0 0.0 0.0
+GATGC 103.476932914 2.68287758909 0.0 0.0 0.0
+GATGG 104.002422402 2.69037971848 0.0 0.0 0.0
+GATGT 106.69608443 3.29736405604 0.0 0.0 0.0
+GATTA 100.791941788 2.45074374076 0.0 0.0 0.0
+GATTC 96.5950703844 2.17541739543 0.0 0.0 0.0
+GATTG 98.7338378461 2.47383191726 0.0 0.0 0.0
+GATTT 92.4718102825 1.88187708895 0.0 0.0 0.0
+GCAAA 87.1483345194 2.49003461447 0.0 0.0 0.0
+GCAAC 78.8229404382 1.99369720537 0.0 0.0 0.0
+GCAAG 85.7573280346 2.38967414414 0.0 0.0 0.0
+GCAAT 77.2592597843 2.04369779357 0.0 0.0 0.0
+GCACA 89.6643768234 1.61675918179 0.0 0.0 0.0
+GCACC 87.2049897958 1.47746411814 0.0 0.0 0.0
+GCACG 88.6930265096 1.6976717113 0.0 0.0 0.0
+GCACT 85.2071245588 1.69208788221 0.0 0.0 0.0
+GCAGA 82.0723483627 2.88114047173 0.0 0.0 0.0
+GCAGC 78.665584397 1.8786417379 0.0 0.0 0.0
+GCAGG 82.5556296938 2.33259238431 0.0 0.0 0.0
+GCAGT 75.0911164261 2.19533461118 0.0 0.0 0.0
+GCATA 83.5135574787 2.25902195522 0.0 0.0 0.0
+GCATC 77.6526812614 2.05292198214 0.0 0.0 0.0
+GCATG 80.8758009053 2.42997812414 0.0 0.0 0.0
+GCATT 71.4528271643 2.40652059373 0.0 0.0 0.0
+GCCAA 98.8671644871 1.76186088143 0.0 0.0 0.0
+GCCAC 91.7789520144 1.71668274414 0.0 0.0 0.0
+GCCAG 96.7960714654 1.83470608015 0.0 0.0 0.0
+GCCAT 90.4696308072 1.8536998298 0.0 0.0 0.0
+GCCCA 97.7760781136 1.84425956069 0.0 0.0 0.0
+GCCCC 95.012307617 1.74400328234 0.0 0.0 0.0
+GCCCG 96.8175563537 1.82208153603 0.0 0.0 0.0
+GCCCT 93.3285210384 1.7682893887 0.0 0.0 0.0
+GCCGA 93.0188808978 2.36756385336 0.0 0.0 0.0
+GCCGC 90.3209274729 1.64167483563 0.0 0.0 0.0
+GCCGG 93.6889869913 1.95328258269 0.0 0.0 0.0
+GCCGT 88.2924769848 1.97711494644 0.0 0.0 0.0
+GCCTA 92.6577016817 2.30006687737 0.0 0.0 0.0
+GCCTC 86.6774230835 2.04820376599 0.0 0.0 0.0
+GCCTG 89.6989898465 2.18029156023 0.0 0.0 0.0
+GCCTT 81.3662188129 2.27065940412 0.0 0.0 0.0
+GCGAA 80.4248960164 2.32331321687 0.0 0.0 0.0
+GCGAC 71.3171789967 1.97463063414 0.0 0.0 0.0
+GCGAG 79.3709589921 2.2294922481 0.0 0.0 0.0
+GCGAT 69.9365554733 1.92436507014 0.0 0.0 0.0
+GCGCA 85.7086241679 1.78393409064 0.0 0.0 0.0
+GCGCC 83.0250774012 1.57502456826 0.0 0.0 0.0
+GCGCG 83.8748761534 1.91165137437 0.0 0.0 0.0
+GCGCT 80.5807813565 1.70097053464 0.0 0.0 0.0
+GCGGA 79.2250607559 2.87599738272 0.0 0.0 0.0
+GCGGC 76.14741393 1.90819358813 0.0 0.0 0.0
+GCGGG 80.9989090704 2.25923857635 0.0 0.0 0.0
+GCGGT 73.7220438121 2.25157063888 0.0 0.0 0.0
+GCGTA 76.536856394 2.37558044894 0.0 0.0 0.0
+GCGTC 70.67637706 2.20065289515 0.0 0.0 0.0
+GCGTG 72.9885852652 2.54896722275 0.0 0.0 0.0
+GCGTT 64.2498143408 2.27919968618 0.0 0.0 0.0
+GCTAA 104.125791847 2.28225248768 0.0 0.0 0.0
+GCTAC 100.880526259 2.09157022193 0.0 0.0 0.0
+GCTAG 102.412219225 2.16963895759 0.0 0.0 0.0
+GCTAT 101.587494366 2.44251718983 0.0 0.0 0.0
+GCTCA 109.716370023 2.19695455208 0.0 0.0 0.0
+GCTCC 108.87252449 2.16205819687 0.0 0.0 0.0
+GCTCG 109.127173006 2.28964961148 0.0 0.0 0.0
+GCTCT 108.918739844 2.12886627244 0.0 0.0 0.0
+GCTGA 99.5234495922 2.29063921047 0.0 0.0 0.0
+GCTGC 99.2992838887 2.13117735854 0.0 0.0 0.0
+GCTGG 100.340369926 2.21151616411 0.0 0.0 0.0
+GCTGT 100.314128995 2.53317772254 0.0 0.0 0.0
+GCTTA 97.8725280246 2.18918729998 0.0 0.0 0.0
+GCTTC 93.8491657823 1.7892316196 0.0 0.0 0.0
+GCTTG 96.1176431984 2.19028501375 0.0 0.0 0.0
+GCTTT 90.0534448389 1.8764282603 0.0 0.0 0.0
+GGAAA 71.9042767075 1.93916173286 0.0 0.0 0.0
+GGAAC 62.1332679095 1.97683252006 0.0 0.0 0.0
+GGAAG 70.0623216925 1.73216577682 0.0 0.0 0.0
+GGAAT 62.4702411399 1.88743257708 0.0 0.0 0.0
+GGACA 77.0150187406 2.29428786839 0.0 0.0 0.0
+GGACC 73.5384024642 2.12543946093 0.0 0.0 0.0
+GGACG 76.0168009557 2.06337561762 0.0 0.0 0.0
+GGACT 73.1519305964 1.89233671226 0.0 0.0 0.0
+GGAGA 71.6126215005 1.83638633213 0.0 0.0 0.0
+GGAGC 66.1644898053 1.71295095569 0.0 0.0 0.0
+GGAGG 71.6918480579 1.49357546462 0.0 0.0 0.0
+GGAGT 66.0854151511 1.80267341879 0.0 0.0 0.0
+GGATA 70.3739741015 2.19075302137 0.0 0.0 0.0
+GGATC 64.2841431223 2.18655512462 0.0 0.0 0.0
+GGATG 67.6323924048 2.05513084747 0.0 0.0 0.0
+GGATT 60.2585930342 1.93936938659 0.0 0.0 0.0
+GGCAA 98.2051887186 2.18077335826 0.0 0.0 0.0
+GGCAC 90.243815688 1.8720453489 0.0 0.0 0.0
+GGCAG 95.6038676752 2.09607856112 0.0 0.0 0.0
+GGCAT 89.8744821997 1.85603313982 0.0 0.0 0.0
+GGCCA 98.6790679439 2.11614583838 0.0 0.0 0.0
+GGCCC 95.4120151639 1.83221559613 0.0 0.0 0.0
+GGCCG 97.3585815861 2.09145344657 0.0 0.0 0.0
+GGCCT 94.5747509322 2.03241795296 0.0 0.0 0.0
+GGCGA 93.6593391465 2.4030935461 0.0 0.0 0.0
+GGCGC 90.0173260702 1.81784746344 0.0 0.0 0.0
+GGCGG 93.9660144051 2.19775240591 0.0 0.0 0.0
+GGCGT 90.0775248088 2.10257128159 0.0 0.0 0.0
+GGCTA 92.0577983398 2.30377281772 0.0 0.0 0.0
+GGCTC 84.8890072074 1.91286401488 0.0 0.0 0.0
+GGCTG 89.0577233764 2.16588723397 0.0 0.0 0.0
+GGCTT 80.219660547 1.87942164387 0.0 0.0 0.0
+GGGAA 67.1046909752 2.13447000521 0.0 0.0 0.0
+GGGAC 56.2553266823 2.32310821019 0.0 0.0 0.0
+GGGAG 65.3360067163 1.76167564328 0.0 0.0 0.0
+GGGAT 56.5685656909 1.88665869325 0.0 0.0 0.0
+GGGCA 78.1470959027 1.98088695958 0.0 0.0 0.0
+GGGCC 74.8604032725 1.62367471791 0.0 0.0 0.0
+GGGCG 76.6749724708 1.79265286865 0.0 0.0 0.0
+GGGCT 73.6082606766 1.62569453981 0.0 0.0 0.0
+GGGGA 69.4744552174 2.2717156328 0.0 0.0 0.0
+GGGGC 64.2604760351 1.72836583197 0.0 0.0 0.0
+GGGGG 70.5530274359 1.58628237522 0.0 0.0 0.0
+GGGGT 63.9159803052 1.53487924487 0.0 0.0 0.0
+GGGTA 67.4233670632 2.1482409277 0.0 0.0 0.0
+GGGTC 60.9524520825 1.91233382599 0.0 0.0 0.0
+GGGTG 63.9580891078 2.04324323525 0.0 0.0 0.0
+GGGTT 56.1175452811 2.07332704552 0.0 0.0 0.0
+GGTAA 100.079563943 2.50188197657 0.0 0.0 0.0
+GGTAC 94.4784692949 2.33032702499 0.0 0.0 0.0
+GGTAG 97.851137218 2.41943843556 0.0 0.0 0.0
+GGTAT 96.7591053619 2.84961227085 0.0 0.0 0.0
+GGTCA 107.78416351 2.63052992586 0.0 0.0 0.0
+GGTCC 106.616601207 2.50209409901 0.0 0.0 0.0
+GGTCG 107.07812139 2.65997669438 0.0 0.0 0.0
+GGTCT 107.167993849 2.55836426524 0.0 0.0 0.0
+GGTGA 97.2465837896 2.78747165051 0.0 0.0 0.0
+GGTGC 96.8846559568 2.51189400908 0.0 0.0 0.0
+GGTGG 98.3700884484 2.70972394233 0.0 0.0 0.0
+GGTGT 100.106053436 3.29290731261 0.0 0.0 0.0
+GGTTA 95.0389495266 2.38297523259 0.0 0.0 0.0
+GGTTC 90.2340879353 2.73720255408 0.0 0.0 0.0
+GGTTG 92.8683473174 2.41355587196 0.0 0.0 0.0
+GGTTT 86.5259088753 1.99382023909 0.0 0.0 0.0
+GTAAA 87.1387334754 2.19256746947 0.0 0.0 0.0
+GTAAC 78.7511610023 2.13900855892 0.0 0.0 0.0
+GTAAG 85.8142299525 2.05611413078 0.0 0.0 0.0
+GTAAT 77.6209576575 2.27165903957 0.0 0.0 0.0
+GTACA 88.7091594126 1.55898684033 0.0 0.0 0.0
+GTACC 86.6163232908 1.48874521683 0.0 0.0 0.0
+GTACG 87.688601942 1.5497021358 0.0 0.0 0.0
+GTACT 85.1662632131 1.61773558566 0.0 0.0 0.0
+GTAGA 84.424620085 2.57034047366 0.0 0.0 0.0
+GTAGC 80.7429851893 2.07677320579 0.0 0.0 0.0
+GTAGG 85.1840517358 3.19921066815 0.0 0.0 0.0
+GTAGT 78.8871061287 2.20860876438 0.0 0.0 0.0
+GTATA 83.4836393187 2.09655072521 0.0 0.0 0.0
+GTATC 79.4975094406 2.19830379075 0.0 0.0 0.0
+GTATG 81.4697766227 2.36656171765 0.0 0.0 0.0
+GTATT 74.0673784714 2.71403533907 0.0 0.0 0.0
+GTCAA 98.3697755225 2.10893095356 0.0 0.0 0.0
+GTCAC 92.2719052352 2.13654297284 0.0 0.0 0.0
+GTCAG 96.8596238139 2.08087821741 0.0 0.0 0.0
+GTCAT 90.6940297692 2.13743762261 0.0 0.0 0.0
+GTCCA 97.0471061103 1.87362033978 0.0 0.0 0.0
+GTCCC 94.7135233509 1.67991344442 0.0 0.0 0.0
+GTCCG 96.4386324586 1.98499928919 0.0 0.0 0.0
+GTCCT 93.2671289977 1.85568352531 0.0 0.0 0.0
+GTCGA 94.7470601286 2.86614439471 0.0 0.0 0.0
+GTCGC 91.606264395 2.21692522467 0.0 0.0 0.0
+GTCGG 95.0995938645 2.23435044952 0.0 0.0 0.0
+GTCGT 90.0269919885 2.89817688859 0.0 0.0 0.0
+GTCTA 92.3546259448 2.31717091884 0.0 0.0 0.0
+GTCTC 88.1797416289 2.37151586583 0.0 0.0 0.0
+GTCTG 90.5388712651 2.49107060451 0.0 0.0 0.0
+GTCTT 82.6600662493 2.66738274614 0.0 0.0 0.0
+GTGAA 81.2064752379 2.78062243375 0.0 0.0 0.0
+GTGAC 71.8683723134 2.19281832759 0.0 0.0 0.0
+GTGAG 80.5377042738 2.77909768154 0.0 0.0 0.0
+GTGAT 71.0796826834 2.36214736609 0.0 0.0 0.0
+GTGCA 87.1865921356 1.69575980059 0.0 0.0 0.0
+GTGCC 84.8215898548 1.66360370246 0.0 0.0 0.0
+GTGCG 85.9801728011 1.88321722719 0.0 0.0 0.0
+GTGCT 83.1032189077 1.85567855265 0.0 0.0 0.0
+GTGGA 83.2051757949 2.98913311004 0.0 0.0 0.0
+GTGGC 79.7042258301 2.79050125645 0.0 0.0 0.0
+GTGGG 84.7748972216 2.59815238561 0.0 0.0 0.0
+GTGGT 78.6204953159 3.04453952644 0.0 0.0 0.0
+GTGTA 80.2705661835 2.54876222662 0.0 0.0 0.0
+GTGTC 76.6070380197 2.82430598114 0.0 0.0 0.0
+GTGTG 77.7685140633 2.75884937047 0.0 0.0 0.0
+GTGTT 70.9060079095 3.14666324704 0.0 0.0 0.0
+GTTAA 101.147518755 1.90924333131 0.0 0.0 0.0
+GTTAC 97.0434847667 1.70646646347 0.0 0.0 0.0
+GTTAG 99.5184565683 1.77288906319 0.0 0.0 0.0
+GTTAT 97.3287574274 2.14255777144 0.0 0.0 0.0
+GTTCA 104.221469936 2.05927376494 0.0 0.0 0.0
+GTTCC 103.007726016 2.01835301155 0.0 0.0 0.0
+GTTCG 103.662056167 2.18420759786 0.0 0.0 0.0
+GTTCT 103.312693407 1.9706102465 0.0 0.0 0.0
+GTTGA 97.2569462969 2.03438888198 0.0 0.0 0.0
+GTTGC 96.1473469603 1.84783126501 0.0 0.0 0.0
+GTTGG 97.9875264128 1.86957164256 0.0 0.0 0.0
+GTTGT 97.1811315637 2.17537733461 0.0 0.0 0.0
+GTTTA 94.7591315455 2.02997682733 0.0 0.0 0.0
+GTTTC 90.7863476884 1.7432525768 0.0 0.0 0.0
+GTTTG 93.1553947076 2.00907158466 0.0 0.0 0.0
+GTTTT 87.6727147155 1.95979688856 0.0 0.0 0.0
+TAAAA 84.1709509983 1.88539786492 0.0 0.0 0.0
+TAAAC 75.3226346503 1.88786738602 0.0 0.0 0.0
+TAAAG 82.4812360749 1.77194086015 0.0 0.0 0.0
+TAAAT 74.3211564677 1.7260188478 0.0 0.0 0.0
+TAACA 86.2759850497 1.76846306224 0.0 0.0 0.0
+TAACC 83.2847994314 1.55724253485 0.0 0.0 0.0
+TAACG 85.2039976744 1.60321059761 0.0 0.0 0.0
+TAACT 82.0541110561 1.52929437847 0.0 0.0 0.0
+TAAGA 82.0490745775 2.17715173431 0.0 0.0 0.0
+TAAGC 77.3957671312 1.70515431061 0.0 0.0 0.0
+TAAGG 81.5620753871 1.76109417064 0.0 0.0 0.0
+TAAGT 75.3409424685 1.80934854593 0.0 0.0 0.0
+TAATA 79.9398191626 1.79908382794 0.0 0.0 0.0
+TAATC 73.6034648328 1.87117867946 0.0 0.0 0.0
+TAATG 77.2114410913 1.80465913988 0.0 0.0 0.0
+TAATT 67.9715029591 2.21619894072 0.0 0.0 0.0
+TACAA 101.760316082 2.07033824765 0.0 0.0 0.0
+TACAC 94.3691908281 1.6347882998 0.0 0.0 0.0
+TACAG 99.1182609725 2.03895949594 0.0 0.0 0.0
+TACAT 93.5990748853 1.67418116028 0.0 0.0 0.0
+TACCA 101.499605866 1.98922262817 0.0 0.0 0.0
+TACCC 98.3053931934 1.83900416151 0.0 0.0 0.0
+TACCG 100.197451698 1.91837408841 0.0 0.0 0.0
+TACCT 96.9323513695 1.8557996184 0.0 0.0 0.0
+TACGA 96.5473935666 2.29863065622 0.0 0.0 0.0
+TACGC 93.3183612763 1.64034898965 0.0 0.0 0.0
+TACGG 96.7626874951 2.06796518926 0.0 0.0 0.0
+TACGT 92.4233546735 1.8074376245 0.0 0.0 0.0
+TACTA 95.2378195443 2.15248511678 0.0 0.0 0.0
+TACTC 88.2214331449 1.74930313678 0.0 0.0 0.0
+TACTG 92.0725014048 2.12413730992 0.0 0.0 0.0
+TACTT 83.0993306969 1.89044561043 0.0 0.0 0.0
+TAGAA 75.9105587904 1.95699178283 0.0 0.0 0.0
+TAGAC 65.8073785335 1.94101015031 0.0 0.0 0.0
+TAGAG 74.418357258 1.94466950447 0.0 0.0 0.0
+TAGAT 65.2682932026 1.67583152055 0.0 0.0 0.0
+TAGCA 82.3951345649 1.61558504471 0.0 0.0 0.0
+TAGCC 79.0147454793 1.46388015951 0.0 0.0 0.0
+TAGCG 79.849760159 1.8582035553 0.0 0.0 0.0
+TAGCT 77.3557384185 1.75243020069 0.0 0.0 0.0
+TAGGA 74.705365225 2.51675669495 0.0 0.0 0.0
+TAGGC 70.5974004247 1.84261842859 0.0 0.0 0.0
+TAGGG 75.9256943486 1.83897839526 0.0 0.0 0.0
+TAGGT 69.0476661738 1.77735900599 0.0 0.0 0.0
+TAGTA 72.7396166558 2.2549977783 0.0 0.0 0.0
+TAGTC 66.1763223096 2.22023309795 0.0 0.0 0.0
+TAGTG 68.9271926578 2.40879384736 0.0 0.0 0.0
+TAGTT 60.2178876918 2.39208167639 0.0 0.0 0.0
+TATAA 106.924879652 2.46676412613 0.0 0.0 0.0
+TATAC 103.127761686 2.45593289847 0.0 0.0 0.0
+TATAG 104.823116348 2.48696314837 0.0 0.0 0.0
+TATAT 104.829266333 3.03854921185 0.0 0.0 0.0
+TATCA 114.916541432 2.50540528521 0.0 0.0 0.0
+TATCC 114.194267 2.6125928348 0.0 0.0 0.0
+TATCG 114.402172964 2.65092855704 0.0 0.0 0.0
+TATCT 114.838903357 2.4933477582 0.0 0.0 0.0
+TATGA 102.560530817 2.62822234875 0.0 0.0 0.0
+TATGC 103.329160429 2.67421135782 0.0 0.0 0.0
+TATGG 103.765886278 2.63637263734 0.0 0.0 0.0
+TATGT 106.143916596 3.09260869837 0.0 0.0 0.0
+TATTA 101.367181901 2.45150203225 0.0 0.0 0.0
+TATTC 96.9892494469 2.08733274924 0.0 0.0 0.0
+TATTG 99.2612786387 2.4038461078 0.0 0.0 0.0
+TATTT 92.9615822214 1.89651592553 0.0 0.0 0.0
+TCAAA 87.4810485339 2.40050609872 0.0 0.0 0.0
+TCAAC 79.3785714825 1.98050869928 0.0 0.0 0.0
+TCAAG 86.2402932043 3.26251298833 0.0 0.0 0.0
+TCAAT 77.7709010643 2.01608958217 0.0 0.0 0.0
+TCACA 89.7487733026 1.61031059819 0.0 0.0 0.0
+TCACC 87.4573788341 1.48998380249 0.0 0.0 0.0
+TCACG 88.8396830035 1.66048769914 0.0 0.0 0.0
+TCACT 85.3697927263 1.687733249 0.0 0.0 0.0
+TCAGA 82.4157841244 2.81061076905 0.0 0.0 0.0
+TCAGC 78.9991368663 1.97576619309 0.0 0.0 0.0
+TCAGG 82.7884815574 2.29527190593 0.0 0.0 0.0
+TCAGT 75.4862567891 2.15804007679 0.0 0.0 0.0
+TCATA 83.5721963133 2.23973074726 0.0 0.0 0.0
+TCATC 77.8073698781 2.18050267906 0.0 0.0 0.0
+TCATG 81.1065235781 2.42243398947 0.0 0.0 0.0
+TCATT 71.7654244674 2.48625475501 0.0 0.0 0.0
+TCCAA 98.4768364814 1.80386008818 0.0 0.0 0.0
+TCCAC 91.654636408 1.6971977078 0.0 0.0 0.0
+TCCAG 96.2587903036 1.67317041277 0.0 0.0 0.0
+TCCAT 90.2124436346 1.71720250582 0.0 0.0 0.0
+TCCCA 97.2888903972 1.84318239149 0.0 0.0 0.0
+TCCCC 94.8975384019 1.88400520466 0.0 0.0 0.0
+TCCCG 96.3024780151 1.8022958594 0.0 0.0 0.0
+TCCCT 93.095717813 1.83403238772 0.0 0.0 0.0
+TCCGA 92.7016420971 2.31249474559 0.0 0.0 0.0
+TCCGC 89.8409959532 1.6746749253 0.0 0.0 0.0
+TCCGG 93.1641187708 1.88714719308 0.0 0.0 0.0
+TCCGT 87.970797439 1.88569045247 0.0 0.0 0.0
+TCCTA 92.1647448391 2.08010655027 0.0 0.0 0.0
+TCCTC 86.4037902316 2.10783649153 0.0 0.0 0.0
+TCCTG 89.3519341355 2.0677207408 0.0 0.0 0.0
+TCCTT 81.0826291233 2.17180897572 0.0 0.0 0.0
+TCGAA 80.2168165843 2.16398807309 0.0 0.0 0.0
+TCGAC 71.5670057811 1.88998801107 0.0 0.0 0.0
+TCGAG 79.154222362 2.109231852 0.0 0.0 0.0
+TCGAT 69.8916365322 1.79439374841 0.0 0.0 0.0
+TCGCA 84.8743758241 1.67902427497 0.0 0.0 0.0
+TCGCC 81.9989620082 1.60531199785 0.0 0.0 0.0
+TCGCG 82.9007271875 1.9209832325 0.0 0.0 0.0
+TCGCT 79.8108964791 1.75315960968 0.0 0.0 0.0
+TCGGA 78.9069418033 2.86409518883 0.0 0.0 0.0
+TCGGC 75.9683139773 1.83351446263 0.0 0.0 0.0
+TCGGG 80.5058854457 2.11372621908 0.0 0.0 0.0
+TCGGT 73.577703129 2.16764952862 0.0 0.0 0.0
+TCGTA 76.2949341272 2.31707769047 0.0 0.0 0.0
+TCGTC 70.4371756713 2.09616595174 0.0 0.0 0.0
+TCGTG 72.8861350188 2.52630398605 0.0 0.0 0.0
+TCGTT 64.2261520471 2.34171482633 0.0 0.0 0.0
+TCTAA 105.065441516 2.18237857764 0.0 0.0 0.0
+TCTAC 102.002392369 2.08498446312 0.0 0.0 0.0
+TCTAG 102.896019896 2.25897476826 0.0 0.0 0.0
+TCTAT 102.617387331 2.70094655082 0.0 0.0 0.0
+TCTCA 110.711474097 2.30299111445 0.0 0.0 0.0
+TCTCC 110.086888098 2.13697052006 0.0 0.0 0.0
+TCTCG 110.24148065 2.39036486539 0.0 0.0 0.0
+TCTCT 110.246536733 2.43914699476 0.0 0.0 0.0
+TCTGA 100.528614755 2.44727343582 0.0 0.0 0.0
+TCTGC 100.353470875 2.14765967176 0.0 0.0 0.0
+TCTGG 101.415230558 2.31964261982 0.0 0.0 0.0
+TCTGT 101.534870443 2.55293404138 0.0 0.0 0.0
+TCTTA 98.904253364 2.28008762182 0.0 0.0 0.0
+TCTTC 94.6760733359 1.81693595534 0.0 0.0 0.0
+TCTTG 96.9176704595 2.34223166754 0.0 0.0 0.0
+TCTTT 90.9765977852 1.89276248635 0.0 0.0 0.0
+TGAAA 74.8307935202 1.83637708611 0.0 0.0 0.0
+TGAAC 64.8567673762 2.10393162581 0.0 0.0 0.0
+TGAAG 72.4734332261 1.67318391624 0.0 0.0 0.0
+TGAAT 63.6779995465 2.07738787511 0.0 0.0 0.0
+TGACA 79.9049070956 1.91184605138 0.0 0.0 0.0
+TGACC 76.8026785476 1.7603712113 0.0 0.0 0.0
+TGACG 78.7642549433 1.74779036645 0.0 0.0 0.0
+TGACT 75.8769514744 1.68537177516 0.0 0.0 0.0
+TGAGA 74.3568428554 2.17991662463 0.0 0.0 0.0
+TGAGC 69.3009060901 1.73315825984 0.0 0.0 0.0
+TGAGG 74.4733176412 1.56820466915 0.0 0.0 0.0
+TGAGT 68.3125544518 1.57967780736 0.0 0.0 0.0
+TGATA 72.4740480383 1.96194562252 0.0 0.0 0.0
+TGATC 66.3899867537 1.95112260309 0.0 0.0 0.0
+TGATG 69.6831947977 1.89797055912 0.0 0.0 0.0
+TGATT 61.3994662967 2.20234816218 0.0 0.0 0.0
+TGCAA 98.9955788607 2.07808479744 0.0 0.0 0.0
+TGCAC 90.9815936203 1.69987915757 0.0 0.0 0.0
+TGCAG 96.1312825635 2.05254402563 0.0 0.0 0.0
+TGCAT 90.3508165634 1.76408979678 0.0 0.0 0.0
+TGCCA 99.5166816883 2.05299270109 0.0 0.0 0.0
+TGCCC 96.1965472263 1.82914679225 0.0 0.0 0.0
+TGCCG 98.2369706326 1.89634406022 0.0 0.0 0.0
+TGCCT 95.0971580542 1.9923871749 0.0 0.0 0.0
+TGCGA 94.0486203128 2.38425451446 0.0 0.0 0.0
+TGCGC 90.5456225182 1.66912251961 0.0 0.0 0.0
+TGCGG 94.4214187741 2.11754641467 0.0 0.0 0.0
+TGCGT 90.1448128784 1.90341596193 0.0 0.0 0.0
+TGCTA 92.4391036732 2.2980967384 0.0 0.0 0.0
+TGCTC 85.7465202221 1.86409411027 0.0 0.0 0.0
+TGCTG 89.6817300638 2.18923918774 0.0 0.0 0.0
+TGCTT 80.8075868514 1.92690017236 0.0 0.0 0.0
+TGGAA 69.1990136085 1.93590720288 0.0 0.0 0.0
+TGGAC 58.6729009919 1.96485127097 0.0 0.0 0.0
+TGGAG 67.494724148 1.8137665147 0.0 0.0 0.0
+TGGAT 58.3279680571 1.81416068869 0.0 0.0 0.0
+TGGCA 79.8011795618 1.6101666636 0.0 0.0 0.0
+TGGCC 76.6859858104 1.39347120714 0.0 0.0 0.0
+TGGCG 78.2668208248 1.620012985 0.0 0.0 0.0
+TGGCT 75.2159937556 1.59148786631 0.0 0.0 0.0
+TGGGA 71.7892235775 2.36759100388 0.0 0.0 0.0
+TGGGC 67.05609355 1.81697982724 0.0 0.0 0.0
+TGGGG 72.9256358395 1.66288700681 0.0 0.0 0.0
+TGGGT 66.040259821 1.68408991922 0.0 0.0 0.0
+TGGTA 69.941199191 2.23476691758 0.0 0.0 0.0
+TGGTC 63.4447503268 2.23286309692 0.0 0.0 0.0
+TGGTG 66.3799679742 2.31175723905 0.0 0.0 0.0
+TGGTT 58.1324051536 2.19043593806 0.0 0.0 0.0
+TGTAA 101.727744867 2.44751261625 0.0 0.0 0.0
+TGTAC 96.5568988151 2.22224771369 0.0 0.0 0.0
+TGTAG 99.4685028148 2.36594705215 0.0 0.0 0.0
+TGTAT 98.3649303744 2.84559248941 0.0 0.0 0.0
+TGTCA 109.890994438 2.35734144102 0.0 0.0 0.0
+TGTCC 108.834526466 2.43209711658 0.0 0.0 0.0
+TGTCG 109.378815382 2.6914340228 0.0 0.0 0.0
+TGTCT 109.468543975 2.86561878218 0.0 0.0 0.0
+TGTGA 98.466695106 2.90271591294 0.0 0.0 0.0
+TGTGC 98.3915862312 2.64315138095 0.0 0.0 0.0
+TGTGG 99.5987128882 2.75384511441 0.0 0.0 0.0
+TGTGT 101.315719101 3.01691335998 0.0 0.0 0.0
+TGTTA 96.8068347641 2.58627032759 0.0 0.0 0.0
+TGTTC 91.9577365971 2.14935656543 0.0 0.0 0.0
+TGTTG 94.4251595688 2.44849103295 0.0 0.0 0.0
+TGTTT 88.0147934969 1.99273874545 0.0 0.0 0.0
+TTAAA 88.6985813291 2.12937303358 0.0 0.0 0.0
+TTAAC 80.6171287777 2.10197582608 0.0 0.0 0.0
+TTAAG 87.3133297265 2.16628824371 0.0 0.0 0.0
+TTAAT 79.307810666 2.06848628693 0.0 0.0 0.0
+TTACA 90.1383564505 1.60374232114 0.0 0.0 0.0
+TTACC 88.0754105457 1.55867828769 0.0 0.0 0.0
+TTACG 89.139532019 1.63094923307 0.0 0.0 0.0
+TTACT 86.5261871941 1.63084005965 0.0 0.0 0.0
+TTAGA 85.6038411217 2.44704063697 0.0 0.0 0.0
+TTAGC 82.1997409714 2.05409606225 0.0 0.0 0.0
+TTAGG 85.946385832 2.01642931341 0.0 0.0 0.0
+TTAGT 80.1584384123 2.16533651089 0.0 0.0 0.0
+TTATA 85.0753203656 2.16212041654 0.0 0.0 0.0
+TTATC 80.7002719028 2.22642915004 0.0 0.0 0.0
+TTATG 82.8642387509 2.26208287072 0.0 0.0 0.0
+TTATT 75.2161321039 2.5356999779 0.0 0.0 0.0
+TTCAA 98.0634421638 1.86868762731 0.0 0.0 0.0
+TTCAC 92.2811814335 1.73627822145 0.0 0.0 0.0
+TTCAG 96.3749474736 1.86764839656 0.0 0.0 0.0
+TTCAT 90.6809325001 1.85243543377 0.0 0.0 0.0
+TTCCA 96.6581733473 1.76879387918 0.0 0.0 0.0
+TTCCC 94.0850708075 1.59017464709 0.0 0.0 0.0
+TTCCG 95.8513216758 1.98453922954 0.0 0.0 0.0
+TTCCT 92.940398258 1.8086869883 0.0 0.0 0.0
+TTCGA 94.1986227197 2.28370015097 0.0 0.0 0.0
+TTCGC 91.3035829506 1.91098255669 0.0 0.0 0.0
+TTCGG 94.6169645865 2.0758107353 0.0 0.0 0.0
+TTCGT 89.6611988573 2.08288751465 0.0 0.0 0.0
+TTCTA 92.291071373 2.11700881652 0.0 0.0 0.0
+TTCTC 87.3720513468 2.19157167452 0.0 0.0 0.0
+TTCTG 89.862930604 2.14458722316 0.0 0.0 0.0
+TTCTT 82.3381218698 2.43946976504 0.0 0.0 0.0
+TTGAA 81.8370227682 3.64966260088 0.0 0.0 0.0
+TTGAC 72.8506802879 2.11724773152 0.0 0.0 0.0
+TTGAG 80.7536074973 2.36649771236 0.0 0.0 0.0
+TTGAT 71.5667995008 2.23196329688 0.0 0.0 0.0
+TTGCA 86.7683686252 1.69070330826 0.0 0.0 0.0
+TTGCC 84.4414940686 1.68336468579 0.0 0.0 0.0
+TTGCG 85.6510169082 1.81156615914 0.0 0.0 0.0
+TTGCT 82.7585575196 1.7812102641 0.0 0.0 0.0
+TTGGA 82.9571298524 2.65274150451 0.0 0.0 0.0
+TTGGC 79.3841768767 2.41253481762 0.0 0.0 0.0
+TTGGG 84.018825574 2.29464755138 0.0 0.0 0.0
+TTGGT 78.1260658905 2.47911898913 0.0 0.0 0.0
+TTGTA 80.0805299978 2.3030806009 0.0 0.0 0.0
+TTGTC 76.0298991599 2.49426818717 0.0 0.0 0.0
+TTGTG 77.6499174636 2.63614687237 0.0 0.0 0.0
+TTGTT 70.5005932718 2.79991150229 0.0 0.0 0.0
+TTTAA 100.785122643 1.93357297059 0.0 0.0 0.0
+TTTAC 97.1272797756 1.87056267056 0.0 0.0 0.0
+TTTAG 99.1678814581 1.88595475877 0.0 0.0 0.0
+TTTAT 97.6431055801 2.15788483587 0.0 0.0 0.0
+TTTCA 104.619016316 2.35395828699 0.0 0.0 0.0
+TTTCC 103.289090333 2.2668102401 0.0 0.0 0.0
+TTTCG 103.89593473 2.52565098077 0.0 0.0 0.0
+TTTCT 103.746563996 2.29401951665 0.0 0.0 0.0
+TTTGA 96.8671352682 2.06683172099 0.0 0.0 0.0
+TTTGC 95.9967285201 1.88759736818 0.0 0.0 0.0
+TTTGG 97.6213646229 2.07188720233 0.0 0.0 0.0
+TTTGT 97.2449217956 2.21165395009 0.0 0.0 0.0
+TTTTA 94.5266364445 2.24404566794 0.0 0.0 0.0
+TTTTC 90.666225394 1.75185029821 0.0 0.0 0.0
+TTTTG 92.8257549205 2.17035458863 0.0 0.0 0.0
+TTTTT 87.5996767438 1.65816112042 0.0 0.0 0.0
diff --git a/etc/r9-models/template_median68pA.model b/etc/r9-models/template_median68pA.model
new file mode 100644
index 0000000..86d7df6
--- /dev/null
+++ b/etc/r9-models/template_median68pA.model
@@ -0,0 +1,4101 @@
+#model_name template_median68pA.model
+#type ONT
+#strand template
+#kit SQK007
+kmer level_mean level_stdv sd_mean sd_stdv ig_lambda weight
+AAAAAA 83.459321 1.591638 1.321178 0.548785 7.657366 4145.236890
+AAAAAC 81.128876 1.616835 1.463961 0.640110 7.657366 3357.464182
+AAAAAG 82.529619 1.615134 1.329729 0.554122 7.657366 3382.354662
+AAAAAT 81.752998 1.553822 1.516740 0.675036 7.657366 3660.623806
+AAAACA 74.185245 1.897392 1.742847 0.831475 7.657366 2529.696987
+AAAACC 72.891908 1.675615 1.706309 0.805465 7.657366 2846.207727
+AAAACG 74.041621 1.882359 1.727239 0.820331 7.657366 3017.200789
+AAAACT 73.728095 1.556456 1.582905 0.719685 7.657366 1905.276256
+AAAAGA 81.347862 1.696167 1.328098 0.553102 7.657366 2123.274964
+AAAAGC 79.996528 1.679097 1.377849 0.584471 7.657366 2707.701683
+AAAAGG 80.804441 1.611021 1.317006 0.546188 7.657366 1566.208197
+AAAAGT 80.171416 1.649446 1.479417 0.650274 7.657366 1527.216783
+AAAATA 73.449638 1.563134 1.576688 0.715450 7.657366 2493.059567
+AAAATC 71.710456 1.552421 1.497682 0.662353 7.657366 3073.260095
+AAAATG 72.739369 1.598583 1.499467 0.663538 7.657366 2813.989990
+AAAATT 72.877537 1.661542 1.672534 0.781669 7.657366 2315.278818
+AAACAA 84.398277 1.936599 1.714203 0.811061 7.657366 1861.900372
+AAACAC 83.014844 1.681348 1.496593 0.661631 7.657366 1428.022474
+AAACAG 83.659180 1.751278 1.605966 0.735470 7.657366 2930.724700
+AAACAT 83.729254 1.942612 1.627776 0.750503 7.657366 1681.928769
+AAACCA 80.985491 1.658047 1.640344 0.759212 7.657366 2693.207618
+AAACCC 80.074048 1.654096 1.537213 0.688750 7.657366 1490.946167
+AAACCG 80.915586 1.627754 1.645865 0.763047 7.657366 2923.851466
+AAACCT 80.766554 1.676588 1.623940 0.747852 7.657366 1495.674470
+AAACGA 83.677200 1.623275 1.510703 0.671010 7.657366 1383.656382
+AAACGC 82.191909 1.526688 1.449353 0.630553 7.657366 3302.344924
+AAACGG 82.761933 1.516278 1.545695 0.694458 7.657366 2054.148944
+AAACGT 82.559118 1.560106 1.524944 0.680521 7.657366 1781.652750
+AAACTA 80.478631 1.494948 1.473653 0.646477 7.657366 660.864488
+AAACTC 78.800534 1.566004 1.489758 0.657104 7.657366 1408.553187
+AAACTG 79.743792 1.576991 1.474391 0.646963 7.657366 2893.538750
+AAACTT 79.227441 1.381497 1.313049 0.543728 7.657366 1170.729934
+AAAGAA 81.554793 1.474459 1.351015 0.567480 7.657366 2371.368393
+AAAGAC 77.544191 1.290763 1.514563 0.673583 7.657366 1203.981299
+AAAGAG 80.741230 1.496750 1.335602 0.557796 7.657366 1861.485406
+AAAGAT 79.011024 1.411275 1.492385 0.658843 7.657366 2043.232278
+AAAGCA 76.283685 1.443684 1.390594 0.592599 7.657366 2130.638872
+AAAGCC 74.786841 1.466823 1.399001 0.597981 7.657366 2690.769063
+AAAGCG 75.920872 1.551416 1.338842 0.559828 7.657366 3427.479637
+AAAGCT 75.946677 1.473156 1.295799 0.533049 7.657366 1396.187530
+AAAGGA 80.494553 1.355408 1.287505 0.527939 7.657366 1009.653309
+AAAGGC 77.989424 1.277592 1.383473 0.588053 7.657366 2022.595239
+AAAGGG 79.564900 1.353195 1.301719 0.536706 7.657366 1019.430602
+AAAGGT 79.162312 1.348292 1.422991 0.613428 7.657366 1741.480781
+AAAGTA 74.705918 1.534961 1.370027 0.579501 7.657366 1123.894885
+AAAGTC 72.288133 1.453325 1.492711 0.659058 7.657366 1310.310594
+AAAGTG 73.732043 1.413239 1.444321 0.627272 7.657366 2045.877900
+AAAGTT 73.608174 1.425360 1.636509 0.756550 7.657366 1383.611994
+AAATAA 79.198897 1.627391 1.784137 0.861197 7.657366 2342.625233
+AAATAC 76.882560 1.637701 1.786796 0.863123 7.657366 1643.114530
+AAATAG 78.532158 1.544583 1.722479 0.816942 7.657366 1067.357110
+AAATAT 77.849877 1.732211 1.916657 0.958908 7.657366 2270.459779
+AAATCA 72.496151 1.808944 2.070469 1.076623 7.657366 2710.237997
+AAATCC 71.313829 1.711679 2.012612 1.031811 7.657366 1899.759527
+AAATCG 72.547710 1.661145 2.056514 1.065756 7.657366 2225.298911
+AAATCT 72.339855 1.875882 2.083359 1.086692 7.657366 1427.773940
+AAATGA 76.767095 1.658428 1.734207 0.825300 7.657366 1786.722603
+AAATGC 74.347666 1.425664 1.741680 0.830640 7.657366 2086.693832
+AAATGG 75.911693 1.547831 1.677174 0.784923 7.657366 1975.075915
+AAATGT 75.050034 1.648438 1.896726 0.943989 7.657366 1211.478747
+AAATTA 67.543936 1.848051 1.939902 0.976405 7.657366 1882.054980
+AAATTC 64.917819 1.749241 2.051651 1.061978 7.657366 1975.835047
+AAATTG 66.765844 1.832815 1.891822 0.940331 7.657366 2162.210430
+AAATTT 65.986463 1.766587 1.863576 0.919350 7.657366 2070.218567
+AACAAA 104.560905 1.948574 2.147401 1.137182 7.657366 2315.742160
+AACAAC 101.781412 2.007783 2.181526 1.164397 7.657366 2055.275375
+AACAAG 103.689085 1.990537 2.202001 1.180828 7.657366 1154.347688
+AACAAT 103.157018 1.956820 2.294183 1.255748 7.657366 1908.334464
+AACACA 96.611913 1.701498 1.700910 0.801646 7.657366 995.192427
+AACACC 95.371831 1.782432 1.719321 0.814696 7.657366 1958.522786
+AACACG 96.474509 1.814872 1.722071 0.816652 7.657366 1177.642974
+AACACT 96.576270 1.743177 1.764756 0.847202 7.657366 926.165922
+AACAGA 102.052335 1.958302 2.134335 1.126819 7.657366 1613.169180
+AACAGC 99.435402 1.866365 2.062956 1.070768 7.657366 3002.842285
+AACAGG 101.086598 1.816855 2.055420 1.064905 7.657366 2125.099766
+AACAGT 100.835276 2.097970 2.279839 1.243989 7.657366 1558.048332
+AACATA 96.094271 1.749695 1.908687 0.952933 7.657366 1071.254080
+AACATC 94.780266 2.051195 2.054606 1.064273 7.657366 2245.528167
+AACATG 95.353033 2.016368 1.985204 1.010806 7.657366 1458.167185
+AACATT 96.219737 2.097954 2.158412 1.145940 7.657366 1597.173907
+AACCAA 104.400321 1.981014 1.810483 0.880343 7.657366 1585.220782
+AACCAC 101.782938 1.780852 1.788697 0.864501 7.657366 2000.131618
+AACCAG 103.410060 1.883453 1.804253 0.875803 7.657366 2976.494638
+AACCAT 102.870244 1.941251 1.994290 1.017754 7.657366 1661.634922
+AACCCA 100.562033 1.766518 1.639453 0.758593 7.657366 1141.212040
+AACCCC 99.041382 1.692912 1.582546 0.719440 7.657366 666.354483
+AACCCG 100.581382 1.823834 1.608350 0.737108 7.657366 1872.708874
+AACCCT 100.116768 1.767846 1.728197 0.821014 7.657366 961.475082
+AACCGA 103.105938 1.949831 1.968968 0.998432 7.657366 1561.590295
+AACCGC 100.666081 1.884756 1.799610 0.872425 7.657366 2564.079247
+AACCGG 102.335367 1.868024 1.861999 0.918183 7.657366 2249.773696
+AACCGT 101.651606 2.014057 2.041089 1.053788 7.657366 1746.395959
+AACCTA 99.898953 1.761176 1.811339 0.880967 7.657366 412.081740
+AACCTC 97.717451 1.845016 1.879945 0.931490 7.657366 719.144174
+AACCTG 99.023649 1.784493 1.829112 0.893965 7.657366 2659.257180
+AACCTT 98.887134 1.862942 1.948424 0.982846 7.657366 1354.262187
+AACGAA 100.077837 1.851299 1.997848 1.020479 7.657366 1494.863518
+AACGAC 95.704313 1.779808 2.000047 1.022164 7.657366 1451.436739
+AACGAG 99.123116 1.867608 1.974685 1.002783 7.657366 727.966235
+AACGAT 97.652516 2.043353 2.157149 1.144934 7.657366 1902.701269
+AACGCA 95.722117 1.873598 1.836809 0.899614 7.657366 1891.757445
+AACGCC 94.467210 1.919080 1.816322 0.884605 7.657366 3567.282527
+AACGCG 95.669917 1.811109 1.828118 0.893237 7.657366 2520.999424
+AACGCT 95.921623 2.058383 1.997356 1.020101 7.657366 2483.276607
+AACGGA 99.798033 1.987094 2.132253 1.125171 7.657366 1218.795070
+AACGGC 96.723474 1.791071 1.958757 0.990675 7.657366 2752.508937
+AACGGG 99.000400 2.006531 2.066303 1.073374 7.657366 1212.875485
+AACGGT 98.526342 1.965278 2.110198 1.107758 7.657366 2243.269639
+AACGTA 95.273999 2.076003 2.071486 1.077416 7.657366 1168.267054
+AACGTC 93.773495 2.100931 2.079577 1.083734 7.657366 1857.160938
+AACGTG 94.708003 2.152767 2.074768 1.079977 7.657366 1646.593129
+AACGTT 95.431504 2.243709 2.269604 1.235621 7.657366 2203.459497
+AACTAA 98.185853 1.769092 1.864309 0.919892 7.657366 504.167139
+AACTAC 94.736240 1.575556 1.732625 0.824171 7.657366 704.742952
+AACTAG 96.631787 1.674116 1.790998 0.866170 7.657366 77.060198
+AACTAT 96.003379 1.709048 1.801495 0.873796 7.657366 788.572894
+AACTCA 89.885965 1.785686 1.810422 0.880299 7.657366 1227.490052
+AACTCC 88.122175 1.726640 1.739263 0.828911 7.657366 884.915688
+AACTCG 90.020540 1.720851 1.768201 0.849685 7.657366 876.547894
+AACTCT 89.488848 1.881682 1.874980 0.927802 7.657366 897.704246
+AACTGA 95.355499 1.903140 1.753425 0.839056 7.657366 1897.021921
+AACTGC 91.965121 1.604050 1.681565 0.788008 7.657366 2213.907047
+AACTGG 94.103038 1.761081 1.735871 0.826488 7.657366 3289.145009
+AACTGT 93.016082 1.602201 1.706891 0.805877 7.657366 1223.359573
+AACTTA 85.458850 1.587465 1.586887 0.722402 7.657366 795.280443
+AACTTC 82.595152 1.437492 1.498138 0.662656 7.657366 1502.185244
+AACTTG 84.877559 1.613256 1.514321 0.673422 7.657366 786.291222
+AACTTT 83.853647 1.511865 1.448039 0.629695 7.657366 1434.913496
+AAGAAA 76.913949 1.996216 1.525274 0.680741 7.657366 2396.766483
+AAGAAC 73.937303 1.593386 1.795329 0.869314 7.657366 1318.555199
+AAGAAG 76.024020 1.981984 1.593704 0.727062 7.657366 2013.333245
+AAGAAT 74.360568 2.019435 1.808873 0.879169 7.657366 1206.156613
+AAGACA 66.013009 2.049991 2.104081 1.102945 7.657366 849.351238
+AAGACC 64.661079 1.992052 2.101933 1.101257 7.657366 654.760742
+AAGACG 65.578305 2.181446 2.057608 1.066607 7.657366 1300.977651
+AAGACT 65.702368 1.842481 1.908177 0.952551 7.657366 525.571144
+AAGAGA 75.504752 2.066446 1.492778 0.659103 7.657366 1605.901525
+AAGAGC 73.261438 1.666474 1.563460 0.706465 7.657366 1137.932916
+AAGAGG 74.244663 2.118492 1.550019 0.697374 7.657366 1053.892039
+AAGAGT 73.619464 1.872784 1.642370 0.760619 7.657366 953.503160
+AAGATA 65.553565 1.793807 1.884409 0.934809 7.657366 1292.496279
+AAGATC 64.158274 1.638242 1.680051 0.786944 7.657366 1344.118577
+AAGATG 64.849453 1.692073 1.766210 0.848250 7.657366 1710.083400
+AAGATT 65.200281 1.811103 1.751288 0.837523 7.657366 1347.346470
+AAGCAA 83.280186 1.926793 1.656862 0.770708 7.657366 1500.333210
+AAGCAC 81.199724 1.710668 1.496820 0.661782 7.657366 1172.998103
+AAGCAG 82.348727 1.726258 1.596628 0.729065 7.657366 1778.482859
+AAGCAT 82.030749 1.667901 1.565822 0.708066 7.657366 1152.516066
+AAGCCA 79.273411 1.592352 1.668810 0.779060 7.657366 1898.460454
+AAGCCC 77.992778 1.412127 1.597981 0.729991 7.657366 1277.046795
+AAGCCG 79.186393 1.629798 1.675089 0.783461 7.657366 1920.887497
+AAGCCT 78.857187 1.673659 1.567786 0.709399 7.657366 1300.980299
+AAGCGA 81.515683 1.857489 1.586999 0.722479 7.657366 1829.388057
+AAGCGC 78.675699 1.514168 1.477371 0.648925 7.657366 2287.679675
+AAGCGG 80.398641 1.668388 1.614457 0.741310 7.657366 2236.842849
+AAGCGT 79.423794 1.589917 1.513145 0.672638 7.657366 1802.210332
+AAGCTA 78.186743 1.580482 1.581365 0.718635 7.657366 580.729528
+AAGCTC 76.220378 1.484049 1.450312 0.631179 7.657366 793.761153
+AAGCTG 77.552028 1.489268 1.497805 0.662435 7.657366 2074.149881
+AAGCTT 77.080338 1.253215 1.349144 0.566302 7.657366 668.089153
+AAGGAA 75.975643 1.836737 1.615997 0.742371 7.657366 1110.202413
+AAGGAC 70.642227 1.653719 1.924979 0.965160 7.657366 377.440863
+AAGGAG 74.807912 1.819210 1.655812 0.769975 7.657366 765.988850
+AAGGAT 72.236323 1.606660 1.900087 0.946500 7.657366 863.129432
+AAGGCA 70.200469 1.681516 1.776655 0.855786 7.657366 1479.358822
+AAGGCC 68.837497 1.656337 1.747949 0.835128 7.657366 976.995174
+AAGGCG 70.113237 1.673595 1.698718 0.800096 7.657366 2078.585423
+AAGGCT 69.741288 1.548645 1.590618 0.724952 7.657366 930.088763
+AAGGGA 76.239953 1.707672 1.468434 0.643046 7.657366 786.777541
+AAGGGC 73.353382 1.437153 1.581256 0.718561 7.657366 842.173680
+AAGGGG 75.283115 1.617473 1.413461 0.607276 7.657366 709.798078
+AAGGGT 74.510794 1.579853 1.669529 0.779563 7.657366 616.031345
+AAGGTA 68.827118 1.505667 1.570437 0.711199 7.657366 1011.694073
+AAGGTC 67.032663 1.569582 1.767461 0.849151 7.657366 944.696244
+AAGGTG 68.371473 1.672282 1.697383 0.799153 7.657366 1444.475654
+AAGGTT 68.280902 1.604559 1.820937 0.887979 7.657366 1263.911634
+AAGTAA 73.443978 1.812738 1.759184 0.843193 7.657366 1102.398830
+AAGTAC 70.031658 1.740854 1.947575 0.982204 7.657366 800.254863
+AAGTAG 72.364195 1.768471 1.786094 0.862614 7.657366 550.553005
+AAGTAT 71.363043 1.776363 1.937821 0.974834 7.657366 732.623499
+AAGTCA 65.538377 1.798957 2.231979 1.205023 7.657366 1068.229182
+AAGTCC 64.232218 1.891010 2.386152 1.332010 7.657366 575.789630
+AAGTCG 65.551345 1.905285 2.239690 1.211273 7.657366 1004.942495
+AAGTCT 65.346026 1.988291 2.420628 1.360982 7.657366 631.150823
+AAGTGA 69.852848 1.705517 1.806007 0.877080 7.657366 1535.571856
+AAGTGC 65.963530 1.630780 2.163738 1.150184 7.657366 1128.611480
+AAGTGG 68.679704 1.725187 1.942567 0.978418 7.657366 1405.829856
+AAGTGT 67.124942 1.710674 2.176350 1.160255 7.657366 766.509940
+AAGTTA 60.497369 1.792996 2.241304 1.212583 7.657366 962.531128
+AAGTTC 57.752619 1.800510 2.451453 1.387061 7.657366 910.700774
+AAGTTG 60.024343 1.962389 2.304799 1.264474 7.657366 1338.223251
+AAGTTT 58.798472 1.773006 2.161368 1.148295 7.657366 1203.711515
+AATAAA 107.673464 2.819317 2.903014 1.787450 7.657366 2620.843826
+AATAAC 105.722313 2.809410 2.850230 1.738922 7.657366 1875.950887
+AATAAG 106.849761 2.536955 2.770290 1.666281 7.657366 1029.489780
+AATAAT 107.866856 2.872366 3.098385 1.970894 7.657366 2490.816952
+AATACA 103.197262 2.722137 2.698446 1.601884 7.657366 999.111899
+AATACC 103.083136 2.914831 2.760588 1.657535 7.657366 2166.186329
+AATACG 102.772891 2.934910 2.738726 1.637884 7.657366 1648.610208
+AATACT 104.847160 2.919974 2.905113 1.789389 7.657366 988.120004
+AATAGA 105.822006 2.908786 2.800303 1.693432 7.657366 722.613611
+AATAGC 104.202416 2.930529 2.772987 1.668714 7.657366 1325.819688
+AATAGG 105.681868 2.699461 2.554251 1.475216 7.657366 436.348925
+AATAGT 106.826191 2.837199 2.923090 1.806024 7.657366 765.250706
+AATATA 104.225153 3.737060 3.047455 1.922500 7.657366 1287.261166
+AATATC 105.842665 3.177052 3.330506 2.196472 7.657366 2351.568613
+AATATG 104.109710 3.008751 3.145676 2.016189 7.657366 1627.950924
+AATATT 108.052518 3.447916 3.534784 2.401621 7.657366 2385.918228
+AATCAA 116.545305 2.614476 2.932418 1.814676 7.657366 1852.479571
+AATCAC 114.265931 2.483327 2.800280 1.693411 7.657366 2043.749784
+AATCAG 115.683078 2.700199 2.909068 1.793045 7.657366 2828.594867
+AATCAT 115.406783 2.653497 3.025695 1.901945 7.657366 1878.552414
+AATCCA 115.760040 2.447365 2.826571 1.717316 7.657366 1628.563489
+AATCCC 113.830721 2.547761 2.815653 1.707375 7.657366 1405.627938
+AATCCG 115.620957 2.617241 2.841121 1.730593 7.657366 1621.533149
+AATCCT 115.321967 2.640498 2.918738 1.801993 7.657366 978.888060
+AATCGA 116.224045 2.872656 3.047968 1.922985 7.657366 1265.860627
+AATCGC 113.693265 2.607078 2.781082 1.676027 7.657366 2743.487504
+AATCGG 115.705912 2.739131 2.868456 1.755628 7.657366 1403.386571
+AATCGT 115.769493 2.742861 3.042331 1.917653 7.657366 1130.019334
+AATCTA 116.032855 2.534986 2.998611 1.876466 7.657366 395.134108
+AATCTC 114.748042 2.580158 3.003677 1.881222 7.657366 1066.482394
+AATCTG 115.474531 2.475242 2.980832 1.859802 7.657366 1887.915576
+AATCTT 115.848026 2.548110 3.189344 2.058317 7.657366 1413.986535
+AATGAA 104.563783 2.776573 2.797242 1.690657 7.657366 1995.567225
+AATGAC 101.310120 2.659247 2.707039 1.609541 7.657366 1619.887642
+AATGAG 103.632264 2.760138 2.799482 1.692688 7.657366 987.699602
+AATGAT 103.669434 2.897090 2.956653 1.837218 7.657366 1903.634533
+AATGCA 104.024907 3.224793 2.854983 1.743274 7.657366 1500.804033
+AATGCC 103.992929 2.862718 2.818192 1.709685 7.657366 2937.147712
+AATGCG 104.213197 2.880393 2.766975 1.663291 7.657366 2264.874009
+AATGCT 106.047225 2.836437 2.891387 1.776723 7.657366 1539.561185
+AATGGA 105.323747 2.886754 2.894916 1.779976 7.657366 1187.960939
+AATGGC 103.398783 2.682675 2.763126 1.659821 7.657366 2838.926073
+AATGGG 104.670965 3.001631 2.802620 1.695534 7.657366 1101.835062
+AATGGT 105.770860 2.829131 2.945920 1.827224 7.657366 1925.855197
+AATGTA 107.047480 3.280752 3.203441 2.071979 7.657366 829.884736
+AATGTC 107.941277 3.203354 3.386312 2.251909 7.657366 1221.103043
+AATGTG 107.083668 3.037168 3.199498 2.068155 7.657366 1179.181606
+AATGTT 110.377833 2.891985 3.358933 2.224653 7.657366 1666.889927
+AATTAA 104.059548 2.245460 2.120552 1.115922 7.657366 1823.295297
+AATTAC 100.394015 2.064110 1.996301 1.019293 7.657366 1420.916764
+AATTAG 103.373137 2.134320 2.118923 1.114636 7.657366 550.013002
+AATTAT 101.762356 1.938742 2.063276 1.071017 7.657366 1540.354035
+AATTCA 98.213038 2.086938 2.294747 1.256211 7.657366 1482.615425
+AATTCC 96.598475 2.024633 2.138855 1.130400 7.657366 1294.604206
+AATTCG 98.331149 2.139103 2.226729 1.200774 7.657366 1115.938517
+AATTCT 98.144310 2.127732 2.260302 1.228033 7.657366 945.570566
+AATTGA 102.151215 2.185372 2.129565 1.123044 7.657366 1421.499324
+AATTGC 98.327182 2.041385 1.944663 0.980002 7.657366 2051.368676
+AATTGG 101.274386 2.193891 2.085428 1.088311 7.657366 880.603070
+AATTGT 99.778448 2.042245 2.029690 1.044973 7.657366 1051.975068
+AATTTA 94.408899 1.852415 1.973569 1.001933 7.657366 1432.413884
+AATTTC 92.305885 1.804893 1.897936 0.944893 7.657366 1860.609829
+AATTTG 93.915448 1.769187 1.919980 0.961403 7.657366 1756.955627
+AATTTT 93.821900 1.926079 2.030928 1.045929 7.657366 2453.957671
+ACAAAA 90.063313 2.363745 2.094429 1.095365 7.657366 2123.993538
+ACAAAC 86.974312 2.319915 2.279130 1.243409 7.657366 1679.231838
+ACAAAG 89.508237 2.653602 2.240750 1.212133 7.657366 1599.888236
+ACAAAT 87.939082 2.415400 2.367273 1.316233 7.657366 1329.143064
+ACAACA 80.652659 2.152424 1.977025 1.004567 7.657366 1600.387836
+ACAACC 79.395140 2.026335 1.924180 0.964559 7.657366 1404.842659
+ACAACG 80.333761 2.086247 1.922583 0.963359 7.657366 1630.689158
+ACAACT 79.982221 2.118189 1.875877 0.928468 7.657366 991.776100
+ACAAGA 88.424666 2.391666 2.089509 1.091508 7.657366 708.562786
+ACAAGC 86.231369 2.288474 2.184678 1.166921 7.657366 851.773453
+ACAAGG 87.523327 2.420967 2.033070 1.047584 7.657366 720.134652
+ACAAGT 86.735947 2.601031 2.272862 1.238283 7.657366 550.063989
+ACAATA 79.070992 2.118111 2.007411 1.027814 7.657366 1092.453586
+ACAATC 77.640937 2.139018 2.322793 1.279311 7.657366 1338.676776
+ACAATG 78.615337 2.038432 2.192624 1.173294 7.657366 1223.691787
+ACAATT 78.516350 2.333893 2.420640 1.360992 7.657366 1178.569423
+ACACAA 91.695338 1.497476 1.374794 0.582528 7.657366 766.443464
+ACACAC 89.238999 1.245656 1.372996 0.581385 7.657366 490.647186
+ACACAG 91.179018 1.316658 1.377675 0.584360 7.657366 793.815850
+ACACAT 90.416709 1.260885 1.377614 0.584321 7.657366 642.571195
+ACACCA 88.805875 1.339340 1.452483 0.632597 7.657366 1803.430552
+ACACCC 87.542436 1.362663 1.473311 0.646252 7.657366 887.140784
+ACACCG 88.644993 1.399088 1.483444 0.652931 7.657366 1743.931969
+ACACCT 88.263211 1.363454 1.424690 0.614527 7.657366 913.759663
+ACACGA 91.040998 1.292908 1.385252 0.589188 7.657366 508.768155
+ACACGC 88.669757 1.416422 1.444852 0.627618 7.657366 1137.063294
+ACACGG 90.253128 1.390962 1.429677 0.617756 7.657366 721.368383
+ACACGT 89.319556 1.369864 1.394155 0.594876 7.657366 478.217559
+ACACTA 87.427730 1.501863 1.561535 0.705161 7.657366 407.788094
+ACACTC 85.062331 1.483098 1.449416 0.630594 7.657366 502.328697
+ACACTG 86.457155 1.368971 1.463452 0.639776 7.657366 1012.825617
+ACACTT 86.105846 1.661801 1.526731 0.681717 7.657366 777.214381
+ACAGAA 85.772374 2.310634 2.065516 1.072761 7.657366 1281.004177
+ACAGAC 80.189158 1.902412 2.205651 1.183765 7.657366 988.520346
+ACAGAG 84.566708 2.014193 2.089922 1.091831 7.657366 654.481185
+ACAGAT 82.161732 1.979555 2.208178 1.185800 7.657366 1060.511419
+ACAGCA 80.397195 1.772131 1.729287 0.821790 7.657366 2397.476249
+ACAGCC 79.003802 2.054052 1.789192 0.864860 7.657366 1563.599167
+ACAGCG 80.315587 1.997967 1.736677 0.827064 7.657366 2485.246371
+ACAGCT 79.863749 1.767291 1.829126 0.893976 7.657366 1080.314744
+ACAGGA 85.436653 1.869541 1.955375 0.988110 7.657366 1080.393743
+ACAGGC 81.810627 2.008101 2.081222 1.085020 7.657366 1758.165990
+ACAGGG 84.397396 2.117810 2.028528 1.044076 7.657366 813.634626
+ACAGGT 83.188729 2.040872 2.077803 1.082347 7.657366 1462.173173
+ACAGTA 77.518384 1.954875 2.140696 1.131860 7.657366 811.874690
+ACAGTC 74.349435 2.086976 2.323598 1.279976 7.657366 651.975216
+ACAGTG 76.277486 1.939085 2.111614 1.108874 7.657366 987.285251
+ACAGTT 75.932127 2.002732 2.327065 1.282841 7.657366 1221.738459
+ACATAA 86.167844 1.957548 1.926062 0.965974 7.657366 1063.188562
+ACATAC 82.870710 1.955384 2.018904 1.036654 7.657366 753.194731
+ACATAG 85.834770 1.977576 1.952895 0.986231 7.657366 524.005163
+ACATAT 84.219363 1.878786 2.017196 1.035338 7.657366 837.144522
+ACATCA 79.404278 2.120286 2.324951 1.281094 7.657366 2320.043482
+ACATCC 77.915479 2.121388 2.363477 1.313069 7.657366 1259.970813
+ACATCG 79.403363 2.194057 2.264190 1.231202 7.657366 1650.850769
+ACATCT 78.652369 1.891692 2.292356 1.254248 7.657366 1001.358098
+ACATGA 83.735787 1.912482 1.907665 0.952168 7.657366 934.225778
+ACATGC 80.420999 2.051716 2.131368 1.124470 7.657366 1090.431000
+ACATGG 83.111171 2.136375 2.046194 1.057744 7.657366 1017.921685
+ACATGT 81.267887 1.926108 2.208331 1.185923 7.657366 560.425645
+ACATTA 73.922192 2.443556 2.545866 1.467958 7.657366 1225.503725
+ACATTC 70.752628 2.265505 2.551158 1.472538 7.657366 990.281616
+ACATTG 72.930467 2.356579 2.490253 1.420121 7.657366 1324.520286
+ACATTT 71.981825 2.133734 2.290733 1.252916 7.657366 1213.686198
+ACCAAA 100.831945 1.655119 1.580108 0.717778 7.657366 2006.487828
+ACCAAC 98.514445 1.618831 1.738785 0.828570 7.657366 1615.519894
+ACCAAG 99.804208 1.611859 1.672036 0.781320 7.657366 427.517394
+ACCAAT 99.504426 1.617436 1.776474 0.855655 7.657366 1528.893025
+ACCACA 92.938359 1.731365 1.489292 0.656795 7.657366 1435.487446
+ACCACC 91.992758 1.611301 1.544134 0.693407 7.657366 2673.410970
+ACCACG 92.928139 1.745944 1.496024 0.661254 7.657366 2070.305078
+ACCACT 92.940067 1.591975 1.595923 0.728582 7.657366 1436.769181
+ACCAGA 99.004425 1.758995 1.730544 0.822686 7.657366 2518.045119
+ACCAGC 96.976849 1.671711 1.753675 0.839236 7.657366 3782.691516
+ACCAGG 97.822921 1.740462 1.760603 0.844214 7.657366 1825.773429
+ACCAGT 97.685362 1.695141 1.810181 0.880123 7.657366 2042.404346
+ACCATA 91.942048 1.831924 1.753398 0.839037 7.657366 961.769959
+ACCATC 90.370848 1.789772 1.933071 0.971252 7.657366 2377.686573
+ACCATG 91.345761 1.733725 1.795336 0.869318 7.657366 1351.432064
+ACCATT 91.583598 1.784831 1.956380 0.988872 7.657366 1975.200779
+ACCCAA 99.813058 1.698307 1.223936 0.489326 7.657366 763.884232
+ACCCAC 97.298530 1.780802 1.335703 0.557860 7.657366 1025.108593
+ACCCAG 98.834642 1.789362 1.257195 0.509406 7.657366 1783.075162
+ACCCAT 98.366873 1.687462 1.401409 0.599526 7.657366 955.014379
+ACCCCA 96.099825 1.766839 1.296893 0.533724 7.657366 743.132471
+ACCCCC 94.812031 1.686205 1.229185 0.492477 7.657366 362.329745
+ACCCCG 95.931616 1.658486 1.235026 0.495992 7.657366 769.146051
+ACCCCT 95.577358 1.491062 1.196193 0.472783 7.657366 458.914493
+ACCCGA 98.609042 1.743703 1.306366 0.539582 7.657366 1019.985430
+ACCCGC 96.389669 1.776166 1.398983 0.597970 7.657366 2090.232446
+ACCCGG 97.616457 1.626954 1.285026 0.526415 7.657366 1509.378193
+ACCCGT 97.056498 1.621035 1.445248 0.627876 7.657366 1137.055264
+ACCCTA 94.763241 1.643448 1.350633 0.567240 7.657366 264.560435
+ACCCTC 93.235124 1.442696 1.450325 0.631188 7.657366 461.376721
+ACCCTG 94.411909 1.711723 1.426486 0.615689 7.657366 1412.457432
+ACCCTT 94.002582 1.654968 1.597365 0.729569 7.657366 675.099103
+ACCGAA 96.101191 1.892699 1.677014 0.784811 7.657366 1791.630744
+ACCGAC 91.512760 1.658595 1.891685 0.940229 7.657366 1659.934240
+ACCGAG 94.834858 1.681455 1.716127 0.812427 7.657366 964.834120
+ACCGAT 93.218633 1.709263 1.874898 0.927741 7.657366 1987.947325
+ACCGCA 91.367227 1.628444 1.531980 0.685236 7.657366 2171.615241
+ACCGCC 90.172321 1.621570 1.632895 0.754046 7.657366 3132.877972
+ACCGCG 91.277425 1.641552 1.594295 0.727467 7.657366 2354.796401
+ACCGCT 91.323612 1.534378 1.633313 0.754336 7.657366 2177.351030
+ACCGGA 95.992647 1.670561 1.691158 0.794761 7.657366 2107.334755
+ACCGGC 92.945615 1.610726 1.761406 0.844791 7.657366 2405.424965
+ACCGGG 94.902334 1.662945 1.646076 0.763194 7.657366 1550.462089
+ACCGGT 94.376729 1.591548 1.811015 0.880731 7.657366 2273.995896
+ACCGTA 89.937318 1.740024 1.798978 0.871965 7.657366 1357.079173
+ACCGTC 87.633889 1.716990 1.957920 0.990040 7.657366 1682.590419
+ACCGTG 89.051801 1.816155 1.923596 0.964120 7.657366 1533.278123
+ACCGTT 89.153767 1.865997 2.047306 1.058606 7.657366 2213.044363
+ACCTAA 94.464573 1.824950 1.613090 0.740369 7.657366 430.825199
+ACCTAC 91.432405 1.845388 1.726845 0.820050 7.657366 601.684556
+ACCTAG 93.472534 1.549919 1.528380 0.682822 7.657366 32.441437
+ACCTAT 92.576529 1.926447 1.755624 0.840635 7.657366 532.570029
+ACCTCA 87.869173 1.952716 1.999369 1.021644 7.657366 674.761607
+ACCTCC 86.220065 1.828007 1.922993 0.963667 7.657366 532.283826
+ACCTCG 87.666441 1.837862 1.955289 0.988045 7.657366 728.962105
+ACCTCT 87.570403 2.061411 1.947468 0.982123 7.657366 731.734061
+ACCTGA 92.090937 1.785285 1.645132 0.762538 7.657366 2007.887795
+ACCTGC 88.798051 1.763338 1.802303 0.874383 7.657366 2419.804136
+ACCTGG 90.927315 1.995589 1.732592 0.824147 7.657366 2016.424398
+ACCTGT 89.929171 1.786534 1.839527 0.901611 7.657366 1366.710596
+ACCTTA 83.111533 1.918042 1.978102 1.005387 7.657366 709.149666
+ACCTTC 80.106534 1.796219 1.996523 1.019464 7.657366 1742.557893
+ACCTTG 82.692537 1.967199 1.962735 0.993695 7.657366 842.234045
+ACCTTT 81.359747 1.720022 1.862300 0.918406 7.657366 1650.437940
+ACGAAA 83.018139 2.064464 2.061045 1.069280 7.657366 1393.341145
+ACGAAC 79.960122 1.948309 2.178391 1.161888 7.657366 1005.550291
+ACGAAG 81.881109 2.410628 2.136292 1.128369 7.657366 1207.661476
+ACGAAT 80.767466 2.229255 2.259164 1.227105 7.657366 1163.617927
+ACGACA 72.803490 2.067331 2.140638 1.131814 7.657366 1039.104672
+ACGACC 71.697542 2.052323 2.139168 1.130648 7.657366 1268.431134
+ACGACG 72.607965 2.088489 2.056486 1.065734 7.657366 1467.390260
+ACGACT 72.324052 1.904603 1.942844 0.978627 7.657366 691.734146
+ACGAGA 81.704846 2.282734 2.059216 1.067857 7.657366 563.298493
+ACGAGC 79.634711 2.275537 2.128101 1.121886 7.657366 863.873736
+ACGAGG 80.644606 2.239305 2.007206 1.027657 7.657366 428.630823
+ACGAGT 80.095364 2.095646 2.090967 1.092650 7.657366 492.657750
+ACGATA 71.432864 1.700293 1.900579 0.946867 7.657366 1561.921794
+ACGATC 70.009866 2.037245 2.089578 1.091561 7.657366 1481.989542
+ACGATG 70.637766 1.871261 1.926453 0.966269 7.657366 1648.436077
+ACGATT 71.016222 1.956295 2.278167 1.242621 7.657366 1119.301775
+ACGCAA 87.324649 1.457519 1.515928 0.674494 7.657366 1609.059675
+ACGCAC 84.879121 1.440604 1.538034 0.689301 7.657366 1415.761890
+ACGCAG 86.490220 1.537255 1.560342 0.704352 7.657366 2454.424145
+ACGCAT 85.747237 1.400972 1.523911 0.679829 7.657366 1688.201394
+ACGCCA 83.889880 1.531255 1.727982 0.820860 7.657366 3369.167153
+ACGCCC 82.671234 1.571043 1.727842 0.820760 7.657366 1958.251392
+ACGCCG 83.774821 1.568213 1.721766 0.816435 7.657366 3449.633304
+ACGCCT 83.486320 1.603835 1.685713 0.790926 7.657366 1823.958124
+ACGCGA 85.846706 1.516744 1.582379 0.719326 7.657366 1603.924918
+ACGCGC 83.031551 1.680321 1.744729 0.832822 7.657366 2813.044255
+ACGCGG 84.817920 1.616302 1.704033 0.803854 7.657366 2105.770725
+ACGCGT 83.828251 1.618462 1.775281 0.854793 7.657366 1636.363154
+ACGCTA 82.335796 1.705673 1.830472 0.894963 7.657366 1055.398505
+ACGCTC 79.974639 1.596753 1.722995 0.817309 7.657366 1355.869404
+ACGCTG 81.358574 1.596408 1.737508 0.827657 7.657366 4042.003026
+ACGCTT 80.790648 1.456478 1.510037 0.670567 7.657366 1746.142523
+ACGGAA 82.531670 2.068361 2.093503 1.094638 7.657366 1371.957640
+ACGGAC 77.097379 1.995991 2.381012 1.327708 7.657366 681.706790
+ACGGAG 81.110902 2.038553 2.111878 1.109081 7.657366 540.568880
+ACGGAT 78.704821 2.040601 2.312294 1.270647 7.657366 1268.202576
+ACGGCA 77.503481 1.869025 1.821369 0.888295 7.657366 2597.189984
+ACGGCC 75.892689 1.786595 1.730257 0.822482 7.657366 911.807146
+ACGGCG 77.103232 1.864995 1.803116 0.874975 7.657366 2817.499254
+ACGGCT 76.998404 1.919357 1.854500 0.912642 7.657366 1375.397967
+ACGGGA 83.358882 1.897131 2.028020 1.043683 7.657366 823.129482
+ACGGGC 80.156120 2.069727 2.097502 1.097776 7.657366 1449.541260
+ACGGGG 82.121182 2.080717 2.070189 1.076404 7.657366 499.585980
+ACGGGT 81.455066 2.000709 2.210719 1.187847 7.657366 1160.291926
+ACGGTA 75.643141 1.957142 2.085328 1.088233 7.657366 1798.153167
+ACGGTC 73.087305 2.289838 2.417689 1.358504 7.657366 1291.152708
+ACGGTG 74.589470 2.038446 2.234585 1.207134 7.657366 1895.591560
+ACGGTT 74.379635 2.163040 2.385223 1.331232 7.657366 1603.894766
+ACGTAA 78.943056 2.044119 2.130214 1.123557 7.657366 1235.376957
+ACGTAC 75.409360 1.942375 2.272335 1.237853 7.657366 838.387618
+ACGTAG 78.175965 2.094405 2.160884 1.147909 7.657366 609.090991
+ACGTAT 76.476820 1.968576 2.268631 1.234827 7.657366 724.157007
+ACGTCA 71.722020 2.059695 2.504087 1.431972 7.657366 1432.705498
+ACGTCC 70.076522 1.921047 2.568921 1.487943 7.657366 808.155008
+ACGTCG 71.742036 2.027234 2.625527 1.537393 7.657366 1154.435158
+ACGTCT 71.325964 1.962821 2.534754 1.458358 7.657366 879.952576
+ACGTGA 75.820352 2.310201 2.252569 1.221736 7.657366 988.857557
+ACGTGC 71.659615 1.965858 2.422669 1.362704 7.657366 1094.861942
+ACGTGG 74.665708 2.192639 2.356797 1.307505 7.657366 931.157008
+ACGTGT 73.032772 2.386924 2.654708 1.563095 7.657366 485.027499
+ACGTTA 66.428583 2.054295 2.588880 1.505318 7.657366 1417.417085
+ACGTTC 63.108333 2.193746 2.786351 1.680792 7.657366 1483.547909
+ACGTTG 65.584268 2.170741 2.661359 1.568973 7.657366 1868.635993
+ACGTTT 64.455579 1.971833 2.569231 1.488213 7.657366 1697.171815
+ACTAAA 106.558581 1.980188 2.184288 1.166609 7.657366 729.091790
+ACTAAC 104.212825 1.978811 2.173295 1.157813 7.657366 589.786310
+ACTAAG 105.541709 2.114305 2.092550 1.093891 7.657366 263.509830
+ACTAAT 105.971570 2.193831 2.290998 1.253134 7.657366 525.896221
+ACTACA 102.131459 1.917774 1.933424 0.971518 7.657366 646.758987
+ACTACC 101.903422 2.091575 2.039720 1.052728 7.657366 884.096587
+ACTACG 101.889629 1.875890 1.965434 0.995745 7.657366 840.421469
+ACTACT 102.943958 1.935829 2.199479 1.178800 7.657366 511.143112
+ACTAGA 103.923008 1.946870 1.902393 0.948223 7.657366 74.513465
+ACTAGC 102.955143 2.017906 2.003676 1.024947 7.657366 114.557331
+ACTAGG 103.316867 1.940837 1.964001 0.994656 7.657366 45.566636
+ACTAGT 103.857962 1.937936 1.959034 0.990885 7.657366 99.448158
+ACTATA 101.991040 2.256800 2.268952 1.235089 7.657366 413.179783
+ACTATC 102.766456 2.320536 2.397715 1.341703 7.657366 979.575960
+ACTATG 101.857483 2.238631 2.397673 1.341669 7.657366 605.145201
+ACTATT 104.605482 2.344758 2.630020 1.541341 7.657366 787.454077
+ACTCAA 111.819170 2.185517 2.193729 1.174180 7.657366 880.064812
+ACTCAC 109.526249 1.881097 2.052343 1.062515 7.657366 843.602613
+ACTCAG 111.113307 1.960733 2.186700 1.168541 7.657366 807.786386
+ACTCAT 110.770084 2.012340 2.218254 1.193925 7.657366 969.911720
+ACTCCA 110.471616 2.182818 2.204241 1.182630 7.657366 969.898847
+ACTCCC 108.987710 1.948603 2.136787 1.128761 7.657366 533.134430
+ACTCCG 110.571488 2.178212 2.168907 1.154308 7.657366 759.463748
+ACTCCT 110.127344 2.248020 2.142650 1.133410 7.657366 608.629837
+ACTCGA 111.499023 1.959571 2.271099 1.236843 7.657366 439.873551
+ACTCGC 108.889955 2.029477 2.123787 1.118476 7.657366 958.033566
+ACTCGG 110.971519 2.339973 2.282649 1.246290 7.657366 556.303093
+ACTCGT 110.383999 2.126802 2.340726 1.294154 7.657366 504.294740
+ACTCTA 110.150491 2.169948 2.336892 1.290976 7.657366 288.977470
+ACTCTC 109.359731 2.141134 2.336993 1.291060 7.657366 622.450924
+ACTCTG 110.291796 2.296990 2.276589 1.241330 7.657366 981.808375
+ACTCTT 110.530544 2.320552 2.414126 1.355502 7.657366 1065.651970
+ACTGAA 102.138344 2.034705 2.038371 1.051683 7.657366 1924.525261
+ACTGAC 98.450424 1.747148 1.946714 0.981552 7.657366 1621.307562
+ACTGAG 101.352389 2.099683 2.073979 1.079361 7.657366 681.331277
+ACTGAT 100.481721 2.077501 2.166203 1.152150 7.657366 1841.992586
+ACTGCA 100.353644 1.993440 2.003108 1.024511 7.657366 1348.280926
+ACTGCC 99.634902 2.097886 2.051645 1.061974 7.657366 1904.751759
+ACTGCG 100.403642 1.939288 1.994539 1.017944 7.657366 1913.517151
+ACTGCT 101.207900 2.155697 2.140710 1.131871 7.657366 1708.940322
+ACTGGA 102.339616 2.063320 2.176160 1.160103 7.657366 1938.346945
+ACTGGC 99.810748 2.112115 2.012363 1.031620 7.657366 3773.816301
+ACTGGG 101.605907 2.003813 2.136707 1.128698 7.657366 1220.514499
+ACTGGT 102.036138 2.145682 2.195543 1.175637 7.657366 2096.571601
+ACTGTA 101.031036 2.065525 2.326151 1.282086 7.657366 666.139679
+ACTGTC 100.857603 2.388966 2.375920 1.323452 7.657366 794.216279
+ACTGTG 100.769277 2.215194 2.309503 1.268347 7.657366 814.593167
+ACTGTT 102.930770 2.404880 2.589318 1.505700 7.657366 1475.708730
+ACTTAA 100.764246 1.784768 1.704960 0.804510 7.657366 834.626017
+ACTTAC 96.862568 1.767470 1.585505 0.721459 7.657366 699.331549
+ACTTAG 99.972223 1.679103 1.681628 0.788053 7.657366 272.315839
+ACTTAT 98.300142 1.559212 1.682302 0.788526 7.657366 700.267513
+ACTTCA 95.506770 1.745149 1.724193 0.818162 7.657366 1679.808427
+ACTTCC 93.790667 1.591378 1.696281 0.798375 7.657366 1473.024848
+ACTTCG 95.311761 1.664582 1.726147 0.819553 7.657366 1348.838717
+ACTTCT 95.085597 1.734586 1.699854 0.800899 7.657366 1098.529656
+ACTTGA 98.964303 1.782985 1.637716 0.757388 7.657366 499.805784
+ACTTGC 94.931286 1.588591 1.563994 0.706827 7.657366 820.946288
+ACTTGG 97.625896 1.788761 1.615706 0.742170 7.657366 287.743679
+ACTTGT 96.409328 1.693406 1.676606 0.784525 7.657366 499.532189
+ACTTTA 91.939898 1.575950 1.607970 0.736847 7.657366 1239.062143
+ACTTTC 89.364051 2.055414 1.560396 0.704389 7.657366 1684.188639
+ACTTTG 91.389728 1.726875 1.567655 0.709310 7.657366 1462.893267
+ACTTTT 91.474394 1.529973 1.624479 0.748224 7.657366 1522.875806
+AGAAAA 76.864654 1.902882 1.485100 0.654024 7.657366 2557.297224
+AGAAAC 74.705630 1.899814 1.495226 0.660725 7.657366 1727.273299
+AGAAAG 75.983422 2.057114 1.458164 0.636312 7.657366 1833.996673
+AGAAAT 75.041703 1.745925 1.481556 0.651685 7.657366 1801.990328
+AGAACA 66.301090 2.107950 1.922137 0.963023 7.657366 1421.856548
+AGAACC 65.338153 2.042419 2.014199 1.033032 7.657366 950.622646
+AGAACG 66.220443 2.038928 1.870481 0.924464 7.657366 1401.430967
+AGAACT 66.286170 1.849487 1.863086 0.918987 7.657366 969.538254
+AGAAGA 74.705545 1.822442 1.480128 0.650743 7.657366 1495.412993
+AGAAGC 73.811464 1.831128 1.369329 0.579058 7.657366 1397.801644
+AGAAGG 73.995715 1.750644 1.476897 0.648613 7.657366 1079.693476
+AGAAGT 73.788376 1.508419 1.377746 0.584405 7.657366 1033.375790
+AGAATA 66.481312 1.789884 1.744925 0.832962 7.657366 1347.748150
+AGAATC 64.998469 1.663073 1.653989 0.768704 7.657366 992.287922
+AGAATG 65.949510 1.806456 1.753027 0.838770 7.657366 1222.638825
+AGAATT 66.254322 1.737764 1.536577 0.688322 7.657366 1017.960676
+AGACAA 80.936748 2.444043 1.970017 0.999229 7.657366 946.306778
+AGACAC 79.712788 1.844803 1.728716 0.821383 7.657366 619.652731
+AGACAG 80.316325 2.070712 1.853701 0.912053 7.657366 874.950253
+AGACAT 80.421169 2.050472 1.766654 0.848570 7.657366 993.737659
+AGACCA 77.244791 2.073390 1.953071 0.986364 7.657366 1104.961794
+AGACCC 75.913348 1.967662 1.823441 0.889811 7.657366 458.338339
+AGACCG 77.027362 2.028148 1.951859 0.985447 7.657366 922.272826
+AGACCT 77.066571 2.076568 1.934017 0.971965 7.657366 526.638463
+AGACGA 80.067882 2.115350 1.903860 0.949320 7.657366 991.803375
+AGACGC 78.721423 1.689199 1.677998 0.785502 7.657366 1620.403419
+AGACGG 79.145238 1.780124 1.760879 0.844413 7.657366 1094.690557
+AGACGT 79.066944 1.741743 1.796571 0.870215 7.657366 900.722072
+AGACTA 76.792907 1.909967 1.741795 0.830722 7.657366 268.497844
+AGACTC 75.563226 1.626866 1.580148 0.717806 7.657366 498.177591
+AGACTG 76.258894 1.752776 1.679381 0.786474 7.657366 761.328743
+AGACTT 75.915204 1.572545 1.532187 0.685374 7.657366 555.021829
+AGAGAA 76.594402 1.627890 1.418375 0.610445 7.657366 1349.484492
+AGAGAC 73.262908 1.325034 1.520293 0.677409 7.657366 477.213435
+AGAGAG 75.562396 1.552207 1.372226 0.580896 7.657366 805.815240
+AGAGAT 74.421684 1.331703 1.474003 0.646707 7.657366 1298.524767
+AGAGCA 69.998265 1.553778 1.611401 0.739207 7.657366 1121.887210
+AGAGCC 68.667105 1.637465 1.653700 0.768503 7.657366 485.521532
+AGAGCG 70.011483 1.597752 1.575177 0.714421 7.657366 1188.881693
+AGAGCT 69.725546 1.510685 1.469489 0.643739 7.657366 790.049644
+AGAGGA 75.715396 1.528779 1.407384 0.603364 7.657366 537.920652
+AGAGGC 74.144127 1.277078 1.397486 0.597010 7.657366 764.173641
+AGAGGG 75.175483 1.385090 1.277554 0.521830 7.657366 508.884901
+AGAGGT 74.807801 1.368310 1.437921 0.623107 7.657366 717.811842
+AGAGTA 69.765781 1.613989 1.339579 0.560290 7.657366 642.218414
+AGAGTC 68.059176 1.330741 1.543768 0.693160 7.657366 465.919102
+AGAGTG 69.226619 1.497620 1.416348 0.609137 7.657366 768.226539
+AGAGTT 69.579172 1.500014 1.537536 0.688967 7.657366 973.507359
+AGATAA 74.568236 1.968897 2.078833 1.083153 7.657366 1874.614854
+AGATAC 72.365509 1.844313 1.994045 1.017567 7.657366 860.509996
+AGATAG 73.832648 1.716101 1.952935 0.986261 7.657366 805.746681
+AGATAT 73.432079 2.172514 2.202616 1.181322 7.657366 1251.782807
+AGATCA 67.834902 1.867309 2.233883 1.206566 7.657366 1166.092036
+AGATCC 66.669077 1.921730 2.300891 1.261260 7.657366 1219.340512
+AGATCG 67.866621 1.887472 2.309761 1.268560 7.657366 1428.027633
+AGATCT 68.040372 2.329420 2.396105 1.340353 7.657366 823.928531
+AGATGA 72.186090 1.688159 1.855694 0.913524 7.657366 1562.323613
+AGATGC 69.679835 1.649246 1.979453 1.006417 7.657366 1643.321773
+AGATGG 71.147262 1.840507 1.919364 0.960940 7.657366 1428.079363
+AGATGT 70.451145 1.891997 2.138444 1.130075 7.657366 967.779161
+AGATTA 63.780032 1.817400 1.996462 1.019417 7.657366 1091.745442
+AGATTC 61.230298 1.703207 1.915220 0.957829 7.657366 868.156036
+AGATTG 63.001398 1.735145 1.914172 0.957043 7.657366 1272.152535
+AGATTT 62.730700 1.813921 1.808202 0.878680 7.657366 1533.935229
+AGCAAA 102.332437 2.041531 2.158085 1.145680 7.657366 2582.560942
+AGCAAC 99.559141 1.859230 2.136273 1.128354 7.657366 2186.717017
+AGCAAG 101.319597 1.970563 2.141313 1.132350 7.657366 1295.775837
+AGCAAT 100.823749 2.077486 2.320540 1.277450 7.657366 1976.415940
+AGCACA 93.493271 1.659531 1.634747 0.755329 7.657366 1258.978667
+AGCACC 92.153505 1.638017 1.693658 0.796524 7.657366 1801.913222
+AGCACG 93.330223 1.674884 1.659143 0.772300 7.657366 1603.708757
+AGCACT 93.408623 1.696950 1.795102 0.869148 7.657366 1069.733824
+AGCAGA 99.488722 1.989939 2.167239 1.152976 7.657366 2045.623960
+AGCAGC 97.055230 1.952275 2.088789 1.090943 7.657366 2761.712504
+AGCAGG 98.581913 1.922495 2.105946 1.104412 7.657366 2494.770544
+AGCAGT 98.260512 1.873890 2.287566 1.250318 7.657366 1587.658004
+AGCATA 93.081731 1.677972 1.849296 0.908803 7.657366 1080.539739
+AGCATC 91.875294 1.920796 2.043693 1.055806 7.657366 2201.588358
+AGCATG 92.462133 1.786950 1.895100 0.942776 7.657366 1214.141028
+AGCATT 93.404388 2.123718 2.144750 1.135077 7.657366 1580.082532
+AGCCAA 102.836744 1.920734 1.803337 0.875136 7.657366 1365.294976
+AGCCAC 100.144195 1.871108 1.820828 0.887899 7.657366 1426.598246
+AGCCAG 102.000321 1.917401 1.831302 0.895571 7.657366 3348.833641
+AGCCAT 101.324320 1.971317 2.027807 1.043519 7.657366 1740.356367
+AGCCCA 99.266886 1.803837 1.631297 0.752939 7.657366 1120.041664
+AGCCCC 97.714934 1.589407 1.572734 0.712760 7.657366 847.500060
+AGCCCG 99.269628 1.677972 1.605759 0.735327 7.657366 1558.859262
+AGCCCT 98.661262 1.652557 1.761167 0.844620 7.657366 776.504436
+AGCCGA 101.280008 2.109897 2.071332 1.077295 7.657366 1307.028206
+AGCCGC 99.146742 1.838367 1.809962 0.879963 7.657366 2106.095978
+AGCCGG 100.724119 1.963043 1.794037 0.868375 7.657366 1384.467718
+AGCCGT 100.060612 2.025190 2.038648 1.051898 7.657366 1464.971776
+AGCCTA 98.311873 1.895547 1.802355 0.874421 7.657366 321.256544
+AGCCTC 96.380065 1.756508 1.808757 0.879085 7.657366 674.755006
+AGCCTG 97.520666 1.860897 1.833282 0.897024 7.657366 2146.556505
+AGCCTT 97.395879 1.825677 1.968073 0.997751 7.657366 1044.969434
+AGCGAA 98.201482 2.068942 2.094686 1.095567 7.657366 2233.178713
+AGCGAC 94.168692 1.927777 2.000644 1.022622 7.657366 1513.226452
+AGCGAG 97.139206 2.034173 2.154002 1.142430 7.657366 1073.475581
+AGCGAT 96.003319 2.012606 2.140722 1.131880 7.657366 2427.798032
+AGCGCA 93.149409 1.796549 1.781482 0.859276 7.657366 2841.847586
+AGCGCC 91.777628 1.754680 1.849659 0.909071 7.657366 3477.674253
+AGCGCG 92.968025 1.790074 1.861279 0.917651 7.657366 2831.792544
+AGCGCT 93.284999 1.872478 1.899488 0.946052 7.657366 980.157379
+AGCGGA 97.742344 2.246966 2.234574 1.207126 7.657366 1597.978640
+AGCGGC 95.212535 1.889063 2.010667 1.030316 7.657366 3059.487252
+AGCGGG 97.253285 2.007746 2.046229 1.057771 7.657366 1667.140217
+AGCGGT 96.913401 1.883320 2.146232 1.136254 7.657366 2354.698507
+AGCGTA 93.122521 2.119396 2.163973 1.150371 7.657366 1774.320830
+AGCGTC 91.970041 2.108346 2.167154 1.152909 7.657366 1956.088669
+AGCGTG 92.671581 2.163223 2.213273 1.189906 7.657366 1867.581016
+AGCGTT 93.761043 2.299069 2.376003 1.323521 7.657366 2484.443683
+AGCTAA 95.956226 1.948802 1.890745 0.939528 7.657366 597.020058
+AGCTAC 93.068947 1.637901 1.783527 0.860755 7.657366 654.907800
+AGCTAG 95.203092 1.908097 1.951968 0.985529 7.657366 74.586155
+AGCTAT 94.159220 1.854979 1.840209 0.902113 7.657366 699.837446
+AGCTCA 88.242356 1.747065 1.840702 0.902475 7.657366 948.099587
+AGCTCC 86.286718 1.577428 1.785775 0.862383 7.657366 779.641254
+AGCTCG 88.066342 1.733611 1.846895 0.907034 7.657366 870.846765
+AGCTCT 87.681755 2.081146 1.911635 0.955141 7.657366 818.842128
+AGCTGA 93.679508 1.846394 1.831065 0.895398 7.657366 1427.675286
+AGCTGC 90.280559 1.555409 1.690381 0.794213 7.657366 1995.716754
+AGCTGG 92.177139 1.830751 1.770917 0.851643 7.657366 2297.476790
+AGCTGT 91.045651 1.781507 1.833732 0.897354 7.657366 1110.676383
+AGCTTA 83.837850 1.691981 1.617614 0.743486 7.657366 811.506007
+AGCTTC 81.096750 1.557872 1.524727 0.680375 7.657366 1284.697878
+AGCTTG 83.365144 1.705424 1.589718 0.724337 7.657366 852.078443
+AGCTTT 82.406625 1.585879 1.463696 0.639936 7.657366 1328.078657
+AGGAAA 69.852085 1.961073 1.675699 0.783889 7.657366 1437.516097
+AGGAAC 67.054280 1.633944 1.822456 0.889090 7.657366 1048.836340
+AGGAAG 68.580202 1.996761 1.726708 0.819952 7.657366 1291.113172
+AGGAAT 67.647827 1.715359 1.826332 0.891928 7.657366 997.997274
+AGGACA 57.700778 1.934619 2.272135 1.237688 7.657366 477.890902
+AGGACC 57.253080 2.310861 2.455461 1.390464 7.657366 257.814055
+AGGACG 57.649252 2.198002 2.207161 1.184981 7.657366 619.572451
+AGGACT 57.727001 1.952481 2.091862 1.093351 7.657366 392.248832
+AGGAGA 67.491243 1.690418 1.731947 0.823687 7.657366 809.973375
+AGGAGC 66.072961 1.464715 1.602841 0.733324 7.657366 585.502634
+AGGAGG 66.657365 1.971725 1.832885 0.896733 7.657366 404.434648
+AGGAGT 66.486159 1.589074 1.611942 0.739578 7.657366 597.528771
+AGGATA 58.289691 1.920346 2.087915 1.090259 7.657366 880.292951
+AGGATC 57.082134 1.565403 1.897179 0.944328 7.657366 1039.599364
+AGGATG 57.692689 1.765863 2.056505 1.065749 7.657366 1118.497946
+AGGATT 57.984906 1.474735 1.672268 0.781483 7.657366 990.611217
+AGGCAA 79.972644 2.140588 1.767223 0.848980 7.657366 1590.864048
+AGGCAC 78.017960 1.674980 1.637259 0.757070 7.657366 1066.559503
+AGGCAG 79.111626 1.727479 1.730303 0.822514 7.657366 1287.725354
+AGGCAT 78.899179 1.938222 1.681793 0.788169 7.657366 1533.526877
+AGGCCA 75.986715 1.687706 1.763876 0.846569 7.657366 1638.825443
+AGGCCC 74.919822 1.599679 1.788863 0.864621 7.657366 451.518627
+AGGCCG 75.935722 1.633107 1.760109 0.843859 7.657366 1537.796226
+AGGCCT 75.770728 1.661480 1.737391 0.827573 7.657366 753.967685
+AGGCGA 78.666533 1.726417 1.700184 0.801132 7.657366 1819.098297
+AGGCGC 76.427401 1.521208 1.562695 0.705946 7.657366 1904.046639
+AGGCGG 77.778766 1.711593 1.705568 0.804941 7.657366 1684.947805
+AGGCGT 77.095491 1.568600 1.633200 0.754257 7.657366 1705.176524
+AGGCTA 75.045246 1.655045 1.624749 0.748410 7.657366 628.722782
+AGGCTC 73.435317 1.455885 1.583479 0.720077 7.657366 555.651372
+AGGCTG 74.508627 1.613353 1.588015 0.723173 7.657366 1533.590955
+AGGCTT 74.144837 1.648024 1.488344 0.656168 7.657366 1172.355095
+AGGGAA 72.006019 1.761336 1.500748 0.664388 7.657366 824.554862
+AGGGAC 67.598846 1.578254 1.919995 0.961414 7.657366 311.464688
+AGGGAG 71.210905 1.781025 1.510177 0.670660 7.657366 492.343234
+AGGGAT 69.152027 1.495681 1.705061 0.804582 7.657366 928.654437
+AGGGCA 65.468442 1.775157 1.857348 0.914746 7.657366 1057.846806
+AGGGCC 64.071273 1.756698 1.963851 0.994542 7.657366 304.400614
+AGGGCG 65.349085 1.831630 1.863063 0.918971 7.657366 1406.100560
+AGGGCT 65.127980 1.620974 1.772943 0.853105 7.657366 656.630876
+AGGGGA 72.408361 1.514172 1.365502 0.576632 7.657366 673.230289
+AGGGGC 70.337694 1.369251 1.480938 0.651277 7.657366 700.943391
+AGGGGG 71.543473 1.483017 1.326131 0.551874 7.657366 426.534264
+AGGGGT 71.246351 1.339125 1.482945 0.652601 7.657366 445.302724
+AGGGTA 65.075324 1.377717 1.530024 0.683924 7.657366 783.523138
+AGGGTC 63.029859 1.536643 1.684611 0.790150 7.657366 280.358411
+AGGGTG 64.328771 1.467227 1.586664 0.722250 7.657366 791.101281
+AGGGTT 64.805191 1.470909 1.761752 0.845040 7.657366 850.224495
+AGGTAA 69.920795 1.897039 1.933405 0.971504 7.657366 1290.735707
+AGGTAC 66.830838 1.848957 1.859770 0.916535 7.657366 624.210771
+AGGTAG 68.955255 1.747568 1.873972 0.927054 7.657366 543.097276
+AGGTAT 68.021308 1.984649 1.986712 1.011958 7.657366 883.556004
+AGGTCA 62.332526 1.916956 2.209855 1.187151 7.657366 1454.873655
+AGGTCC 60.986563 1.914581 2.243146 1.214078 7.657366 323.325452
+AGGTCG 62.225067 1.859361 2.155677 1.143762 7.657366 1097.588413
+AGGTCT 62.112016 2.139322 2.308954 1.267894 7.657366 687.970973
+AGGTGA 66.346092 1.759398 1.758576 0.842757 7.657366 1557.068357
+AGGTGC 62.933672 1.512546 1.975711 1.003565 7.657366 1195.192447
+AGGTGG 65.249889 1.651855 1.807782 0.878374 7.657366 1169.777958
+AGGTGT 64.136389 1.791074 1.982458 1.008710 7.657366 881.354712
+AGGTTA 58.189184 1.913107 2.333315 1.288014 7.657366 1116.763341
+AGGTTC 55.443946 1.821197 2.276861 1.241552 7.657366 848.810011
+AGGTTG 57.627741 2.057323 2.261548 1.229048 7.657366 1427.295074
+AGGTTT 56.875232 2.052507 2.216410 1.192437 7.657366 1537.007463
+AGTAAA 104.080696 2.546898 2.839583 1.729188 7.657366 1470.158619
+AGTAAC 102.112453 2.489962 2.903416 1.787822 7.657366 930.347632
+AGTAAG 103.262781 2.566095 2.777608 1.672887 7.657366 644.662125
+AGTAAT 104.067140 2.430214 2.875522 1.762119 7.657366 1279.290162
+AGTACA 97.859317 2.444718 2.597176 1.512559 7.657366 788.386040
+AGTACC 97.429948 2.587130 2.633198 1.544136 7.657366 1039.097398
+AGTACG 97.669043 2.373075 2.495849 1.424911 7.657366 1140.779147
+AGTACT 98.868422 2.797257 2.783778 1.678465 7.657366 462.350185
+AGTAGA 101.835155 2.788331 2.754595 1.652140 7.657366 473.450255
+AGTAGC 100.118738 2.209690 2.705109 1.607820 7.657366 735.819590
+AGTAGG 101.256513 2.419814 2.574824 1.493075 7.657366 255.820606
+AGTAGT 102.360451 2.289384 2.697891 1.601389 7.657366 484.431381
+AGTATA 99.575604 2.552407 2.913983 1.797591 7.657366 358.191614
+AGTATC 100.713247 2.937385 3.233641 2.101348 7.657366 855.030622
+AGTATG 99.495124 2.703176 2.973459 1.852905 7.657366 707.915405
+AGTATT 102.928409 2.981741 3.445935 2.311644 7.657366 1033.385360
+AGTCAA 112.647039 2.482853 2.743172 1.641874 7.657366 674.867144
+AGTCAC 110.454364 2.473513 2.919713 1.802895 7.657366 750.061594
+AGTCAG 111.819918 3.055046 3.056412 1.930982 7.657366 1258.084081
+AGTCAT 111.619242 2.810360 2.925376 1.808144 7.657366 997.807517
+AGTCCA 111.125371 2.892538 2.893768 1.778918 7.657366 581.927982
+AGTCCC 109.444704 2.548750 2.725323 1.625875 7.657366 383.791737
+AGTCCG 110.822817 2.591233 2.754073 1.651670 7.657366 676.261968
+AGTCCT 110.615039 2.389310 2.912247 1.795984 7.657366 403.049023
+AGTCGA 112.092739 2.883783 3.107851 1.979933 7.657366 756.142594
+AGTCGC 109.661872 2.448181 2.854936 1.743230 7.657366 1090.618937
+AGTCGG 111.301290 2.411854 2.918057 1.801362 7.657366 950.647998
+AGTCGT 111.296108 2.760210 3.144630 2.015184 7.657366 697.126150
+AGTCTA 111.028622 2.562984 2.817136 1.708724 7.657366 220.152483
+AGTCTC 110.237069 2.804768 3.110143 1.982124 7.657366 395.755765
+AGTCTG 111.354947 2.530083 2.887735 1.773358 7.657366 973.155184
+AGTCTT 111.773125 2.650610 3.032557 1.908420 7.657366 495.828761
+AGTGAA 101.705809 2.529008 2.781845 1.676716 7.657366 1571.129356
+AGTGAC 98.583669 2.315049 2.676283 1.582189 7.657366 976.736411
+AGTGAG 100.710511 2.604108 2.655452 1.563752 7.657366 731.598420
+AGTGAT 100.819765 2.560057 2.903523 1.787921 7.657366 1541.736642
+AGTGCA 100.223181 2.696779 2.740746 1.639696 7.657366 909.390996
+AGTGCC 99.765044 2.575573 2.757647 1.654886 7.657366 1257.359583
+AGTGCG 100.309199 2.632371 2.732107 1.631950 7.657366 1364.637131
+AGTGCT 101.847568 2.580613 2.892906 1.778123 7.657366 1162.141495
+AGTGGA 102.161288 3.060632 2.978321 1.857452 7.657366 838.581475
+AGTGGC 100.722427 2.590773 2.615751 1.528815 7.657366 1553.400876
+AGTGGG 101.816371 2.537168 2.717376 1.618769 7.657366 794.850359
+AGTGGT 103.066453 2.613655 2.845710 1.734788 7.657366 1252.007994
+AGTGTA 103.304961 2.971056 2.995043 1.873116 7.657366 444.625882
+AGTGTC 103.787559 2.778390 3.308904 2.175137 7.657366 568.627217
+AGTGTG 103.164023 2.957297 3.328528 2.194515 7.657366 611.716060
+AGTGTT 106.417849 2.878866 3.339016 2.204896 7.657366 826.649933
+AGTTAA 100.144052 2.440119 2.353610 1.304854 7.657366 1114.855121
+AGTTAC 96.913826 2.073146 2.035751 1.049657 7.657366 833.939670
+AGTTAG 99.230441 2.250202 2.282395 1.246082 7.657366 465.190495
+AGTTAT 98.286497 1.954105 2.114242 1.110944 7.657366 960.119267
+AGTTCA 94.022000 2.081429 2.399808 1.343461 7.657366 1506.773366
+AGTTCC 92.223658 2.438602 2.413418 1.354906 7.657366 1079.497177
+AGTTCG 93.975598 2.111372 2.471137 1.403801 7.657366 1223.004399
+AGTTCT 93.964380 2.042017 2.460076 1.394386 7.657366 917.230672
+AGTTGA 98.384811 2.295518 2.130093 1.123461 7.657366 1271.345478
+AGTTGC 94.521664 2.023241 2.081917 1.085564 7.657366 1621.421227
+AGTTGG 97.009426 2.176299 2.168470 1.153959 7.657366 842.225830
+AGTTGT 96.000020 2.172093 2.176769 1.160590 7.657366 1047.748037
+AGTTTA 90.427767 1.745115 2.143437 1.134034 7.657366 1013.378053
+AGTTTC 88.498631 1.723716 1.963198 0.994046 7.657366 1134.748931
+AGTTTG 90.115997 1.630822 1.962360 0.993410 7.657366 1643.980887
+AGTTTT 89.943349 1.690818 1.917960 0.959886 7.657366 2120.379764
+ATAAAA 91.099900 2.079440 2.042266 1.054700 7.657366 2716.358450
+ATAAAC 88.676227 2.116822 2.321561 1.278293 7.657366 2237.091001
+ATAAAG 90.480169 2.183183 2.178502 1.161976 7.657366 2201.207934
+ATAAAT 89.633769 2.329729 2.364763 1.314140 7.657366 2273.071785
+ATAACA 82.360961 2.355702 2.101217 1.100694 7.657366 1391.777223
+ATAACC 81.011251 2.222219 1.992298 1.016229 7.657366 2061.573137
+ATAACG 81.836937 2.321455 2.073660 1.079112 7.657366 2379.693812
+ATAACT 81.806704 2.467776 1.959645 0.991348 7.657366 947.909625
+ATAAGA 89.566823 2.074006 2.097013 1.097392 7.657366 870.726203
+ATAAGC 87.997421 2.170459 2.239662 1.211251 7.657366 1275.356497
+ATAAGG 88.591792 2.364657 2.105816 1.104310 7.657366 986.502224
+ATAAGT 88.666309 2.460587 2.299255 1.259914 7.657366 780.476410
+ATAATA 81.266605 2.450861 2.182503 1.165179 7.657366 1222.885264
+ATAATC 79.929108 2.368365 2.520680 1.446228 7.657366 2115.224955
+ATAATG 80.663359 2.534260 2.319416 1.276522 7.657366 1841.383790
+ATAATT 80.931725 2.633665 2.687607 1.592242 7.657366 1443.771932
+ATACAA 91.767359 1.425003 1.576693 0.715453 7.657366 749.994218
+ATACAC 90.139619 1.487206 1.524343 0.680118 7.657366 604.039897
+ATACAG 91.125546 1.356268 1.595275 0.728138 7.657366 1181.135275
+ATACAT 90.648653 1.393158 1.509136 0.669966 7.657366 811.485335
+ATACCA 89.313815 1.482398 1.735806 0.826442 7.657366 1498.820051
+ATACCC 88.346358 1.561048 1.766304 0.848318 7.657366 1255.190050
+ATACCG 89.134289 1.506962 1.749375 0.836151 7.657366 2139.959866
+ATACCT 88.914654 1.464175 1.674756 0.783227 7.657366 952.425650
+ATACGA 91.072926 1.403526 1.571479 0.711907 7.657366 589.002778
+ATACGC 89.126534 1.521366 1.617309 0.743275 7.657366 1728.430811
+ATACGG 90.149808 1.464659 1.588157 0.723270 7.657366 1236.674083
+ATACGT 89.483789 1.447365 1.589671 0.724305 7.657366 755.983900
+ATACTA 88.363417 1.497590 1.764279 0.846859 7.657366 296.141552
+ATACTC 86.589856 1.488065 1.615576 0.742081 7.657366 768.308036
+ATACTG 87.569689 1.572713 1.728974 0.821567 7.657366 1204.739294
+ATACTT 87.252445 1.558369 1.749856 0.836496 7.657366 774.194477
+ATAGAA 89.137806 1.983119 2.033243 1.047718 7.657366 742.252957
+ATAGAC 84.333875 1.995822 2.421320 1.361565 7.657366 463.585748
+ATAGAG 87.853220 1.908300 2.038284 1.051617 7.657366 709.511270
+ATAGAT 85.914803 2.031718 2.245507 1.215996 7.657366 666.415157
+ATAGCA 85.811106 4.963277 2.079718 1.083845 7.657366 2727.926288
+ATAGCC 82.740267 2.230880 2.134014 1.126565 7.657366 1200.685054
+ATAGCG 83.667170 2.190827 2.051900 1.062172 7.657366 1765.384861
+ATAGCT 83.344551 2.018993 2.106200 1.104612 7.657366 779.969806
+ATAGGA 88.528427 2.120203 2.052943 1.062981 7.657366 205.106633
+ATAGGC 85.871083 2.005270 2.219651 1.195053 7.657366 579.006000
+ATAGGG 87.822638 2.021378 2.088383 1.090625 7.657366 331.835702
+ATAGGT 87.424711 2.721350 2.255521 1.224139 7.657366 605.774331
+ATAGTA 82.317361 2.160502 2.347797 1.300023 7.657366 399.098589
+ATAGTC 80.054995 2.410205 2.544273 1.466580 7.657366 651.306138
+ATAGTG 81.517840 2.447885 2.388566 1.334032 7.657366 812.086475
+ATAGTT 81.595657 2.946105 2.797305 1.690714 7.657366 784.930010
+ATATAA 87.461803 1.906246 2.126329 1.120485 7.657366 1045.243449
+ATATAC 84.770132 1.914010 2.167144 1.152901 7.657366 678.453595
+ATATAG 86.670037 1.820794 2.171295 1.156215 7.657366 612.425743
+ATATAT 85.734259 2.002053 2.244563 1.215229 7.657366 912.274309
+ATATCA 82.660857 2.148482 2.619450 1.532058 7.657366 1899.425531
+ATATCC 81.542202 2.372390 2.728370 1.628603 7.657366 1725.394441
+ATATCG 82.547583 2.296535 2.599641 1.514713 7.657366 2384.417678
+ATATCT 82.472440 2.521034 2.765669 1.662113 7.657366 1356.653757
+ATATGA 85.669167 2.097905 2.288516 1.251098 7.657366 1112.377716
+ATATGC 82.527583 2.047133 2.337950 1.291853 7.657366 1376.306052
+ATATGG 84.518922 2.030116 2.225664 1.199913 7.657366 1216.687390
+ATATGT 83.546034 2.363362 2.450050 1.385871 7.657366 901.559611
+ATATTA 78.235412 2.918334 2.973009 1.852485 7.657366 1397.375968
+ATATTC 75.335682 2.774053 2.964956 1.844963 7.657366 2190.711836
+ATATTG 77.404209 2.844587 2.813630 1.705536 7.657366 2240.980489
+ATATTT 76.542562 2.561446 2.797211 1.690628 7.657366 2062.024912
+ATCAAA 101.571633 2.017860 2.065456 1.072714 7.657366 2370.772604
+ATCAAC 99.505428 1.995852 2.207753 1.185458 7.657366 2947.689111
+ATCAAG 100.868282 1.839766 2.041645 1.054218 7.657366 1224.364959
+ATCAAT 100.360146 1.885130 2.227130 1.201098 7.657366 2227.439675
+ATCACA 94.950527 2.234841 2.039308 1.052409 7.657366 1283.317540
+ATCACC 94.319742 2.216418 2.174581 1.158841 7.657366 3500.584600
+ATCACG 95.112855 2.398614 2.148386 1.137965 7.657366 1875.258393
+ATCACT 94.812034 2.166698 2.172965 1.157549 7.657366 1594.846217
+ATCAGA 100.367869 1.996761 2.199267 1.178630 7.657366 1813.958296
+ATCAGC 98.614329 2.228760 2.355637 1.306540 7.657366 3522.807983
+ATCAGG 99.428882 2.146958 2.220351 1.195619 7.657366 2937.187173
+ATCAGT 99.231222 2.112031 2.429894 1.368804 7.657366 1745.257994
+ATCATA 93.613756 2.511855 2.426117 1.365614 7.657366 884.688388
+ATCATC 92.360079 2.568062 2.485680 1.416212 7.657366 3267.165730
+ATCATG 92.776111 2.740934 2.373969 1.321821 7.657366 1972.169317
+ATCATT 93.466954 2.522390 2.510700 1.437648 7.657366 1843.042452
+ATCCAA 100.198562 2.061607 1.703384 0.803395 7.657366 546.476747
+ATCCAC 98.070430 1.828684 1.876675 0.929060 7.657366 1713.283014
+ATCCAG 99.651720 1.874650 1.778569 0.857169 7.657366 3121.045753
+ATCCAT 98.987419 1.866770 1.952730 0.986106 7.657366 1367.321962
+ATCCCA 97.140542 1.737243 1.652125 0.767405 7.657366 1219.800331
+ATCCCC 96.069919 1.874895 1.705334 0.804775 7.657366 1265.543118
+ATCCCG 97.137252 1.732086 1.691114 0.794730 7.657366 1662.436282
+ATCCCT 96.676022 1.852937 1.846692 0.906885 7.657366 970.525105
+ATCCGA 99.268381 1.849106 1.929588 0.968629 7.657366 747.720938
+ATCCGC 97.410277 1.965307 2.042691 1.055029 7.657366 2444.773652
+ATCCGG 98.755919 1.853660 1.890277 0.939179 7.657366 2470.123826
+ATCCGT 98.040321 1.879805 2.070750 1.076842 7.657366 1259.053506
+ATCCTA 96.379441 2.439911 2.092121 1.093554 7.657366 202.356498
+ATCCTC 94.412303 2.057046 2.145801 1.135911 7.657366 972.867883
+ATCCTG 95.603432 2.068081 2.011677 1.031093 7.657366 2482.635191
+ATCCTT 95.323689 1.983653 2.147163 1.136993 7.657366 912.714084
+ATCGAA 99.078545 2.193445 2.271748 1.237372 7.657366 1605.873568
+ATCGAC 94.614730 2.089173 2.436372 1.374282 7.657366 2181.734239
+ATCGAG 97.991546 2.121908 2.253771 1.222714 7.657366 1112.415461
+ATCGAT 96.223834 2.180025 2.482544 1.413532 7.657366 1739.165123
+ATCGCA 94.277127 2.391160 2.220817 1.195995 7.657366 1782.030743
+ATCGCC 93.162237 2.356179 2.332335 1.287202 7.657366 4449.413634
+ATCGCG 94.126762 2.552484 2.339264 1.292942 7.657366 2945.665826
+ATCGCT 94.008191 2.422227 2.289854 1.252195 7.657366 2620.210044
+ATCGGA 98.812857 2.051396 2.261576 1.229071 7.657366 740.034996
+ATCGGC 95.942451 2.106218 2.348239 1.300391 7.657366 2608.952146
+ATCGGG 97.894535 2.235413 2.225543 1.199815 7.657366 1173.288356
+ATCGGT 97.249038 2.197734 2.462547 1.396488 7.657366 1965.234354
+ATCGTA 93.072350 3.080841 2.553884 1.474898 7.657366 1021.237674
+ATCGTC 90.398466 4.963277 2.539262 1.462250 7.657366 4551.808477
+ATCGTG 90.755323 4.963277 2.387492 1.333132 7.657366 4210.422664
+ATCGTT 92.368499 2.471795 2.783323 1.678053 7.657366 1711.222178
+ATCTAA 95.913544 1.863392 2.015431 1.033980 7.657366 447.308121
+ATCTAC 91.289173 4.963277 2.121349 1.116551 7.657366 2625.912487
+ATCTAG 95.586683 2.394372 2.325716 1.281726 7.657366 41.763115
+ATCTAT 94.182685 1.949330 2.168808 1.154229 7.657366 655.292504
+ATCTCA 90.725175 2.379448 2.564098 1.483755 7.657366 1125.215414
+ATCTCC 89.340913 4.963277 2.287065 1.249908 7.657366 3779.589733
+ATCTCG 90.578950 2.381157 2.541954 1.464575 7.657366 984.530672
+ATCTCT 90.154215 2.268519 2.552717 1.473888 7.657366 1376.628827
+ATCTGA 94.108298 2.152226 2.083644 1.086915 7.657366 1445.589948
+ATCTGC 91.096910 2.179651 2.296090 1.257314 7.657366 2578.192766
+ATCTGG 93.059670 2.152942 2.235495 1.207872 7.657366 2727.485823
+ATCTGT 92.066835 2.241480 2.325751 1.281755 7.657366 1076.683085
+ATCTTA 86.309311 2.504632 2.608440 1.522409 7.657366 752.800065
+ATCTTC 83.241383 2.431948 2.674839 1.580908 7.657366 2318.425782
+ATCTTG 85.433899 2.491917 2.592333 1.508330 7.657366 866.514827
+ATCTTT 87.226029 4.963277 2.126887 1.120926 7.657366 12975.457853
+ATGAAA 84.902971 2.827913 2.529336 1.453684 7.657366 2468.663184
+ATGAAC 81.762409 3.136534 2.798482 1.691781 7.657366 1737.833716
+ATGAAG 83.836118 3.055646 2.523980 1.449069 7.657366 2409.427019
+ATGAAT 82.850320 2.834651 2.706195 1.608789 7.657366 1633.047796
+ATGACA 74.643927 2.588315 2.328612 1.284121 7.657366 1418.444049
+ATGACC 73.317110 2.403795 2.173678 1.158119 7.657366 1762.477203
+ATGACG 74.255438 2.499774 2.316845 1.274400 7.657366 2032.250836
+ATGACT 74.234605 2.602306 2.082743 1.086210 7.657366 948.484825
+ATGAGA 83.998328 2.962590 2.512107 1.438857 7.657366 751.023999
+ATGAGC 82.458087 3.494101 2.651677 1.560419 7.657366 1490.123498
+ATGAGG 82.838595 2.935904 2.474637 1.406784 7.657366 632.839619
+ATGAGT 83.930369 4.963277 2.491839 1.421478 7.657366 1792.723312
+ATGATA 73.979282 2.808191 2.409309 1.351447 7.657366 1259.279239
+ATGATC 72.665532 2.643389 2.670973 1.577483 7.657366 1686.462512
+ATGATG 73.062909 2.536933 2.441485 1.378610 7.657366 2453.984862
+ATGATT 73.666371 2.710977 2.806336 1.698907 7.657366 1910.978614
+ATGCAA 90.132827 1.481097 1.752950 0.838715 7.657366 1232.290884
+ATGCAC 87.815741 1.545728 1.625026 0.748601 7.657366 1155.239445
+ATGCAG 89.270204 1.505912 1.751674 0.837800 7.657366 2148.571084
+ATGCAT 88.531745 1.496948 1.612649 0.740066 7.657366 935.854627
+ATGCCA 87.324289 1.849244 1.989548 1.014126 7.657366 2262.987232
+ATGCCC 86.102784 1.758930 1.922959 0.963641 7.657366 1517.788873
+ATGCCG 87.060928 1.778490 1.962560 0.993561 7.657366 3201.234555
+ATGCCT 86.797980 1.690223 1.871878 0.925500 7.657366 1426.974817
+ATGCGA 89.098510 1.566576 1.718000 0.813757 7.657366 1178.628839
+ATGCGC 86.813977 1.763848 1.792829 0.867498 7.657366 2828.684559
+ATGCGG 88.144443 1.646107 1.829031 0.893906 7.657366 1987.704059
+ATGCGT 87.400444 1.666184 1.785376 0.862094 7.657366 1534.410843
+ATGCTA 85.829528 1.835122 1.903357 0.948944 7.657366 477.872371
+ATGCTC 84.108278 1.829891 1.879281 0.930996 7.657366 1074.743848
+ATGCTG 85.161528 1.780811 1.883174 0.933890 7.657366 3107.868999
+ATGCTT 85.155131 1.945601 2.006208 1.026891 7.657366 1134.070001
+ATGGAA 87.726981 2.508113 2.517659 1.443629 7.657366 1511.784062
+ATGGAC 83.146705 2.699024 2.772919 1.668653 7.657366 831.579911
+ATGGAG 86.513447 2.546715 2.467811 1.400968 7.657366 858.549423
+ATGGAT 84.746176 3.556104 2.794140 1.687845 7.657366 1487.461474
+ATGGCA 82.808033 2.835908 2.288614 1.251178 7.657366 2190.842791
+ATGGCC 81.433556 2.693035 2.439695 1.377094 7.657366 1056.430329
+ATGGCG 82.455012 2.981666 2.387219 1.332904 7.657366 3891.127079
+ATGGCT 82.043296 2.773162 2.403061 1.346193 7.657366 1735.663276
+ATGGGA 88.190600 2.243932 2.375104 1.322770 7.657366 649.900122
+ATGGGC 85.831382 2.430800 2.447587 1.383782 7.657366 1490.617433
+ATGGGG 86.928114 2.575080 2.336394 1.290564 7.657366 899.416075
+ATGGGT 86.934069 2.606363 2.665389 1.572539 7.657366 939.698376
+ATGGTA 81.786030 3.153114 2.739522 1.638598 7.657366 1103.300113
+ATGGTC 79.797370 3.241246 2.947791 1.828965 7.657366 1479.382714
+ATGGTG 80.964013 3.309434 2.842665 1.732004 7.657366 2588.820917
+ATGGTT 81.356618 3.202760 2.912138 1.795884 7.657366 1907.089217
+ATGTAA 84.665016 2.024142 2.344361 1.297170 7.657366 834.979056
+ATGTAC 80.983662 2.325353 2.509465 1.436587 7.657366 879.761655
+ATGTAG 83.313687 1.989308 2.261265 1.228818 7.657366 559.391528
+ATGTAT 82.408987 2.675781 2.658263 1.566236 7.657366 848.271683
+ATGTCA 79.833685 2.785613 3.001071 1.878775 7.657366 1073.390560
+ATGTCC 78.523605 3.017429 3.105521 1.977707 7.657366 797.331075
+ATGTCG 79.838157 2.926483 3.034362 1.910123 7.657366 1434.842832
+ATGTCT 79.408076 2.862889 3.040126 1.915568 7.657366 832.593215
+ATGTGA 82.580285 2.487528 2.541689 1.464347 7.657366 1064.655490
+ATGTGC 78.704010 2.679901 2.743116 1.641824 7.657366 1162.099515
+ATGTGG 81.135216 2.764406 2.793128 1.686928 7.657366 1166.339079
+ATGTGT 79.677613 2.762904 2.876671 1.763175 7.657366 640.422220
+ATGTTA 75.161718 3.124657 3.254846 2.122052 7.657366 1001.042495
+ATGTTC 71.997409 3.167687 3.209318 2.077683 7.657366 1625.194371
+ATGTTG 74.381972 2.974677 3.038781 1.914297 7.657366 1940.525860
+ATGTTT 73.468934 3.097604 3.084939 1.958078 7.657366 1728.471992
+ATTAAA 104.308353 1.612077 1.753423 0.839055 7.657366 2232.476249
+ATTAAC 102.462120 1.761136 1.898003 0.944943 7.657366 1886.734414
+ATTAAG 103.482608 1.647317 1.761933 0.845171 7.657366 1138.096980
+ATTAAT 103.754144 1.822419 1.936683 0.973976 7.657366 2168.782930
+ATTACA 99.311869 1.703245 1.618393 0.744023 7.657366 1139.761879
+ATTACC 98.743104 1.720978 1.719513 0.814833 7.657366 2118.522384
+ATTACG 99.130959 1.638701 1.663778 0.775538 7.657366 1737.484579
+ATTACT 99.876829 1.663781 1.732405 0.824014 7.657366 1336.109285
+ATTAGA 102.401456 1.564107 1.798873 0.871889 7.657366 330.449179
+ATTAGC 101.006348 1.630458 1.906073 0.950976 7.657366 875.570340
+ATTAGG 101.896543 1.500438 1.713833 0.810799 7.657366 326.743551
+ATTAGT 102.358260 1.912487 1.846282 0.906582 7.657366 575.507110
+ATTATA 98.863235 1.882214 1.885860 0.935889 7.657366 579.869585
+ATTATC 99.270106 2.056072 2.030755 1.045795 7.657366 2132.970853
+ATTATG 98.608620 2.093292 2.004056 1.025239 7.657366 1507.179571
+ATTATT 100.900308 1.974587 2.082375 1.085922 7.657366 2313.727730
+ATTCAA 107.023990 2.065574 1.766355 0.848354 7.657366 1225.730950
+ATTCAC 105.132192 1.918356 1.718675 0.814237 7.657366 1511.188140
+ATTCAG 106.319752 2.077624 1.780688 0.858701 7.657366 2650.314788
+ATTCAT 105.948155 2.047953 1.797627 0.870983 7.657366 1784.808187
+ATTCCA 105.186081 1.927274 1.733644 0.824898 7.657366 1212.454501
+ATTCCC 103.833934 1.961366 1.674413 0.782987 7.657366 1198.743020
+ATTCCG 105.161437 2.168856 1.735526 0.826242 7.657366 1727.194601
+ATTCCT 104.855084 1.831522 1.787646 0.863739 7.657366 1088.221403
+ATTCGA 106.319832 2.103358 1.953022 0.986327 7.657366 725.784916
+ATTCGC 104.594306 1.965419 1.740166 0.829557 7.657366 2054.247968
+ATTCGG 105.768823 2.025758 1.845778 0.906212 7.657366 911.317197
+ATTCGT 105.606713 2.060277 1.800957 0.873404 7.657366 1236.108416
+ATTCTA 105.801871 1.885498 1.747935 0.835119 7.657366 344.431162
+ATTCTC 104.741767 1.999193 1.905261 0.950369 7.657366 1140.450938
+ATTCTG 105.359304 2.070824 1.797292 0.870739 7.657366 2021.756650
+ATTCTT 105.526917 1.890478 1.903057 0.948720 7.657366 1187.656049
+ATTGAA 100.803045 1.635053 1.734616 0.825592 7.657366 2018.812496
+ATTGAC 97.363323 1.746043 2.010304 1.030037 7.657366 1141.135666
+ATTGAG 99.968271 1.617785 1.766039 0.848127 7.657366 1393.239583
+ATTGAT 99.291133 1.720679 1.968772 0.998282 7.657366 2322.309956
+ATTGCA 98.296979 1.728940 1.605473 0.735131 7.657366 1500.582514
+ATTGCC 97.645793 1.778764 1.666876 0.777706 7.657366 3137.053372
+ATTGCG 98.228927 1.693798 1.656869 0.770713 7.657366 2141.177271
+ATTGCT 98.977932 1.748724 1.706006 0.805251 7.657366 2206.839290
+ATTGGA 100.594943 1.796335 1.850507 0.909696 7.657366 328.621860
+ATTGGC 98.723637 1.633516 1.780901 0.858855 7.657366 1913.628795
+ATTGGG 100.218482 1.671054 1.695234 0.797636 7.657366 655.715940
+ATTGGT 100.391999 1.642020 1.837258 0.899944 7.657366 1728.778853
+ATTGTA 98.711815 1.850140 1.810696 0.880499 7.657366 555.985598
+ATTGTC 98.775737 1.992071 1.938524 0.975365 7.657366 1338.452285
+ATTGTG 98.579800 1.972930 1.878084 0.930106 7.657366 1365.500546
+ATTGTT 100.465523 1.952730 2.037881 1.051305 7.657366 1863.221570
+ATTTAA 97.944152 1.851365 1.450137 0.631065 7.657366 1614.537298
+ATTTAC 95.169001 1.707880 1.376627 0.583693 7.657366 1670.219495
+ATTTAG 97.205670 1.666894 1.382097 0.587176 7.657366 666.770578
+ATTTAT 96.201060 1.646744 1.427991 0.616664 7.657366 2104.188241
+ATTTCA 93.061875 1.473605 1.486213 0.654760 7.657366 2200.722247
+ATTTCC 91.331118 1.485992 1.488873 0.656518 7.657366 1952.873077
+ATTTCG 92.999026 1.500832 1.476553 0.648387 7.657366 1711.163833
+ATTTCT 92.698086 1.504346 1.514777 0.673726 7.657366 1600.249250
+ATTTGA 96.424049 1.854394 1.360485 0.573457 7.657366 1231.306137
+ATTTGC 93.479690 1.795052 1.409757 0.604890 7.657366 2453.058820
+ATTTGG 95.619755 1.779758 1.387302 0.590496 7.657366 1054.514994
+ATTTGT 94.551524 1.714959 1.437060 0.622548 7.657366 1399.579460
+ATTTTA 89.753798 1.634930 1.508332 0.669431 7.657366 1787.353975
+ATTTTC 87.581176 1.586283 1.493199 0.659382 7.657366 2959.858387
+ATTTTG 89.262092 1.570269 1.437718 0.622975 7.657366 2799.262075
+ATTTTT 89.093539 1.703119 1.418587 0.610582 7.657366 3661.669611
+CAAAAA 83.468189 1.808515 1.495790 0.661099 7.657366 3119.793860
+CAAAAC 81.063973 1.466721 1.698105 0.799663 7.657366 2237.174503
+CAAAAG 82.735684 1.921967 1.610618 0.738668 7.657366 1701.960805
+CAAAAT 81.660960 1.526909 1.736430 0.826887 7.657366 2474.420144
+CAAACA 73.560040 1.900962 1.958012 0.990109 7.657366 1946.192242
+CAAACC 72.585731 1.875771 1.920509 0.961800 7.657366 1825.518426
+CAAACG 73.657030 1.913358 1.924056 0.964466 7.657366 1866.999735
+CAAACT 73.410603 1.764929 1.788777 0.864559 7.657366 1791.392639
+CAAAGA 81.333304 1.728959 1.511998 0.671873 7.657366 1627.864356
+CAAAGC 79.729252 1.514714 1.610650 0.738690 7.657366 2111.168566
+CAAAGG 80.516742 1.716148 1.566201 0.708324 7.657366 1359.115177
+CAAAGT 79.974269 1.551789 1.707440 0.806266 7.657366 1385.228229
+CAAATA 73.040258 1.818546 1.761747 0.845037 7.657366 1379.156117
+CAAATC 70.979445 1.487305 1.613906 0.740931 7.657366 1668.872355
+CAAATG 72.310750 1.515219 1.667705 0.778286 7.657366 1472.737347
+CAAATT 72.351594 1.605087 1.849620 0.909042 7.657366 1779.111839
+CAACAA 85.143827 1.684250 1.596497 0.728975 7.657366 2396.776501
+CAACAC 83.836955 1.557116 1.481916 0.651922 7.657366 1389.129058
+CAACAG 84.557422 1.550201 1.583455 0.720060 7.657366 1833.800531
+CAACAT 84.442368 1.636165 1.532950 0.685886 7.657366 1916.323352
+CAACCA 81.843787 1.475504 1.604715 0.734611 7.657366 1931.456119
+CAACCC 80.912698 1.354222 1.579221 0.717174 7.657366 925.955169
+CAACCG 81.793866 1.549753 1.575124 0.714385 7.657366 1768.170611
+CAACCT 81.606992 1.567186 1.555611 0.701151 7.657366 1320.562015
+CAACGA 84.461830 1.607620 1.492330 0.658806 7.657366 1525.625199
+CAACGC 82.866690 1.393927 1.454752 0.634080 7.657366 2497.288004
+CAACGG 83.573571 1.441636 1.475183 0.647484 7.657366 1820.597357
+CAACGT 83.252792 1.466441 1.463096 0.639543 7.657366 1894.140540
+CAACTA 81.199534 1.616427 1.544258 0.693490 7.657366 469.545933
+CAACTC 79.678929 1.348667 1.455831 0.634785 7.657366 874.787227
+CAACTG 80.571425 1.420341 1.479034 0.650021 7.657366 1906.452352
+CAACTT 80.109887 1.453334 1.285990 0.527007 7.657366 1291.435974
+CAAGAA 81.969768 1.523207 1.496806 0.661772 7.657366 1052.506458
+CAAGAC 77.554478 1.458400 1.691999 0.795354 7.657366 533.153775
+CAAGAG 81.032399 1.467548 1.550477 0.697683 7.657366 541.463831
+CAAGAT 79.021564 1.534682 1.690604 0.794370 7.657366 933.473008
+CAAGCA 75.886110 1.473219 1.598538 0.730373 7.657366 982.790272
+CAAGCC 74.586447 1.528936 1.566859 0.708770 7.657366 1031.162107
+CAAGCG 75.935977 1.608639 1.583429 0.720042 7.657366 1103.536751
+CAAGCT 75.314585 1.399266 1.472778 0.645901 7.657366 789.496818
+CAAGGA 80.882826 1.508762 1.442420 0.626034 7.657366 687.778679
+CAAGGC 77.882458 1.309769 1.520963 0.677857 7.657366 915.272645
+CAAGGG 79.919041 1.479312 1.459347 0.637086 7.657366 493.036664
+CAAGGT 79.040065 1.325631 1.562941 0.706113 7.657366 764.523796
+CAAGTA 74.268508 1.440990 1.550203 0.697498 7.657366 368.307389
+CAAGTC 71.927071 1.413322 1.776274 0.855511 7.657366 418.895278
+CAAGTG 73.520466 1.511863 1.716579 0.812748 7.657366 495.791243
+CAAGTT 73.245776 1.614265 1.831977 0.896067 7.657366 701.867664
+CAATAA 79.389051 1.643938 1.737491 0.827645 7.657366 2024.698107
+CAATAC 76.936377 1.503840 1.671026 0.780612 7.657366 1637.110154
+CAATAG 78.932810 1.425336 1.729733 0.822108 7.657366 642.441216
+CAATAT 77.882949 1.529116 1.721451 0.816211 7.657366 2622.717405
+CAATCA 72.319971 1.696951 1.964609 0.995118 7.657366 2532.177024
+CAATCC 71.230605 1.735717 2.128064 1.121856 7.657366 1262.517243
+CAATCG 72.465009 1.732570 1.986982 1.012165 7.657366 1484.711248
+CAATCT 72.015805 1.793052 2.008826 1.028901 7.657366 1391.199885
+CAATGA 76.922473 1.547937 1.652352 0.767563 7.657366 1783.123300
+CAATGC 74.312912 1.535918 1.750793 0.837168 7.657366 1984.858825
+CAATGG 76.093977 1.482189 1.644808 0.762313 7.657366 1997.239503
+CAATGT 75.164651 1.530445 1.776904 0.855965 7.657366 1325.419453
+CAATTA 67.614262 2.042166 2.208461 1.186028 7.657366 1104.852977
+CAATTC 64.997116 1.933370 2.272681 1.238135 7.657366 1009.158059
+CAATTG 66.990622 1.904303 2.101185 1.100669 7.657366 1258.127334
+CAATTT 66.154986 1.933212 2.110620 1.108090 7.657366 1833.953885
+CACAAA 103.261179 1.970212 1.897991 0.944934 7.657366 1554.561940
+CACAAC 100.480835 1.722170 1.856550 0.914156 7.657366 1220.745850
+CACAAG 102.146134 1.970082 1.947739 0.982327 7.657366 605.369289
+CACAAT 101.620083 1.862432 2.043210 1.055431 7.657366 1291.227683
+CACACA 94.869515 1.574795 1.442697 0.626214 7.657366 654.420501
+CACACC 93.710270 1.409346 1.471473 0.645043 7.657366 1212.048195
+CACACG 94.817380 1.516341 1.437236 0.622662 7.657366 651.516045
+CACACT 94.827748 1.532529 1.562269 0.705658 7.657366 746.426340
+CACAGA 100.199006 1.775520 1.949239 0.983463 7.657366 788.217098
+CACAGC 98.045897 1.557324 1.744334 0.832540 7.657366 1572.825186
+CACAGG 99.499617 1.712747 1.843009 0.904173 7.657366 1112.082933
+CACAGT 99.058384 1.717869 1.916064 0.958463 7.657366 825.629215
+CACATA 94.306027 1.631630 1.600719 0.731869 7.657366 661.988434
+CACATC 93.116124 1.586960 1.630734 0.752549 7.657366 1612.686500
+CACATG 93.901903 1.528664 1.620716 0.745625 7.657366 683.801654
+CACATT 94.341608 1.673321 1.789206 0.864870 7.657366 1115.701862
+CACCAA 103.019591 1.875597 1.611020 0.738944 7.657366 1837.204029
+CACCAC 100.626497 1.666246 1.648333 0.764765 7.657366 2845.453084
+CACCAG 102.311513 1.901879 1.644274 0.761941 7.657366 3897.427893
+CACCAT 101.718737 1.913500 1.816318 0.884603 7.657366 2557.721698
+CACCCA 99.159855 1.778736 1.458501 0.636532 7.657366 1431.367651
+CACCCC 97.640308 1.653518 1.429700 0.617771 7.657366 579.667027
+CACCCG 99.093210 1.730370 1.469992 0.644069 7.657366 1480.700357
+CACCCT 98.586224 1.757048 1.544702 0.693789 7.657366 806.440447
+CACCGA 101.584280 1.861293 1.743962 0.832273 7.657366 2067.044240
+CACCGC 99.330327 1.670403 1.616417 0.742661 7.657366 3201.672924
+CACCGG 100.779424 1.938546 1.664608 0.776119 7.657366 2743.177966
+CACCGT 100.173176 1.868309 1.826535 0.892077 7.657366 2057.378089
+CACCTA 98.451055 1.728231 1.503708 0.666355 7.657366 417.569370
+CACCTC 96.366290 1.810627 1.689863 0.793848 7.657366 814.356345
+CACCTG 97.743191 1.696308 1.607094 0.736245 7.657366 2187.298852
+CACCTT 97.211223 1.653411 1.781229 0.859093 7.657366 1419.732008
+CACGAA 97.691873 2.039950 1.827054 0.892457 7.657366 1120.305242
+CACGAC 93.731484 1.618568 1.603244 0.733601 7.657366 1084.213414
+CACGAG 96.684747 1.765572 1.720290 0.815385 7.657366 480.735714
+CACGAT 95.368648 1.657061 1.825437 0.891273 7.657366 1339.877852
+CACGCA 93.028589 1.602584 1.520524 0.677564 7.657366 1802.314030
+CACGCC 92.084824 1.555345 1.536546 0.688301 7.657366 2737.708954
+CACGCG 93.219891 1.577070 1.541776 0.691818 7.657366 1896.528786
+CACGCT 93.330837 1.584700 1.601648 0.732506 7.657366 2006.873711
+CACGGA 97.570513 1.854051 1.837720 0.900283 7.657366 944.968814
+CACGGC 94.382181 1.558296 1.635073 0.755555 7.657366 1820.952846
+CACGGG 96.413411 1.815626 1.732538 0.824109 7.657366 927.644434
+CACGGT 96.201343 1.807520 1.852852 0.911426 7.657366 1686.077077
+CACGTA 92.717495 1.787218 1.688326 0.792766 7.657366 756.924002
+CACGTC 91.049549 1.722898 1.672484 0.781634 7.657366 918.669778
+CACGTG 92.455829 1.589861 1.630395 0.752315 7.657366 175.362733
+CACGTT 92.678679 1.757586 1.911745 0.955224 7.657366 1590.252886
+CACTAA 96.665678 2.025195 1.714662 0.811387 7.657366 715.486854
+CACTAC 93.299723 1.783563 1.621031 0.745843 7.657366 878.924170
+CACTAG 95.740086 1.434656 1.733434 0.824748 7.657366 61.730688
+CACTAT 94.597511 2.006895 1.741376 0.830422 7.657366 804.589824
+CACTCA 88.434989 1.759049 1.770737 0.851513 7.657366 831.239925
+CACTCC 86.865038 1.736891 1.712466 0.809829 7.657366 662.522193
+CACTCG 88.457772 2.052467 1.744856 0.832913 7.657366 548.768293
+CACTCT 88.100403 1.884646 1.851372 0.910334 7.657366 721.561640
+CACTGA 93.250198 2.234710 1.685755 0.790955 7.657366 1373.609334
+CACTGC 90.074080 1.601347 1.580633 0.718136 7.657366 1820.778676
+CACTGG 92.205887 1.916306 1.639403 0.758558 7.657366 2084.694431
+CACTGT 90.960495 1.777474 1.653214 0.768164 7.657366 1014.669775
+CACTTA 83.891353 1.536709 1.544705 0.693791 7.657366 596.579652
+CACTTC 81.332132 1.531289 1.632989 0.754111 7.657366 1863.473770
+CACTTG 83.435925 1.691536 1.568050 0.709578 7.657366 520.666494
+CACTTT 82.511623 1.533106 1.458250 0.636368 7.657366 2115.295942
+CAGAAA 75.339327 1.715878 1.739533 0.829105 7.657366 2932.643667
+CAGAAC 73.015601 1.621636 1.965674 0.995927 7.657366 1945.739025
+CAGAAG 74.106114 1.889126 1.813324 0.882416 7.657366 1448.782478
+CAGAAT 73.509192 1.609334 1.973685 1.002021 7.657366 2002.732672
+CAGACA 64.393269 2.116707 2.323562 1.279946 7.657366 1324.904814
+CAGACC 63.522967 2.119573 2.409138 1.351303 7.657366 1773.859543
+CAGACG 64.412167 2.154390 2.287239 1.250050 7.657366 2351.372848
+CAGACT 64.412840 1.985455 2.194089 1.174469 7.657366 1071.186693
+CAGAGA 73.391972 1.783137 1.707985 0.806652 7.657366 1271.731297
+CAGAGC 71.852318 1.765360 1.852312 0.911028 7.657366 1232.872739
+CAGAGG 72.629465 1.745873 1.739845 0.829327 7.657366 786.049605
+CAGAGT 72.094353 1.833606 1.851724 0.910594 7.657366 913.056327
+CAGATA 64.339425 1.896268 2.113033 1.109991 7.657366 1831.926966
+CAGATC 63.068806 1.598632 1.841909 0.903363 7.657366 1858.540485
+CAGATG 63.711223 1.705779 2.014588 1.033332 7.657366 1981.470670
+CAGATT 63.650901 1.745768 1.897256 0.944386 7.657366 1721.563678
+CAGCAA 82.899723 1.779157 1.692010 0.795362 7.657366 3802.656433
+CAGCAC 81.042758 1.578464 1.535884 0.687857 7.657366 3031.324667
+CAGCAG 82.115700 1.678524 1.642885 0.760976 7.657366 4229.876675
+CAGCAT 81.731417 1.704624 1.616029 0.742393 7.657366 2750.085215
+CAGCCA 79.007410 1.447929 1.669632 0.779635 7.657366 3416.645332
+CAGCCC 77.882719 1.408881 1.590754 0.725045 7.657366 1739.962978
+CAGCCG 79.018869 1.514828 1.638519 0.757945 7.657366 2180.557058
+CAGCCT 78.591350 1.410345 1.555059 0.700778 7.657366 1477.600865
+CAGCGA 81.361679 1.493052 1.600931 0.732014 7.657366 2741.103234
+CAGCGC 79.130296 1.358188 1.497046 0.661932 7.657366 4604.035975
+CAGCGG 80.365592 1.515189 1.576230 0.715138 7.657366 3578.856622
+CAGCGT 79.637130 1.447708 1.541173 0.691413 7.657366 3670.004041
+CAGCTA 77.911217 1.338702 1.509255 0.670046 7.657366 828.866170
+CAGCTC 76.250815 1.432982 1.513297 0.672739 7.657366 2053.102979
+CAGCTG 77.439365 1.377738 1.478359 0.649576 7.657366 2391.813722
+CAGCTT 76.952666 1.367094 1.362134 0.574500 7.657366 2191.066232
+CAGGAA 74.969661 1.689185 1.752865 0.838654 7.657366 2408.783603
+CAGGAC 69.938170 1.496915 1.959704 0.991393 7.657366 955.577626
+CAGGAG 73.997898 1.670586 1.844255 0.905090 7.657366 1133.321628
+CAGGAT 71.454996 1.524284 1.958552 0.990520 7.657366 2104.643297
+CAGGCA 69.339927 1.679721 1.917540 0.959571 7.657366 2794.824138
+CAGGCC 67.967429 1.670439 1.899635 0.946162 7.657366 2275.643612
+CAGGCG 69.221516 1.759060 1.914309 0.957146 7.657366 3786.482948
+CAGGCT 68.876764 1.640195 1.772203 0.852571 7.657366 2110.711076
+CAGGGA 75.117989 1.417335 1.657184 0.770933 7.657366 1035.567012
+CAGGGC 72.480778 1.465023 1.758100 0.842414 7.657366 1928.270254
+CAGGGG 74.192157 1.612912 1.612864 0.740213 7.657366 1025.691881
+CAGGGT 73.353744 1.624099 1.821615 0.888475 7.657366 1489.833851
+CAGGTA 68.134561 1.382815 1.628892 0.751275 7.657366 1517.256939
+CAGGTC 66.404805 1.575008 1.895123 0.942793 7.657366 1411.407641
+CAGGTG 67.554487 1.483827 1.763494 0.846294 7.657366 1995.073998
+CAGGTT 67.499758 1.575843 1.900505 0.946812 7.657366 2588.773042
+CAGTAA 73.033831 1.760993 1.862318 0.918419 7.657366 1987.244505
+CAGTAC 69.913638 1.672596 2.028910 1.044370 7.657366 1769.903602
+CAGTAG 72.443117 1.598268 1.871520 0.925235 7.657366 788.952873
+CAGTAT 70.949559 1.855655 1.965319 0.995657 7.657366 1282.542120
+CAGTCA 65.558157 1.823760 2.335302 1.289659 7.657366 1342.400004
+CAGTCC 64.130092 1.843051 2.399025 1.342804 7.657366 784.240915
+CAGTCG 65.632572 1.861634 2.352608 1.304022 7.657366 1330.471143
+CAGTCT 65.171523 2.081022 2.529822 1.454104 7.657366 748.705124
+CAGTGA 69.788966 1.738839 1.937676 0.974725 7.657366 1683.208565
+CAGTGC 66.349641 1.749762 2.273035 1.238425 7.657366 2164.675666
+CAGTGG 68.878727 1.847226 2.062418 1.070349 7.657366 1578.984128
+CAGTGT 67.157995 1.755637 2.147107 1.136949 7.657366 931.603497
+CAGTTA 60.678981 2.097953 2.536183 1.459591 7.657366 1194.751729
+CAGTTC 57.825754 1.917945 2.488557 1.418671 7.657366 2460.264700
+CAGTTG 60.051086 2.023894 2.500952 1.429283 7.657366 2014.450510
+CAGTTT 59.100467 2.011384 2.455218 1.390258 7.657366 2832.905518
+CATAAA 105.057752 2.322094 2.494315 1.423598 7.657366 2152.670068
+CATAAC 102.669365 2.331548 2.542318 1.464890 7.657366 1460.056982
+CATAAG 103.827349 2.196316 2.498918 1.427540 7.657366 572.988944
+CATAAT 104.810354 2.523647 2.753283 1.650960 7.657366 1467.653926
+CATACA 99.986649 2.084882 2.251654 1.220992 7.657366 873.343366
+CATACC 99.952134 2.225940 2.371015 1.319356 7.657366 1037.843953
+CATACG 99.968408 2.301011 2.348747 1.300812 7.657366 864.032316
+CATACT 101.203702 2.487418 2.629788 1.541137 7.657366 742.484178
+CATAGA 102.246031 2.587098 2.418820 1.359457 7.657366 567.087447
+CATAGC 101.083348 2.068640 2.379234 1.326221 7.657366 896.325932
+CATAGG 101.847063 2.539953 2.381848 1.328407 7.657366 379.247198
+CATAGT 103.304825 2.268839 2.593860 1.509663 7.657366 531.464655
+CATATA 101.180508 2.448833 2.671896 1.578300 7.657366 635.220489
+CATATC 102.497686 2.511352 2.902054 1.786564 7.657366 1330.984932
+CATATG 100.783012 2.553787 2.754544 1.652094 7.657366 863.348275
+CATATT 104.445924 2.801609 3.151966 2.022240 7.657366 1719.057185
+CATCAA 114.199078 2.595596 2.698632 1.602049 7.657366 3136.305496
+CATCAC 112.307342 2.266289 2.626159 1.537949 7.657366 2747.383260
+CATCAG 113.540569 2.792792 2.688727 1.593237 7.657366 3706.086916
+CATCAT 113.252970 2.395132 2.734017 1.633662 7.657366 2677.462205
+CATCCA 113.189398 2.415215 2.617880 1.530681 7.657366 2026.543849
+CATCCC 111.372735 2.510567 2.697752 1.601265 7.657366 1350.144962
+CATCCG 113.446677 2.805648 2.780721 1.675701 7.657366 2110.169486
+CATCCT 112.706070 2.402789 2.686490 1.591249 7.657366 1259.859641
+CATCGA 113.598498 2.738730 2.886095 1.771847 7.657366 1833.945804
+CATCGC 111.272286 2.502323 2.650103 1.559030 7.657366 3489.524300
+CATCGG 112.862101 2.842270 2.806936 1.699453 7.657366 2176.386578
+CATCGT 112.837098 2.939725 2.782212 1.677049 7.657366 1692.303853
+CATCTA 113.787569 2.092332 2.658977 1.566867 7.657366 432.269176
+CATCTC 112.634970 2.252435 2.673515 1.579735 7.657366 1185.932872
+CATCTG 113.454737 2.505517 2.756007 1.653411 7.657366 2091.854966
+CATCTT 113.500285 2.655342 3.069586 1.943480 7.657366 1780.956338
+CATGAA 101.606055 2.749568 2.516887 1.442965 7.657366 1468.215138
+CATGAC 98.040370 2.411364 2.374609 1.322356 7.657366 1323.278448
+CATGAG 100.475511 2.942205 2.458328 1.392900 7.657366 835.208629
+CATGAT 100.543008 2.630354 2.649396 1.558406 7.657366 1702.166347
+CATGCA 100.695052 2.656019 2.326198 1.282125 7.657366 1128.929907
+CATGCC 100.321492 2.622041 2.393538 1.338200 7.657366 1563.605348
+CATGCG 100.469182 2.515386 2.418393 1.359098 7.657366 1324.943847
+CATGCT 102.120235 2.622800 2.547806 1.469636 7.657366 1228.466078
+CATGGA 101.864086 3.191903 2.636931 1.547421 7.657366 974.021121
+CATGGC 99.659361 2.468535 2.463209 1.397051 7.657366 1584.735816
+CATGGG 101.074706 2.364809 2.526104 1.450899 7.657366 763.072403
+CATGGT 101.925236 2.834555 2.647450 1.556689 7.657366 1588.593884
+CATGTA 103.227494 2.395025 2.667155 1.574101 7.657366 588.206564
+CATGTC 104.113937 2.755594 2.979662 1.858706 7.657366 842.057310
+CATGTG 103.139235 2.639730 2.832747 1.722947 7.657366 643.175895
+CATGTT 106.281171 2.934083 3.151351 2.021647 7.657366 1339.523655
+CATTAA 100.838982 2.298007 2.188348 1.169863 7.657366 2089.214691
+CATTAC 97.375843 2.124451 1.945903 0.980939 7.657366 1806.032701
+CATTAG 99.964303 2.331000 2.109509 1.107216 7.657366 576.645609
+CATTAT 98.985049 2.135170 2.026503 1.042512 7.657366 1758.878855
+CATTCA 95.143474 2.276727 2.237769 1.209716 7.657366 1706.740424
+CATTCC 93.584204 2.247805 2.113580 1.110422 7.657366 1302.541073
+CATTCG 95.238328 2.232851 2.237122 1.209191 7.657366 1199.809148
+CATTCT 91.446820 4.963277 2.109579 1.107271 7.657366 5231.301935
+CATTGA 98.924908 2.907437 2.086496 1.089147 7.657366 2163.461565
+CATTGC 95.087503 2.002967 1.928000 0.967433 7.657366 2400.053932
+CATTGG 97.804438 2.455649 2.061749 1.069828 7.657366 1273.273739
+CATTGT 96.580433 2.074044 2.071877 1.077721 7.657366 1267.906817
+CATTTA 91.414428 1.924241 1.977960 1.005279 7.657366 1251.651753
+CATTTC 89.444010 1.789585 1.866877 0.921794 7.657366 1775.412536
+CATTTG 91.055248 1.768066 1.868394 0.922918 7.657366 1340.028058
+CATTTT 91.141069 1.789203 1.809337 0.879508 7.657366 2829.143094
+CCAAAA 89.631551 2.256962 1.958498 0.990478 7.657366 1514.721660
+CCAAAC 86.756066 1.964075 2.126669 1.120754 7.657366 1216.984714
+CCAAAG 89.025977 2.475831 1.994313 1.017772 7.657366 1435.974862
+CCAAAT 87.630038 2.122555 2.273670 1.238944 7.657366 892.640927
+CCAACA 80.252065 1.937043 1.900570 0.946861 7.657366 1375.712636
+CCAACC 79.003515 1.895960 1.817330 0.885342 7.657366 1292.194462
+CCAACG 80.046629 2.021115 1.928591 0.967878 7.657366 1590.955766
+CCAACT 79.858525 1.988809 1.789687 0.865218 7.657366 884.451098
+CCAAGA 87.963816 2.053834 1.912352 0.955679 7.657366 335.328192
+CCAAGC 85.852733 2.271471 2.065038 1.072389 7.657366 582.205517
+CCAAGG 87.100966 2.328818 2.005671 1.026479 7.657366 247.828038
+CCAAGT 86.432374 2.038798 2.200695 1.179778 7.657366 247.290368
+CCAATA 79.056300 1.862506 1.903561 0.949097 7.657366 1228.315880
+CCAATC 77.382158 2.110771 2.175604 1.159658 7.657366 1333.537615
+CCAATG 78.271065 1.715493 1.984234 1.010066 7.657366 1214.774296
+CCAATT 78.319479 2.106626 2.335372 1.289717 7.657366 767.667397
+CCACAA 92.541586 1.297019 1.346663 0.564740 7.657366 1327.790491
+CCACAC 89.843858 1.372114 1.301457 0.536543 7.657366 992.875432
+CCACAG 91.799021 1.262124 1.307603 0.540349 7.657366 1362.626539
+CCACAT 91.138724 1.786739 1.299357 0.535245 7.657366 1106.408739
+CCACCA 89.283835 1.362600 1.379201 0.585331 7.657366 3201.998288
+CCACCC 87.971864 1.436674 1.410409 0.605310 7.657366 912.352287
+CCACCG 89.116049 1.356915 1.380387 0.586086 7.657366 2332.375943
+CCACCT 88.798199 1.341868 1.348522 0.565910 7.657366 983.006021
+CCACGA 91.528908 1.441935 1.269472 0.516886 7.657366 1143.141336
+CCACGC 89.172082 1.614749 1.339052 0.559959 7.657366 2534.793406
+CCACGG 90.752977 1.366287 1.338231 0.559444 7.657366 1696.863522
+CCACGT 90.291817 1.790962 1.290491 0.529777 7.657366 925.139473
+CCACTA 87.703431 1.468625 1.428622 0.617073 7.657366 584.794888
+CCACTC 85.669562 1.390734 1.429209 0.617453 7.657366 794.019696
+CCACTG 86.871155 1.473512 1.414724 0.608090 7.657366 1618.300924
+CCACTT 86.434049 1.528651 1.423773 0.613934 7.657366 1670.383201
+CCAGAA 85.860605 2.045464 1.956686 0.989104 7.657366 2766.319589
+CCAGAC 80.439242 1.936027 2.177102 1.160857 7.657366 2535.309192
+CCAGAG 84.795848 2.302664 2.075801 1.080784 7.657366 1297.037459
+CCAGAT 82.175953 1.786773 2.148094 1.137733 7.657366 2240.894352
+CCAGCA 80.251776 1.768838 1.680024 0.786925 7.657366 5102.553836
+CCAGCC 78.895160 1.940538 1.757226 0.841786 7.657366 3354.475351
+CCAGCG 80.225510 1.951648 1.736299 0.826794 7.657366 5833.778648
+CCAGCT 79.977028 1.798961 1.808936 0.879215 7.657366 2317.037843
+CCAGGA 85.373897 1.929944 1.908251 0.952607 7.657366 1398.648873
+CCAGGC 81.630122 1.918027 2.037203 1.050780 7.657366 2541.802058
+CCAGGG 84.215903 1.945182 1.908841 0.953048 7.657366 1217.548591
+CCAGGT 83.187156 1.944640 2.049814 1.060552 7.657366 1857.513317
+CCAGTA 77.362477 1.894236 2.058972 1.067668 7.657366 1778.621438
+CCAGTC 74.456760 1.982742 2.231464 1.204606 7.657366 1474.131842
+CCAGTG 76.393779 2.012341 2.148273 1.137875 7.657366 2125.491326
+CCAGTT 75.912643 1.990358 2.301434 1.261705 7.657366 3158.899423
+CCATAA 86.147294 1.885639 1.791582 0.866594 7.657366 1611.812721
+CCATAC 82.743997 1.783807 1.955593 0.988276 7.657366 922.436536
+CCATAG 86.627545 4.963277 1.967516 0.997327 7.657366 1047.750783
+CCATAT 84.149239 1.878349 1.951820 0.985417 7.657366 1205.963846
+CCATCA 79.125056 2.006013 2.265553 1.232314 7.657366 2957.336227
+CCATCC 77.626659 1.992997 2.265964 1.232650 7.657366 1517.811007
+CCATCG 79.195481 2.028440 2.195437 1.175552 7.657366 2402.664605
+CCATCT 78.459284 1.907953 2.244751 1.215381 7.657366 1263.278048
+CCATGA 83.785786 1.947503 1.853067 0.911584 7.657366 1313.028680
+CCATGC 80.144407 1.962106 2.155719 1.143796 7.657366 1601.216709
+CCATGG 82.816449 2.132560 1.965326 0.995662 7.657366 784.617441
+CCATGT 81.189242 2.041933 2.097873 1.098067 7.657366 994.863629
+CCATTA 73.897150 2.179864 2.385670 1.331606 7.657366 1394.092150
+CCATTC 70.648725 2.096917 2.406065 1.348719 7.657366 1487.810422
+CCATTG 72.909640 2.190032 2.327914 1.283544 7.657366 2005.374053
+CCATTT 71.933181 2.115896 2.258334 1.226429 7.657366 2096.070474
+CCCAAA 101.945692 1.555142 1.556854 0.701992 7.657366 685.600632
+CCCAAC 99.804194 1.447575 1.545508 0.694332 7.657366 955.659490
+CCCAAG 100.796124 1.459455 1.516532 0.674897 7.657366 243.869844
+CCCAAT 100.508533 1.447882 1.586080 0.721852 7.657366 718.705683
+CCCACA 94.158096 1.702680 1.321659 0.549085 7.657366 917.429047
+CCCACC 92.846868 1.684547 1.392955 0.594109 7.657366 1031.162087
+CCCACG 93.902608 1.692780 1.334748 0.557262 7.657366 1225.644957
+CCCACT 93.845532 1.694221 1.428060 0.616709 7.657366 898.439737
+CCCAGA 99.658182 1.605332 1.567282 0.709057 7.657366 1133.198735
+CCCAGC 97.549931 1.615577 1.591115 0.725292 7.657366 2516.046398
+CCCAGG 98.516310 1.485198 1.543017 0.692654 7.657366 1309.840028
+CCCAGT 98.298027 1.656979 1.648571 0.764930 7.657366 1234.355175
+CCCATA 92.711035 1.670132 1.552317 0.698925 7.657366 845.753126
+CCCATC 91.124479 1.593727 1.708904 0.807303 7.657366 1265.881547
+CCCATG 92.207370 1.771842 1.652436 0.767622 7.657366 881.795163
+CCCATT 92.349712 1.575454 1.761922 0.845163 7.657366 1097.265795
+CCCCAA 100.709668 2.176578 1.275764 0.520734 7.657366 538.970170
+CCCCAC 98.476096 1.714300 1.290090 0.529530 7.657366 855.462235
+CCCCAG 100.093472 1.636620 1.257305 0.509473 7.657366 1470.967125
+CCCCAT 99.597821 1.603911 1.369289 0.579032 7.657366 922.394182
+CCCCCA 97.178429 1.774670 1.228466 0.492045 7.657366 623.679391
+CCCCCC 96.404430 1.456577 1.127182 0.432465 7.657366 252.889031
+CCCCCG 97.359240 1.973884 1.194330 0.471680 7.657366 550.357150
+CCCCCT 96.732217 1.518192 1.155785 0.449030 7.657366 421.868717
+CCCCGA 99.811617 1.652215 1.297620 0.534172 7.657366 546.588219
+CCCCGC 97.524744 1.436168 1.277153 0.521585 7.657366 1051.589836
+CCCCGG 98.809906 1.695631 1.275662 0.520671 7.657366 1181.421619
+CCCCGT 98.408447 1.649975 1.391392 0.593109 7.657366 552.947834
+CCCCTA 95.794918 1.651957 1.237641 0.497568 7.657366 209.533682
+CCCCTC 94.546639 1.337608 1.361635 0.574185 7.657366 326.491217
+CCCCTG 95.488160 1.530228 1.259909 0.511057 7.657366 976.056505
+CCCCTT 95.204338 1.427948 1.400456 0.598914 7.657366 670.541552
+CCCGAA 96.779669 1.536928 1.458025 0.636221 7.657366 960.509632
+CCCGAC 92.125411 1.461307 1.770901 0.851632 7.657366 895.089664
+CCCGAG 95.211906 1.647898 1.523674 0.679671 7.657366 438.976850
+CCCGAT 93.979075 1.569830 1.753268 0.838943 7.657366 1194.991846
+CCCGCA 91.970409 1.711951 1.377244 0.584086 7.657366 1998.997401
+CCCGCC 90.844044 1.614704 1.478861 0.649907 7.657366 1918.035119
+CCCGCG 92.107387 1.609212 1.427108 0.616092 7.657366 1665.937014
+CCCGCT 92.046330 1.591132 1.504187 0.666673 7.657366 1610.789987
+CCCGGA 96.511068 1.540342 1.520291 0.677408 7.657366 1535.416997
+CCCGGC 93.423395 1.507087 1.610984 0.738919 7.657366 2116.494138
+CCCGGG 95.372000 1.524357 1.504393 0.666810 7.657366 578.621589
+CCCGGT 95.001224 1.503876 1.621265 0.746005 7.657366 1722.968118
+CCCGTA 90.582637 1.667887 1.667934 0.778446 7.657366 776.592079
+CCCGTC 88.312430 1.577716 1.798387 0.871536 7.657366 950.260192
+CCCGTG 89.937361 1.802029 1.743777 0.832140 7.657366 876.395452
+CCCGTT 89.920230 1.713725 1.898802 0.945539 7.657366 1172.644756
+CCCTAA 95.516934 1.448478 1.432501 0.619587 7.657366 310.028126
+CCCTAC 92.861240 1.737986 1.570761 0.711419 7.657366 381.368135
+CCCTAG 95.215861 4.963277 1.821056 0.888066 7.657366 23.940710
+CCCTAT 94.149307 1.858157 1.587742 0.722987 7.657366 351.732634
+CCCTCA 88.773956 2.043431 1.779895 0.858128 7.657366 468.346584
+CCCTCC 87.099568 1.935042 1.789100 0.864793 7.657366 348.372285
+CCCTCG 88.666587 1.723203 1.816149 0.884479 7.657366 419.969888
+CCCTCT 88.372195 2.114294 1.895287 0.942916 7.657366 506.269804
+CCCTGA 92.799802 1.593002 1.451441 0.631916 7.657366 1586.238687
+CCCTGC 89.665576 1.573723 1.621530 0.746187 7.657366 1734.108716
+CCCTGG 91.600393 1.810819 1.514313 0.673416 7.657366 1429.489644
+CCCTGT 90.712546 1.686987 1.524179 0.680009 7.657366 993.578798
+CCCTTA 84.224760 1.961516 1.919623 0.961134 7.657366 449.209247
+CCCTTC 81.083675 1.804122 1.946806 0.981622 7.657366 1006.015600
+CCCTTG 83.456749 1.894423 1.913097 0.956237 7.657366 531.050604
+CCCTTT 82.301445 1.801886 1.767197 0.848961 7.657366 1039.010156
+CCGAAA 82.280518 1.911851 1.897052 0.944233 7.657366 1718.819946
+CCGAAC 79.371830 1.708561 2.088588 1.090786 7.657366 1195.496183
+CCGAAG 81.214434 2.248432 1.998158 1.020716 7.657366 1437.093350
+CCGAAT 80.334991 1.727515 2.090703 1.092443 7.657366 944.140841
+CCGACA 72.080861 1.962302 2.027957 1.043634 7.657366 1354.633309
+CCGACC 71.036583 1.725701 1.954533 0.987472 7.657366 1166.878322
+CCGACG 72.059002 1.812573 2.075791 1.080776 7.657366 1544.629443
+CCGACT 71.926979 1.557171 1.905939 0.950876 7.657366 890.795437
+CCGAGA 80.810220 1.944942 1.893854 0.941846 7.657366 745.328446
+CCGAGC 78.820135 1.866717 1.979705 1.006609 7.657366 968.580086
+CCGAGG 79.748057 2.063714 1.871623 0.925311 7.657366 713.462244
+CCGAGT 79.124925 2.174397 2.157513 1.145224 7.657366 523.909845
+CCGATA 70.995555 1.582594 1.810454 0.880322 7.657366 1719.058782
+CCGATC 69.449013 1.735861 2.028144 1.043779 7.657366 1238.696105
+CCGATG 70.144615 1.628788 1.789997 0.865444 7.657366 2199.191315
+CCGATT 70.339070 1.703947 2.127331 1.121277 7.657366 1358.168541
+CCGCAA 88.187556 1.488709 1.437618 0.622911 7.657366 2227.949788
+CCGCAC 85.637952 1.417318 1.444937 0.627673 7.657366 1660.596781
+CCGCAG 87.275454 1.525150 1.450344 0.631199 7.657366 2842.718432
+CCGCAT 86.389192 1.401124 1.408193 0.603884 7.657366 1967.247297
+CCGCCA 84.431916 1.515507 1.630710 0.752533 7.657366 4456.306674
+CCGCCC 83.204505 1.484451 1.609797 0.738103 7.657366 1583.200659
+CCGCCG 84.380577 1.523436 1.574571 0.714009 7.657366 3443.314910
+CCGCCT 83.940777 1.431579 1.530380 0.684162 7.657366 1804.594085
+CCGCGA 86.643522 1.562302 1.487397 0.655542 7.657366 2166.048992
+CCGCGC 83.689240 1.512084 1.604480 0.734449 7.657366 3171.547746
+CCGCGG 85.449717 1.512847 1.528124 0.682650 7.657366 846.086923
+CCGCGT 84.377778 1.539074 1.594597 0.727673 7.657366 1921.286672
+CCGCTA 82.749344 1.562669 1.673125 0.782083 7.657366 1158.173551
+CCGCTC 80.558504 1.476196 1.602390 0.733015 7.657366 1220.999188
+CCGCTG 82.057334 1.574317 1.618353 0.743995 7.657366 3678.263588
+CCGCTT 81.441148 1.464220 1.435913 0.621803 7.657366 2072.230985
+CCGGAA 82.175361 1.874902 1.968528 0.998097 7.657366 2864.129459
+CCGGAC 76.792594 1.723310 2.290544 1.252761 7.657366 1370.785416
+CCGGAG 80.784247 1.953668 2.049129 1.060020 7.657366 1392.182116
+CCGGAT 78.416296 1.790386 2.265669 1.232410 7.657366 2239.406547
+CCGGCA 76.854549 1.707165 1.792818 0.867490 7.657366 3301.674100
+CCGGCC 75.408482 1.540633 1.734514 0.825519 7.657366 863.930349
+CCGGCG 76.624321 1.688049 1.758662 0.842818 7.657366 2604.884556
+CCGGCT 76.607070 1.747723 1.732394 0.824006 7.657366 1291.437709
+CCGGGA 82.608620 1.699697 1.888151 0.937595 7.657366 1421.533437
+CCGGGC 79.759022 1.629900 2.006737 1.027297 7.657366 1783.796824
+CCGGGG 81.594900 1.786404 1.890789 0.939560 7.657366 1003.830042
+CCGGGT 80.742337 1.741521 1.976499 1.004165 7.657366 1521.641117
+CCGGTA 75.162134 1.935846 2.038196 1.051548 7.657366 2119.315032
+CCGGTC 72.637826 1.964550 2.245600 1.216071 7.657366 1194.842654
+CCGGTG 74.068997 1.849313 2.069108 1.075561 7.657366 2540.909601
+CCGGTT 74.052437 2.038281 2.268504 1.234723 7.657366 2220.604577
+CCGTAA 78.823316 1.922702 1.988985 1.013695 7.657366 1399.377144
+CCGTAC 75.450790 1.999493 2.202001 1.180828 7.657366 1038.497789
+CCGTAG 77.912213 1.821808 1.992573 1.016440 7.657366 932.342725
+CCGTAT 76.650817 2.104954 2.149189 1.138603 7.657366 1071.211462
+CCGTCA 71.462202 2.072036 2.447480 1.383691 7.657366 1923.198234
+CCGTCC 69.972288 2.044325 2.564881 1.484435 7.657366 770.126591
+CCGTCG 71.494452 2.010180 2.436808 1.374651 7.657366 1688.324337
+CCGTCT 71.123973 2.213470 2.583262 1.500420 7.657366 1123.746243
+CCGTGA 75.587352 2.129209 2.154971 1.143201 7.657366 1308.039728
+CCGTGC 71.689313 2.086290 2.403341 1.346429 7.657366 1363.707089
+CCGTGG 74.397147 2.298483 2.242695 1.213712 7.657366 1709.569601
+CCGTGT 72.549758 1.757733 2.265039 1.231895 7.657366 632.592972
+CCGTTA 66.263457 2.175116 2.554829 1.475717 7.657366 1669.642114
+CCGTTC 62.982304 2.011638 2.620085 1.532616 7.657366 1328.600163
+CCGTTG 65.391083 2.138235 2.528517 1.452979 7.657366 1859.777376
+CCGTTT 64.154678 1.942000 2.384089 1.330283 7.657366 2262.299201
+CCTAAA 108.761479 2.233040 2.400412 1.343968 7.657366 408.651777
+CCTAAC 106.095317 2.328202 2.475660 1.407657 7.657366 469.805908
+CCTAAG 107.953153 1.955919 2.318512 1.275776 7.657366 186.238986
+CCTAAT 108.194266 2.313661 2.525806 1.450642 7.657366 335.536165
+CCTACA 104.166413 2.097662 2.200681 1.179766 7.657366 587.922276
+CCTACC 103.793939 2.332130 2.193860 1.174286 7.657366 513.945413
+CCTACG 103.875183 2.115621 2.173092 1.157651 7.657366 598.296776
+CCTACT 105.524609 2.261013 2.332383 1.287242 7.657366 268.297323
+CCTAGA 104.379977 3.756365 2.985704 1.864362 7.657366 16.916798
+CCTAGC 104.371370 1.844571 2.129492 1.122986 7.657366 73.001132
+CCTAGG 104.960000 2.880000 2.575000 1.504000 7.657366 0.000000
+CCTAGT 106.126525 3.037812 2.610411 1.524136 7.657366 27.605229
+CCTATA 104.605844 2.177140 2.453628 1.388908 7.657366 299.047405
+CCTATC 104.722722 2.744743 2.659619 1.567435 7.657366 508.511605
+CCTATG 104.211168 2.646801 2.520050 1.445686 7.657366 473.082124
+CCTATT 106.558651 2.666926 2.892608 1.777849 7.657366 397.124522
+CCTCAA 114.307912 2.293411 2.395767 1.340069 7.657366 879.885179
+CCTCAC 111.952356 2.215016 2.407284 1.349744 7.657366 657.085464
+CCTCAG 113.765762 2.210364 2.398331 1.342221 7.657366 411.181599
+CCTCAT 113.331500 2.029491 2.175198 1.159334 7.657366 579.187353
+CCTCCA 113.289872 2.367398 2.350325 1.302124 7.657366 586.297548
+CCTCCC 111.676533 2.167907 2.331752 1.286719 7.657366 360.292360
+CCTCCG 113.017375 2.008099 2.213403 1.190011 7.657366 621.689873
+CCTCCT 112.731835 2.021471 2.364970 1.314313 7.657366 327.951862
+CCTCGA 113.996738 2.287669 2.409800 1.351860 7.657366 642.463163
+CCTCGC 111.260555 2.086768 2.284351 1.247684 7.657366 1075.270552
+CCTCGG 113.220473 2.213324 2.345396 1.298030 7.657366 720.453074
+CCTCGT 113.060810 2.103616 2.440215 1.377535 7.657366 461.232049
+CCTCTA 112.703557 2.127541 2.357885 1.308411 7.657366 288.209187
+CCTCTC 111.801474 2.211922 2.372465 1.320566 7.657366 492.171807
+CCTCTG 112.709424 2.137346 2.451202 1.386848 7.657366 753.519855
+CCTCTT 113.140211 2.256067 2.595725 1.511292 7.657366 908.945053
+CCTGAA 104.611932 2.221921 2.303995 1.263812 7.657366 2587.837468
+CCTGAC 100.664187 2.100541 2.194902 1.175122 7.657366 2121.990103
+CCTGAG 103.679604 2.222127 2.269652 1.235660 7.657366 939.653098
+CCTGAT 102.750844 2.264016 2.454356 1.389525 7.657366 2779.648618
+CCTGCA 102.758471 2.186810 2.204828 1.183103 7.657366 2360.053183
+CCTGCC 102.011683 2.221860 2.209714 1.187037 7.657366 2081.250289
+CCTGCG 102.622322 2.266217 2.220348 1.195616 7.657366 2656.255376
+CCTGCT 103.615094 2.366774 2.416361 1.357385 7.657366 2500.763547
+CCTGGA 104.884710 2.300571 2.333395 1.288080 7.657366 1404.224764
+CCTGGC 102.190933 2.104828 2.291223 1.253318 7.657366 2830.312777
+CCTGGG 104.045212 2.236950 2.330863 1.285983 7.657366 1393.539013
+CCTGGT 104.424169 2.266852 2.399244 1.342987 7.657366 1903.527042
+CCTGTA 103.556159 2.500634 2.552837 1.473992 7.657366 1081.145054
+CCTGTC 103.471816 2.635920 2.625556 1.537419 7.657366 852.045343
+CCTGTG 103.259376 2.525490 2.570873 1.489640 7.657366 1183.103652
+CCTGTT 105.592920 2.748007 2.885618 1.771408 7.657366 2080.140957
+CCTTAA 103.100332 1.745069 1.740727 0.829959 7.657366 822.252679
+CCTTAC 99.307612 1.674450 1.641634 0.760107 7.657366 742.584792
+CCTTAG 102.065958 1.997460 1.750425 0.836904 7.657366 322.924455
+CCTTAT 100.813608 1.848468 1.737980 0.827995 7.657366 963.506579
+CCTTCA 97.188930 1.654876 1.827024 0.892435 7.657366 1488.241746
+CCTTCC 95.452545 1.594742 1.776107 0.855390 7.657366 1412.477169
+CCTTCG 97.129019 1.748597 1.801717 0.873957 7.657366 1361.271291
+CCTTCT 96.833908 1.615094 1.870029 0.924130 7.657366 957.840812
+CCTTGA 101.125438 1.779547 1.713077 0.810263 7.657366 702.568514
+CCTTGC 97.439157 1.727521 1.617992 0.743746 7.657366 1275.101741
+CCTTGG 100.595934 1.731115 1.697368 0.799142 7.657366 276.585555
+CCTTGT 98.735205 1.764610 1.725654 0.819201 7.657366 697.729563
+CCTTTA 93.741088 1.694389 1.703632 0.803571 7.657366 1289.279798
+CCTTTC 91.617713 1.744351 1.693565 0.796459 7.657366 1343.297215
+CCTTTG 93.181175 1.602271 1.667689 0.778275 7.657366 1369.789839
+CCTTTT 93.041117 1.765486 1.758010 0.842349 7.657366 1553.213972
+CGAAAA 77.548659 1.690538 1.547715 0.695820 7.657366 2481.550518
+CGAAAC 75.238946 1.544110 1.721193 0.816027 7.657366 1647.411250
+CGAAAG 76.555097 1.751574 1.591439 0.725513 7.657366 1927.743650
+CGAAAT 75.760480 1.570699 1.726910 0.820096 7.657366 1736.381392
+CGAACA 66.410548 2.165318 2.172183 1.156924 7.657366 1565.392629
+CGAACC 65.506731 2.003847 2.245574 1.216050 7.657366 1107.722355
+CGAACG 66.766081 2.005538 2.207136 1.184961 7.657366 1326.140816
+CGAACT 66.552400 2.056037 2.166650 1.152506 7.657366 1219.252982
+CGAAGA 74.968273 1.780533 1.608207 0.737010 7.657366 1744.444121
+CGAAGC 73.764724 1.501006 1.567624 0.709289 7.657366 1725.297371
+CGAAGG 74.188077 1.653573 1.644051 0.761787 7.657366 1239.700794
+CGAAGT 73.753225 1.548154 1.580450 0.718012 7.657366 1181.955228
+CGAATA 66.408564 1.928762 2.075596 1.080624 7.657366 1231.589847
+CGAATC 65.021668 1.750059 1.939221 0.975891 7.657366 1126.547262
+CGAATG 65.728987 1.815281 2.049763 1.060513 7.657366 1249.766217
+CGAATT 65.683332 1.677223 1.830703 0.895132 7.657366 1066.498022
+CGACAA 82.178182 2.090032 1.896284 0.943659 7.657366 2013.082740
+CGACAC 80.940323 1.819108 1.702456 0.802739 7.657366 1115.744485
+CGACAG 81.471359 1.950727 1.838552 0.900895 7.657366 1417.864774
+CGACAT 81.541979 2.053579 1.779069 0.857530 7.657366 1424.292066
+CGACCA 78.557581 1.776105 1.809995 0.879987 7.657366 2090.665448
+CGACCC 77.712265 1.652200 1.726388 0.819725 7.657366 730.707820
+CGACCG 78.529156 1.812702 1.807597 0.878239 7.657366 1415.063447
+CGACCT 78.436286 1.786901 1.772987 0.853136 7.657366 1110.547144
+CGACGA 81.210779 1.918067 1.809058 0.879304 7.657366 1558.973434
+CGACGC 79.966390 1.679879 1.618906 0.744376 7.657366 2198.953746
+CGACGG 80.431074 1.772414 1.699609 0.800726 7.657366 1505.556118
+CGACGT 80.298881 1.749430 1.720255 0.815360 7.657366 1268.939853
+CGACTA 77.977705 1.712562 1.708771 0.807209 7.657366 630.736377
+CGACTC 76.827379 1.646502 1.632824 0.753997 7.657366 610.618928
+CGACTG 77.720137 1.621909 1.702778 0.802966 7.657366 1216.883539
+CGACTT 77.113857 1.553832 1.422606 0.613179 7.657366 1087.174935
+CGAGAA 77.412741 1.614056 1.493341 0.659476 7.657366 1479.955928
+CGAGAC 73.380993 1.304578 1.795418 0.869378 7.657366 436.489264
+CGAGAG 76.449874 1.484161 1.503046 0.665915 7.657366 456.526656
+CGAGAT 74.774064 1.513382 1.673513 0.782355 7.657366 1016.752530
+CGAGCA 70.638139 1.749100 1.863109 0.919005 7.657366 1495.478648
+CGAGCC 69.523507 1.569309 1.890590 0.939413 7.657366 582.103784
+CGAGCG 70.744223 1.726310 1.718149 0.813863 7.657366 1322.818953
+CGAGCT 70.226969 1.553370 1.681802 0.788175 7.657366 812.431111
+CGAGGA 76.678069 1.438634 1.437878 0.623079 7.657366 912.983769
+CGAGGC 74.715554 1.283205 1.507299 0.668743 7.657366 837.987521
+CGAGGG 75.895987 1.504194 1.434304 0.620758 7.657366 480.491224
+CGAGGT 75.366359 1.379233 1.499077 0.663279 7.657366 807.185660
+CGAGTA 70.169831 1.417677 1.534577 0.686979 7.657366 544.300942
+CGAGTC 68.157020 1.381781 1.719827 0.815056 7.657366 443.065943
+CGAGTG 69.573652 1.506934 1.625336 0.748816 7.657366 610.156493
+CGAGTT 69.649068 1.487844 1.758531 0.842724 7.657366 837.640316
+CGATAA 75.199666 1.913731 2.067752 1.074504 7.657366 2623.900740
+CGATAC 72.903739 1.778580 2.012169 1.031471 7.657366 1481.766101
+CGATAG 74.474332 1.728482 1.981031 1.007621 7.657366 1077.328892
+CGATAT 73.981205 2.065185 2.206513 1.184459 7.657366 2234.576078
+CGATCA 68.090669 1.844038 2.135961 1.128107 7.657366 2090.412284
+CGATCC 66.982727 1.878571 2.183287 1.165807 7.657366 1700.747170
+CGATCG 68.275557 1.779978 2.134784 1.127175 7.657366 1668.650910
+CGATCT 68.040386 2.038459 2.226676 1.200731 7.657366 1511.352474
+CGATGA 72.634115 1.791697 1.970032 0.999241 7.657366 2328.198032
+CGATGC 69.969146 1.604072 1.962475 0.993497 7.657366 2255.407899
+CGATGG 71.795510 1.704136 2.006160 1.026854 7.657366 2127.886108
+CGATGT 70.854772 1.840673 2.107606 1.105718 7.657366 1446.790863
+CGATTA 63.945133 1.907500 2.149571 1.138906 7.657366 1491.545775
+CGATTC 61.437853 1.916850 2.296399 1.257567 7.657366 1130.779779
+CGATTG 63.342161 1.999667 2.144873 1.135174 7.657366 1464.527707
+CGATTT 62.636226 1.992722 2.130114 1.123478 7.657366 2351.470242
+CGCAAA 103.209485 1.957018 2.124452 1.119001 7.657366 2648.053667
+CGCAAC 100.136878 1.884808 2.165662 1.151719 7.657366 2006.082331
+CGCAAG 102.007928 2.187244 2.169102 1.154464 7.657366 1261.540941
+CGCAAT 101.580272 1.867206 2.261747 1.229211 7.657366 2178.984849
+CGCACA 93.653804 1.692928 1.591208 0.725355 7.657366 1465.514495
+CGCACC 92.406170 1.607998 1.620354 0.745376 7.657366 2184.238923
+CGCACG 93.574387 1.678990 1.656988 0.770796 7.657366 1710.563199
+CGCACT 93.553043 1.649269 1.750908 0.837250 7.657366 1246.163879
+CGCAGA 100.230296 2.012450 2.150173 1.139385 7.657366 2236.083247
+CGCAGC 97.607255 1.827570 2.012033 1.031366 7.657366 3263.445617
+CGCAGG 99.270999 1.938503 2.104822 1.103528 7.657366 2661.483461
+CGCAGT 98.895497 2.033723 2.203127 1.181733 7.657366 2018.574415
+CGCATA 93.399434 1.732088 1.833554 0.897224 7.657366 1301.455933
+CGCATC 92.097821 1.821393 1.897263 0.944391 7.657366 2939.015082
+CGCATG 92.723061 1.737272 1.837567 0.900171 7.657366 1262.476540
+CGCATT 93.420402 1.816503 2.030036 1.045239 7.657366 2285.580745
+CGCCAA 103.637708 1.901709 1.757952 0.842308 7.657366 2876.971919
+CGCCAC 101.013783 1.765948 1.801979 0.874148 7.657366 3159.895818
+CGCCAG 102.901872 1.905353 1.821437 0.888345 7.657366 7059.787668
+CGCCAT 102.232151 1.931164 1.984105 1.009967 7.657366 3752.691043
+CGCCCA 99.912893 1.737641 1.558967 0.703421 7.657366 2454.881493
+CGCCCC 98.280860 1.726816 1.573310 0.713151 7.657366 1215.130211
+CGCCCG 99.856561 1.762882 1.588251 0.723334 7.657366 2483.322166
+CGCCCT 99.148868 1.726816 1.726298 0.819660 7.657366 1412.594565
+CGCCGA 102.184145 2.004341 1.982820 1.008986 7.657366 2784.914303
+CGCCGC 99.897003 1.752495 1.802857 0.874786 7.657366 4831.769761
+CGCCGG 101.294901 1.906143 1.808479 0.878882 7.657366 2869.678735
+CGCCGT 100.817146 1.981721 2.018561 1.036390 7.657366 2782.110042
+CGCCTA 98.978333 1.814328 1.737593 0.827718 7.657366 504.727893
+CGCCTC 96.811972 1.752205 1.793652 0.868096 7.657366 1083.662019
+CGCCTG 98.342408 1.690275 1.774924 0.854535 7.657366 3733.830949
+CGCCTT 97.971993 1.833629 1.893819 0.941820 7.657366 2199.498115
+CGCGAA 98.503542 2.047102 2.027995 1.043664 7.657366 2307.348699
+CGCGAC 94.227697 1.809558 1.954209 0.987226 7.657366 1968.276607
+CGCGAG 97.670678 1.993669 1.986894 1.012097 7.657366 1288.457075
+CGCGAT 96.243840 1.942508 2.078758 1.083094 7.657366 2734.071867
+CGCGCA 92.916196 1.724953 1.744527 0.832677 7.657366 2981.099238
+CGCGCC 91.747030 1.715820 1.775101 0.854663 7.657366 3775.699404
+CGCGCG 93.060532 1.687021 1.737733 0.827818 7.657366 2918.444419
+CGCGCT 93.130809 1.748146 1.866048 0.921180 7.657366 2757.913046
+CGCGGA 98.073477 2.122467 2.121783 1.116894 7.657366 1613.736277
+CGCGGC 95.087472 1.775964 1.971542 1.000390 7.657366 2655.723630
+CGCGGG 97.279821 2.001113 2.038448 1.051743 7.657366 1660.993251
+CGCGGT 96.857421 1.894204 2.140843 1.131977 7.657366 2173.014867
+CGCGTA 93.043384 1.892470 1.976845 1.004429 7.657366 1622.118217
+CGCGTC 91.539845 1.994021 2.037541 1.051041 7.657366 1908.843981
+CGCGTG 92.498445 1.845278 2.000577 1.022570 7.657366 1802.437503
+CGCGTT 93.302055 2.010158 2.203142 1.181746 7.657366 2573.195634
+CGCTAA 96.945447 2.183278 1.915442 0.957996 7.657366 1407.654449
+CGCTAC 93.525482 1.690586 1.793437 0.867940 7.657366 1490.440077
+CGCTAG 95.915109 1.598591 1.805042 0.876378 7.657366 145.837572
+CGCTAT 94.810007 1.924925 1.827548 0.892819 7.657366 1581.950225
+CGCTCA 88.077068 1.929869 1.852454 0.911132 7.657366 1860.077671
+CGCTCC 86.390938 1.857265 1.896521 0.943837 7.657366 942.201439
+CGCTCG 88.061572 2.032815 1.939556 0.976144 7.657366 1233.299257
+CGCTCT 87.648657 2.041935 1.977696 1.005078 7.657366 1223.674985
+CGCTGA 94.080683 1.901277 1.781997 0.859649 7.657366 3065.420022
+CGCTGC 90.545150 1.651341 1.702403 0.802701 7.657366 3478.779905
+CGCTGG 92.574391 1.796101 1.767200 0.848963 7.657366 5581.125788
+CGCTGT 91.463192 1.679278 1.772287 0.852632 7.657366 2346.758401
+CGCTTA 84.151262 1.730811 1.655854 0.770005 7.657366 1286.763834
+CGCTTC 81.317035 1.535893 1.576172 0.715098 7.657366 2502.227910
+CGCTTG 83.422324 1.587424 1.596109 0.728709 7.657366 1083.174336
+CGCTTT 82.589343 1.474443 1.513901 0.673142 7.657366 2934.458154
+CGGAAA 70.267692 1.789277 1.879148 0.930897 7.657366 2288.661122
+CGGAAC 67.718993 1.584100 2.102311 1.101554 7.657366 1522.462001
+CGGAAG 68.956446 1.759386 1.898840 0.945568 7.657366 1981.531619
+CGGAAT 68.514406 1.828362 2.127656 1.121534 7.657366 1638.363693
+CGGACA 58.235312 2.131928 2.491405 1.421107 7.657366 933.967293
+CGGACC 57.685713 2.116122 2.563656 1.483371 7.657366 609.480470
+CGGACG 58.533937 2.493603 2.580138 1.497699 7.657366 1226.431516
+CGGACT 58.393114 2.155173 2.268624 1.234821 7.657366 692.930911
+CGGAGA 67.771934 1.734151 1.930069 0.968990 7.657366 1031.025386
+CGGAGC 66.548683 1.766074 1.859369 0.916239 7.657366 849.454562
+CGGAGG 67.085805 1.960481 1.944244 0.979685 7.657366 606.396558
+CGGAGT 66.697164 1.772466 1.867643 0.922362 7.657366 746.716906
+CGGATA 58.643683 1.963379 2.283277 1.246804 7.657366 1774.203342
+CGGATC 57.335290 1.599262 2.025236 1.041534 7.657366 1328.361899
+CGGATG 58.186820 2.010921 2.172115 1.156870 7.657366 2021.133446
+CGGATT 58.042763 1.488428 1.790112 0.865527 7.657366 1659.565663
+CGGCAA 81.905564 1.876398 1.752201 0.838178 7.657366 4029.865237
+CGGCAC 79.921576 1.637299 1.581682 0.718851 7.657366 2373.178492
+CGGCAG 80.991246 1.713233 1.673737 0.782512 7.657366 3323.515279
+CGGCAT 80.530649 1.770100 1.612510 0.739970 7.657366 2865.035694
+CGGCCA 77.938932 1.569798 1.725260 0.818921 7.657366 1127.615290
+CGGCCC 76.837849 1.469311 1.655981 0.770094 7.657366 882.140882
+CGGCCG 77.806235 1.625580 1.744775 0.832855 7.657366 320.921869
+CGGCCT 77.737529 1.729960 1.695716 0.797976 7.657366 1430.230060
+CGGCGA 80.389087 1.678023 1.627868 0.750567 7.657366 2993.441789
+CGGCGC 78.373844 1.566435 1.572709 0.712743 7.657366 2886.734686
+CGGCGG 79.594010 1.677079 1.653787 0.768563 7.657366 3340.266690
+CGGCGT 78.835535 1.596130 1.628998 0.751348 7.657366 3177.997773
+CGGCTA 76.943175 1.575407 1.632097 0.753493 7.657366 904.677672
+CGGCTC 75.226576 1.419736 1.549539 0.697050 7.657366 937.172177
+CGGCTG 76.226909 1.478017 1.563415 0.706434 7.657366 2092.213958
+CGGCTT 75.961449 1.443638 1.390180 0.592334 7.657366 1932.761530
+CGGGAA 73.461643 1.847199 1.817911 0.885767 7.657366 1835.771730
+CGGGAC 68.923221 1.411090 1.992687 1.016527 7.657366 450.458433
+CGGGAG 72.410088 1.850453 1.845385 0.905922 7.657366 777.040900
+CGGGAT 70.299784 1.567696 1.928240 0.967614 7.657366 1575.010950
+CGGGCA 66.711773 1.898979 2.098634 1.098665 7.657366 2106.237243
+CGGGCC 65.535212 1.862273 2.153472 1.142008 7.657366 1042.019421
+CGGGCG 66.793071 1.856176 2.087530 1.089957 7.657366 2145.677509
+CGGGCT 66.443747 1.731098 1.987819 1.012805 7.657366 1364.635805
+CGGGGA 73.898807 1.755808 1.667535 0.778167 7.657366 987.988666
+CGGGGC 71.683241 1.425109 1.743934 0.832253 7.657366 991.231322
+CGGGGG 72.773436 1.532085 1.586158 0.721905 7.657366 503.568274
+CGGGGT 72.347716 1.528444 1.732528 0.824101 7.657366 735.037105
+CGGGTA 66.166456 1.481472 1.696621 0.798615 7.657366 1462.472786
+CGGGTC 64.391979 1.588829 1.891642 0.940197 7.657366 1023.806616
+CGGGTG 65.423235 1.556194 1.769970 0.850960 7.657366 1264.701265
+CGGGTT 65.676229 1.702739 1.975416 1.003340 7.657366 1643.258786
+CGGTAA 71.214991 2.029232 1.914886 0.957579 7.657366 3193.469819
+CGGTAC 68.142709 1.999622 2.107324 1.105496 7.657366 1712.101646
+CGGTAG 70.395356 1.866064 1.897444 0.944526 7.657366 1448.430260
+CGGTAT 69.477848 2.143273 2.111082 1.108455 7.657366 2148.710431
+CGGTCA 63.475468 2.077478 2.442275 1.379279 7.657366 2361.911883
+CGGTCC 62.276309 2.217455 2.689149 1.593612 7.657366 610.402989
+CGGTCG 63.670842 2.211697 2.459087 1.393546 7.657366 1420.155965
+CGGTCT 63.431100 2.438006 2.572465 1.491024 7.657366 931.527643
+CGGTGA 67.707079 1.925004 1.965411 0.995727 7.657366 2914.842121
+CGGTGC 64.330235 1.906089 2.245590 1.216063 7.657366 2501.961285
+CGGTGG 66.713626 1.939768 2.081358 1.085127 7.657366 2288.692264
+CGGTGT 65.320996 2.013536 2.284583 1.247874 7.657366 1725.099910
+CGGTTA 59.250754 2.313571 2.481455 1.412602 7.657366 1631.129049
+CGGTTC 56.385341 2.157087 2.474674 1.406816 7.657366 1470.712182
+CGGTTG 58.826493 2.408953 2.613937 1.527225 7.657366 1867.135824
+CGGTTT 57.749934 2.238147 2.474832 1.406951 7.657366 2891.663435
+CGTAAA 104.277984 2.521009 2.868657 1.755813 7.657366 1885.702961
+CGTAAC 101.847198 2.303722 2.814031 1.705900 7.657366 1485.694536
+CGTAAG 103.192590 2.629247 2.836703 1.726558 7.657366 776.791754
+CGTAAT 103.998738 2.464915 3.025307 1.901580 7.657366 1705.095021
+CGTACA 97.908697 2.262956 2.448476 1.384535 7.657366 892.312055
+CGTACC 97.627895 2.291578 2.536884 1.460196 7.657366 1480.863083
+CGTACG 97.770115 2.088947 2.467685 1.400860 7.657366 755.809831
+CGTACT 99.072348 2.608924 2.742126 1.640935 7.657366 1141.833430
+CGTAGA 101.228831 2.263981 2.714274 1.615998 7.657366 690.938699
+CGTAGC 99.879692 2.589534 2.698127 1.601599 7.657366 1247.268675
+CGTAGG 100.947197 2.388016 2.788330 1.682583 7.657366 504.371002
+CGTAGT 102.260902 2.670318 2.928491 1.811032 7.657366 804.623604
+CGTATA 99.054192 2.692887 2.953619 1.834392 7.657366 495.027859
+CGTATC 100.538753 2.637184 3.194745 2.063548 7.657366 1090.397021
+CGTATG 99.004226 2.561166 3.013298 1.890268 7.657366 852.554830
+CGTATT 102.621519 2.743744 3.321642 2.187709 7.657366 1525.286832
+CGTCAA 113.260981 2.870520 2.905261 1.789527 7.657366 1452.779427
+CGTCAC 111.225573 2.263947 2.783774 1.678461 7.657366 1558.092328
+CGTCAG 112.527290 2.623111 2.877495 1.763934 7.657366 2625.684391
+CGTCAT 112.227832 2.537745 2.904835 1.789133 7.657366 1998.415056
+CGTCCA 111.438459 2.691093 2.999083 1.876908 7.657366 1142.831965
+CGTCCC 109.936619 2.295042 2.817836 1.709361 7.657366 519.503732
+CGTCCG 111.663516 2.596425 2.780052 1.675096 7.657366 1351.813705
+CGTCCT 111.083838 2.487372 2.988555 1.867034 7.657366 610.608785
+CGTCGA 112.549114 2.620113 3.036636 1.912271 7.657366 1421.059039
+CGTCGC 110.153232 2.510850 2.874514 1.761193 7.657366 2112.639904
+CGTCGG 112.025820 2.684395 2.993142 1.871333 7.657366 1511.308200
+CGTCGT 111.863131 2.783081 3.131647 2.002716 7.657366 1342.222610
+CGTCTA 111.932840 2.176376 2.922921 1.805867 7.657366 323.126479
+CGTCTC 110.995657 2.388441 2.943693 1.825152 7.657366 726.823657
+CGTCTG 111.908734 2.524138 2.972436 1.851949 7.657366 2159.562344
+CGTCTT 111.889839 2.498011 3.216493 2.084655 7.657366 1353.805297
+CGTGAA 101.380168 2.481307 2.698357 1.601804 7.657366 1937.144226
+CGTGAC 97.779834 2.587918 2.772677 1.668435 7.657366 1283.093875
+CGTGAG 100.480247 2.758374 2.743088 1.641799 7.657366 934.275136
+CGTGAT 100.182127 2.297152 2.911674 1.795455 7.657366 1573.496015
+CGTGCA 99.864710 2.605087 2.611174 1.524804 7.657366 1225.709378
+CGTGCC 99.487259 2.727373 2.714166 1.615902 7.657366 1356.321030
+CGTGCG 100.023582 2.653683 2.684565 1.589539 7.657366 1633.698247
+CGTGCT 101.509791 2.343931 2.695756 1.599489 7.657366 1459.908513
+CGTGGA 101.969491 2.861322 2.834873 1.724887 7.657366 1273.692932
+CGTGGC 99.956556 2.501440 2.742506 1.641276 7.657366 1847.157943
+CGTGGG 101.236404 2.742023 2.773423 1.669108 7.657366 1017.631819
+CGTGGT 102.400814 2.739113 2.960357 1.840672 7.657366 1922.656674
+CGTGTA 102.498274 2.543411 3.091497 1.964326 7.657366 440.873049
+CGTGTC 103.130573 2.776691 3.186390 2.055459 7.657366 575.896538
+CGTGTG 102.551192 2.851147 3.174389 2.043857 7.657366 591.589430
+CGTGTT 105.638906 2.790895 3.323892 2.189932 7.657366 1087.599634
+CGTTAA 100.217671 2.633641 2.370727 1.319115 7.657366 2163.668483
+CGTTAC 96.423046 2.303644 2.207646 1.185371 7.657366 1631.051501
+CGTTAG 99.348798 2.670713 2.213548 1.190128 7.657366 864.850923
+CGTTAT 97.813653 2.153838 2.278089 1.242557 7.657366 2044.989854
+CGTTCA 93.507605 2.186327 2.474394 1.406577 7.657366 1858.326359
+CGTTCC 91.643940 2.251702 2.516115 1.442302 7.657366 1383.303323
+CGTTCG 93.590890 2.108217 2.508637 1.435876 7.657366 1222.704564
+CGTTCT 93.200863 2.156506 2.488603 1.418710 7.657366 1395.466713
+CGTTGA 97.539082 2.486422 2.262650 1.229947 7.657366 2378.755693
+CGTTGC 93.584241 2.099090 2.239802 1.211364 7.657366 2266.446266
+CGTTGG 96.712744 2.383006 2.264394 1.231369 7.657366 1620.641442
+CGTTGT 95.246914 2.259173 2.380535 1.327309 7.657366 1440.974005
+CGTTTA 89.891544 2.128921 2.333178 1.287900 7.657366 1582.635511
+CGTTTC 87.696913 1.882084 2.237386 1.209405 7.657366 1727.805612
+CGTTTG 89.319926 1.919851 2.224322 1.198827 7.657366 1769.912050
+CGTTTT 89.206633 1.808049 2.023010 1.039818 7.657366 3403.503201
+CTAAAA 91.047018 2.043658 1.966471 0.996533 7.657366 968.962355
+CTAAAC 88.759155 2.069691 2.127678 1.121551 7.657366 556.147334
+CTAAAG 90.453482 2.257948 2.017192 1.035335 7.657366 775.098141
+CTAAAT 89.751022 2.192708 2.310759 1.269382 7.657366 606.752507
+CTAACA 82.020331 2.134062 2.044792 1.056657 7.657366 714.854313
+CTAACC 81.448390 2.353847 2.052225 1.062424 7.657366 676.349761
+CTAACG 81.969957 2.280305 2.100038 1.099768 7.657366 814.950956
+CTAACT 82.113216 2.198533 2.079919 1.084001 7.657366 483.531652
+CTAAGA 89.548211 2.335820 2.279502 1.243713 7.657366 215.864591
+CTAAGC 88.161084 1.834331 2.104294 1.103113 7.657366 435.802516
+CTAAGG 88.608936 2.591584 2.171743 1.156573 7.657366 316.270424
+CTAAGT 88.670058 2.677611 2.248626 1.218529 7.657366 261.692930
+CTAATA 81.678373 2.304874 2.259973 1.227765 7.657366 462.714287
+CTAATC 80.098105 2.904278 2.347570 1.299834 7.657366 511.237785
+CTAATG 80.630622 2.318288 2.238195 1.210061 7.657366 579.986389
+CTAATT 81.025135 2.364889 2.563800 1.483497 7.657366 562.780869
+CTACAA 92.593460 1.586373 1.602082 0.732803 7.657366 896.834876
+CTACAC 90.550956 1.507499 1.471655 0.645162 7.657366 546.683159
+CTACAG 91.763542 1.486108 1.573549 0.713314 7.657366 644.111690
+CTACAT 91.452608 1.445622 1.443543 0.626766 7.657366 612.989839
+CTACCA 89.993823 1.682420 1.732390 0.824003 7.657366 926.735713
+CTACCC 89.038927 1.460547 1.662218 0.774448 7.657366 507.755688
+CTACCG 89.887021 1.587012 1.690135 0.794040 7.657366 1351.383336
+CTACCT 89.487879 1.490211 1.565968 0.708165 7.657366 609.685377
+CTACGA 91.700562 1.613548 1.525654 0.680995 7.657366 590.585067
+CTACGC 89.821379 1.476293 1.538111 0.689353 7.657366 1120.308374
+CTACGG 90.875855 1.455124 1.573046 0.712972 7.657366 906.433824
+CTACGT 90.051144 1.421222 1.530889 0.684504 7.657366 565.293163
+CTACTA 88.905473 1.540460 1.715623 0.812069 7.657366 270.688457
+CTACTC 87.252400 1.747587 1.677582 0.785210 7.657366 388.973990
+CTACTG 88.100225 1.468544 1.595255 0.728124 7.657366 783.815584
+CTACTT 87.981182 1.581499 1.723470 0.817647 7.657366 536.570514
+CTAGAA 88.180831 3.378807 1.988193 1.013090 7.657366 87.153246
+CTAGAC 85.937630 1.971176 2.504449 1.432282 7.657366 24.719957
+CTAGAG 87.771605 1.165329 1.943634 0.979224 7.657366 20.973343
+CTAGAT 86.637019 2.498174 2.511171 1.438053 7.657366 38.086690
+CTAGCA 84.118959 2.413450 2.077569 1.082165 7.657366 98.418745
+CTAGCC 82.183963 2.033801 1.873048 0.926368 7.657366 91.664508
+CTAGCG 83.739133 1.853022 1.816563 0.884782 7.657366 177.356464
+CTAGCT 83.418656 1.988629 2.078216 1.082670 7.657366 101.413986
+CTAGGA 89.025841 2.712222 2.643324 1.553052 7.657366 17.807137
+CTAGGC 85.568372 1.661012 1.894721 0.942493 7.657366 61.655043
+CTAGGG 88.694704 2.715699 3.008330 1.885595 7.657366 12.798935
+CTAGGT 87.076494 1.699929 1.948931 0.983230 7.657366 34.533136
+CTAGTA 81.702460 1.946476 1.952468 0.985907 7.657366 64.288138
+CTAGTC 79.594651 2.834214 2.468842 1.401846 7.657366 57.101852
+CTAGTG 81.620295 2.413491 2.246218 1.216573 7.657366 107.437372
+CTAGTT 81.505551 2.274368 2.435308 1.373381 7.657366 89.942278
+CTATAA 87.988590 1.958570 2.122071 1.117121 7.657366 648.820671
+CTATAC 85.491186 3.597482 2.138268 1.129935 7.657366 560.457267
+CTATAG 87.069828 1.778853 2.109765 1.107418 7.657366 299.577504
+CTATAT 86.348018 2.116209 2.234709 1.207234 7.657366 607.864098
+CTATCA 83.335591 2.394130 2.557760 1.478257 7.657366 1037.411381
+CTATCC 82.379152 2.457378 2.712355 1.614285 7.657366 626.158127
+CTATCG 83.435040 2.459502 2.626937 1.538632 7.657366 1153.743841
+CTATCT 83.140164 2.571990 2.684660 1.589623 7.657366 890.498806
+CTATGA 86.236989 2.178799 2.238151 1.210025 7.657366 673.657791
+CTATGC 83.171060 2.375373 2.373961 1.321815 7.657366 742.684117
+CTATGG 85.190668 2.087247 2.187698 1.169341 7.657366 604.171645
+CTATGT 84.265796 2.750263 2.674269 1.580404 7.657366 527.512781
+CTATTA 79.436778 3.087096 2.971969 1.851513 7.657366 561.724559
+CTATTC 76.242827 2.627176 2.818566 1.710026 7.657366 683.680379
+CTATTG 78.240591 2.488901 2.760531 1.657484 7.657366 752.566412
+CTATTT 77.489267 2.770615 2.781403 1.676317 7.657366 1165.998611
+CTCAAA 100.564202 1.899546 1.903507 0.949056 7.657366 1136.657482
+CTCAAC 98.695663 1.798532 2.086907 1.089469 7.657366 1324.811382
+CTCAAG 99.563637 1.767346 1.842217 0.903590 7.657366 638.629338
+CTCAAT 99.443069 1.948468 2.093613 1.094725 7.657366 1383.860110
+CTCACA 94.417027 1.983519 1.778118 0.856843 7.657366 515.868878
+CTCACC 93.779110 2.181420 1.972034 1.000764 7.657366 1448.281217
+CTCACG 94.388635 2.082889 1.879088 0.930852 7.657366 991.149464
+CTCACT 94.471761 2.031921 2.033849 1.048186 7.657366 777.051841
+CTCAGA 99.076133 1.940293 2.083269 1.086621 7.657366 395.094148
+CTCAGC 97.470985 2.009051 2.175989 1.159967 7.657366 965.016565
+CTCAGG 98.019300 1.802227 1.982298 1.008588 7.657366 1006.535057
+CTCAGT 98.095285 1.947781 2.251657 1.220995 7.657366 563.381121
+CTCATA 93.113620 1.968056 2.097622 1.097871 7.657366 656.005121
+CTCATC 92.111162 2.144575 2.284647 1.247927 7.657366 1449.059743
+CTCATG 92.612622 2.268561 2.196645 1.176522 7.657366 763.382473
+CTCATT 93.159702 2.000490 2.348189 1.300349 7.657366 1048.711537
+CTCCAA 99.128617 1.717494 1.509637 0.670300 7.657366 363.719999
+CTCCAC 96.984027 1.924870 1.676972 0.784782 7.657366 918.580690
+CTCCAG 98.454318 1.740542 1.594220 0.727416 7.657366 1856.038381
+CTCCAT 97.803671 1.850838 1.804328 0.875857 7.657366 858.118424
+CTCCCA 96.003680 1.646327 1.522705 0.679022 7.657366 454.203006
+CTCCCC 95.104600 1.614798 1.594512 0.727615 7.657366 576.871239
+CTCCCG 96.215444 1.687021 1.545794 0.694525 7.657366 754.737126
+CTCCCT 95.511521 1.673723 1.614645 0.741440 7.657366 475.850509
+CTCCGA 98.281839 1.795546 1.731741 0.823540 7.657366 380.883247
+CTCCGC 96.716816 1.892774 1.795296 0.869289 7.657366 1043.741204
+CTCCGG 97.497147 1.832495 1.688136 0.792632 7.657366 1555.450333
+CTCCGT 97.261554 2.044897 2.132083 1.125036 7.657366 596.123850
+CTCCTA 95.754489 1.703997 1.717859 0.813657 7.657366 132.402080
+CTCCTC 94.030955 1.906136 1.968864 0.998353 7.657366 482.753365
+CTCCTG 94.825691 1.990112 1.892566 0.940886 7.657366 1259.688375
+CTCCTT 94.678710 2.020901 2.094781 1.095641 7.657366 643.680418
+CTCGAA 98.019427 2.074371 2.162172 1.148936 7.657366 618.614460
+CTCGAC 93.945086 2.165199 2.246457 1.216767 7.657366 952.577801
+CTCGAG 96.680494 2.303797 2.375660 1.323234 7.657366 253.761898
+CTCGAT 95.466793 1.960376 2.305708 1.265222 7.657366 1094.656339
+CTCGCA 93.229964 2.149874 1.986876 1.012083 7.657366 715.395853
+CTCGCC 92.062615 2.106002 2.111847 1.109057 7.657366 1826.880062
+CTCGCG 93.084706 2.141979 2.106847 1.105121 7.657366 1314.555099
+CTCGCT 92.993717 2.114257 2.150204 1.139409 7.657366 1114.963964
+CTCGGA 97.348654 1.879362 1.978316 1.005550 7.657366 306.147636
+CTCGGC 94.993026 1.898387 2.188916 1.170319 7.657366 1256.494880
+CTCGGG 96.606580 2.070606 2.006172 1.026863 7.657366 412.084085
+CTCGGT 96.071246 1.926272 2.234837 1.207339 7.657366 1102.378759
+CTCGTA 91.640205 2.325003 2.148769 1.138269 7.657366 450.517444
+CTCGTC 90.205794 2.251858 2.483356 1.414226 7.657366 689.944768
+CTCGTG 91.501309 2.126381 2.401825 1.345155 7.657366 486.222117
+CTCGTT 91.566417 2.243684 2.536141 1.459555 7.657366 855.860778
+CTCTAA 94.929115 1.704860 1.895148 0.942812 7.657366 306.774986
+CTCTAC 92.472864 2.204202 2.184948 1.167137 7.657366 690.500958
+CTCTAG 93.724989 2.150488 1.985507 1.011038 7.657366 47.019824
+CTCTAT 93.196128 1.860517 2.095517 1.096218 7.657366 601.878961
+CTCTCA 90.522351 2.607486 2.619178 1.531820 7.657366 503.446769
+CTCTCC 88.950691 2.651674 2.450803 1.386510 7.657366 814.869916
+CTCTCG 90.110636 2.383578 2.365232 1.314531 7.657366 572.413222
+CTCTCT 89.905959 2.424916 2.562538 1.482401 7.657366 844.591324
+CTCTGA 93.460605 2.072182 2.107211 1.105407 7.657366 749.515797
+CTCTGC 90.621201 2.006050 2.211597 1.188555 7.657366 1377.372136
+CTCTGG 92.473364 2.020485 2.123781 1.118471 7.657366 1367.128997
+CTCTGT 91.362388 1.946930 2.255186 1.223866 7.657366 777.733505
+CTCTTA 86.207719 2.126495 2.338193 1.292054 7.657366 472.116983
+CTCTTC 83.560993 2.487207 2.543356 1.465787 7.657366 1979.522993
+CTCTTG 85.696992 2.313864 2.422501 1.362562 7.657366 479.594679
+CTCTTT 84.467246 2.062826 2.278749 1.243097 7.657366 1701.261133
+CTGAAA 83.631316 2.559074 2.337223 1.291250 7.657366 3215.679682
+CTGAAC 81.255425 2.499456 2.595102 1.510748 7.657366 2078.399307
+CTGAAG 82.810489 2.433500 2.391662 1.336626 7.657366 1708.320418
+CTGAAT 82.001946 2.368572 2.635201 1.545898 7.657366 2213.418504
+CTGACA 74.217154 2.391592 2.353830 1.305037 7.657366 1274.824097
+CTGACC 73.064057 2.232228 2.154962 1.143194 7.657366 2679.590854
+CTGACG 73.795421 2.304011 2.278379 1.242795 7.657366 2615.569189
+CTGACT 73.926692 2.188676 2.081324 1.085100 7.657366 1293.903464
+CTGAGA 82.877816 2.424836 2.354894 1.305922 7.657366 578.334287
+CTGAGC 81.628465 2.730427 2.483009 1.413930 7.657366 1211.247816
+CTGAGG 82.018389 2.690648 2.373268 1.321236 7.657366 337.721453
+CTGAGT 81.976228 2.349493 2.461548 1.395638 7.657366 835.925113
+CTGATA 73.250160 2.314713 2.242237 1.213340 7.657366 1861.188250
+CTGATC 72.334950 2.450615 2.674729 1.580811 7.657366 1725.155706
+CTGATG 72.624284 2.421501 2.420872 1.361187 7.657366 3094.584273
+CTGATT 73.044919 2.581203 2.748297 1.646478 7.657366 2773.365690
+CTGCAA 89.483609 1.761753 1.750760 0.837144 7.657366 2851.689056
+CTGCAC 87.095982 1.556850 1.523925 0.679838 7.657366 1859.458298
+CTGCAG 88.512089 1.646862 1.698088 0.799651 7.657366 1235.577390
+CTGCAT 87.898319 1.547615 1.556625 0.701837 7.657366 2190.234146
+CTGCCA 86.502320 1.797795 1.860556 0.917116 7.657366 3006.200110
+CTGCCC 85.312556 1.757692 1.819535 0.886954 7.657366 1516.837960
+CTGCCG 86.198978 1.795376 1.821406 0.888322 7.657366 3547.679172
+CTGCCT 85.888784 1.636114 1.763627 0.846389 7.657366 1351.318223
+CTGCGA 88.269419 1.618990 1.653642 0.768462 7.657366 1282.196159
+CTGCGC 86.245060 1.736959 1.688864 0.793144 7.657366 4032.126953
+CTGCGG 87.371314 1.680996 1.720407 0.815468 7.657366 2551.243514
+CTGCGT 86.629333 1.692605 1.673734 0.782510 7.657366 2322.197064
+CTGCTA 85.182261 1.673285 1.736401 0.826866 7.657366 733.049661
+CTGCTC 83.635166 1.773922 1.714905 0.811560 7.657366 1767.136240
+CTGCTG 84.697993 1.730947 1.788770 0.864554 7.657366 4458.156407
+CTGCTT 84.462113 1.808019 1.821077 0.888081 7.657366 1936.465344
+CTGGAA 86.051278 2.338692 2.287682 1.250414 7.657366 3606.332185
+CTGGAC 81.956096 2.406594 2.590014 1.506307 7.657366 1247.227096
+CTGGAG 85.083106 2.316462 2.312483 1.270802 7.657366 1637.047711
+CTGGAT 83.439631 2.503559 2.654210 1.562656 7.657366 2884.280261
+CTGGCA 80.921786 2.461368 2.072126 1.077915 7.657366 4201.110902
+CTGGCC 79.792854 2.390669 2.160780 1.147826 7.657366 1750.777460
+CTGGCG 80.607691 2.425608 2.118127 1.114008 7.657366 6703.269431
+CTGGCT 80.359613 2.500306 2.227630 1.201503 7.657366 3518.680984
+CTGGGA 86.572192 2.119580 2.214572 1.190954 7.657366 1089.388912
+CTGGGC 84.302756 2.223499 2.317762 1.275156 7.657366 1994.327454
+CTGGGG 85.428213 2.295161 2.249035 1.218862 7.657366 1488.059291
+CTGGGT 85.014900 2.252522 2.321302 1.278079 7.657366 1677.751830
+CTGGTA 80.299209 2.590625 2.512349 1.439065 7.657366 1854.083679
+CTGGTC 78.315417 2.921860 2.705917 1.608541 7.657366 1919.756436
+CTGGTG 79.167483 2.766135 2.621953 1.534255 7.657366 3807.642548
+CTGGTT 79.803058 2.740233 2.805703 1.698333 7.657366 2847.764691
+CTGTAA 83.602088 2.370682 2.308567 1.267576 7.657366 1309.562544
+CTGTAC 80.142438 1.976871 2.339623 1.293240 7.657366 1060.968975
+CTGTAG 82.468413 2.163613 2.282791 1.246406 7.657366 745.643304
+CTGTAT 81.505103 2.389913 2.393311 1.338009 7.657366 1172.263204
+CTGTCA 78.910199 2.662985 2.893773 1.778923 7.657366 1041.663245
+CTGTCC 77.741750 2.490901 2.902820 1.787272 7.657366 620.230165
+CTGTCG 78.654043 2.521608 2.767010 1.663322 7.657366 1508.543503
+CTGTCT 78.432334 2.773546 2.963175 1.843301 7.657366 803.696115
+CTGTGA 81.412272 2.549608 2.523668 1.448800 7.657366 749.397155
+CTGTGC 78.116154 2.536827 2.620981 1.533402 7.657366 1293.018337
+CTGTGG 80.126428 2.331893 2.430846 1.369609 7.657366 1444.388437
+CTGTGT 79.031948 2.596191 2.666006 1.573084 7.657366 726.234452
+CTGTTA 74.628462 2.835152 2.985931 1.864576 7.657366 1229.625512
+CTGTTC 71.573933 2.916117 3.097631 1.970175 7.657366 2216.172933
+CTGTTG 74.032903 2.994261 3.030355 1.906341 7.657366 2026.341889
+CTGTTT 72.983861 2.992607 2.938367 1.820201 7.657366 2850.785357
+CTTAAA 101.896766 1.860097 1.676381 0.784367 7.657366 857.126583
+CTTAAC 100.299264 1.748816 1.714245 0.811091 7.657366 1087.454561
+CTTAAG 101.288219 1.894202 1.699857 0.800901 7.657366 699.246918
+CTTAAT 101.820843 1.707277 1.822274 0.888957 7.657366 1094.166211
+CTTACA 97.646007 1.666798 1.574171 0.713737 7.657366 564.602697
+CTTACC 97.141210 1.780933 1.616614 0.742796 7.657366 1049.919686
+CTTACG 97.546435 1.719600 1.546337 0.694891 7.657366 871.038185
+CTTACT 98.411725 1.889894 1.692005 0.795358 7.657366 678.806305
+CTTAGA 99.571342 1.811201 1.849815 0.909186 7.657366 170.583041
+CTTAGC 98.636082 1.611115 1.686621 0.791565 7.657366 551.484986
+CTTAGG 99.016271 2.039777 1.712262 0.809684 7.657366 194.379967
+CTTAGT 100.048199 1.721720 1.731430 0.823318 7.657366 263.810547
+CTTATA 97.729442 2.044381 1.827558 0.892827 7.657366 513.097274
+CTTATC 98.616034 2.140728 1.920361 0.961689 7.657366 1443.741799
+CTTATG 97.424005 1.773763 1.803196 0.875034 7.657366 702.267140
+CTTATT 99.934478 1.874192 2.014295 1.033106 7.657366 1056.770339
+CTTCAA 104.939898 2.699605 1.894445 0.942287 7.657366 2174.413666
+CTTCAC 103.507473 2.249079 1.849706 0.909106 7.657366 1961.691511
+CTTCAG 104.111044 2.621786 1.921316 0.962407 7.657366 2025.671158
+CTTCAT 104.158761 2.607367 1.972040 1.000769 7.657366 2240.590619
+CTTCCA 103.104663 2.683661 1.835005 0.898289 7.657366 2609.190525
+CTTCCC 101.869605 2.284462 1.813974 0.882890 7.657366 1235.450997
+CTTCCG 102.919543 2.716852 1.848416 0.908155 7.657366 2295.880871
+CTTCCT 102.831182 2.586384 1.971992 1.000732 7.657366 1528.203528
+CTTCGA 103.936919 2.711759 1.990695 1.015003 7.657366 1332.699461
+CTTCGC 102.329493 2.351074 1.783933 0.861050 7.657366 2449.783042
+CTTCGG 103.422693 2.530305 1.942559 0.978411 7.657366 1588.617438
+CTTCGT 103.454004 2.388806 2.021374 1.038557 7.657366 1467.668475
+CTTCTA 103.708658 2.407328 1.975187 1.003166 7.657366 557.778213
+CTTCTC 103.096839 2.437025 2.041298 1.053950 7.657366 1114.318794
+CTTCTG 103.629994 2.304346 1.953967 0.987043 7.657366 1742.586376
+CTTCTT 103.878907 2.230798 2.062183 1.070166 7.657366 2026.688271
+CTTGAA 98.271494 1.888221 1.712656 0.809963 7.657366 713.100760
+CTTGAC 95.309201 1.821579 1.861881 0.918096 7.657366 333.782310
+CTTGAG 97.242471 1.880083 1.705428 0.804842 7.657366 722.605732
+CTTGAT 96.992949 1.850066 1.883105 0.933839 7.657366 1191.541861
+CTTGCA 95.878507 1.880347 1.551599 0.698441 7.657366 847.424238
+CTTGCC 95.488689 1.859352 1.614819 0.741559 7.657366 1400.416939
+CTTGCG 95.934469 1.961820 1.607043 0.736210 7.657366 1160.655568
+CTTGCT 96.958037 2.078605 1.669837 0.779779 7.657366 1258.789891
+CTTGGA 97.694084 2.082276 1.750322 0.836830 7.657366 147.822268
+CTTGGC 96.486220 1.776868 1.715334 0.811864 7.657366 585.198165
+CTTGGG 97.339929 1.705421 1.634814 0.755375 7.657366 263.172660
+CTTGGT 97.908974 1.915033 1.858437 0.915550 7.657366 414.922814
+CTTGTA 97.129171 2.020094 1.775929 0.855261 7.657366 458.549676
+CTTGTC 96.930681 2.184587 1.883883 0.934418 7.657366 608.601168
+CTTGTG 96.840002 1.973997 1.835791 0.898867 7.657366 565.978824
+CTTGTT 98.797416 2.043979 2.004611 1.025665 7.657366 1093.620182
+CTTTAA 95.692277 2.471652 1.615793 0.742231 7.657366 1654.878691
+CTTTAC 93.309259 2.337912 1.479955 0.650628 7.657366 1748.019823
+CTTTAG 94.953223 2.070737 1.524413 0.680165 7.657366 643.174338
+CTTTAT 94.071113 2.107172 1.516300 0.674742 7.657366 1968.060870
+CTTTCA 90.974836 1.724407 1.599419 0.730977 7.657366 2357.154704
+CTTTCC 89.558814 1.656422 1.574338 0.713851 7.657366 1449.942420
+CTTTCG 90.966995 1.682478 1.602693 0.733223 7.657366 1886.799809
+CTTTCT 90.881071 1.811968 1.620945 0.745784 7.657366 1660.647261
+CTTTGA 94.100214 2.703963 1.546924 0.695287 7.657366 1478.647978
+CTTTGC 91.009425 1.478887 1.437690 0.622957 7.657366 2125.587132
+CTTTGG 93.608446 2.332826 1.503313 0.666092 7.657366 1270.207814
+CTTTGT 92.304059 1.695860 1.506969 0.668524 7.657366 1421.699130
+CTTTTA 88.386913 1.519408 1.461644 0.638591 7.657366 1349.704244
+CTTTTC 86.548125 1.468550 1.460081 0.637567 7.657366 1579.867636
+CTTTTG 87.953996 1.437071 1.449009 0.630328 7.657366 1649.315408
+CTTTTT 88.246372 1.630793 1.360548 0.573497 7.657366 3192.524008
+GAAAAA 81.443868 1.512981 1.387132 0.590387 7.657366 4134.840543
+GAAAAC 79.324003 1.322059 1.412466 0.606635 7.657366 2735.985894
+GAAAAG 80.490477 1.477686 1.371396 0.580370 7.657366 1485.955148
+GAAAAT 79.746383 1.355598 1.494164 0.660021 7.657366 2888.476660
+GAAACA 71.451284 1.820842 1.636766 0.756728 7.657366 1483.410825
+GAAACC 70.434309 1.683026 1.654732 0.769222 7.657366 2122.696646
+GAAACG 71.512960 1.787899 1.630596 0.752454 7.657366 2088.501218
+GAAACT 71.309262 1.677986 1.540601 0.691028 7.657366 1411.923946
+GAAAGA 79.100737 1.742692 1.374355 0.582249 7.657366 1935.913842
+GAAAGC 77.681157 1.411647 1.303455 0.537779 7.657366 2648.773278
+GAAAGG 78.126461 1.776188 1.344629 0.563461 7.657366 1455.598766
+GAAAGT 77.771466 1.469373 1.415215 0.608407 7.657366 1555.433070
+GAAATA 71.509403 1.636262 1.490814 0.657802 7.657366 1733.752387
+GAAATC 69.881580 1.314739 1.347733 0.565413 7.657366 2280.651667
+GAAATG 70.789175 1.702195 1.448853 0.630227 7.657366 1867.365278
+GAAATT 70.975549 1.533145 1.509758 0.670381 7.657366 2153.341435
+GAACAA 83.813197 2.081598 1.835865 0.898921 7.657366 1925.757425
+GAACAC 82.409676 1.936057 1.736870 0.827201 7.657366 1089.273593
+GAACAG 83.051445 1.999387 1.828091 0.893217 7.657366 2277.777553
+GAACAT 83.194797 1.944082 1.765876 0.848010 7.657366 1571.603113
+GAACCA 80.223943 1.781736 1.778229 0.856923 7.657366 1187.550614
+GAACCC 79.128639 1.680090 1.790545 0.865841 7.657366 647.347948
+GAACCG 80.188242 1.750552 1.767940 0.849497 7.657366 1520.423045
+GAACCT 79.929327 1.764088 1.785923 0.862491 7.657366 891.696547
+GAACGA 82.755813 2.044907 1.722609 0.817035 7.657366 1099.160644
+GAACGC 81.262436 1.741456 1.649572 0.765627 7.657366 2124.658109
+GAACGG 81.943486 1.710723 1.764245 0.846835 7.657366 1517.753714
+GAACGT 81.809075 1.767163 1.682473 0.788646 7.657366 1575.942442
+GAACTA 80.011700 1.800592 1.713542 0.810592 7.657366 483.884476
+GAACTC 78.459889 1.835870 1.631196 0.752869 7.657366 904.581476
+GAACTG 79.236003 1.664432 1.605048 0.734839 7.657366 2725.380905
+GAACTT 78.901243 1.492028 1.392405 0.593757 7.657366 1012.552347
+GAAGAA 79.960367 1.421679 1.397424 0.596970 7.657366 2627.292669
+GAAGAC 76.337154 1.283602 1.411985 0.606325 7.657366 1149.372515
+GAAGAG 79.021351 1.417125 1.395117 0.595492 7.657366 1765.362756
+GAAGAT 77.449270 1.388013 1.460993 0.638164 7.657366 2371.517883
+GAAGCA 73.995740 1.409610 1.483574 0.653016 7.657366 1272.229699
+GAAGCC 72.860172 1.713646 1.411358 0.605921 7.657366 1813.340759
+GAAGCG 74.007720 1.614477 1.409253 0.604566 7.657366 2277.334611
+GAAGCT 73.740247 1.436628 1.311140 0.542543 7.657366 1303.193609
+GAAGGA 78.900080 1.328246 1.383494 0.588066 7.657366 832.083887
+GAAGGC 76.630585 1.307298 1.333216 0.556302 7.657366 1846.875603
+GAAGGG 78.196585 1.356665 1.338098 0.559361 7.657366 1007.737911
+GAAGGT 77.510504 1.352199 1.437064 0.622551 7.657366 1860.899758
+GAAGTA 72.875060 1.599513 1.291489 0.530392 7.657366 958.169462
+GAAGTC 71.141489 1.276245 1.394226 0.594922 7.657366 994.269952
+GAAGTG 72.227793 1.440514 1.369790 0.579350 7.657366 1774.612013
+GAAGTT 72.364004 1.298701 1.489343 0.656829 7.657366 1367.086407
+GAATAA 77.405090 1.856343 1.943474 0.979103 7.657366 1862.143370
+GAATAC 75.259816 1.730827 1.971898 1.000661 7.657366 1524.142062
+GAATAG 76.817604 1.711345 1.850929 0.910007 7.657366 690.475955
+GAATAT 76.328769 2.214702 2.101965 1.101282 7.657366 2164.165399
+GAATCA 70.360851 1.918484 2.181526 1.164396 7.657366 1341.699539
+GAATCC 69.273197 2.031930 2.282665 1.246303 7.657366 949.999965
+GAATCG 70.508405 1.939010 2.209110 1.186550 7.657366 1202.348324
+GAATCT 70.259945 2.236471 2.356010 1.306850 7.657366 1007.193621
+GAATGA 74.912010 1.703291 1.828316 0.893382 7.657366 1218.290994
+GAATGC 72.497394 1.509090 1.854604 0.912719 7.657366 1700.888092
+GAATGG 74.060728 1.663679 1.850789 0.909904 7.657366 1533.826315
+GAATGT 73.165339 1.833304 2.002551 1.024084 7.657366 1165.240972
+GAATTA 66.124238 1.662479 1.973440 1.001835 7.657366 1086.069247
+GAATTC 63.883209 1.702894 1.916103 0.958492 7.657366 749.027840
+GAATTG 65.544314 1.641405 1.847784 0.907689 7.657366 987.848878
+GAATTT 65.032668 1.691505 1.764374 0.846928 7.657366 1974.562115
+GACAAA 103.509079 2.036761 2.124212 1.118812 7.657366 1828.767236
+GACAAC 100.593351 1.975191 2.177203 1.160937 7.657366 1231.266319
+GACAAG 102.087371 2.267383 2.210710 1.187840 7.657366 573.960358
+GACAAT 102.007582 1.850874 2.287785 1.250498 7.657366 1389.949327
+GACACA 94.919546 1.584321 1.682110 0.788391 7.657366 578.579156
+GACACC 93.721071 1.626687 1.753358 0.839008 7.657366 1030.093950
+GACACG 94.811737 1.669822 1.762408 0.845512 7.657366 643.138857
+GACACT 94.661421 1.693202 1.889352 0.938490 7.657366 571.738541
+GACAGA 100.440169 2.087250 2.213798 1.190329 7.657366 788.060004
+GACAGC 98.086466 1.946736 2.091598 1.093145 7.657366 1548.158563
+GACAGG 99.528354 1.954120 2.105715 1.104230 7.657366 827.408539
+GACAGT 99.197399 2.101202 2.183703 1.166140 7.657366 831.997949
+GACATA 94.294061 1.766934 1.897553 0.944607 7.657366 834.560495
+GACATC 93.092615 1.927618 2.042707 1.055041 7.657366 1469.197676
+GACATG 93.574919 2.025710 1.966414 0.996490 7.657366 806.106480
+GACATT 94.588037 2.039853 2.158280 1.145835 7.657366 1214.784215
+GACCAA 103.186447 1.822440 1.783241 0.860548 7.657366 1262.509603
+GACCAC 100.478607 1.822566 1.838577 0.900913 7.657366 1354.814590
+GACCAG 102.505078 1.872742 1.879256 0.930978 7.657366 1946.142883
+GACCAT 101.773055 1.826668 2.004751 1.025772 7.657366 1551.741989
+GACCCA 99.282628 1.814499 1.691271 0.794840 7.657366 660.279748
+GACCCC 97.562345 1.849256 1.699953 0.800969 7.657366 298.332009
+GACCCG 99.260792 1.976227 1.731119 0.823097 7.657366 911.533381
+GACCCT 99.182073 1.762873 1.745610 0.833453 7.657366 337.564861
+GACCGA 101.801003 1.850399 1.934036 0.971979 7.657366 1232.246597
+GACCGC 99.220716 1.706984 1.868988 0.923358 7.657366 1488.063366
+GACCGG 100.933695 1.871103 1.871263 0.925044 7.657366 1247.519212
+GACCGT 100.233542 1.961696 2.049301 1.060154 7.657366 1339.745344
+GACCTA 98.676663 1.741368 1.737853 0.827904 7.657366 319.513317
+GACCTC 96.480154 1.738297 1.870345 0.924364 7.657366 477.523327
+GACCTG 97.862295 1.673419 1.845918 0.906314 7.657366 1444.341957
+GACCTT 97.482778 1.745205 1.995455 1.018646 7.657366 959.973492
+GACGAA 98.887088 2.198982 2.151051 1.140083 7.657366 1610.977879
+GACGAC 94.787347 2.030529 2.125677 1.119969 7.657366 1096.739397
+GACGAG 98.230885 2.068349 2.078420 1.082830 7.657366 777.617375
+GACGAT 96.647906 2.093674 2.172667 1.157311 7.657366 1985.709547
+GACGCA 94.366279 1.811723 1.825409 0.891252 7.657366 1590.554519
+GACGCC 92.950743 1.897327 1.892269 0.940664 7.657366 1937.663403
+GACGCG 94.285306 1.981976 1.855305 0.913237 7.657366 2002.957612
+GACGCT 94.503636 1.777150 1.908688 0.952933 7.657366 2018.735500
+GACGGA 98.465886 2.059638 2.164480 1.150776 7.657366 979.677415
+GACGGC 95.612829 1.956350 2.058150 1.067028 7.657366 1890.538264
+GACGGG 97.902385 1.986486 2.037849 1.051280 7.657366 961.180773
+GACGGT 97.552484 2.135496 2.158371 1.145907 7.657366 1903.327147
+GACGTA 93.910972 1.959735 2.057277 1.066349 7.657366 808.696901
+GACGTC 92.342894 2.101184 2.165811 1.151838 7.657366 962.338916
+GACGTG 93.428874 1.959181 1.972200 1.000891 7.657366 1154.721050
+GACGTT 94.127025 2.448337 2.352710 1.304106 7.657366 1904.536805
+GACTAA 97.328735 2.014446 1.952606 0.986012 7.657366 510.198645
+GACTAC 93.642412 1.703025 1.769345 0.850509 7.657366 573.381911
+GACTAG 96.006773 2.165553 1.915602 0.958116 7.657366 71.347892
+GACTAT 94.759409 1.715446 1.813691 0.882684 7.657366 688.917891
+GACTCA 88.751597 1.771715 1.855723 0.913545 7.657366 610.005311
+GACTCC 86.845907 1.830784 1.949521 0.983676 7.657366 530.124958
+GACTCG 88.675388 1.786187 1.772330 0.852663 7.657366 562.581930
+GACTCT 88.199066 1.681354 1.728935 0.821539 7.657366 520.853814
+GACTGA 94.373854 1.915747 1.800519 0.873086 7.657366 934.730587
+GACTGC 90.942486 1.671602 1.685562 0.790819 7.657366 1044.571209
+GACTGG 93.035618 1.921616 1.822849 0.889378 7.657366 1589.476477
+GACTGT 91.645170 1.765565 1.846655 0.906858 7.657366 571.388593
+GACTTA 84.412115 1.717827 1.710726 0.808595 7.657366 518.815838
+GACTTC 81.371034 1.559803 1.556909 0.702029 7.657366 1193.864281
+GACTTG 83.365544 1.735020 1.700355 0.801253 7.657366 370.199026
+GACTTT 82.670093 1.557342 1.459516 0.637197 7.657366 1268.293549
+GAGAAA 74.219540 2.003268 1.515737 0.674366 7.657366 1938.718074
+GAGAAC 71.488792 1.832310 1.727993 0.820868 7.657366 966.825897
+GAGAAG 72.800809 2.047783 1.539319 0.690165 7.657366 1124.971955
+GAGAAT 72.031166 2.014371 1.724230 0.818188 7.657366 1092.697639
+GAGACA 62.515512 2.408692 2.215440 1.191654 7.657366 561.155937
+GAGACC 61.053125 2.173578 2.153361 1.141919 7.657366 137.454867
+GAGACG 62.351139 2.350487 2.147755 1.137463 7.657366 687.962172
+GAGACT 61.953377 2.131394 1.961258 0.992573 7.657366 386.163552
+GAGAGA 71.785797 1.894730 1.518090 0.675937 7.657366 712.834758
+GAGAGC 70.344580 1.836297 1.516307 0.674747 7.657366 767.067004
+GAGAGG 71.025229 1.962034 1.541516 0.691644 7.657366 542.134141
+GAGAGT 70.385590 2.110730 1.541656 0.691738 7.657366 576.754956
+GAGATA 62.561344 2.055328 1.948954 0.983247 7.657366 1120.387125
+GAGATC 61.404530 1.806659 1.732917 0.824379 7.657366 1371.864361
+GAGATG 62.078372 2.013072 1.872693 0.926105 7.657366 1240.123745
+GAGATT 62.145656 1.817595 1.642351 0.760605 7.657366 1117.219965
+GAGCAA 82.251483 1.897832 1.860327 0.916947 7.657366 1541.399621
+GAGCAC 79.869896 1.653437 1.645680 0.762919 7.657366 811.820779
+GAGCAG 81.192162 1.865843 1.800431 0.873021 7.657366 1784.672918
+GAGCAT 80.721247 1.872749 1.736955 0.827262 7.657366 1164.742054
+GAGCCA 77.894733 1.707218 1.865536 0.920801 7.657366 875.144051
+GAGCCC 76.766824 1.949306 1.866521 0.921531 7.657366 203.159655
+GAGCCG 77.865532 1.700408 1.811391 0.881006 7.657366 876.124869
+GAGCCT 77.402152 1.771398 1.711402 0.809074 7.657366 544.391316
+GAGCGA 80.275916 1.773040 1.767551 0.849216 7.657366 1130.013005
+GAGCGC 77.547722 1.661416 1.629050 0.751384 7.657366 1535.758083
+GAGCGG 79.147853 1.661270 1.734831 0.825745 7.657366 1257.926943
+GAGCGT 78.262772 1.661118 1.675392 0.783673 7.657366 1409.513749
+GAGCTA 76.696983 1.621648 1.644772 0.762287 7.657366 429.472262
+GAGCTC 75.054806 1.523904 1.623972 0.747873 7.657366 194.804246
+GAGCTG 76.349441 1.551388 1.633110 0.754195 7.657366 1994.901786
+GAGCTT 75.875860 1.502056 1.458774 0.636711 7.657366 830.254822
+GAGGAA 74.441606 1.560249 1.510926 0.671159 7.657366 1034.432687
+GAGGAC 69.404495 1.460387 1.805491 0.876705 7.657366 274.866572
+GAGGAG 73.124187 1.772113 1.596845 0.729213 7.657366 444.632535
+GAGGAT 71.002831 1.667143 1.812021 0.881465 7.657366 939.806996
+GAGGCA 68.300590 1.722504 1.786724 0.863071 7.657366 694.238481
+GAGGCC 66.877538 1.817721 1.809070 0.879313 7.657366 568.485848
+GAGGCG 68.136660 1.706428 1.765315 0.847605 7.657366 1021.066633
+GAGGCT 67.649186 1.715477 1.650837 0.766507 7.657366 598.815578
+GAGGGA 74.470865 1.404281 1.344690 0.563500 7.657366 430.307456
+GAGGGC 72.432985 1.480194 1.490793 0.657788 7.657366 463.014046
+GAGGGG 73.597143 1.526066 1.334759 0.557269 7.657366 333.358022
+GAGGGT 73.057851 1.542828 1.545023 0.694005 7.657366 429.906714
+GAGGTA 67.231781 1.421407 1.471902 0.645325 7.657366 525.802363
+GAGGTC 65.546194 1.688081 1.650455 0.766242 7.657366 521.887742
+GAGGTG 66.615077 1.530985 1.541219 0.691443 7.657366 743.764157
+GAGGTT 66.905251 1.605478 1.727406 0.820450 7.657366 752.154857
+GAGTAA 71.640303 1.775090 1.899878 0.946344 7.657366 922.543659
+GAGTAC 68.350538 1.761833 1.851091 0.910127 7.657366 580.578527
+GAGTAG 70.531648 1.617522 1.865152 0.920517 7.657366 376.278010
+GAGTAT 69.604617 1.878225 1.915872 0.958319 7.657366 737.195276
+GAGTCA 63.543015 1.696442 2.021480 1.038638 7.657366 718.237219
+GAGTCC 62.471231 2.103483 2.153499 1.142029 7.657366 384.205816
+GAGTCG 63.723829 1.810720 2.103264 1.102303 7.657366 708.459348
+GAGTCT 63.700625 2.169290 2.243614 1.214458 7.657366 450.003265
+GAGTGA 67.996406 1.647733 1.786690 0.863046 7.657366 811.090477
+GAGTGC 64.515600 1.659552 1.963631 0.994375 7.657366 707.263398
+GAGTGG 67.008737 1.679256 1.791816 0.866763 7.657366 798.669725
+GAGTGT 65.530043 1.669970 2.009851 1.029689 7.657366 485.131471
+GAGTTA 58.856632 1.901102 2.127807 1.121654 7.657366 809.902705
+GAGTTC 56.129215 1.778992 2.270657 1.236482 7.657366 959.702563
+GAGTTG 58.300703 1.843617 2.191912 1.172722 7.657366 942.390811
+GAGTTT 57.335033 1.769195 2.121967 1.117039 7.657366 1412.719907
+GATAAA 107.257021 2.494357 2.813097 1.705051 7.657366 3104.819179
+GATAAC 104.927565 2.488758 2.827899 1.718526 7.657366 2243.073958
+GATAAG 106.072350 2.395636 2.681680 1.586978 7.657366 1545.736047
+GATAAT 107.144178 2.560220 3.120329 1.991870 7.657366 2200.158587
+GATACA 102.270920 2.238907 2.537735 1.460931 7.657366 823.728552
+GATACC 101.975029 2.551145 2.700584 1.603787 7.657366 1745.382416
+GATACG 101.967091 2.292596 2.570946 1.489703 7.657366 1184.455790
+GATACT 103.568676 2.566195 2.809149 1.701462 7.657366 874.384967
+GATAGA 104.679048 2.372963 2.668297 1.575112 7.657366 792.994415
+GATAGC 103.326827 2.417250 2.643525 1.553229 7.657366 1573.268040
+GATAGG 104.145899 2.239633 2.565893 1.485313 7.657366 463.013954
+GATAGT 105.501453 2.594493 3.003756 1.881297 7.657366 931.668864
+GATATA 103.412679 2.903230 3.007928 1.885217 7.657366 917.213270
+GATATC 104.642517 2.747233 3.136704 2.007570 7.657366 2585.305243
+GATATG 103.220762 2.743564 3.065925 1.940004 7.657366 1369.408209
+GATATT 106.795018 2.691217 3.343183 2.209024 7.657366 2491.578315
+GATCAA 114.632021 2.444792 2.757604 1.654848 7.657366 1739.454549
+GATCAC 112.177158 2.464868 2.774295 1.669896 7.657366 1601.639576
+GATCAG 113.981789 2.528743 2.883158 1.769143 7.657366 1818.121897
+GATCAT 113.464532 2.627709 2.968280 1.848067 7.657366 1630.067673
+GATCCA 113.950645 2.541801 2.785556 1.680073 7.657366 1294.929904
+GATCCC 112.400862 2.361837 2.759602 1.656647 7.657366 1218.618075
+GATCCG 113.962892 2.400506 2.814353 1.706193 7.657366 1552.867623
+GATCCT 113.056373 2.783006 3.093745 1.966469 7.657366 1134.620589
+GATCGA 114.655715 2.637110 2.837956 1.727702 7.657366 1541.333497
+GATCGC 112.023066 2.491508 2.850510 1.739178 7.657366 2381.836611
+GATCGG 113.769826 2.485945 2.904876 1.789170 7.657366 1381.210493
+GATCGT 113.630963 2.573932 3.152955 2.023192 7.657366 1557.179151
+GATCTA 113.880217 2.506063 3.224373 2.092320 7.657366 313.444965
+GATCTC 112.875844 2.376663 2.954035 1.834779 7.657366 1272.312245
+GATCTG 113.716237 2.710793 2.921569 1.804615 7.657366 1917.369475
+GATCTT 113.933055 2.397691 3.082968 1.956203 7.657366 1323.446935
+GATGAA 103.933714 2.514530 2.704253 1.607057 7.657366 3022.952339
+GATGAC 100.579458 2.365876 2.757246 1.654526 7.657366 1916.147842
+GATGAG 103.151246 2.438518 2.666806 1.573792 7.657366 1466.042249
+GATGAT 103.042150 2.517952 2.852142 1.740672 7.657366 3184.219424
+GATGCA 103.095648 2.682571 2.728901 1.629078 7.657366 1731.235934
+GATGCC 102.888882 2.694660 2.757711 1.654945 7.657366 2532.878014
+GATGCG 103.146556 2.515712 2.788240 1.682502 7.657366 2764.094620
+GATGCT 104.761754 2.598663 2.901575 1.786122 7.657366 2036.677072
+GATGGA 104.393417 2.540305 2.790927 1.684935 7.657366 1512.150905
+GATGGC 102.670093 2.461745 2.731713 1.631597 7.657366 2890.770891
+GATGGG 103.999922 2.651568 2.755819 1.653242 7.657366 1251.386345
+GATGGT 105.068920 2.559202 2.910481 1.794351 7.657366 2312.464241
+GATGTA 105.705456 2.916329 3.067010 1.941033 7.657366 1030.231769
+GATGTC 106.580502 2.957907 3.329740 2.195714 7.657366 1316.459161
+GATGTG 105.733813 2.866729 3.125439 1.996764 7.657366 1608.919873
+GATGTT 109.102015 3.129399 3.392700 2.258284 7.657366 2167.014431
+GATTAA 102.393676 1.989432 2.167515 1.153197 7.657366 1856.488711
+GATTAC 98.831593 1.826036 1.995454 1.018645 7.657366 1623.670516
+GATTAG 101.618559 2.328827 2.146148 1.136187 7.657366 524.541237
+GATTAT 100.198537 1.945310 2.113926 1.110696 7.657366 2022.030229
+GATTCA 97.025884 2.143921 2.288166 1.250811 7.657366 1458.354217
+GATTCC 95.228473 1.969522 2.141003 1.132104 7.657366 844.258754
+GATTCG 96.962541 2.105894 2.276340 1.241126 7.657366 1071.188329
+GATTCT 96.906612 2.022180 2.235051 1.207512 7.657366 951.345839
+GATTGA 100.520777 2.207851 2.104152 1.103001 7.657366 1696.644371
+GATTGC 96.624242 1.672016 1.958339 0.990358 7.657366 2070.928527
+GATTGG 99.462223 2.101349 2.092606 1.093935 7.657366 1314.486196
+GATTGT 98.156559 1.795649 2.106002 1.104456 7.657366 1333.398859
+GATTTA 93.181962 1.643380 1.914930 0.957612 7.657366 1756.948365
+GATTTC 91.058230 1.608039 1.824194 0.890363 7.657366 2397.560666
+GATTTG 92.631553 1.680989 1.912651 0.955903 7.657366 1748.102725
+GATTTT 92.870341 1.763877 2.037895 1.051316 7.657366 3187.686537
+GCAAAA 88.599623 2.188513 1.996218 1.019230 7.657366 3861.514162
+GCAAAC 85.729208 2.083043 2.179086 1.162443 7.657366 2553.274463
+GCAAAG 87.987695 2.409619 2.100872 1.100423 7.657366 2101.616289
+GCAAAT 86.686851 2.242723 2.265220 1.232043 7.657366 2345.979224
+GCAACA 79.186002 2.013042 1.919307 0.960897 7.657366 2666.549982
+GCAACC 78.219408 1.857516 1.835773 0.898853 7.657366 2059.632219
+GCAACG 79.167199 1.963083 1.869985 0.924097 7.657366 2713.081613
+GCAACT 78.697325 1.831540 1.773850 0.853760 7.657366 1631.477901
+GCAAGA 86.979287 2.130143 1.996378 1.019353 7.657366 1230.117123
+GCAAGC 84.766598 2.127453 2.089683 1.091644 7.657366 1515.638816
+GCAAGG 86.082393 2.283571 1.971861 1.000633 7.657366 1292.663998
+GCAAGT 85.325070 2.378010 2.251737 1.221059 7.657366 806.275642
+GCAATA 77.881516 1.940437 1.846801 0.906965 7.657366 2153.374611
+GCAATC 76.481589 2.045093 2.151251 1.140242 7.657366 1991.202577
+GCAATG 77.247938 2.027274 1.944915 0.980192 7.657366 2372.474786
+GCAATT 77.377300 2.062639 2.300428 1.260879 7.657366 1993.712860
+GCACAA 90.769909 1.395538 1.393394 0.594389 7.657366 1522.184875
+GCACAC 88.430586 1.392707 1.358844 0.572420 7.657366 953.110453
+GCACAG 89.995422 1.322077 1.339699 0.560365 7.657366 1371.634094
+GCACAT 89.342979 1.309711 1.285073 0.526444 7.657366 1250.633753
+GCACCA 87.678639 1.342401 1.411770 0.606186 7.657366 2686.957835
+GCACCC 86.420555 1.353357 1.404999 0.601830 7.657366 865.028866
+GCACCG 87.430020 1.393646 1.399934 0.598579 7.657366 2760.547458
+GCACCT 87.257636 1.455269 1.385475 0.589330 7.657366 1294.636892
+GCACGA 89.891563 1.340905 1.319662 0.547841 7.657366 1239.396311
+GCACGC 87.596576 1.456187 1.411057 0.605727 7.657366 2122.410454
+GCACGG 89.110459 1.387292 1.383967 0.588368 7.657366 1508.780188
+GCACGT 88.232670 1.426087 1.353560 0.569084 7.657366 1059.094566
+GCACTA 86.132986 1.595176 1.558939 0.703403 7.657366 664.626860
+GCACTC 84.097736 1.522572 1.452764 0.632780 7.657366 780.342920
+GCACTG 85.324302 1.415835 1.450933 0.631584 7.657366 2186.620407
+GCACTT 85.101276 1.530092 1.414030 0.607643 7.657366 1163.412596
+GCAGAA 84.494680 2.052840 1.941970 0.977967 7.657366 2710.309894
+GCAGAC 79.141562 1.905164 2.149757 1.139054 7.657366 1470.654237
+GCAGAG 83.561342 2.068876 2.024703 1.041123 7.657366 1453.880110
+GCAGAT 81.041647 1.952378 2.128127 1.121906 7.657366 2361.563901
+GCAGCA 79.258137 1.719075 1.678037 0.785529 7.657366 3936.058249
+GCAGCC 77.689103 1.748373 1.684347 0.789965 7.657366 2033.799305
+GCAGCG 78.977833 1.845644 1.669974 0.779875 7.657366 3592.583570
+GCAGCT 78.869541 1.904448 1.762237 0.845389 7.657366 1912.257122
+GCAGGA 84.254156 1.776168 1.866946 0.921845 7.657366 2293.906648
+GCAGGC 80.539407 1.829472 2.015557 1.034077 7.657366 3224.780775
+GCAGGG 83.254763 1.880942 1.882084 0.933079 7.657366 1763.016086
+GCAGGT 82.164089 1.930279 2.030595 1.045671 7.657366 2302.550646
+GCAGTA 76.500479 1.778018 1.948005 0.982529 7.657366 1481.656321
+GCAGTC 73.431780 1.916665 2.154038 1.142459 7.657366 1005.067134
+GCAGTG 75.558973 1.751989 1.989428 1.014035 7.657366 1678.870578
+GCAGTT 75.098636 2.128587 2.353517 1.304777 7.657366 2168.146946
+GCATAA 84.962961 1.868155 1.807894 0.878456 7.657366 1770.563163
+GCATAC 81.905817 2.118690 2.014903 1.033574 7.657366 1063.968455
+GCATAG 84.335713 1.824721 1.799979 0.872693 7.657366 723.128501
+GCATAT 82.979270 1.827913 1.892536 0.940863 7.657366 1418.450969
+GCATCA 78.201699 1.947527 2.301387 1.261667 7.657366 3168.821765
+GCATCC 76.694951 1.890479 2.262178 1.229562 7.657366 2027.708811
+GCATCG 78.143332 2.044922 2.242020 1.213164 7.657366 2545.232158
+GCATCT 77.636971 2.074718 2.236124 1.208381 7.657366 1536.279547
+GCATGA 82.622066 1.948887 1.890415 0.939282 7.657366 1656.858932
+GCATGC 79.477720 2.272269 2.111127 1.108490 7.657366 736.342874
+GCATGG 81.445568 2.093733 1.952458 0.985900 7.657366 1610.441202
+GCATGT 79.995603 2.007228 2.027004 1.042899 7.657366 1091.054804
+GCATTA 72.739347 2.016809 2.364175 1.313650 7.657366 2066.990350
+GCATTC 69.885855 2.161864 2.415413 1.356586 7.657366 1666.419362
+GCATTG 72.038658 2.213629 2.385009 1.331053 7.657366 2209.884764
+GCATTT 71.299246 2.223873 2.354191 1.305338 7.657366 2050.659536
+GCCAAA 100.050810 1.566515 1.542959 0.692615 7.657366 2259.246304
+GCCAAC 97.646649 1.504978 1.701988 0.802407 7.657366 1932.784427
+GCCAAG 99.010757 1.628534 1.595397 0.728222 7.657366 632.520730
+GCCAAT 98.586668 1.580838 1.740917 0.830094 7.657366 1953.878943
+GCCACA 91.993500 1.720589 1.414952 0.608237 7.657366 1626.422598
+GCCACC 91.076341 1.683421 1.515034 0.673897 7.657366 2297.342945
+GCCACG 92.043485 1.763789 1.491554 0.658292 7.657366 2034.227417
+GCCACT 91.978172 1.512147 1.530186 0.684032 7.657366 1431.205429
+GCCAGA 97.865546 1.701591 1.690933 0.794603 7.657366 3619.435685
+GCCAGC 95.882299 1.659970 1.720581 0.815592 7.657366 6054.860030
+GCCAGG 96.866807 1.754311 1.710907 0.808723 7.657366 2791.100157
+GCCAGT 96.737547 1.734156 1.849534 0.908979 7.657366 3512.792768
+GCCATA 91.151393 1.798304 1.716680 0.812820 7.657366 1606.780831
+GCCATC 89.581952 1.703736 1.857947 0.915188 7.657366 3042.335346
+GCCATG 90.444884 1.665147 1.795589 0.869502 7.657366 1594.360688
+GCCATT 90.692447 1.722669 1.932864 0.971096 7.657366 2784.238258
+GCCCAA 98.708915 1.560805 1.269065 0.516638 7.657366 889.862615
+GCCCAC 96.699427 1.855472 1.307422 0.540236 7.657366 1162.141250
+GCCCAG 97.996577 1.709217 1.269660 0.517001 7.657366 2019.286653
+GCCCAT 97.757384 1.524444 1.376754 0.583774 7.657366 1559.831567
+GCCCCA 95.688050 1.734074 1.312846 0.543602 7.657366 1251.177668
+GCCCCC 94.159983 1.537043 1.248523 0.504145 7.657366 712.358299
+GCCCCG 95.428024 1.611430 1.250398 0.505281 7.657366 967.858304
+GCCCCT 94.662055 1.688814 1.240018 0.499002 7.657366 695.123519
+GCCCGA 97.906187 1.662774 1.300540 0.535977 7.657366 1149.712043
+GCCCGC 95.780842 1.650809 1.357718 0.571708 7.657366 2303.040101
+GCCCGG 96.893682 1.785573 1.279326 0.522916 7.657366 2129.795983
+GCCCGT 96.458067 1.558396 1.430268 0.618140 7.657366 1471.643409
+GCCCTA 93.856208 1.826146 1.323472 0.550215 7.657366 314.084591
+GCCCTC 92.593599 1.812387 1.448586 0.630052 7.657366 455.454845
+GCCCTG 93.674390 1.729244 1.380948 0.586444 7.657366 2087.989531
+GCCCTT 93.289073 1.475410 1.489457 0.656904 7.657366 1007.608262
+GCCGAA 95.099981 1.775793 1.622293 0.746714 7.657366 2162.032985
+GCCGAC 90.675981 1.695031 1.859621 0.916425 7.657366 1830.956724
+GCCGAG 93.996511 1.663386 1.674601 0.783118 7.657366 1362.432221
+GCCGAT 92.379212 1.604049 1.815806 0.884229 7.657366 2840.083815
+GCCGCA 90.584901 1.629270 1.485350 0.654189 7.657366 2973.380444
+GCCGCC 89.422491 1.590401 1.574989 0.714293 7.657366 4103.317667
+GCCGCG 90.595053 1.607895 1.564201 0.706967 7.657366 2691.259907
+GCCGCT 90.518060 1.573792 1.613562 0.740694 7.657366 2859.223391
+GCCGGA 94.983712 1.665560 1.642947 0.761019 7.657366 3790.970901
+GCCGGC 92.057878 1.507691 1.670106 0.779967 7.657366 365.661897
+GCCGGG 94.121804 1.770009 1.652624 0.767753 7.657366 2311.222087
+GCCGGT 93.522151 1.571351 1.753016 0.838763 7.657366 2336.064708
+GCCGTA 89.168911 1.832134 1.825341 0.891202 7.657366 1656.065735
+GCCGTC 87.004545 1.833929 1.971058 1.000022 7.657366 1803.304781
+GCCGTG 88.511858 1.755475 1.893958 0.941924 7.657366 1749.156279
+GCCGTT 88.591079 1.847773 2.045543 1.057239 7.657366 2772.694494
+GCCTAA 94.138590 2.043871 1.612328 0.739845 7.657366 462.251021
+GCCTAC 90.883302 1.794467 1.729186 0.821718 7.657366 719.951500
+GCCTAG 93.376648 1.817066 1.466448 0.641742 7.657366 46.638555
+GCCTAT 92.135716 2.132842 1.838444 0.900816 7.657366 598.649845
+GCCTCA 87.140121 1.884868 1.872409 0.925894 7.657366 705.495026
+GCCTCC 85.850885 2.047256 2.062525 1.070432 7.657366 602.395656
+GCCTCG 86.950042 1.807217 1.909565 0.953591 7.657366 885.246244
+GCCTCT 86.694119 1.973015 1.931010 0.969699 7.657366 744.370845
+GCCTGA 91.322568 1.785696 1.609007 0.737560 7.657366 3135.281350
+GCCTGC 88.129472 1.675729 1.763308 0.846160 7.657366 3364.978668
+GCCTGG 90.051770 1.885449 1.691855 0.795253 7.657366 2996.890007
+GCCTGT 89.271615 1.887485 1.836189 0.899159 7.657366 1806.138161
+GCCTTA 82.772028 1.957643 1.960809 0.992232 7.657366 881.282170
+GCCTTC 79.640301 1.787497 2.011773 1.031166 7.657366 1843.771711
+GCCTTG 82.022324 1.862643 1.906239 0.951100 7.657366 861.962198
+GCCTTT 80.998010 1.936885 1.885397 0.935544 7.657366 2252.926464
+GCGAAA 81.919695 1.999633 1.956090 0.988652 7.657366 3071.655028
+GCGAAC 78.919387 2.038301 2.118469 1.114277 7.657366 1868.942894
+GCGAAG 80.862285 2.155499 2.045487 1.057196 7.657366 2151.518395
+GCGAAT 79.812448 2.003518 2.141839 1.132767 7.657366 1969.562117
+GCGACA 71.767191 2.041209 2.144969 1.135251 7.657366 1764.565463
+GCGACC 70.629569 1.932151 2.071224 1.077211 7.657366 1802.973354
+GCGACG 71.454500 1.982351 2.105006 1.103673 7.657366 2219.460174
+GCGACT 71.366822 1.738286 2.006493 1.027109 7.657366 1085.624571
+GCGAGA 80.408392 1.912054 1.891243 0.939899 7.657366 1160.745116
+GCGAGC 78.439462 2.167962 2.086220 1.088931 7.657366 1438.256130
+GCGAGG 79.467211 2.042121 1.929689 0.968704 7.657366 936.385587
+GCGAGT 79.086903 2.191262 2.230082 1.203487 7.657366 901.553485
+GCGATA 70.720630 1.740972 1.873844 0.926959 7.657366 2767.382185
+GCGATC 69.063019 1.890995 2.058997 1.067686 7.657366 2255.258658
+GCGATG 69.744602 1.681867 1.880181 0.931665 7.657366 3360.247744
+GCGATT 70.073144 1.934373 2.234610 1.207155 7.657366 2524.325934
+GCGCAA 86.946944 1.573740 1.530255 0.684079 7.657366 3281.123593
+GCGCAC 84.524167 1.474533 1.497575 0.662282 7.657366 2248.139480
+GCGCAG 86.084193 1.571551 1.552136 0.698803 7.657366 4097.997107
+GCGCAT 85.387493 1.475326 1.499689 0.663685 7.657366 2898.717025
+GCGCCA 83.489045 1.526325 1.672125 0.781382 7.657366 4418.094205
+GCGCCC 82.407065 1.534117 1.700337 0.801240 7.657366 1741.901948
+GCGCCG 83.304400 1.520331 1.633389 0.754388 7.657366 3298.930498
+GCGCCT 83.082683 1.533340 1.657154 0.770911 7.657366 1915.563636
+GCGCGA 85.376822 1.597757 1.565275 0.707695 7.657366 3009.769553
+GCGCGC 82.614585 1.596812 1.686277 0.791323 7.657366 3298.908030
+GCGCGG 84.290660 1.622576 1.633900 0.754742 7.657366 3466.306964
+GCGCGT 83.221053 1.560151 1.665234 0.776557 7.657366 2984.256087
+GCGCTA 81.566039 1.526582 1.719656 0.814935 7.657366 1349.060229
+GCGCTC 79.614699 1.566017 1.692576 0.795761 7.657366 1447.410516
+GCGCTG 80.795666 1.615982 1.723306 0.817530 7.657366 4791.678139
+GCGCTT 80.184414 1.415620 1.435263 0.621380 7.657366 2389.389643
+GCGGAA 81.782354 2.129156 2.092652 1.093971 7.657366 2500.546532
+GCGGAC 76.397944 1.944787 2.362201 1.312005 7.657366 924.844815
+GCGGAG 80.484877 2.191034 2.174617 1.158869 7.657366 1026.173885
+GCGGAT 78.184445 1.954418 2.310272 1.268980 7.657366 2363.700137
+GCGGCA 76.749527 1.831408 1.827318 0.892651 7.657366 4569.314484
+GCGGCC 75.186143 1.742631 1.776183 0.855445 7.657366 1320.272690
+GCGGCG 76.376836 1.755694 1.781102 0.859001 7.657366 4944.411605
+GCGGCT 76.112211 1.866340 1.796272 0.869999 7.657366 2021.426191
+GCGGGA 82.388773 2.010568 2.007610 1.027967 7.657366 1541.309546
+GCGGGC 79.570912 1.834835 2.070688 1.076793 7.657366 2331.209948
+GCGGGG 81.285604 1.959726 1.929920 0.968879 7.657366 1218.712982
+GCGGGT 80.722196 2.082225 2.126946 1.120973 7.657366 2067.578475
+GCGGTA 74.912629 2.061602 2.097632 1.097878 7.657366 2504.175170
+GCGGTC 72.441025 2.096158 2.360042 1.310207 7.657366 1471.903671
+GCGGTG 73.839846 2.025582 2.214930 1.191242 7.657366 2979.309007
+GCGGTT 73.855034 2.210997 2.434496 1.372694 7.657366 2515.789322
+GCGTAA 78.219446 2.008077 2.065799 1.072982 7.657366 2463.405024
+GCGTAC 74.615535 1.787099 2.163954 1.150357 7.657366 1529.999638
+GCGTAG 77.373476 2.012550 2.057909 1.066841 7.657366 1169.183592
+GCGTAT 76.021949 1.901616 2.237125 1.209193 7.657366 1622.145667
+GCGTCA 71.093481 2.030489 2.502744 1.430820 7.657366 2476.578294
+GCGTCC 69.746395 2.201360 2.715896 1.617447 7.657366 991.618246
+GCGTCG 71.175762 2.097243 2.565575 1.485037 7.657366 2175.744935
+GCGTCT 70.694943 2.121920 2.508016 1.435343 7.657366 1573.623730
+GCGTGA 74.964689 2.138655 2.209747 1.187064 7.657366 2405.473072
+GCGTGC 71.052957 1.938513 2.511624 1.438442 7.657366 1934.616748
+GCGTGG 73.702706 2.095230 2.298612 1.259386 7.657366 2238.370074
+GCGTGT 72.187418 2.035062 2.488970 1.419024 7.657366 1063.234865
+GCGTTA 65.716852 2.099579 2.585944 1.502758 7.657366 2386.181423
+GCGTTC 62.576888 1.998427 2.646925 1.556227 7.657366 1881.851728
+GCGTTG 64.805946 2.059020 2.517961 1.443889 7.657366 2619.320396
+GCGTTT 63.796313 1.978357 2.528102 1.452621 7.657366 3225.983956
+GCTAAA 105.197769 2.064309 2.126401 1.120542 7.657366 1320.069995
+GCTAAC 102.650009 2.089703 2.194859 1.175087 7.657366 1186.533209
+GCTAAG 104.167450 2.028431 2.063412 1.071123 7.657366 581.355851
+GCTAAT 104.528094 2.128202 2.295260 1.256632 7.657366 984.137502
+GCTACA 100.889118 1.957196 2.000506 1.022516 7.657366 851.543559
+GCTACC 100.417004 2.102341 2.091897 1.093379 7.657366 1306.825473
+GCTACG 100.554925 2.001770 1.995629 1.018779 7.657366 1195.576485
+GCTACT 101.690465 2.182124 2.177093 1.160849 7.657366 835.941124
+GCTAGA 103.192387 2.351715 2.144485 1.134867 7.657366 70.414196
+GCTAGC 101.200090 2.137420 1.997935 1.020545 7.657366 217.491917
+GCTAGG 102.633929 1.672750 1.991539 1.015648 7.657366 70.677287
+GCTAGT 102.627966 2.175093 2.370637 1.319040 7.657366 97.942255
+GCTATA 101.034596 2.152090 2.258541 1.226598 7.657366 785.527034
+GCTATC 101.441888 2.339790 2.391388 1.336396 7.657366 1582.888096
+GCTATG 100.710480 2.021017 2.212996 1.189683 7.657366 960.621150
+GCTATT 103.276543 2.404052 2.653848 1.562336 7.657366 1291.994567
+GCTCAA 110.718972 1.993877 2.118591 1.114374 7.657366 1536.072764
+GCTCAC 108.580184 2.020666 2.056864 1.066028 7.657366 1247.604852
+GCTCAG 110.165756 2.015838 2.179290 1.162607 7.657366 1211.562853
+GCTCAT 109.444472 2.128258 2.191038 1.172020 7.657366 1387.456735
+GCTCCA 109.497488 1.911854 2.109584 1.107275 7.657366 1163.668877
+GCTCCC 107.861339 2.108626 2.095131 1.095916 7.657366 553.729200
+GCTCCG 109.282126 1.958988 2.205850 1.183925 7.657366 864.211517
+GCTCCT 108.962965 2.032715 2.042602 1.054960 7.657366 706.888948
+GCTCGA 110.138423 2.211380 2.312203 1.270572 7.657366 949.661109
+GCTCGC 107.699918 2.101711 2.134880 1.127250 7.657366 1503.767082
+GCTCGG 109.696173 2.179598 2.116764 1.112933 7.657366 1093.459549
+GCTCGT 109.076162 2.121370 2.199919 1.179153 7.657366 788.284225
+GCTCTA 109.266868 2.000750 2.141571 1.132554 7.657366 475.447947
+GCTCTC 108.236855 2.078619 2.218077 1.193782 7.657366 891.248481
+GCTCTG 108.753852 2.208824 2.227410 1.201325 7.657366 1349.166497
+GCTCTT 109.486178 1.971570 2.301272 1.261573 7.657366 1102.783047
+GCTGAA 100.924700 1.983581 2.092644 1.093965 7.657366 3512.300764
+GCTGAC 97.428691 1.862885 1.960212 0.991779 7.657366 2947.133007
+GCTGAG 100.389581 1.850201 1.941680 0.977748 7.657366 1002.523063
+GCTGAT 99.438425 1.998743 2.125249 1.119631 7.657366 3895.483559
+GCTGCA 99.154755 1.890771 1.934064 0.972000 7.657366 2562.411753
+GCTGCC 98.589569 2.086042 1.996791 1.019669 7.657366 3111.735769
+GCTGCG 99.093478 2.036866 1.972011 1.000747 7.657366 3447.845450
+GCTGCT 100.135689 2.145704 2.108849 1.106696 7.657366 3005.832641
+GCTGGA 101.170398 2.061048 2.173135 1.157685 7.657366 4136.960064
+GCTGGC 98.857987 2.008467 2.033808 1.048154 7.657366 6407.042501
+GCTGGG 100.646134 2.012145 2.092171 1.093593 7.657366 2477.895717
+GCTGGT 100.845884 2.115803 2.180493 1.163569 7.657366 3894.241630
+GCTGTA 99.864490 2.265869 2.336293 1.290480 7.657366 1585.697416
+GCTGTC 99.777021 2.430798 2.387376 1.333035 7.657366 1609.959868
+GCTGTG 99.545046 2.375253 2.354096 1.305259 7.657366 1496.444186
+GCTGTT 101.853872 2.434554 2.562567 1.482427 7.657366 3125.572132
+GCTTAA 99.563493 1.746958 1.706097 0.805315 7.657366 1494.374417
+GCTTAC 95.812736 1.538735 1.537126 0.688691 7.657366 1108.577269
+GCTTAG 98.730791 1.863814 1.671908 0.781230 7.657366 408.179076
+GCTTAT 97.374293 1.565231 1.663827 0.775573 7.657366 1235.013072
+GCTTCA 94.408218 1.661918 1.739324 0.828955 7.657366 2117.490000
+GCTTCC 92.735747 1.542171 1.690354 0.794194 7.657366 1713.830231
+GCTTCG 94.353012 1.689111 1.715438 0.811938 7.657366 1830.938641
+GCTTCT 94.018551 1.668056 1.711096 0.808857 7.657366 1592.498489
+GCTTGA 97.864705 1.674362 1.619178 0.744564 7.657366 1000.424515
+GCTTGC 94.016633 1.646492 1.578854 0.716924 7.657366 1592.059930
+GCTTGG 96.962732 1.658896 1.602092 0.732810 7.657366 566.841347
+GCTTGT 95.590765 1.578317 1.597737 0.729824 7.657366 950.858790
+GCTTTA 90.971282 1.610574 1.588866 0.723754 7.657366 2038.097253
+GCTTTC 88.598833 1.660118 1.533065 0.685964 7.657366 2818.848628
+GCTTTG 90.264235 1.622886 1.547788 0.695869 7.657366 2097.196555
+GCTTTT 90.498123 1.647306 1.584481 0.720760 7.657366 2596.222177
+GGAAAA 73.072714 1.806078 1.587788 0.723018 7.657366 2529.023195
+GGAAAC 71.185375 1.659521 1.428526 0.617010 7.657366 1602.913820
+GGAAAG 71.950032 1.937020 1.561052 0.704833 7.657366 1412.015307
+GGAAAT 71.423738 1.784959 1.453401 0.633197 7.657366 2000.517325
+GGAACA 62.279479 1.994514 1.876319 0.928796 7.657366 1659.715594
+GGAACC 61.514233 1.911726 1.950668 0.984544 7.657366 854.232230
+GGAACG 62.201385 1.983405 1.806420 0.877382 7.657366 1435.557047
+GGAACT 62.463325 1.991847 1.842274 0.903632 7.657366 1147.102160
+GGAAGA 70.608164 1.728753 1.581191 0.718517 7.657366 1983.863081
+GGAAGC 70.150361 1.753919 1.453450 0.633229 7.657366 1741.757537
+GGAAGG 69.800711 1.709372 1.602416 0.733033 7.657366 1370.587037
+GGAAGT 69.669136 1.575497 1.455730 0.634719 7.657366 1424.089051
+GGAATA 62.985703 2.050302 1.842641 0.903902 7.657366 1490.530867
+GGAATC 61.686073 1.781444 1.664951 0.776359 7.657366 848.916260
+GGAATG 62.461589 1.918438 1.769897 0.850907 7.657366 1367.376629
+GGAATT 62.616729 1.551842 1.460036 0.637537 7.657366 1276.184037
+GGACAA 77.614233 2.404462 2.169896 1.155098 7.657366 818.209851
+GGACAC 76.461226 1.939259 1.899313 0.945922 7.657366 415.831523
+GGACAG 76.747255 2.109956 2.012075 1.031399 7.657366 587.330580
+GGACAT 77.047009 2.498343 2.038201 1.051552 7.657366 778.439544
+GGACCA 73.573050 1.936543 1.981283 1.007813 7.657366 499.807813
+GGACCC 73.071001 2.327962 1.975959 1.003754 7.657366 261.810448
+GGACCG 73.689382 2.184960 2.082757 1.086221 7.657366 606.374956
+GGACCT 73.900676 2.119034 2.065162 1.072485 7.657366 293.157788
+GGACGA 76.588328 2.108170 1.954015 0.987080 7.657366 928.863010
+GGACGC 75.484998 1.777823 1.823113 0.889571 7.657366 1031.499376
+GGACGG 75.817588 1.881212 1.889489 0.938592 7.657366 841.852932
+GGACGT 75.895852 2.211985 1.923397 0.963970 7.657366 822.517122
+GGACTA 73.935619 1.724244 1.826102 0.891760 7.657366 255.671336
+GGACTC 72.379494 1.761044 1.754246 0.839646 7.657366 434.595135
+GGACTG 73.363817 1.808320 1.776143 0.855416 7.657366 802.036031
+GGACTT 72.947645 1.884998 1.635362 0.755755 7.657366 588.168913
+GGAGAA 73.129655 1.342023 1.394573 0.595144 7.657366 1264.211853
+GGAGAC 70.085507 1.280385 1.471824 0.645274 7.657366 457.973017
+GGAGAG 72.211051 1.572369 1.396591 0.596436 7.657366 801.152996
+GGAGAT 71.003464 1.400531 1.475702 0.647826 7.657366 1333.785269
+GGAGCA 66.384479 1.613051 1.684951 0.790390 7.657366 941.099491
+GGAGCC 65.245770 1.612594 1.683368 0.789276 7.657366 596.569393
+GGAGCG 66.568975 1.694621 1.613752 0.740825 7.657366 871.471839
+GGAGCT 66.344154 1.630062 1.604556 0.734502 7.657366 662.521520
+GGAGGA 72.381735 1.376970 1.410368 0.605284 7.657366 536.592749
+GGAGGC 71.030835 1.487809 1.399607 0.598369 7.657366 693.270152
+GGAGGG 71.765939 1.362528 1.376930 0.583886 7.657366 283.031820
+GGAGGT 71.665852 1.336528 1.382413 0.587377 7.657366 492.003494
+GGAGTA 66.854548 1.602685 1.422802 0.613306 7.657366 623.889441
+GGAGTC 64.876639 1.847890 1.383685 0.588188 7.657366 555.103797
+GGAGTG 65.831098 1.593990 1.367168 0.577687 7.657366 672.857255
+GGAGTT 66.665531 1.339935 1.554643 0.700497 7.657366 921.992398
+GGATAA 71.326577 2.196014 2.171174 1.156119 7.657366 2099.224272
+GGATAC 69.372011 1.968142 2.186029 1.168004 7.657366 895.845210
+GGATAG 70.601615 1.755320 2.096272 1.096811 7.657366 760.524396
+GGATAT 70.146058 2.335868 2.257838 1.226026 7.657366 1749.894268
+GGATCA 64.590049 2.086482 2.391578 1.336556 7.657366 1710.643328
+GGATCC 63.459531 2.246107 2.476483 1.408359 7.657366 565.708108
+GGATCG 64.456951 2.001935 2.302671 1.262723 7.657366 1679.972932
+GGATCT 64.548619 2.203997 2.507806 1.435163 7.657366 1208.516597
+GGATGA 68.855777 1.939710 2.098219 1.098339 7.657366 1908.199647
+GGATGC 66.402401 1.753484 2.004110 1.025281 7.657366 1853.367299
+GGATGG 68.072810 1.723967 2.104559 1.103321 7.657366 1355.337894
+GGATGT 67.272794 1.989561 2.240079 1.211589 7.657366 1289.802087
+GGATTA 61.239508 1.747645 2.082909 1.086340 7.657366 1308.200077
+GGATTC 58.997933 1.641767 2.008977 1.029017 7.657366 804.393832
+GGATTG 60.683147 1.833389 2.054318 1.064049 7.657366 1147.524395
+GGATTT 60.156368 1.743802 1.971366 1.000256 7.657366 1894.244851
+GGCAAA 99.459523 1.960271 2.065226 1.072535 7.657366 3801.543474
+GGCAAC 96.895285 1.931339 2.057496 1.066520 7.657366 2618.450069
+GGCAAG 98.614322 1.949369 2.038291 1.051622 7.657366 1442.966411
+GGCAAT 98.004372 2.101391 2.221533 1.196574 7.657366 3186.100453
+GGCACA 90.392384 1.797411 1.531465 0.684890 7.657366 1455.888573
+GGCACC 89.385702 1.725866 1.591859 0.725801 7.657366 1913.647914
+GGCACG 90.343257 1.858309 1.625613 0.749007 7.657366 1483.435872
+GGCACT 90.785721 1.717181 1.681920 0.788258 7.657366 1265.594943
+GGCAGA 96.662519 1.986583 2.091329 1.092934 7.657366 2404.621223
+GGCAGC 94.419605 1.842534 1.983992 1.009881 7.657366 3071.874880
+GGCAGG 95.647570 1.950119 1.998828 1.021229 7.657366 2211.310679
+GGCAGT 95.553910 2.010107 2.155969 1.143995 7.657366 1768.044209
+GGCATA 90.297216 1.645243 1.775701 0.855096 7.657366 1415.708908
+GGCATC 89.003322 1.876364 1.866574 0.921570 7.657366 2731.354566
+GGCATG 89.672399 1.578451 1.827075 0.892472 7.657366 1629.339430
+GGCATT 90.518575 1.960987 2.027205 1.043054 7.657366 2912.986201
+GGCCAA 99.424053 2.132438 1.736609 0.827015 7.657366 1006.075904
+GGCCAC 97.425850 1.763881 1.717730 0.813566 7.657366 804.158180
+GGCCAG 99.112064 2.155587 1.736310 0.826802 7.657366 1758.903424
+GGCCAT 98.656492 1.872479 1.860015 0.916717 7.657366 1133.131105
+GGCCCA 95.961399 1.580515 1.545455 0.694296 7.657366 722.247464
+GGCCCC 94.051163 1.647388 1.525482 0.680881 7.657366 560.754763
+GGCCCG 95.826619 1.672972 1.554559 0.700440 7.657366 995.770252
+GGCCCT 95.684306 1.795855 1.698876 0.800208 7.657366 333.000394
+GGCCGA 98.505066 2.048322 1.839743 0.901770 7.657366 1024.781713
+GGCCGC 96.245311 1.736410 1.782146 0.859756 7.657366 1318.242002
+GGCCGG 97.582263 1.835879 1.724805 0.818597 7.657366 944.388718
+GGCCGT 97.063189 1.930559 1.996217 1.019229 7.657366 1017.067564
+GGCCTA 95.910992 1.845608 1.721210 0.816040 7.657366 557.895354
+GGCCTC 93.415098 1.738756 1.825010 0.890960 7.657366 508.052262
+GGCCTG 94.656845 1.754145 1.710100 0.808151 7.657366 2328.880911
+GGCCTT 94.225074 1.900250 1.850835 0.909938 7.657366 1051.494336
+GGCGAA 95.260585 2.044077 2.003015 1.024440 7.657366 3465.470123
+GGCGAC 91.580760 1.894090 1.919331 0.960915 7.657366 2260.207780
+GGCGAG 94.459471 2.040595 2.011298 1.030801 7.657366 1694.511936
+GGCGAT 93.190832 1.933720 2.078149 1.082618 7.657366 4464.169602
+GGCGCA 90.260851 1.732058 1.702708 0.802917 7.657366 3641.270532
+GGCGCC 89.154239 1.778954 1.669053 0.779230 7.657366 132.232478
+GGCGCG 90.151755 1.691361 1.706814 0.805823 7.657366 3795.043376
+GGCGCT 90.410823 1.813324 1.825429 0.891267 7.657366 3787.880585
+GGCGGA 95.002953 2.085657 2.035283 1.049295 7.657366 2073.661812
+GGCGGC 92.438886 1.880824 1.928704 0.967963 7.657366 4235.620323
+GGCGGG 94.382461 1.943952 1.991099 1.015312 7.657366 1927.527266
+GGCGGT 94.157464 1.970523 2.035874 1.049752 7.657366 3271.161164
+GGCGTA 90.382561 1.945195 1.911460 0.955010 7.657366 2220.210250
+GGCGTC 89.138326 1.949148 2.020271 1.037707 7.657366 1888.000260
+GGCGTG 89.937425 1.969647 1.986089 1.011483 7.657366 2698.714648
+GGCGTT 90.864264 2.172559 2.196243 1.176199 7.657366 3724.295765
+GGCTAA 93.593951 2.143316 1.903577 0.949109 7.657366 991.850434
+GGCTAC 90.113128 1.766534 1.771803 0.852283 7.657366 1005.269219
+GGCTAG 93.003878 1.842471 1.965899 0.996098 7.657366 102.810271
+GGCTAT 91.589503 1.940961 1.846900 0.907038 7.657366 1251.852351
+GGCTCA 85.512410 1.785805 1.869835 0.923985 7.657366 763.888914
+GGCTCC 83.837596 1.733085 1.828060 0.893194 7.657366 616.003869
+GGCTCG 85.368274 1.707360 1.756374 0.841174 7.657366 715.812295
+GGCTCT 84.845851 1.970161 1.926578 0.966363 7.657366 624.767647
+GGCTGA 90.824279 2.017995 1.807409 0.878102 7.657366 2235.242591
+GGCTGC 87.605816 1.631432 1.688360 0.792789 7.657366 2071.644192
+GGCTGG 89.408293 1.788856 1.777431 0.856347 7.657366 3204.446194
+GGCTGT 88.509697 1.688516 1.740925 0.830100 7.657366 1503.193259
+GGCTTA 81.397536 1.720432 1.659751 0.772725 7.657366 1017.908898
+GGCTTC 78.731415 1.516703 1.568731 0.710040 7.657366 1909.483463
+GGCTTG 80.861956 1.641879 1.633149 0.754222 7.657366 1002.753912
+GGCTTT 79.932044 1.540410 1.491640 0.658349 7.657366 2490.785323
+GGGAAA 68.787299 2.035300 1.696918 0.798825 7.657366 1715.993192
+GGGAAC 66.002117 1.646046 1.854215 0.912432 7.657366 870.667522
+GGGAAG 67.298783 1.885708 1.744601 0.832731 7.657366 1238.811236
+GGGAAT 66.515368 1.793384 1.837544 0.900154 7.657366 1022.423293
+GGGACA 56.525732 2.586219 2.379542 1.326479 7.657366 460.942419
+GGGACC 55.213513 1.797299 2.066455 1.073493 7.657366 240.601723
+GGGACG 56.649741 2.467660 2.326030 1.281986 7.657366 535.555533
+GGGACT 56.536833 2.055712 2.303184 1.263145 7.657366 365.868662
+GGGAGA 66.220798 1.739245 1.765757 0.847923 7.657366 789.471456
+GGGAGC 64.849979 1.491610 1.639357 0.758526 7.657366 580.530999
+GGGAGG 65.086610 1.809484 1.838128 0.900584 7.657366 350.078820
+GGGAGT 65.205804 1.726486 1.657162 0.770917 7.657366 574.081640
+GGGATA 57.121838 2.206244 2.141056 1.132146 7.657366 1018.512912
+GGGATC 55.869272 1.692025 1.968210 0.997855 7.657366 1165.473949
+GGGATG 56.346974 1.863106 2.057390 1.066437 7.657366 1219.116036
+GGGATT 56.803118 1.553734 1.747440 0.834764 7.657366 1331.307799
+GGGCAA 79.230646 1.959207 1.912310 0.955648 7.657366 1855.510593
+GGGCAC 77.027482 1.684137 1.631201 0.752873 7.657366 735.646294
+GGGCAG 78.308852 1.624673 1.785318 0.862052 7.657366 1413.837483
+GGGCAT 77.880537 1.821750 1.747792 0.835016 7.657366 1432.164354
+GGGCCA 75.277515 1.581897 1.867099 0.921958 7.657366 1071.884832
+GGGCCC 74.229410 1.590037 1.828103 0.893226 7.657366 68.606681
+GGGCCG 75.067267 1.594665 1.918738 0.960470 7.657366 970.208957
+GGGCCT 74.920382 1.589796 1.842027 0.903450 7.657366 481.121903
+GGGCGA 77.817403 1.807175 1.753023 0.838768 7.657366 2161.257376
+GGGCGC 75.612525 1.524504 1.694817 0.797342 7.657366 1663.376721
+GGGCGG 76.932039 1.595064 1.724733 0.818546 7.657366 1509.448786
+GGGCGT 76.250182 1.567278 1.680845 0.787502 7.657366 1799.922554
+GGGCTA 74.429374 1.448094 1.617996 0.743749 7.657366 665.528005
+GGGCTC 72.736313 1.591172 1.629414 0.751636 7.657366 181.744731
+GGGCTG 73.646218 1.527036 1.679171 0.786326 7.657366 1800.842528
+GGGCTT 73.578387 1.571968 1.521927 0.678502 7.657366 1341.089205
+GGGGAA 71.504716 1.660768 1.474861 0.647272 7.657366 1081.062486
+GGGGAC 67.188383 1.354445 1.751636 0.837773 7.657366 323.090610
+GGGGAG 70.714844 1.643991 1.499353 0.663462 7.657366 700.251527
+GGGGAT 68.623436 1.550004 1.709268 0.807561 7.657366 1131.222868
+GGGGCA 64.636285 1.623740 1.821089 0.888090 7.657366 987.543176
+GGGGCC 63.393720 1.611772 1.925889 0.965845 7.657366 531.593168
+GGGGCG 64.631718 1.643348 1.836938 0.899709 7.657366 1188.345898
+GGGGCT 64.387168 1.677574 1.789029 0.864742 7.657366 896.019794
+GGGGGA 71.258790 1.493057 1.374437 0.582301 7.657366 546.413921
+GGGGGC 69.731016 1.440592 1.491773 0.658437 7.657366 705.312565
+GGGGGG 70.817177 1.514192 1.317807 0.546686 7.657366 306.179810
+GGGGGT 70.379282 1.451975 1.416219 0.609054 7.657366 332.596892
+GGGGTA 64.639552 1.398397 1.481033 0.651340 7.657366 626.052625
+GGGGTC 62.944666 1.417148 1.649076 0.765281 7.657366 266.440188
+GGGGTG 63.837395 1.378130 1.512910 0.672481 7.657366 616.236375
+GGGGTT 64.169983 1.375379 1.768677 0.850027 7.657366 678.321427
+GGGTAA 68.868097 1.878664 2.017560 1.035619 7.657366 1593.302035
+GGGTAC 65.793103 1.738678 1.854911 0.912946 7.657366 650.299523
+GGGTAG 68.021526 1.919247 1.934573 0.972385 7.657366 540.589042
+GGGTAT 67.138533 1.945389 1.992850 1.016652 7.657366 1053.589005
+GGGTCA 61.268495 1.722803 2.087300 1.089777 7.657366 906.518844
+GGGTCC 59.856083 1.812521 1.952970 0.986288 7.657366 264.259382
+GGGTCG 61.446485 1.827837 2.142679 1.133434 7.657366 791.675470
+GGGTCT 61.237710 2.003632 2.149297 1.138688 7.657366 431.782012
+GGGTGA 65.678872 1.733750 1.818745 0.886376 7.657366 1535.172238
+GGGTGC 62.423171 1.610359 1.953770 0.986894 7.657366 886.557772
+GGGTGG 64.468424 1.593066 1.800494 0.873067 7.657366 863.790714
+GGGTGT 63.240236 1.605391 1.948235 0.982703 7.657366 778.378982
+GGGTTA 57.254849 1.837633 2.211731 1.188663 7.657366 1427.866839
+GGGTTC 54.733139 1.939031 2.273531 1.238830 7.657366 778.847364
+GGGTTG 56.590356 1.877222 2.212938 1.189636 7.657366 1068.610974
+GGGTTT 55.789530 1.760674 2.007018 1.027513 7.657366 1325.127712
+GGTAAA 100.942964 2.289132 2.623250 1.535393 7.657366 2916.741828
+GGTAAC 98.826048 2.260428 2.686711 1.591445 7.657366 1656.814660
+GGTAAG 100.123571 2.517155 2.694846 1.598679 7.657366 1113.997569
+GGTAAT 100.496725 2.419130 2.869690 1.756761 7.657366 1953.582882
+GGTACA 94.251897 2.093374 2.317223 1.274712 7.657366 994.099878
+GGTACC 93.866031 2.142784 2.328912 1.284369 7.657366 652.644713
+GGTACG 94.147327 2.232351 2.367540 1.316455 7.657366 1678.471055
+GGTACT 95.507845 2.451037 2.582362 1.499637 7.657366 1004.312664
+GGTAGA 98.103666 2.354990 2.498321 1.427029 7.657366 932.750163
+GGTAGC 96.873306 2.411221 2.542331 1.464902 7.657366 1283.178903
+GGTAGG 97.557335 2.086103 2.464062 1.397777 7.657366 469.300220
+GGTAGT 98.809055 2.385383 2.695572 1.599325 7.657366 836.119967
+GGTATA 95.916990 2.552356 2.812877 1.704851 7.657366 834.402298
+GGTATC 96.775152 2.548973 2.899772 1.784457 7.657366 1721.025829
+GGTATG 95.510510 2.514425 2.678220 1.583907 7.657366 1192.897922
+GGTATT 98.830751 2.504054 3.048582 1.923566 7.657366 2040.833279
+GGTCAA 108.887985 2.443787 2.643460 1.553172 7.657366 1143.488251
+GGTCAC 106.769439 2.361775 2.563284 1.483048 7.657366 1311.954389
+GGTCAG 108.037836 2.645232 2.663397 1.570776 7.657366 2719.829717
+GGTCAT 107.636096 2.526407 2.707421 1.609882 7.657366 1906.509261
+GGTCCA 107.146864 2.363775 2.580388 1.497918 7.657366 504.450683
+GGTCCC 105.364044 2.148820 2.373473 1.321407 7.657366 196.841584
+GGTCCG 107.122860 2.337507 2.567869 1.487029 7.657366 710.100727
+GGTCCT 106.821743 2.623027 2.499496 1.428036 7.657366 277.281073
+GGTCGA 107.959667 2.470002 2.707753 1.610178 7.657366 1160.702114
+GGTCGC 105.914549 2.417755 2.658654 1.566582 7.657366 1752.953277
+GGTCGG 107.444503 2.601777 2.741635 1.640494 7.657366 1256.921834
+GGTCGT 107.235004 2.535061 2.828278 1.718871 7.657366 1184.222242
+GGTCTA 107.120172 2.601660 2.994951 1.873031 7.657366 242.662746
+GGTCTC 106.435683 2.535998 2.763526 1.660181 7.657366 135.776968
+GGTCTG 107.145274 2.555321 2.792027 1.685930 7.657366 1893.829187
+GGTCTT 107.735107 2.288758 2.844344 1.733539 7.657366 720.686176
+GGTGAA 98.423070 2.688534 2.615197 1.528330 7.657366 2763.287613
+GGTGAC 95.383434 2.373746 2.624242 1.536265 7.657366 1651.528378
+GGTGAG 97.706420 2.484416 2.613549 1.526885 7.657366 1360.401170
+GGTGAT 97.495680 2.464819 2.791896 1.685812 7.657366 3146.007548
+GGTGCA 96.705233 2.337862 2.474449 1.406624 7.657366 1583.144229
+GGTGCC 96.225510 2.478059 2.472954 1.405350 7.657366 2027.941223
+GGTGCG 96.618785 2.412148 2.572083 1.490691 7.657366 2118.479206
+GGTGCT 97.993444 2.470980 2.734126 1.633759 7.657366 1833.805731
+GGTGGA 98.763236 2.679925 2.716870 1.618317 7.657366 1566.999903
+GGTGGC 97.108245 2.565622 2.534455 1.458100 7.657366 2375.736349
+GGTGGG 98.267420 2.618581 2.514466 1.440884 7.657366 1129.445593
+GGTGGT 99.231028 2.479288 2.695003 1.598818 7.657366 2512.522451
+GGTGTA 99.204767 3.163647 2.995405 1.873456 7.657366 1219.447307
+GGTGTC 99.698184 3.101071 3.208491 2.076881 7.657366 1154.064516
+GGTGTG 99.133487 2.964611 3.085235 1.958361 7.657366 1206.804398
+GGTGTT 102.132991 2.835661 3.200315 2.068947 7.657366 1884.596246
+GGTTAA 96.751813 2.289215 2.297568 1.258528 7.657366 2129.011372
+GGTTAC 93.296083 1.893762 2.030128 1.045311 7.657366 1229.579291
+GGTTAG 95.594090 2.172160 2.233928 1.206602 7.657366 700.590731
+GGTTAT 94.501353 1.884667 2.177515 1.161187 7.657366 1937.340435
+GGTTCA 90.796988 3.393528 2.383145 1.329493 7.657366 1510.464028
+GGTTCC 88.959329 1.907371 2.221742 1.196743 7.657366 861.943376
+GGTTCG 90.782666 2.456084 2.359928 1.310112 7.657366 1199.706986
+GGTTCT 90.636037 2.557274 2.446260 1.382656 7.657366 967.886094
+GGTTGA 94.322533 2.265445 2.148949 1.138412 7.657366 1866.520994
+GGTTGC 91.162950 1.954322 2.008428 1.028596 7.657366 2095.026002
+GGTTGG 93.667568 1.926639 2.160196 1.147361 7.657366 1336.517097
+GGTTGT 92.329473 2.061184 2.146750 1.136665 7.657366 1337.833405
+GGTTTA 87.406065 2.007295 2.054863 1.064473 7.657366 1682.752930
+GGTTTC 85.318098 1.852309 1.908492 0.952787 7.657366 2288.343450
+GGTTTG 86.979841 1.838944 2.004541 1.025611 7.657366 1747.775792
+GGTTTT 86.407747 1.704518 1.804855 0.876242 7.657366 3171.775073
+GTAAAA 88.363699 1.913884 1.952826 0.986179 7.657366 2249.998818
+GTAAAC 85.965551 2.010686 2.146393 1.136381 7.657366 1627.031978
+GTAAAG 87.555120 1.926490 2.010044 1.029837 7.657366 1722.458431
+GTAAAT 86.777526 2.184223 2.219785 1.195162 7.657366 1606.049027
+GTAACA 79.173380 2.273329 2.108228 1.106207 7.657366 1099.820268
+GTAACC 78.120486 2.118213 2.021299 1.038499 7.657366 1490.723101
+GTAACG 79.071981 2.220109 2.075408 1.080477 7.657366 1788.523440
+GTAACT 78.674735 1.857891 1.877435 0.929625 7.657366 941.262475
+GTAAGA 86.781079 1.930548 1.902083 0.947992 7.657366 602.704230
+GTAAGC 85.100513 1.766650 2.034238 1.048487 7.657366 1052.051192
+GTAAGG 85.754358 1.899862 1.982943 1.009080 7.657366 654.504710
+GTAAGT 85.622680 2.303263 2.278967 1.243275 7.657366 689.500435
+GTAATA 78.075753 1.854620 1.909425 0.953486 7.657366 1402.098562
+GTAATC 76.937468 2.266762 2.334770 1.289218 7.657366 1449.042911
+GTAATG 77.412864 2.202861 1.977926 1.005253 7.657366 1791.046351
+GTAATT 78.038326 2.437507 2.473085 1.405461 7.657366 1460.543176
+GTACAA 89.561593 1.470506 1.516462 0.674851 7.657366 865.374618
+GTACAC 87.795914 1.374112 1.483278 0.652821 7.657366 603.096660
+GTACAG 88.953749 1.444210 1.528645 0.682999 7.657366 1312.520891
+GTACAT 88.552260 1.346895 1.428270 0.616845 7.657366 880.584112
+GTACCA 86.966226 1.451372 1.710950 0.808754 7.657366 1081.993754
+GTACCC 85.992121 1.450533 1.603673 0.733895 7.657366 727.256287
+GTACCG 86.871639 1.514125 1.662393 0.774570 7.657366 1820.257129
+GTACCT 86.669523 1.479428 1.707627 0.806399 7.657366 563.827084
+GTACGA 88.657501 1.366673 1.472281 0.645574 7.657366 740.769377
+GTACGC 86.818345 1.353275 1.501507 0.664892 7.657366 1616.445692
+GTACGG 87.961854 1.455217 1.542978 0.692628 7.657366 1156.290485
+GTACGT 87.212579 1.362957 1.459089 0.636917 7.657366 945.656228
+GTACTA 85.905571 1.486818 1.713452 0.810528 7.657366 322.365327
+GTACTC 84.416501 1.568163 1.589990 0.724522 7.657366 633.794004
+GTACTG 85.259186 1.579791 1.638917 0.758221 7.657366 1783.484868
+GTACTT 85.023166 1.483859 1.587654 0.722927 7.657366 807.637005
+GTAGAA 86.653985 1.920401 1.805239 0.876521 7.657366 927.352519
+GTAGAC 82.060002 1.873762 2.063511 1.071200 7.657366 446.060466
+GTAGAG 85.326327 1.845283 1.892327 0.940707 7.657366 638.741118
+GTAGAT 83.511560 1.966941 2.074513 1.079778 7.657366 800.712745
+GTAGCA 81.057150 1.960945 1.702186 0.802547 7.657366 828.776986
+GTAGCC 80.035778 2.179259 1.945015 0.980268 7.657366 950.204805
+GTAGCG 81.064903 1.908787 1.796263 0.869992 7.657366 1589.052987
+GTAGCT 80.894729 2.159287 1.933417 0.971513 7.657366 678.073108
+GTAGGA 86.909389 4.963277 2.129989 1.123379 7.657366 628.749581
+GTAGGC 83.713708 1.757819 1.979011 1.006080 7.657366 824.139781
+GTAGGG 85.101438 1.900329 1.919195 0.960814 7.657366 388.965873
+GTAGGT 84.549195 1.822008 2.023667 1.040325 7.657366 498.511342
+GTAGTA 80.064896 2.146186 2.116538 1.112755 7.657366 546.747612
+GTAGTC 77.641188 2.165511 2.368102 1.316925 7.657366 582.975748
+GTAGTG 78.994399 1.915423 2.221897 1.196868 7.657366 902.324551
+GTAGTT 79.161768 2.128138 2.390654 1.335781 7.657366 790.900631
+GTATAA 84.738142 1.760560 1.936962 0.974186 7.657366 722.161095
+GTATAC 81.978213 1.729655 1.956677 0.989097 7.657366 553.838988
+GTATAG 84.090313 1.735946 2.124481 1.119025 7.657366 390.898068
+GTATAT 83.194587 1.910427 2.081541 1.085270 7.657366 747.973869
+GTATCA 79.884410 2.100198 2.587534 1.504144 7.657366 1266.512947
+GTATCC 78.787137 2.151759 2.600469 1.515437 7.657366 849.718638
+GTATCG 79.819405 2.233354 2.539073 1.462087 7.657366 1611.687960
+GTATCT 79.570562 2.146994 2.541237 1.463956 7.657366 771.725514
+GTATGA 83.030299 1.953421 2.099949 1.099698 7.657366 912.116451
+GTATGC 79.896359 2.002922 2.125790 1.120059 7.657366 1101.137273
+GTATGG 82.089363 2.120986 2.193090 1.173667 7.657366 1066.071124
+GTATGT 80.897550 2.274519 2.297270 1.258283 7.657366 764.115860
+GTATTA 75.405892 2.517761 2.671173 1.577660 7.657366 1009.877474
+GTATTC 72.620093 2.312864 2.765201 1.661691 7.657366 1426.217457
+GTATTG 74.431307 2.381919 2.626092 1.537890 7.657366 1658.058678
+GTATTT 73.865056 2.574292 2.621999 1.534296 7.657366 1616.867842
+GTCAAA 99.407571 1.799102 1.887957 0.937451 7.657366 1288.943420
+GTCAAC 97.271867 2.003706 2.148240 1.137849 7.657366 1090.700408
+GTCAAG 98.550798 2.197104 2.020844 1.038149 7.657366 349.272550
+GTCAAT 98.118756 1.912460 2.135326 1.127604 7.657366 1171.607183
+GTCACA 92.492171 2.288920 1.908425 0.952737 7.657366 730.098967
+GTCACC 91.941042 2.156959 2.027784 1.043501 7.657366 1674.344724
+GTCACG 92.495130 2.014488 1.888613 0.937939 7.657366 1261.615987
+GTCACT 92.283818 1.932867 2.077087 1.081789 7.657366 945.374999
+GTCAGA 97.712436 1.868414 2.026216 1.042291 7.657366 1394.111215
+GTCAGC 96.049347 2.083774 2.162174 1.148937 7.657366 2843.502930
+GTCAGG 96.913219 2.085066 2.073499 1.078987 7.657366 2144.969562
+GTCAGT 96.693419 1.910056 2.172729 1.157360 7.657366 1537.204196
+GTCATA 91.242982 1.975487 2.079285 1.083506 7.657366 1093.890418
+GTCATC 89.915405 2.100455 2.271403 1.237091 7.657366 2223.026593
+GTCATG 90.559291 1.969091 2.086162 1.088886 7.657366 1288.622427
+GTCATT 90.860894 2.179457 2.337781 1.291713 7.657366 1756.525468
+GTCCAA 97.850468 1.539669 1.483642 0.653062 7.657366 290.430014
+GTCCAC 96.005281 1.761952 1.809569 0.879677 7.657366 626.663839
+GTCCAG 97.520459 1.759798 1.647913 0.764472 7.657366 1337.982751
+GTCCAT 96.859856 1.880709 1.801151 0.873546 7.657366 897.242012
+GTCCCA 95.111631 1.553812 1.487813 0.655817 7.657366 445.484244
+GTCCCC 93.731740 1.447015 1.423023 0.613449 7.657366 391.371339
+GTCCCG 94.957458 1.479023 1.517238 0.675368 7.657366 483.056793
+GTCCCT 94.904362 1.806503 1.725521 0.819107 7.657366 303.431387
+GTCCGA 97.492905 1.807708 1.787541 0.863663 7.657366 473.819657
+GTCCGC 95.373386 1.760587 1.889065 0.938276 7.657366 1038.929481
+GTCCGG 96.706876 1.781700 1.705404 0.804824 7.657366 1506.699998
+GTCCGT 96.116455 1.863410 1.914868 0.957566 7.657366 675.751018
+GTCCTA 93.878341 1.771203 1.636639 0.756640 7.657366 111.651650
+GTCCTC 92.252640 1.488670 1.909522 0.953559 7.657366 245.810812
+GTCCTG 93.729301 1.759907 1.874681 0.927580 7.657366 915.042912
+GTCCTT 93.186582 1.963738 2.009055 1.029077 7.657366 525.062722
+GTCGAA 96.769354 2.389628 2.186104 1.168063 7.657366 1300.334298
+GTCGAC 92.595988 2.418478 2.221540 1.196580 7.657366 798.749635
+GTCGAG 95.586284 2.429361 2.205442 1.183597 7.657366 1050.751302
+GTCGAT 93.986308 2.083148 2.379200 1.326193 7.657366 2117.386976
+GTCGCA 92.044938 1.996778 1.958629 0.990578 7.657366 1313.028724
+GTCGCC 90.846599 2.299852 2.185695 1.167736 7.657366 2413.221315
+GTCGCG 91.805686 2.204664 2.060958 1.069212 7.657366 1996.461427
+GTCGCT 91.757715 2.139764 2.092020 1.093476 7.657366 1567.486839
+GTCGGA 96.220664 1.989358 2.118405 1.114227 7.657366 673.673208
+GTCGGC 93.685816 2.001571 2.203460 1.182001 7.657366 1843.274934
+GTCGGG 95.585958 2.046728 2.143302 1.133927 7.657366 992.893390
+GTCGGT 94.947107 2.016814 2.203142 1.181746 7.657366 1694.576899
+GTCGTA 90.764238 2.418635 2.274448 1.239579 7.657366 860.980267
+GTCGTC 89.033771 4.044988 2.480777 1.412023 7.657366 1521.247728
+GTCGTG 90.026675 2.032756 2.317304 1.274779 7.657366 1041.757324
+GTCGTT 90.411628 2.209219 2.441991 1.379038 7.657366 1392.054368
+GTCTAA 93.894310 1.808190 1.929267 0.968387 7.657366 241.398963
+GTCTAC 90.971447 2.128557 2.208236 1.185847 7.657366 437.458367
+GTCTAG 92.794281 2.134021 1.890396 0.939268 7.657366 28.730922
+GTCTAT 91.893797 1.871373 2.146696 1.136622 7.657366 519.292046
+GTCTCA 88.695549 2.235069 2.492049 1.421658 7.657366 313.981079
+GTCTCC 87.214685 2.099674 2.400691 1.344203 7.657366 452.619230
+GTCTCG 88.544388 2.381891 2.634349 1.545149 7.657366 498.706980
+GTCTCT 88.158661 2.274440 2.400273 1.343851 7.657366 513.016005
+GTCTGA 92.161605 2.077816 2.138933 1.130462 7.657366 1245.953316
+GTCTGC 88.896296 2.127554 2.207657 1.185380 7.657366 1608.815290
+GTCTGG 90.930694 2.185383 2.186258 1.168187 7.657366 2663.744826
+GTCTGT 89.895690 2.141098 2.225689 1.199933 7.657366 912.642008
+GTCTTA 84.294871 2.598239 2.628040 1.539601 7.657366 533.648390
+GTCTTC 81.021591 2.279761 2.540853 1.463624 7.657366 1092.809401
+GTCTTG 83.250891 2.327518 2.476766 1.408601 7.657366 586.553176
+GTCTTT 82.179141 2.262383 2.393844 1.338456 7.657366 1299.749904
+GTGAAA 82.589541 2.443576 2.274035 1.239241 7.657366 2607.318711
+GTGAAC 79.952811 2.752813 2.678318 1.583994 7.657366 1580.325814
+GTGAAG 81.605337 2.574622 2.391865 1.336796 7.657366 1924.965001
+GTGAAT 80.692541 2.537379 2.629797 1.541145 7.657366 1467.733547
+GTGACA 72.278979 2.357405 2.302858 1.262877 7.657366 1002.196382
+GTGACC 71.205640 2.183295 2.235663 1.208008 7.657366 1312.466667
+GTGACG 71.981996 2.156528 2.265918 1.232612 7.657366 1643.963165
+GTGACT 72.076215 2.100885 2.026746 1.042699 7.657366 856.920817
+GTGAGA 81.452139 2.502078 2.309858 1.268640 7.657366 712.308273
+GTGAGC 79.956132 2.756892 2.555968 1.476704 7.657366 1108.228275
+GTGAGG 80.587795 2.393029 2.287715 1.250441 7.657366 599.845687
+GTGAGT 80.439349 3.005173 2.626844 1.538550 7.657366 818.200961
+GTGATA 71.471149 2.129047 2.062832 1.070671 7.657366 1425.028414
+GTGATC 70.560106 2.424814 2.570015 1.488894 7.657366 1326.815044
+GTGATG 70.773259 2.246641 2.229904 1.203343 7.657366 2484.880029
+GTGATT 71.485744 2.650280 2.689182 1.593642 7.657366 2144.941491
+GTGCAA 88.211079 1.559939 1.636918 0.756834 7.657366 1317.945547
+GTGCAC 86.086221 1.527346 1.662265 0.774481 7.657366 700.068983
+GTGCAG 87.504678 1.544527 1.740770 0.829989 7.657366 1749.649450
+GTGCAT 86.779939 1.472475 1.550422 0.697646 7.657366 1126.779394
+GTGCCA 85.220159 1.669789 1.883825 0.934375 7.657366 1862.840041
+GTGCCC 84.150173 1.658028 1.828145 0.893256 7.657366 713.927786
+GTGCCG 85.071439 1.668355 1.810488 0.880347 7.657366 2375.246644
+GTGCCT 84.816064 1.554603 1.791242 0.866347 7.657366 1011.810420
+GTGCGA 87.234168 1.617739 1.692940 0.796018 7.657366 1094.417668
+GTGCGC 84.942068 1.753899 1.750766 0.837148 7.657366 2373.004525
+GTGCGG 86.333318 1.664525 1.758095 0.842411 7.657366 1739.346345
+GTGCGT 85.397207 1.515832 1.707144 0.806056 7.657366 1351.849730
+GTGCTA 83.935987 1.655308 1.901251 0.947369 7.657366 476.894181
+GTGCTC 82.177969 1.819299 1.715233 0.811792 7.657366 866.994481
+GTGCTG 83.300249 1.801472 1.852443 0.911124 7.657366 3329.450633
+GTGCTT 83.150975 1.883797 1.796466 0.870139 7.657366 1055.107529
+GTGGAA 85.191435 2.337196 2.274788 1.239857 7.657366 1587.846196
+GTGGAC 80.721154 2.388224 2.650599 1.559467 7.657366 587.711756
+GTGGAG 84.201452 2.546290 2.361385 1.311325 7.657366 889.675097
+GTGGAT 82.548674 2.581640 2.615302 1.528421 7.657366 1603.596718
+GTGGCA 79.956148 2.578018 2.046275 1.057807 7.657366 1947.370992
+GTGGCC 79.078589 2.859423 2.298904 1.259626 7.657366 699.471669
+GTGGCG 80.001488 2.834859 2.204191 1.182590 7.657366 3176.546918
+GTGGCT 79.779977 2.648621 2.299133 1.259814 7.657366 1438.584130
+GTGGGA 86.059121 2.547061 2.309476 1.268324 7.657366 736.741571
+GTGGGC 83.626259 2.336208 2.358239 1.308706 7.657366 1094.898692
+GTGGGG 84.994929 2.446745 2.304980 1.264623 7.657366 803.893208
+GTGGGT 84.442809 2.460914 2.360518 1.310604 7.657366 937.527568
+GTGGTA 79.253259 2.700985 2.553372 1.474455 7.657366 1166.623688
+GTGGTC 77.553804 3.294070 2.751948 1.649760 7.657366 1318.512063
+GTGGTG 78.543300 2.852136 2.644426 1.554023 7.657366 2660.029803
+GTGGTT 79.052606 2.938916 2.828307 1.718898 7.657366 1951.499342
+GTGTAA 82.152152 2.204363 2.318377 1.275664 7.657366 817.002159
+GTGTAC 78.560331 2.009209 2.370432 1.318869 7.657366 556.371652
+GTGTAG 80.905829 2.142338 2.248302 1.218266 7.657366 518.866365
+GTGTAT 79.727080 2.294275 2.409221 1.351373 7.657366 672.062142
+GTGTCA 77.092510 2.659979 2.852202 1.740727 7.657366 842.123573
+GTGTCC 76.134057 2.860064 2.951339 1.832267 7.657366 455.557699
+GTGTCG 76.891355 2.910434 2.801182 1.694230 7.657366 1186.198692
+GTGTCT 76.444044 2.756717 2.998985 1.876816 7.657366 532.633234
+GTGTGA 79.631519 2.155995 2.375086 1.322755 7.657366 687.023301
+GTGTGC 75.959732 2.349386 2.693664 1.597627 7.657366 773.689123
+GTGTGG 78.518541 2.692369 2.534007 1.457713 7.657366 1049.651906
+GTGTGT 77.155596 2.344977 2.746607 1.644959 7.657366 464.317194
+GTGTTA 72.426120 2.938693 3.050165 1.925064 7.657366 937.488651
+GTGTTC 69.430433 3.165989 3.231142 2.098912 7.657366 1055.127880
+GTGTTG 71.364193 2.767879 2.947071 1.828294 7.657366 1498.688768
+GTGTTT 70.447963 2.936482 2.944976 1.826345 7.657366 1366.680620
+GTTAAA 101.944313 1.709929 1.749027 0.835902 7.657366 2017.772806
+GTTAAC 99.964390 1.710190 1.853774 0.912107 7.657366 1978.457594
+GTTAAG 101.195707 1.801047 1.810112 0.880073 7.657366 979.440132
+GTTAAT 101.423259 1.763044 1.906847 0.951556 7.657366 1994.675385
+GTTACA 96.906110 1.627658 1.583759 0.720268 7.657366 837.988082
+GTTACC 96.686078 1.650671 1.675126 0.783486 7.657366 1560.636616
+GTTACG 96.935454 1.603516 1.640077 0.759026 7.657366 1410.626932
+GTTACT 97.604491 1.745003 1.725421 0.819036 7.657366 951.540747
+GTTAGA 99.790180 1.808887 1.779459 0.857812 7.657366 412.061445
+GTTAGC 98.741131 1.667371 1.772930 0.853096 7.657366 1172.394819
+GTTAGG 99.523985 1.559392 1.739200 0.828867 7.657366 414.180635
+GTTAGT 100.056140 1.844297 1.948974 0.983263 7.657366 569.562065
+GTTATA 96.919139 1.989199 1.913064 0.956213 7.657366 904.179408
+GTTATC 97.193645 1.976637 1.978440 1.005645 7.657366 2232.396675
+GTTATG 96.384433 1.827292 1.934574 0.972385 7.657366 1325.133993
+GTTATT 98.712799 2.001122 2.053564 1.063464 7.657366 1887.817887
+GTTCAA 105.093617 2.114014 1.800426 0.873018 7.657366 1536.911498
+GTTCAC 103.289083 1.960880 1.734013 0.825161 7.657366 1607.522513
+GTTCAG 104.413819 1.836103 1.725000 0.818736 7.657366 2306.274731
+GTTCAT 103.965828 2.009770 1.860157 0.916821 7.657366 1719.227713
+GTTCCA 103.563730 1.913286 1.736290 0.826787 7.657366 1461.840701
+GTTCCC 101.931977 1.868349 1.659283 0.772397 7.657366 953.191880
+GTTCCG 103.402152 1.941468 1.722315 0.816825 7.657366 1578.596616
+GTTCCT 103.040009 1.852355 1.761157 0.844612 7.657366 1107.772209
+GTTCGA 104.532020 2.183533 1.882270 0.933218 7.657366 1157.592036
+GTTCGC 102.435603 1.940342 1.720356 0.815432 7.657366 1986.790248
+GTTCGG 103.995712 1.949063 1.741843 0.830757 7.657366 1294.244653
+GTTCGT 103.690856 2.025266 1.911997 0.955413 7.657366 1150.152530
+GTTCTA 103.773770 1.832614 1.804339 0.875866 7.657366 434.303261
+GTTCTC 102.715705 2.054639 1.890070 0.939025 7.657366 998.597736
+GTTCTG 103.304643 1.923134 1.823045 0.889522 7.657366 2056.138803
+GTTCTT 103.524060 1.960092 2.030116 1.045302 7.657366 1413.726090
+GTTGAA 98.630686 1.756689 1.741627 0.830602 7.657366 2036.295394
+GTTGAC 95.549388 1.668938 1.922672 0.963425 7.657366 1095.723418
+GTTGAG 97.738845 1.711460 1.769408 0.850554 7.657366 1373.010694
+GTTGAT 97.244682 1.726440 1.867493 0.922250 7.657366 2796.995553
+GTTGCA 96.119896 1.748803 1.598787 0.730543 7.657366 1867.347160
+GTTGCC 95.546081 1.786067 1.645245 0.762616 7.657366 2678.787193
+GTTGCG 96.073461 1.783047 1.608071 0.736916 7.657366 2045.506200
+GTTGCT 96.812539 1.888620 1.722983 0.817300 7.657366 2401.159935
+GTTGGA 98.770253 1.823858 1.795233 0.869244 7.657366 616.941697
+GTTGGC 96.697540 1.677453 1.724251 0.818203 7.657366 2038.377013
+GTTGGG 98.123166 1.753182 1.678624 0.785942 7.657366 1024.151186
+GTTGGT 98.248149 1.700378 1.760127 0.843872 7.657366 1709.339034
+GTTGTA 96.825308 1.936593 1.836285 0.899229 7.657366 1107.210106
+GTTGTC 96.768542 1.971351 1.921948 0.962881 7.657366 1242.064409
+GTTGTG 96.588267 2.108161 1.884985 0.935238 7.657366 1175.999357
+GTTGTT 98.508001 2.057684 2.049262 1.060123 7.657366 2370.127027
+GTTTAA 96.282521 1.756510 1.520202 0.677349 7.657366 1741.205900
+GTTTAC 93.164493 1.575221 1.350700 0.567282 7.657366 2427.487820
+GTTTAG 95.511565 1.641899 1.422993 0.613429 7.657366 661.663299
+GTTTAT 94.163307 1.670873 1.432730 0.619736 7.657366 2507.595338
+GTTTCA 91.188917 1.626296 1.482880 0.652558 7.657366 1990.428242
+GTTTCC 89.751950 1.629361 1.504766 0.667058 7.657366 1474.555901
+GTTTCG 91.172587 1.686584 1.482903 0.652574 7.657366 1757.064855
+GTTTCT 90.888107 1.622677 1.510331 0.670762 7.657366 1624.857377
+GTTTGA 94.388123 1.854905 1.428987 0.617309 7.657366 1621.761137
+GTTTGC 91.624077 1.677300 1.395608 0.595807 7.657366 2790.934115
+GTTTGG 93.856641 1.579640 1.406046 0.602503 7.657366 1447.742861
+GTTTGT 92.657593 1.700810 1.409822 0.604932 7.657366 1864.354547
+GTTTTA 88.722476 1.922498 1.443376 0.626657 7.657366 2064.362202
+GTTTTC 86.412088 1.708405 1.455613 0.634642 7.657366 2908.342304
+GTTTTG 87.960336 1.805540 1.446480 0.628679 7.657366 2448.738274
+GTTTTT 87.361267 1.894608 1.362227 0.574559 7.657366 3479.558417
+TAAAAA 85.378847 1.793722 1.503117 0.665962 7.657366 3080.062468
+TAAAAC 82.916796 1.474028 1.680725 0.787418 7.657366 1903.337917
+TAAAAG 84.695416 1.733421 1.557021 0.702105 7.657366 1317.682951
+TAAAAT 83.726265 1.530636 1.749194 0.836021 7.657366 1729.079170
+TAAACA 75.587164 1.833649 1.870333 0.924354 7.657366 1698.439008
+TAAACC 74.508603 1.726198 1.810577 0.880412 7.657366 1664.863638
+TAAACG 75.695144 1.859031 1.821324 0.888262 7.657366 1664.579899
+TAAACT 75.450303 1.744531 1.725399 0.819020 7.657366 1037.943741
+TAAAGA 83.453044 1.654411 1.503154 0.665987 7.657366 1851.515967
+TAAAGC 81.735834 1.509764 1.583883 0.720352 7.657366 1974.096265
+TAAAGG 82.731553 1.787802 1.562867 0.706063 7.657366 1454.605850
+TAAAGT 81.935521 1.637834 1.674959 0.783369 7.657366 1243.285203
+TAAATA 75.096492 1.501511 1.627662 0.750424 7.657366 1794.662099
+TAAATC 73.406976 1.805179 1.821276 0.888227 7.657366 1517.968654
+TAAATG 74.210491 1.501557 1.649836 0.765811 7.657366 1219.661069
+TAAATT 74.601879 1.627613 1.915460 0.958010 7.657366 1614.695136
+TAACAA 86.952785 1.789792 1.522127 0.678636 7.657366 1468.501590
+TAACAC 85.294136 1.585009 1.498421 0.662844 7.657366 861.620782
+TAACAG 86.365526 1.591151 1.521684 0.678340 7.657366 1183.979466
+TAACAT 86.260532 1.589024 1.444075 0.627112 7.657366 1055.106547
+TAACCA 83.738891 1.513662 1.595238 0.728113 7.657366 1793.067787
+TAACCC 82.610098 1.419258 1.538901 0.689884 7.657366 1341.819660
+TAACCG 83.598363 1.604586 1.583861 0.720337 7.657366 1566.248470
+TAACCT 83.311242 1.531851 1.525255 0.680728 7.657366 1091.153404
+TAACGA 86.188608 1.448992 1.475726 0.647842 7.657366 1484.042932
+TAACGC 84.442851 1.469018 1.464480 0.640450 7.657366 2472.693998
+TAACGG 85.352947 1.453237 1.467498 0.642431 7.657366 1761.729866
+TAACGT 84.948208 1.410658 1.454779 0.634097 7.657366 1413.718674
+TAACTA 82.902910 1.434670 1.492053 0.658623 7.657366 469.729421
+TAACTC 81.354959 1.378919 1.436477 0.622169 7.657366 763.776579
+TAACTG 82.301146 1.436049 1.455829 0.634784 7.657366 1203.954062
+TAACTT 81.723428 1.354818 1.285617 0.526778 7.657366 1048.110041
+TAAGAA 83.850677 1.425717 1.505972 0.667861 7.657366 808.863423
+TAAGAC 79.754962 1.374726 1.692722 0.795864 7.657366 441.141876
+TAAGAG 83.171552 1.569271 1.670170 0.780012 7.657366 402.341655
+TAAGAT 81.295469 1.518207 1.734313 0.825375 7.657366 699.856598
+TAAGCA 77.915895 1.592840 1.510776 0.671059 7.657366 1114.838426
+TAAGCC 76.510071 1.461372 1.515450 0.674175 7.657366 1123.496490
+TAAGCG 77.806408 1.574162 1.482946 0.652602 7.657366 1243.240193
+TAAGCT 77.193147 1.706167 1.475843 0.647919 7.657366 799.388256
+TAAGGA 82.835408 1.409571 1.467724 0.642579 7.657366 784.845104
+TAAGGC 80.043317 1.523882 1.615627 0.742116 7.657366 910.516977
+TAAGGG 82.096038 1.400675 1.474852 0.647266 7.657366 438.034174
+TAAGGT 81.325904 1.367580 1.585782 0.721648 7.657366 649.770506
+TAAGTA 76.371288 1.569066 1.688916 0.793181 7.657366 570.655003
+TAAGTC 74.038351 1.571000 1.778847 0.857370 7.657366 564.872154
+TAAGTG 75.485386 1.532128 1.662803 0.774857 7.657366 633.426812
+TAAGTT 75.466103 1.584257 1.937265 0.974415 7.657366 846.805934
+TAATAA 80.980168 1.575089 1.747364 0.834710 7.657366 1629.174069
+TAATAC 78.709460 1.491155 1.696963 0.798856 7.657366 992.446602
+TAATAG 80.360815 1.527577 1.733771 0.824989 7.657366 612.539629
+TAATAT 79.734172 1.688010 1.765713 0.847892 7.657366 1170.751015
+TAATCA 73.787863 1.671316 1.961538 0.992785 7.657366 2135.383738
+TAATCC 72.766143 1.809478 2.026622 1.042604 7.657366 1483.475517
+TAATCG 73.923232 1.788996 1.973781 1.002095 7.657366 1530.150368
+TAATCT 73.806293 1.951644 2.146491 1.136459 7.657366 1125.394233
+TAATGA 78.520858 1.595035 1.669770 0.779732 7.657366 1808.326673
+TAATGC 75.897275 1.520950 1.745447 0.833336 7.657366 2070.058936
+TAATGG 77.702043 1.483860 1.678097 0.785572 7.657366 1480.246033
+TAATGT 76.714028 1.447814 1.760384 0.844056 7.657366 1199.584660
+TAATTA 69.177831 1.946136 2.183970 1.166354 7.657366 1223.475322
+TAATTC 66.448262 1.994506 2.314339 1.272333 7.657366 1127.612632
+TAATTG 68.521667 2.032913 2.197563 1.177260 7.657366 1127.055465
+TAATTT 67.630030 1.846766 2.066686 1.073673 7.657366 1880.540932
+TACAAA 103.102386 1.892186 1.895864 0.943346 7.657366 1285.898399
+TACAAC 100.333042 1.866771 1.869162 0.923487 7.657366 1068.005603
+TACAAG 101.961703 1.908048 1.862257 0.918374 7.657366 482.618736
+TACAAT 101.713330 1.673651 1.945606 0.980714 7.657366 574.268785
+TACACA 94.751316 1.735045 1.437444 0.622798 7.657366 555.031927
+TACACC 93.702315 1.381964 1.429377 0.617562 7.657366 1191.329096
+TACACG 94.567368 1.584950 1.461740 0.638654 7.657366 492.014323
+TACACT 94.576099 1.562679 1.513190 0.672668 7.657366 544.817955
+TACAGA 100.265699 1.852069 1.915913 0.958350 7.657366 815.079082
+TACAGC 97.869215 1.705732 1.772534 0.852810 7.657366 1717.871828
+TACAGG 99.212180 2.031116 1.869421 0.923679 7.657366 1143.027086
+TACAGT 99.042717 1.829254 1.970085 0.999281 7.657366 698.929464
+TACATA 93.951365 1.444150 1.515302 0.674077 7.657366 573.591633
+TACATC 92.913345 1.675117 1.638613 0.758010 7.657366 1131.156855
+TACATG 93.543834 1.552877 1.600165 0.731488 7.657366 738.755187
+TACATT 94.047715 1.772725 1.770779 0.851544 7.657366 936.891580
+TACCAA 102.723551 1.865189 1.555746 0.701243 7.657366 1045.226597
+TACCAC 100.168771 1.570242 1.527339 0.682124 7.657366 1288.698507
+TACCAG 101.956323 1.762149 1.559999 0.704120 7.657366 1849.422915
+TACCAT 101.264128 1.764993 1.750013 0.836608 7.657366 1044.580997
+TACCCA 98.806678 1.658707 1.367267 0.577751 7.657366 1339.433049
+TACCCC 97.256251 1.665374 1.364166 0.575786 7.657366 648.466065
+TACCCG 98.845570 1.826847 1.437356 0.622740 7.657366 1461.923559
+TACCCT 98.361845 1.634800 1.475745 0.647854 7.657366 757.432735
+TACCGA 101.424377 1.777060 1.658718 0.772003 7.657366 1717.355290
+TACCGC 99.025432 1.617496 1.586515 0.722149 7.657366 2671.681178
+TACCGG 100.455349 1.747369 1.614232 0.741156 7.657366 2349.424403
+TACCGT 99.837820 1.738499 1.743532 0.831965 7.657366 1697.772399
+TACCTA 97.824796 1.643781 1.553752 0.699895 7.657366 391.446525
+TACCTC 96.005877 1.621868 1.583932 0.720386 7.657366 536.879337
+TACCTG 97.289623 1.786514 1.596285 0.728829 7.657366 1712.727517
+TACCTT 96.727703 1.779773 1.719359 0.814723 7.657366 1129.727530
+TACGAA 98.195606 1.995998 1.859256 0.916155 7.657366 866.098507
+TACGAC 94.281758 1.577752 1.705001 0.804539 7.657366 884.059659
+TACGAG 97.409401 1.878423 1.695500 0.797824 7.657366 378.338786
+TACGAT 96.183833 1.745561 1.889013 0.938237 7.657366 903.379255
+TACGCA 93.452218 1.492511 1.527586 0.682289 7.657366 1532.899651
+TACGCC 92.423245 1.591696 1.556759 0.701927 7.657366 2203.546400
+TACGCG 93.540451 1.601263 1.577704 0.716141 7.657366 1670.158686
+TACGCT 93.754857 1.554728 1.586487 0.722129 7.657366 1761.390297
+TACGGA 97.927010 1.932600 1.874513 0.927456 7.657366 825.337668
+TACGGC 95.069631 1.593695 1.642334 0.760593 7.657366 1609.652087
+TACGGG 97.225448 1.859990 1.763994 0.846654 7.657366 872.950503
+TACGGT 96.649200 1.745321 1.830829 0.895224 7.657366 1384.333816
+TACGTA 92.950127 1.822048 1.638692 0.758065 7.657366 812.818275
+TACGTC 91.509007 1.644651 1.719180 0.814596 7.657366 769.244675
+TACGTG 92.356247 1.672819 1.680677 0.787384 7.657366 852.811114
+TACGTT 92.960834 1.797521 1.929440 0.968517 7.657366 1337.685608
+TACTAA 96.636652 1.864601 1.729482 0.821929 7.657366 327.589959
+TACTAC 93.368527 1.489382 1.578803 0.716890 7.657366 575.597266
+TACTAG 96.346675 1.434555 1.602654 0.733196 7.657366 55.061723
+TACTAT 94.624830 1.894937 1.686969 0.791810 7.657366 416.093826
+TACTCA 88.700956 1.681038 1.683039 0.789045 7.657366 719.858274
+TACTCC 87.213204 1.530461 1.675865 0.784005 7.657366 573.010227
+TACTCG 88.754586 1.541279 1.715105 0.811701 7.657366 553.815805
+TACTCT 88.227604 1.740741 1.746470 0.834069 7.657366 625.973494
+TACTGA 93.805194 1.997115 1.552242 0.698875 7.657366 1456.004602
+TACTGC 90.425731 1.552319 1.535970 0.687915 7.657366 1679.088445
+TACTGG 92.535730 1.764390 1.584369 0.720684 7.657366 1991.700581
+TACTGT 91.563453 1.640205 1.546548 0.695033 7.657366 866.039611
+TACTTA 84.405930 1.589934 1.520109 0.677287 7.657366 576.429041
+TACTTC 81.537323 1.571738 1.576554 0.715358 7.657366 1005.750730
+TACTTG 83.471657 1.607036 1.482984 0.652627 7.657366 481.334057
+TACTTT 82.821752 1.460822 1.469677 0.643862 7.657366 1130.406748
+TAGAAA 77.479435 1.783060 1.789496 0.865080 7.657366 762.270415
+TAGAAC 74.794833 1.581342 1.950778 0.984628 7.657366 454.936307
+TAGAAG 76.219952 1.988849 1.901764 0.947753 7.657366 459.957252
+TAGAAT 75.222758 1.513722 1.979536 1.006481 7.657366 241.744400
+TAGACA 66.181690 2.037955 2.208297 1.185896 7.657366 377.028773
+TAGACC 65.208072 2.002599 2.176670 1.160511 7.657366 257.079296
+TAGACG 66.183360 1.850729 2.239060 1.210762 7.657366 355.841973
+TAGACT 65.882571 1.863307 2.172565 1.157229 7.657366 168.066136
+TAGAGA 75.491269 1.947426 1.695844 0.798066 7.657366 611.065283
+TAGAGC 73.814301 1.562669 1.877917 0.929983 7.657366 460.730611
+TAGAGG 74.365936 1.928781 1.724988 0.818728 7.657366 322.805985
+TAGAGT 74.114767 1.788505 1.910859 0.954560 7.657366 318.390922
+TAGATA 65.896469 1.677984 1.983753 1.009699 7.657366 553.366204
+TAGATC 64.458497 1.521452 1.701781 0.802262 7.657366 415.313082
+TAGATG 65.102544 1.426590 1.903121 0.948768 7.657366 428.307198
+TAGATT 65.568651 1.823495 1.968873 0.998359 7.657366 421.485983
+TAGCAA 83.385933 1.507939 1.568130 0.709632 7.657366 923.178767
+TAGCAC 81.281605 1.351282 1.424242 0.614237 7.657366 449.089158
+TAGCAG 82.665594 1.446218 1.585797 0.721659 7.657366 725.733196
+TAGCAT 82.252112 1.462166 1.500152 0.663993 7.657366 479.798879
+TAGCCA 79.479169 1.408224 1.594447 0.727571 7.657366 1188.545099
+TAGCCC 78.290968 1.314357 1.555596 0.701141 7.657366 752.356145
+TAGCCG 79.292350 1.346154 1.576622 0.715405 7.657366 964.198339
+TAGCCT 79.069883 1.435632 1.546147 0.694763 7.657366 637.112018
+TAGCGA 81.455159 1.463398 1.488963 0.656578 7.657366 1301.449461
+TAGCGC 78.401753 1.441793 1.562336 0.705703 7.657366 1342.439092
+TAGCGG 80.195017 1.350375 1.493538 0.659606 7.657366 1377.257010
+TAGCGT 79.167894 1.590527 1.574585 0.714019 7.657366 1079.860067
+TAGCTA 78.272604 1.542347 1.485695 0.654418 7.657366 369.992219
+TAGCTC 76.202456 1.530722 1.541245 0.691461 7.657366 395.612368
+TAGCTG 77.884550 1.560537 1.528202 0.682703 7.657366 852.995954
+TAGCTT 77.127800 1.426021 1.419464 0.611148 7.657366 632.731765
+TAGGAA 77.207360 1.465800 1.842814 0.904030 7.657366 182.598261
+TAGGAC 71.817168 1.438326 1.932950 0.971161 7.657366 126.590277
+TAGGAG 75.908827 1.436967 1.781673 0.859414 7.657366 100.034330
+TAGGAT 73.797377 1.599670 2.060639 1.068964 7.657366 219.003670
+TAGGCA 71.126330 1.836255 1.821969 0.888734 7.657366 443.376654
+TAGGCC 69.599581 1.820330 1.825581 0.891378 7.657366 578.635320
+TAGGCG 71.048650 1.789898 1.872664 0.926083 7.657366 491.063649
+TAGGCT 70.663146 1.498556 1.712473 0.809834 7.657366 355.167302
+TAGGGA 77.334508 1.538283 1.794611 0.868792 7.657366 254.724761
+TAGGGC 74.474415 1.460199 1.826425 0.891997 7.657366 267.560344
+TAGGGG 76.504827 1.522884 1.668951 0.779158 7.657366 183.466929
+TAGGGT 75.352932 1.491731 1.741098 0.830224 7.657366 277.513444
+TAGGTA 69.819908 1.472168 1.714100 0.810988 7.657366 331.293877
+TAGGTC 68.002932 1.797573 1.972972 1.001479 7.657366 312.851696
+TAGGTG 69.255429 1.546868 1.880701 0.932051 7.657366 383.880896
+TAGGTT 69.186397 1.660476 2.005938 1.026684 7.657366 422.587414
+TAGTAA 74.276750 1.789634 1.885451 0.935585 7.657366 543.692448
+TAGTAC 70.945845 1.870465 2.133783 1.126382 7.657366 290.165355
+TAGTAG 73.656760 1.839762 1.944193 0.979647 7.657366 317.590681
+TAGTAT 72.196307 1.833028 2.135779 1.127963 7.657366 299.990491
+TAGTCA 66.478108 1.881961 2.386682 1.332453 7.657366 629.094039
+TAGTCC 65.159533 1.970838 2.598176 1.513433 7.657366 284.405876
+TAGTCG 66.776630 2.310601 2.509171 1.436335 7.657366 530.713627
+TAGTCT 66.263232 2.238770 2.513002 1.439626 7.657366 325.312163
+TAGTGA 70.811956 1.825498 2.013083 1.032174 7.657366 841.443518
+TAGTGC 67.263420 2.020245 2.419857 1.360332 7.657366 621.356005
+TAGTGG 69.624063 1.907888 2.041371 1.054006 7.657366 647.945908
+TAGTGT 67.891826 2.015925 2.263715 1.230816 7.657366 378.348066
+TAGTTA 61.742382 2.203793 2.628778 1.540250 7.657366 511.147598
+TAGTTC 58.386319 1.715031 2.565993 1.485400 7.657366 418.488744
+TAGTTG 61.086230 2.217032 2.756630 1.653971 7.657366 515.625664
+TAGTTT 59.715488 1.935541 2.460052 1.394366 7.657366 650.686102
+TATAAA 107.697079 2.375280 2.486994 1.417335 7.657366 1410.706981
+TATAAC 105.582267 2.414488 2.600641 1.515587 7.657366 933.084581
+TATAAG 106.748853 2.173643 2.421855 1.362017 7.657366 484.016263
+TATAAT 107.572551 2.230124 2.660483 1.568198 7.657366 554.589233
+TATACA 102.517775 2.494867 2.345374 1.298011 7.657366 565.682188
+TATACC 102.816717 2.489827 2.517019 1.443078 7.657366 773.425010
+TATACG 102.662579 2.358630 2.507117 1.434571 7.657366 530.900892
+TATACT 104.489966 2.087782 2.449674 1.385552 7.657366 394.817359
+TATAGA 105.310085 2.160079 2.369314 1.317936 7.657366 487.472699
+TATAGC 103.644378 2.357600 2.472512 1.404973 7.657366 733.111521
+TATAGG 104.325443 2.318978 2.384652 1.330754 7.657366 360.419066
+TATAGT 105.875851 2.836725 2.809349 1.701645 7.657366 346.932781
+TATATA 103.499644 2.585530 2.770385 1.666367 7.657366 542.870818
+TATATC 104.923056 2.516639 3.018582 1.895243 7.657366 793.130556
+TATATG 103.607079 2.686009 2.781689 1.676576 7.657366 710.388198
+TATATT 107.422001 2.693013 3.086331 1.959405 7.657366 1219.840316
+TATCAA 116.016391 2.484689 2.540214 1.463072 7.657366 2134.966000
+TATCAC 113.719927 2.123302 2.452267 1.387753 7.657366 1312.911430
+TATCAG 115.240884 2.327620 2.571373 1.490075 7.657366 1936.052151
+TATCAT 114.815290 2.663214 2.681052 1.586420 7.657366 1237.745112
+TATCCA 114.964259 2.340040 2.562922 1.482734 7.657366 1631.394283
+TATCCC 112.992392 2.607396 2.574860 1.493107 7.657366 953.145332
+TATCCG 114.853645 2.394880 2.616331 1.529323 7.657366 1600.418469
+TATCCT 114.025756 2.618409 2.739941 1.638974 7.657366 915.332670
+TATCGA 115.394247 2.639096 2.747441 1.645708 7.657366 2022.935986
+TATCGC 112.928886 2.302576 2.545707 1.467820 7.657366 2667.027880
+TATCGG 114.556326 2.477548 2.672225 1.578591 7.657366 1646.573169
+TATCGT 114.820038 2.468836 2.801011 1.694074 7.657366 1447.341041
+TATCTA 115.216879 2.277816 2.494075 1.423392 7.657366 627.757683
+TATCTC 114.235594 2.215692 2.614603 1.527809 7.657366 1033.153131
+TATCTG 115.082054 2.629540 2.592389 1.508379 7.657366 1973.910681
+TATCTT 114.967481 2.625900 2.991477 1.869772 7.657366 1422.670756
+TATGAA 103.867826 2.216352 2.403837 1.346845 7.657366 1289.659013
+TATGAC 100.464011 2.479319 2.480019 1.411376 7.657366 1170.115866
+TATGAG 103.125990 2.440814 2.389473 1.334792 7.657366 585.809418
+TATGAT 102.927741 2.161481 2.554734 1.475635 7.657366 913.223216
+TATGCA 102.787603 2.528013 2.414570 1.355876 7.657366 1213.875063
+TATGCC 102.639312 2.283258 2.347905 1.300113 7.657366 1366.899497
+TATGCG 103.057560 2.407534 2.378791 1.325851 7.657366 1254.081717
+TATGCT 104.592726 2.747391 2.555420 1.476229 7.657366 1084.123530
+TATGGA 104.268529 2.473772 2.495515 1.424625 7.657366 1004.808245
+TATGGC 102.492392 2.358533 2.333770 1.288390 7.657366 1595.915377
+TATGGG 103.698757 2.416586 2.385466 1.331435 7.657366 836.733815
+TATGGT 104.663910 2.678775 2.646960 1.556258 7.657366 986.363117
+TATGTA 105.111909 2.448406 2.801058 1.694117 7.657366 558.106188
+TATGTC 105.729462 2.977870 3.068911 1.942838 7.657366 753.609878
+TATGTG 105.028729 2.658848 2.865977 1.753353 7.657366 648.191241
+TATGTT 108.546197 2.858692 3.147191 2.017646 7.657366 1053.283375
+TATTAA 103.014646 2.340173 1.925697 0.965700 7.657366 1520.753842
+TATTAC 99.397907 1.992747 1.810587 0.880419 7.657366 1420.182170
+TATTAG 102.209779 1.872992 1.833060 0.896861 7.657366 504.941890
+TATTAT 100.860725 1.891729 1.923748 0.964234 7.657366 1255.040961
+TATTCA 97.423998 2.111422 2.076305 1.081177 7.657366 2165.201661
+TATTCC 95.759441 1.872498 2.010551 1.030227 7.657366 1474.268977
+TATTCG 97.419582 1.960503 2.090452 1.092247 7.657366 1168.822206
+TATTCT 97.264287 2.014494 2.053800 1.063647 7.657366 1294.103711
+TATTGA 100.933460 2.106550 1.900031 0.946458 7.657366 2015.491914
+TATTGC 97.318426 1.896281 1.818415 0.886135 7.657366 2263.904800
+TATTGG 100.037783 1.985185 1.924624 0.964893 7.657366 1240.780369
+TATTGT 98.820396 1.904838 1.914016 0.956927 7.657366 1214.087565
+TATTTA 93.672438 1.757052 1.808801 0.879116 7.657366 1604.519232
+TATTTC 91.570518 1.688504 1.705524 0.804909 7.657366 1515.237781
+TATTTG 93.353641 1.624727 1.751315 0.837542 7.657366 1319.629552
+TATTTT 93.368194 1.836258 1.848952 0.908550 7.657366 2695.989630
+TCAAAA 88.977006 2.039956 1.973446 1.001840 7.657366 1872.699006
+TCAAAC 86.106341 2.190712 2.192133 1.172899 7.657366 1581.717668
+TCAAAG 88.107460 2.335994 2.087882 1.090233 7.657366 1430.595797
+TCAAAT 86.842912 2.183513 2.263567 1.230695 7.657366 1213.350044
+TCAACA 79.842998 1.975552 1.919826 0.961287 7.657366 2258.568201
+TCAACC 78.621360 1.871640 1.839385 0.901507 7.657366 1800.093070
+TCAACG 79.702009 2.053196 1.892815 0.941071 7.657366 2287.971101
+TCAACT 79.249263 1.842367 1.809397 0.879551 7.657366 1265.850317
+TCAAGA 87.349539 2.075885 2.006216 1.026897 7.657366 717.772439
+TCAAGC 85.022512 2.102532 2.123276 1.118072 7.657366 876.979347
+TCAAGG 86.789162 4.963277 2.014449 1.033225 7.657366 1328.330859
+TCAAGT 85.754647 2.498108 2.253006 1.222092 7.657366 466.845833
+TCAATA 78.396397 1.973218 1.905905 0.950850 7.657366 1904.183114
+TCAATC 77.066750 2.029630 2.234460 1.207033 7.657366 1446.935925
+TCAATG 77.841529 1.878949 1.969142 0.998564 7.657366 1817.787947
+TCAATT 77.733519 1.998741 2.315406 1.273213 7.657366 1160.144080
+TCACAA 90.893800 1.328782 1.405297 0.602022 7.657366 1078.868990
+TCACAC 88.538802 1.292498 1.335643 0.557822 7.657366 715.744575
+TCACAG 90.190631 1.405536 1.360872 0.573702 7.657366 782.280790
+TCACAT 89.484135 1.363124 1.339168 0.560032 7.657366 968.932681
+TCACCA 87.846552 1.395636 1.444858 0.627622 7.657366 3051.090681
+TCACCC 86.725804 1.421896 1.484744 0.653789 7.657366 1645.213666
+TCACCG 87.767169 1.380016 1.429573 0.617689 7.657366 3080.323668
+TCACCT 87.407800 1.431970 1.416469 0.609215 7.657366 1505.167085
+TCACGA 90.139961 1.465939 1.318961 0.547404 7.657366 1203.583711
+TCACGC 87.873949 1.530860 1.428039 0.616695 7.657366 2576.941860
+TCACGG 89.213652 1.407725 1.399475 0.598285 7.657366 1397.312298
+TCACGT 88.345963 1.383738 1.336988 0.558665 7.657366 903.014094
+TCACTA 86.425696 1.508626 1.561919 0.705421 7.657366 828.302039
+TCACTC 84.438155 1.488899 1.458600 0.636597 7.657366 945.721856
+TCACTG 85.697086 1.584846 1.477319 0.648891 7.657366 1773.984470
+TCACTT 85.085530 1.639129 1.521733 0.678372 7.657366 1335.887688
+TCAGAA 84.923551 1.968882 1.993556 1.017192 7.657366 1607.078825
+TCAGAC 79.664449 1.759157 2.181012 1.163985 7.657366 1218.614103
+TCAGAG 83.742894 2.109482 2.014308 1.033116 7.657366 755.432101
+TCAGAT 81.327731 1.794249 2.167909 1.153511 7.657366 1406.302516
+TCAGCA 79.575531 1.706779 1.708070 0.806712 7.657366 3758.617149
+TCAGCC 78.107976 1.818714 1.729656 0.822054 7.657366 2329.786279
+TCAGCG 79.305745 1.817096 1.739760 0.829267 7.657366 3565.642849
+TCAGCT 79.169423 2.196294 1.936497 0.973836 7.657366 1436.284535
+TCAGGA 84.510223 1.939007 1.941447 0.977572 7.657366 2064.676192
+TCAGGC 80.981014 1.758350 1.999121 1.021454 7.657366 2885.352290
+TCAGGG 83.365389 1.932112 1.959256 0.991053 7.657366 1559.984991
+TCAGGT 82.368781 1.912518 2.060554 1.068898 7.657366 2007.622690
+TCAGTA 76.792639 1.915895 2.092389 1.093765 7.657366 1202.892270
+TCAGTC 74.031470 1.904308 2.207645 1.185371 7.657366 956.406781
+TCAGTG 75.724352 1.812859 2.106322 1.104708 7.657366 1354.837376
+TCAGTT 75.413354 2.052320 2.369757 1.318305 7.657366 1803.117325
+TCATAA 85.019421 1.872602 1.839726 0.901758 7.657366 1394.121193
+TCATAC 81.859034 2.000872 2.017176 1.035323 7.657366 900.658324
+TCATAG 84.317843 1.982450 1.850472 0.909671 7.657366 670.514316
+TCATAT 83.041056 1.735241 1.911132 0.954765 7.657366 1067.889362
+TCATCA 78.362203 2.063848 2.320705 1.277586 7.657366 3588.530931
+TCATCC 76.872827 2.072503 2.349347 1.301311 7.657366 1962.829418
+TCATCG 78.347483 2.176862 2.320946 1.277785 7.657366 2632.959221
+TCATCT 77.789762 1.955908 2.352731 1.304123 7.657366 1619.545035
+TCATGA 82.886095 1.972318 1.920283 0.961630 7.657366 1392.983001
+TCATGC 79.422510 2.108129 2.169490 1.154774 7.657366 1700.118008
+TCATGG 81.796356 2.053938 1.984980 1.010635 7.657366 1434.619462
+TCATGT 80.190256 1.965514 2.065934 1.073087 7.657366 916.773876
+TCATTA 73.046789 2.295099 2.432523 1.371026 7.657366 1825.810470
+TCATTC 70.097900 2.065407 2.410996 1.352867 7.657366 1193.910891
+TCATTG 72.238410 2.412882 2.515801 1.442031 7.657366 1628.031022
+TCATTT 71.535354 2.079515 2.344194 1.297032 7.657366 1771.087711
+TCCAAA 99.387429 1.444697 1.500888 0.664481 7.657366 426.700959
+TCCAAC 97.496814 1.656032 1.588862 0.723752 7.657366 697.903157
+TCCAAG 98.990152 1.991355 1.582671 0.719526 7.657366 126.780730
+TCCAAT 98.143502 1.393245 1.593163 0.726693 7.657366 339.601059
+TCCACA 91.755431 1.696952 1.314152 0.544413 7.657366 1113.882075
+TCCACC 90.977630 1.648287 1.406743 0.602951 7.657366 1541.007490
+TCCACG 91.777845 1.713883 1.364205 0.575811 7.657366 1287.341240
+TCCACT 91.870549 1.609228 1.419169 0.610958 7.657366 1021.319775
+TCCAGA 97.352072 1.523759 1.539986 0.690614 7.657366 2048.150526
+TCCAGC 95.405223 1.510740 1.569778 0.710751 7.657366 4164.289697
+TCCAGG 96.187360 1.577372 1.567898 0.709475 7.657366 1504.570609
+TCCAGT 96.153513 1.484066 1.627435 0.750267 7.657366 1928.026591
+TCCATA 90.908531 1.629164 1.603615 0.733855 7.657366 1017.635599
+TCCATC 89.367763 1.531466 1.719005 0.814471 7.657366 1643.338790
+TCCATG 90.282288 1.653905 1.624082 0.747950 7.657366 983.810422
+TCCATT 90.351077 1.613435 1.829962 0.894588 7.657366 1209.601856
+TCCCAA 98.212213 1.698504 1.315823 0.545452 7.657366 523.013272
+TCCCAC 96.073453 1.893668 1.291360 0.530312 7.657366 809.442638
+TCCCAG 97.472252 1.664221 1.284563 0.526130 7.657366 1199.783364
+TCCCAT 97.232150 1.590814 1.368260 0.578380 7.657366 652.829802
+TCCCCA 95.496686 1.894205 1.299780 0.535507 7.657366 1111.368547
+TCCCCC 94.263184 1.808198 1.271213 0.517950 7.657366 572.878075
+TCCCCG 95.255551 1.887942 1.292370 0.530934 7.657366 1007.864296
+TCCCCT 94.501058 1.677423 1.238087 0.497837 7.657366 687.723875
+TCCCGA 97.389680 1.675399 1.293221 0.531458 7.657366 887.254082
+TCCCGC 95.359029 1.709910 1.271670 0.518229 7.657366 1658.918348
+TCCCGG 96.422959 1.730912 1.259222 0.510639 7.657366 1441.073786
+TCCCGT 96.064224 1.438816 1.363229 0.575193 7.657366 741.682096
+TCCCTA 93.787596 1.544515 1.251140 0.505731 7.657366 300.655918
+TCCCTC 92.219110 1.653386 1.372277 0.580929 7.657366 429.017032
+TCCCTG 93.236047 2.031204 1.317683 0.546609 7.657366 1133.338130
+TCCCTT 93.053553 1.602330 1.460287 0.637701 7.657366 808.332595
+TCCGAA 94.810921 1.693131 1.570415 0.711184 7.657366 686.393570
+TCCGAC 90.390136 1.547565 1.758742 0.842876 7.657366 639.700168
+TCCGAG 93.447824 1.729283 1.538149 0.689379 7.657366 352.097721
+TCCGAT 92.176801 1.606433 1.741292 0.830362 7.657366 742.716206
+TCCGCA 90.120799 1.641528 1.430175 0.618079 7.657366 1762.284948
+TCCGCC 89.001957 1.538544 1.503823 0.666431 7.657366 2155.928435
+TCCGCG 90.230360 1.584621 1.466569 0.641821 7.657366 1583.482837
+TCCGCT 90.161053 1.544758 1.519971 0.677195 7.657366 1682.293778
+TCCGGA 94.480911 1.617340 1.571332 0.711807 7.657366 1175.594597
+TCCGGC 91.634509 1.532536 1.595839 0.728524 7.657366 3331.377730
+TCCGGG 93.498380 1.532856 1.504943 0.667176 7.657366 1705.086491
+TCCGGT 92.964720 1.560889 1.661359 0.773848 7.657366 1881.650939
+TCCGTA 88.850954 1.732359 1.770980 0.851689 7.657366 797.412482
+TCCGTC 86.727319 1.794007 1.978606 1.005772 7.657366 977.297884
+TCCGTG 88.076195 1.742095 1.791614 0.866616 7.657366 861.858807
+TCCGTT 88.251991 1.671077 1.948521 0.982919 7.657366 1148.572669
+TCCTAA 93.711555 2.149972 1.619957 0.745102 7.657366 227.925993
+TCCTAC 90.736213 1.656363 1.654108 0.768787 7.657366 265.804495
+TCCTAG 92.158205 1.369622 1.366647 0.577357 7.657366 21.450318
+TCCTAT 91.835737 1.806862 1.519667 0.676991 7.657366 192.693029
+TCCTCA 86.755550 1.986449 1.946001 0.981013 7.657366 813.892343
+TCCTCC 85.556738 2.094879 1.969781 0.999050 7.657366 542.173438
+TCCTCG 86.597076 1.913328 1.871954 0.925557 7.657366 816.111195
+TCCTCT 86.721719 2.108207 1.993910 1.017463 7.657366 613.419042
+TCCTGA 90.914159 1.774208 1.522626 0.678969 7.657366 2229.329854
+TCCTGC 87.860511 1.535605 1.665494 0.776739 7.657366 2271.672109
+TCCTGG 89.744925 1.724274 1.607130 0.736270 7.657366 1632.243389
+TCCTGT 88.758918 1.700268 1.658508 0.771857 7.657366 1167.293819
+TCCTTA 82.440080 2.092739 2.052325 1.062501 7.657366 716.701384
+TCCTTC 79.485759 1.778348 2.024090 1.040651 7.657366 779.084857
+TCCTTG 81.663808 1.824034 1.894336 0.942206 7.657366 664.765225
+TCCTTT 80.677683 1.731346 1.812842 0.882064 7.657366 937.486184
+TCGAAA 81.693397 1.979676 2.003022 1.024445 7.657366 1381.426279
+TCGAAC 78.970099 1.816871 2.217737 1.193508 7.657366 1078.271123
+TCGAAG 80.641033 1.987554 2.057980 1.066896 7.657366 1128.243455
+TCGAAT 79.623649 1.829690 2.245063 1.215635 7.657366 664.262833
+TCGACA 72.001630 1.932306 2.139571 1.130968 7.657366 1581.627707
+TCGACC 70.897291 1.695767 2.093165 1.094373 7.657366 1095.927532
+TCGACG 71.792582 2.026364 2.114278 1.110973 7.657366 1395.797944
+TCGACT 71.607927 1.758808 1.933686 0.971715 7.657366 799.890173
+TCGAGA 80.383961 1.916882 2.016334 1.034675 7.657366 741.548538
+TCGAGC 78.235524 1.818040 2.055915 1.065290 7.657366 834.018014
+TCGAGG 79.194648 2.053507 1.990134 1.014574 7.657366 683.955393
+TCGAGT 78.712045 1.966811 2.096325 1.096853 7.657366 501.076975
+TCGATA 70.653353 1.613147 1.852899 0.911461 7.657366 2130.173522
+TCGATC 69.063594 1.843097 2.095272 1.096026 7.657366 1272.101600
+TCGATG 69.691778 1.575412 1.866058 0.921188 7.657366 1795.218656
+TCGATT 70.185677 1.754195 2.167488 1.153176 7.657366 1291.250164
+TCGCAA 85.886922 1.426413 1.524211 0.680030 7.657366 1393.376297
+TCGCAC 83.612402 1.434514 1.573186 0.713067 7.657366 1030.807023
+TCGCAG 85.244794 1.565542 1.585401 0.721388 7.657366 1388.060065
+TCGCAT 84.570305 1.377864 1.508331 0.669430 7.657366 1190.904497
+TCGCCA 82.390022 1.554528 1.772875 0.853056 7.657366 4533.058077
+TCGCCC 81.298867 1.494702 1.736450 0.826901 7.657366 2154.825116
+TCGCCG 82.298897 1.529819 1.710413 0.808373 7.657366 3361.069449
+TCGCCT 82.108933 1.608389 1.739469 0.829058 7.657366 1987.852576
+TCGCGA 84.374920 1.539164 1.628322 0.750880 7.657366 1864.346758
+TCGCGC 81.525712 1.635823 1.760116 0.843864 7.657366 2948.075827
+TCGCGG 83.350274 1.603110 1.721754 0.816426 7.657366 2059.250929
+TCGCGT 82.182516 1.672353 1.783539 0.860764 7.657366 1472.239613
+TCGCTA 80.759222 1.625378 1.776192 0.855451 7.657366 1273.553850
+TCGCTC 78.910564 1.638034 1.802260 0.874352 7.657366 1202.929834
+TCGCTG 80.142651 1.641344 1.776829 0.855911 7.657366 2972.181823
+TCGCTT 79.527846 1.437960 1.534652 0.687029 7.657366 2028.030958
+TCGGAA 81.423764 2.018140 2.030801 1.045831 7.657366 778.506837
+TCGGAC 76.264630 1.849531 2.251034 1.220488 7.657366 489.553814
+TCGGAG 80.196692 1.864961 2.131761 1.124781 7.657366 353.569743
+TCGGAT 77.967456 2.038324 2.315249 1.273083 7.657366 763.490195
+TCGGCA 76.448951 1.746726 1.821219 0.888185 7.657366 2946.462204
+TCGGCC 75.210289 1.756505 1.776471 0.855653 7.657366 997.862456
+TCGGCG 76.212617 1.701939 1.774500 0.854229 7.657366 2784.431646
+TCGGCT 75.893850 1.842810 1.820626 0.887752 7.657366 1238.120246
+TCGGGA 81.877669 1.767782 1.959180 0.990996 7.657366 827.122785
+TCGGGC 79.120561 1.889068 2.125625 1.119928 7.657366 1081.932095
+TCGGGG 80.926371 1.778511 2.031123 1.046079 7.657366 542.943678
+TCGGGT 80.130402 1.956281 2.206375 1.184348 7.657366 898.008807
+TCGGTA 74.703414 1.824083 2.169652 1.154902 7.657366 1557.198325
+TCGGTC 72.398711 1.974375 2.319869 1.276896 7.657366 1125.847654
+TCGGTG 73.727787 1.989484 2.215510 1.191710 7.657366 1837.947438
+TCGGTT 73.569576 2.130692 2.395702 1.340015 7.657366 1416.579139
+TCGTAA 78.100580 1.885671 2.141869 1.132791 7.657366 1020.898244
+TCGTAC 74.436619 1.848281 2.158980 1.146392 7.657366 701.371224
+TCGTAG 77.035260 1.853659 2.083518 1.086816 7.657366 629.178998
+TCGTAT 75.719472 1.899084 2.236051 1.208322 7.657366 646.884670
+TCGTCA 70.979483 1.997417 2.593355 1.509222 7.657366 1690.156319
+TCGTCC 69.630573 1.972324 2.666588 1.573600 7.657366 929.074427
+TCGTCG 70.925838 2.128531 2.523217 1.448412 7.657366 1545.740621
+TCGTCT 70.415011 1.970817 2.561296 1.481323 7.657366 993.011428
+TCGTGA 74.668094 1.953652 2.093310 1.094487 7.657366 1387.892835
+TCGTGC 70.976764 1.912434 2.386844 1.332589 7.657366 1123.693845
+TCGTGG 73.762575 2.255402 2.329673 1.284999 7.657366 1167.244855
+TCGTGT 72.204654 2.174717 2.570750 1.489532 7.657366 479.140359
+TCGTTA 65.728510 2.204501 2.650854 1.559693 7.657366 1425.192451
+TCGTTC 62.480085 1.843194 2.556121 1.476837 7.657366 999.742374
+TCGTTG 64.903196 2.185815 2.603165 1.517794 7.657366 1459.844741
+TCGTTT 63.890329 1.891409 2.442286 1.379289 7.657366 1339.692588
+TCTAAA 106.168238 2.090928 2.188545 1.170021 7.657366 481.526739
+TCTAAC 103.807003 2.089663 2.190465 1.171560 7.657366 431.513156
+TCTAAG 105.085711 1.763959 2.143323 1.133944 7.657366 198.966130
+TCTAAT 105.114572 2.337979 2.497661 1.426463 7.657366 304.591211
+TCTACA 102.024918 1.913293 2.010447 1.030147 7.657366 663.310970
+TCTACC 101.583972 2.033294 2.181031 1.164000 7.657366 913.029066
+TCTACG 101.530544 2.008909 2.082144 1.085742 7.657366 695.707292
+TCTACT 102.970243 1.902409 2.120033 1.115512 7.657366 509.073349
+TCTAGA 103.262156 2.082103 2.342963 1.296010 7.657366 43.407141
+TCTAGC 102.268945 2.143601 2.283971 1.247373 7.657366 53.639570
+TCTAGG 103.001844 1.591897 2.222017 1.196964 7.657366 9.971345
+TCTAGT 103.176441 2.987841 2.334586 1.289066 7.657366 68.962710
+TCTATA 101.894151 2.246749 2.460917 1.395102 7.657366 471.073401
+TCTATC 102.649358 2.548660 2.551962 1.473233 7.657366 782.602734
+TCTATG 101.469029 2.555791 2.514581 1.440983 7.657366 577.447988
+TCTATT 104.313337 2.452969 2.688061 1.592645 7.657366 783.051158
+TCTCAA 111.703067 2.179832 2.280125 1.244223 7.657366 989.557991
+TCTCAC 109.307847 2.134701 2.231586 1.204705 7.657366 727.292295
+TCTCAG 111.214564 2.045943 2.200742 1.179816 7.657366 560.693080
+TCTCAT 110.483300 2.106049 2.287955 1.250637 7.657366 872.701398
+TCTCCA 110.533317 2.053780 2.198758 1.178220 7.657366 1056.136092
+TCTCCC 109.145909 2.164469 2.087889 1.090238 7.657366 807.223076
+TCTCCG 110.761670 2.061632 2.138752 1.130319 7.657366 1131.997071
+TCTCCT 110.009184 1.918228 2.118849 1.114578 7.657366 770.283298
+TCTCGA 111.383713 2.113926 2.286215 1.249211 7.657366 802.907624
+TCTCGC 108.750981 2.072939 2.181139 1.164087 7.657366 1181.689373
+TCTCGG 110.592417 2.161883 2.278113 1.242577 7.657366 754.279226
+TCTCGT 110.405878 2.376751 2.428471 1.367602 7.657366 665.475716
+TCTCTA 110.467052 2.098709 2.264787 1.231690 7.657366 506.397347
+TCTCTC 109.302264 2.352201 2.426851 1.366234 7.657366 715.098042
+TCTCTG 110.528923 2.981736 2.321617 1.278339 7.657366 1400.471440
+TCTCTT 110.337019 2.261889 2.596313 1.511805 7.657366 1530.602855
+TCTGAA 102.089945 1.992110 2.165635 1.151697 7.657366 1648.133058
+TCTGAC 98.434733 1.997213 2.124345 1.118917 7.657366 1427.064101
+TCTGAG 101.096975 1.872956 2.160121 1.147301 7.657366 445.080207
+TCTGAT 100.495767 2.034902 2.284824 1.248071 7.657366 1655.973528
+TCTGCA 100.272129 2.036448 2.067524 1.074326 7.657366 1823.710469
+TCTGCC 99.753404 2.008076 2.145643 1.135786 7.657366 2346.174803
+TCTGCG 100.087923 2.139137 2.126454 1.120584 7.657366 2329.225886
+TCTGCT 101.276743 2.124286 2.218206 1.193887 7.657366 2059.033480
+TCTGGA 102.264552 2.172309 2.242419 1.213488 7.657366 2031.281224
+TCTGGC 99.986279 2.070616 2.119637 1.115199 7.657366 3865.016852
+TCTGGG 101.471025 2.019979 2.165390 1.151502 7.657366 1214.468649
+TCTGGT 101.856598 2.199698 2.308726 1.267707 7.657366 2571.150090
+TCTGTA 101.201071 2.227185 2.432316 1.370851 7.657366 718.046450
+TCTGTC 101.289800 2.556796 2.472111 1.404631 7.657366 858.027310
+TCTGTG 100.792251 2.333312 2.505269 1.432986 7.657366 784.625294
+TCTGTT 103.080628 2.579908 2.708576 1.610912 7.657366 1652.132107
+TCTTAA 100.686641 1.681054 1.746642 0.834193 7.657366 765.150199
+TCTTAC 96.963896 1.787956 1.699593 0.800714 7.657366 708.437933
+TCTTAG 99.795883 1.787392 1.767800 0.849396 7.657366 221.334267
+TCTTAT 98.185340 1.778091 1.818991 0.886556 7.657366 867.867003
+TCTTCA 95.234441 1.667608 1.794359 0.868609 7.657366 2502.050512
+TCTTCC 93.541002 1.566357 1.739305 0.828941 7.657366 2149.172783
+TCTTCG 95.129135 1.844237 1.832454 0.896416 7.657366 1916.930703
+TCTTCT 94.945641 1.635329 1.806947 0.877766 7.657366 1718.362866
+TCTTGA 98.888483 1.839411 1.637194 0.757025 7.657366 831.497090
+TCTTGC 94.803112 1.731386 1.668164 0.778607 7.657366 1216.522568
+TCTTGG 97.560894 1.943857 1.728910 0.821521 7.657366 369.273822
+TCTTGT 96.375230 1.695077 1.730872 0.822920 7.657366 655.481823
+TCTTTA 91.784966 1.578254 1.656873 0.770716 7.657366 1966.662422
+TCTTTC 89.461070 1.801220 1.643098 0.761124 7.657366 1990.916772
+TCTTTG 91.154760 1.628964 1.636799 0.756752 7.657366 1672.819977
+TCTTTT 91.487383 1.651689 1.717306 0.813264 7.657366 1942.899829
+TGAAAA 76.034262 1.740922 1.631453 0.753047 7.657366 3438.131808
+TGAAAC 73.886620 1.630017 1.740742 0.829969 7.657366 1954.108179
+TGAAAG 74.907092 1.679227 1.661700 0.774086 7.657366 2326.705691
+TGAAAT 74.351450 1.621211 1.739322 0.828953 7.657366 2241.552296
+TGAACA 64.945231 2.128885 2.263806 1.230890 7.657366 2002.858076
+TGAACC 64.292639 1.940777 2.259543 1.227415 7.657366 1234.251878
+TGAACG 65.168652 2.245739 2.274345 1.239495 7.657366 1943.522075
+TGAACT 65.134716 2.074549 2.143974 1.134461 7.657366 1674.775183
+TGAAGA 73.304531 1.653293 1.684370 0.789981 7.657366 2328.081180
+TGAAGC 72.063776 1.434429 1.575652 0.714744 7.657366 2031.076901
+TGAAGG 72.642544 1.744907 1.755442 0.840505 7.657366 1633.519944
+TGAAGT 72.032442 1.437507 1.595385 0.728213 7.657366 1652.623809
+TGAATA 64.514103 2.246976 2.259218 1.227150 7.657366 2006.800605
+TGAATC 62.730862 1.757061 2.033198 1.047683 7.657366 1336.768974
+TGAATG 63.925173 2.147708 2.227877 1.201703 7.657366 1558.154941
+TGAATT 63.460528 1.670152 1.764646 0.847123 7.657366 1694.953071
+TGACAA 80.504836 1.928380 1.756490 0.841257 7.657366 1278.479336
+TGACAC 79.119789 1.810487 1.638436 0.757887 7.657366 778.804270
+TGACAG 79.722390 1.752590 1.705071 0.804589 7.657366 1122.885689
+TGACAT 79.903039 1.917053 1.711685 0.809275 7.657366 1023.717053
+TGACCA 77.101627 1.631540 1.766689 0.848595 7.657366 2301.783533
+TGACCC 76.181255 1.599168 1.727385 0.820434 7.657366 781.938140
+TGACCG 77.098345 1.651322 1.734478 0.825493 7.657366 2220.719664
+TGACCT 76.915397 1.863911 1.718141 0.813858 7.657366 1248.871574
+TGACGA 79.599992 1.704154 1.665012 0.776401 7.657366 1850.725995
+TGACGC 78.117264 1.596539 1.607993 0.736862 7.657366 2735.279851
+TGACGG 78.833820 1.625497 1.695185 0.797602 7.657366 1900.504788
+TGACGT 78.573321 1.734007 1.653971 0.768692 7.657366 1458.601757
+TGACTA 76.606627 1.663992 1.666895 0.777719 7.657366 605.322025
+TGACTC 75.198129 1.663606 1.699597 0.800717 7.657366 687.257956
+TGACTG 76.019156 1.648314 1.623850 0.747789 7.657366 1192.972832
+TGACTT 75.747676 1.516074 1.483114 0.652713 7.657366 975.923955
+TGAGAA 76.218432 1.706226 1.507402 0.668812 7.657366 1012.426765
+TGAGAC 72.338055 1.447474 1.735029 0.825886 7.657366 351.263043
+TGAGAG 75.367147 1.561322 1.527642 0.682327 7.657366 481.036440
+TGAGAT 73.565422 1.432440 1.675707 0.783894 7.657366 1009.361005
+TGAGCA 69.720697 1.671290 1.868701 0.923145 7.657366 1614.146418
+TGAGCC 68.502615 1.669852 1.910134 0.954017 7.657366 810.014757
+TGAGCG 69.629033 1.700299 1.837329 0.899996 7.657366 1866.160346
+TGAGCT 69.411382 1.615370 1.757798 0.842197 7.657366 946.767523
+TGAGGA 75.479158 1.455222 1.430593 0.618350 7.657366 767.247825
+TGAGGC 73.636421 1.303208 1.491093 0.657987 7.657366 638.120486
+TGAGGG 74.559821 1.532209 1.381727 0.586940 7.657366 454.477201
+TGAGGT 74.224304 1.386623 1.522371 0.678799 7.657366 633.336008
+TGAGTA 68.968134 1.364663 1.543299 0.692844 7.657366 941.297278
+TGAGTC 67.282570 1.442082 1.805554 0.876750 7.657366 590.732161
+TGAGTG 68.328177 1.480275 1.626542 0.749649 7.657366 822.884522
+TGAGTT 68.637093 1.444640 1.830570 0.895034 7.657366 1171.513573
+TGATAA 73.585596 1.713768 1.928017 0.967446 7.657366 2279.520159
+TGATAC 71.076972 1.584929 1.907409 0.951976 7.657366 1211.918167
+TGATAG 72.964789 1.595823 1.905107 0.950253 7.657366 949.246658
+TGATAT 72.221374 1.874355 1.985583 1.011096 7.657366 1948.744112
+TGATCA 66.701778 1.894224 2.145408 1.135599 7.657366 1928.444098
+TGATCC 65.615169 1.883353 2.272111 1.237669 7.657366 1749.202820
+TGATCG 66.848856 1.825745 2.123164 1.117984 7.657366 2131.904301
+TGATCT 66.638114 2.027472 2.197665 1.177341 7.657366 1351.753659
+TGATGA 71.022600 1.633549 1.815795 0.884220 7.657366 3542.534255
+TGATGC 68.379285 1.593187 1.908214 0.952579 7.657366 3164.972129
+TGATGG 70.069165 1.511176 1.783137 0.860473 7.657366 2959.025904
+TGATGT 69.195677 1.728797 1.979369 1.006354 7.657366 2173.691294
+TGATTA 62.530246 1.968661 2.252764 1.221895 7.657366 2114.194071
+TGATTC 59.828776 1.846215 2.301897 1.262087 7.657366 1293.925709
+TGATTG 61.893429 1.940820 2.187155 1.168906 7.657366 2497.650892
+TGATTT 61.193181 1.981448 2.226683 1.200737 7.657366 2896.089615
+TGCAAA 100.387351 1.910189 1.933365 0.971474 7.657366 2162.881662
+TGCAAC 97.653767 1.729443 1.936269 0.973663 7.657366 1905.742854
+TGCAAG 99.383034 1.922886 1.972549 1.001157 7.657366 914.351936
+TGCAAT 98.839114 1.886802 2.105611 1.104149 7.657366 1579.708557
+TGCACA 91.193560 1.718056 1.474843 0.647260 7.657366 1094.934975
+TGCACC 90.246514 1.705239 1.473813 0.646583 7.657366 1588.475416
+TGCACG 91.237184 1.604150 1.449453 0.630618 7.657366 1262.321766
+TGCACT 91.419793 1.568864 1.575122 0.714384 7.657366 1062.800091
+TGCAGA 97.216958 1.920830 1.942441 0.978323 7.657366 1736.665860
+TGCAGC 94.849428 1.717621 1.795129 0.869168 7.657366 2417.313519
+TGCAGG 96.295201 1.838139 1.879289 0.931002 7.657366 2411.448123
+TGCAGT 96.114064 1.861665 1.970674 0.999729 7.657366 1382.221538
+TGCATA 90.829568 1.600712 1.633033 0.754142 7.657366 1203.075885
+TGCATC 89.562252 1.647898 1.729622 0.822029 7.657366 1726.293773
+TGCATG 90.290034 1.637359 1.678740 0.786023 7.657366 1061.862360
+TGCATT 90.847472 1.790663 1.811597 0.881156 7.657366 1584.629661
+TGCCAA 100.587547 2.079155 1.566558 0.708565 7.657366 1794.113293
+TGCCAC 98.241983 1.628904 1.615922 0.742320 7.657366 1896.097963
+TGCCAG 99.914392 1.924215 1.604738 0.734626 7.657366 4324.222702
+TGCCAT 99.286007 1.866007 1.766124 0.848188 7.657366 2374.522393
+TGCCCA 96.524731 1.631529 1.433494 0.620232 7.657366 1578.157419
+TGCCCC 95.048672 1.649825 1.437809 0.623034 7.657366 1086.234708
+TGCCCG 96.579345 1.904997 1.497298 0.662099 7.657366 2224.658147
+TGCCCT 96.396217 1.730682 1.505930 0.667832 7.657366 1120.510551
+TGCCGA 99.327163 1.758531 1.681388 0.787884 7.657366 3152.642340
+TGCCGC 97.129756 1.659266 1.596437 0.728933 7.657366 4407.703465
+TGCCGG 98.420711 1.788298 1.618045 0.743783 7.657366 3276.909650
+TGCCGT 97.997881 1.724979 1.729323 0.821816 7.657366 2775.832595
+TGCCTA 96.133070 1.946215 1.611737 0.739438 7.657366 462.161826
+TGCCTC 93.995471 1.640626 1.589784 0.724382 7.657366 685.477654
+TGCCTG 95.499175 1.751591 1.610740 0.738752 7.657366 2781.531860
+TGCCTT 94.841513 1.832240 1.823054 0.889528 7.657366 1574.657918
+TGCGAA 95.898326 1.958523 1.818971 0.886541 7.657366 1360.511366
+TGCGAC 91.931233 1.700086 1.734003 0.825154 7.657366 1423.660705
+TGCGAG 94.716923 2.015089 1.877062 0.929348 7.657366 710.812735
+TGCGAT 93.573351 1.677951 1.877258 0.929493 7.657366 1704.022721
+TGCGCA 90.835618 1.589345 1.556743 0.701917 7.657366 2705.073928
+TGCGCC 89.760836 1.616935 1.558854 0.703345 7.657366 3749.899923
+TGCGCG 90.834083 1.523426 1.562691 0.705943 7.657366 3070.107214
+TGCGCT 90.895972 1.601708 1.647556 0.764224 7.657366 2756.265425
+TGCGGA 95.675594 1.974136 1.844088 0.904967 7.657366 1999.761714
+TGCGGC 92.861716 1.648444 1.714444 0.811232 7.657366 2910.215120
+TGCGGG 94.760044 1.859997 1.814701 0.883422 7.657366 1846.294041
+TGCGGT 94.362373 1.759441 1.869393 0.923658 7.657366 2141.483827
+TGCGTA 90.507964 1.806767 1.756823 0.841497 7.657366 1525.687810
+TGCGTC 89.129272 1.732501 1.794076 0.868403 7.657366 1653.343588
+TGCGTG 90.163375 1.607163 1.732157 0.823837 7.657366 1751.874542
+TGCGTT 90.834058 1.875481 1.940491 0.976849 7.657366 2182.994852
+TGCTAA 94.217931 2.007187 1.796695 0.870306 7.657366 887.675663
+TGCTAC 90.913722 1.702051 1.682459 0.788637 7.657366 795.821353
+TGCTAG 92.665903 2.187024 1.815372 0.883912 7.657366 99.464172
+TGCTAT 92.099410 1.936637 1.719134 0.814564 7.657366 814.026870
+TGCTCA 86.104506 1.659611 1.813358 0.882441 7.657366 1545.555053
+TGCTCC 84.609450 1.665635 1.774116 0.853952 7.657366 888.408401
+TGCTCG 86.326483 1.782881 1.790187 0.865582 7.657366 1523.228979
+TGCTCT 85.859778 1.919043 1.843266 0.904362 7.657366 1074.229875
+TGCTGA 91.400930 2.035549 1.677128 0.784891 7.657366 3769.824527
+TGCTGC 88.142662 1.612893 1.613346 0.740545 7.657366 4192.659543
+TGCTGG 90.112226 1.901727 1.676842 0.784691 7.657366 5113.483458
+TGCTGT 89.003916 1.676632 1.634220 0.754964 7.657366 2586.439304
+TGCTTA 82.085903 1.602860 1.568863 0.710130 7.657366 1071.629413
+TGCTTC 79.206171 1.599542 1.645357 0.762694 7.657366 1560.122755
+TGCTTG 81.304800 1.628871 1.526836 0.681787 7.657366 962.741680
+TGCTTT 80.528525 1.569198 1.490380 0.657515 7.657366 2158.599143
+TGGAAA 70.586572 1.701015 1.860446 0.917035 7.657366 2435.127386
+TGGAAC 68.171930 1.534334 2.097242 1.097572 7.657366 1373.067589
+TGGAAG 69.271378 1.787537 1.887403 0.937038 7.657366 2294.908773
+TGGAAT 68.655100 1.677886 2.065379 1.072655 7.657366 1207.596360
+TGGACA 58.783235 1.845560 2.472806 1.405223 7.657366 693.069824
+TGGACC 58.044044 1.987160 2.561053 1.481112 7.657366 488.566635
+TGGACG 58.842114 2.068225 2.429937 1.368841 7.657366 1129.397390
+TGGACT 58.923001 1.842892 2.306648 1.265996 7.657366 574.531109
+TGGAGA 68.527175 1.691806 1.889486 0.938590 7.657366 1070.660918
+TGGAGC 67.068026 1.758435 1.934290 0.972171 7.657366 1008.801752
+TGGAGG 67.338849 1.798639 1.902153 0.948044 7.657366 657.838694
+TGGAGT 67.135853 1.647766 1.970587 0.999663 7.657366 828.987058
+TGGATA 59.046023 1.928961 2.278586 1.242964 7.657366 1496.856428
+TGGATC 57.681317 1.700988 2.062706 1.070573 7.657366 1329.088646
+TGGATG 58.316432 1.868801 2.206619 1.184545 7.657366 1902.991409
+TGGATT 58.317057 1.432740 1.811281 0.880925 7.657366 1691.245382
+TGGCAA 80.892345 1.587152 1.639521 0.758640 7.657366 3052.124009
+TGGCAC 78.942231 1.397536 1.450491 0.631295 7.657366 1816.346584
+TGGCAG 79.885052 1.444109 1.548202 0.696148 7.657366 2994.455215
+TGGCAT 79.546453 1.402864 1.466538 0.641801 7.657366 2255.342327
+TGGCCA 77.025666 1.295076 1.635783 0.756047 7.657366 674.581833
+TGGCCC 75.854773 1.328606 1.547145 0.695435 7.657366 995.962403
+TGGCCG 76.979064 1.368544 1.526430 0.681515 7.657366 1451.676816
+TGGCCT 76.780510 1.370266 1.500510 0.664230 7.657366 1620.398797
+TGGCGA 79.499106 1.445290 1.534313 0.686802 7.657366 4589.593684
+TGGCGC 77.332090 1.324857 1.455718 0.634711 7.657366 4484.070278
+TGGCGG 78.536122 1.409741 1.515836 0.674433 7.657366 4588.124364
+TGGCGT 77.845111 1.427729 1.511409 0.671480 7.657366 3535.790426
+TGGCTA 76.060076 1.461560 1.543713 0.693123 7.657366 1184.374460
+TGGCTC 74.243014 1.477717 1.557069 0.702137 7.657366 1008.070391
+TGGCTG 75.509409 1.420900 1.522160 0.678658 7.657366 3768.207589
+TGGCTT 75.106217 1.416165 1.355767 0.570477 7.657366 2044.717861
+TGGGAA 73.937847 1.618208 1.793048 0.867657 7.657366 993.153817
+TGGGAC 69.409307 1.596116 2.036552 1.050276 7.657366 409.994102
+TGGGAG 72.822376 1.766557 1.897835 0.944817 7.657366 501.127025
+TGGGAT 70.784027 1.564846 2.074586 1.079835 7.657366 1044.028012
+TGGGCA 67.430029 1.814239 2.083617 1.086894 7.657366 1373.610180
+TGGGCC 66.250209 1.663166 2.053507 1.063420 7.657366 696.029291
+TGGGCG 67.473071 1.915452 2.127433 1.121358 7.657366 2224.712868
+TGGGCT 67.111588 1.570042 1.927092 0.966750 7.657366 959.449814
+TGGGGA 74.062506 1.431113 1.654369 0.768969 7.657366 1035.981144
+TGGGGC 71.935942 1.412716 1.770456 0.851310 7.657366 1240.936025
+TGGGGG 72.992586 1.487703 1.621358 0.746068 7.657366 604.024484
+TGGGGT 72.677268 1.526132 1.860275 0.916909 7.657366 696.002771
+TGGGTA 66.676902 1.492023 1.679888 0.786830 7.657366 1217.438037
+TGGGTC 64.968011 1.615098 1.967686 0.997456 7.657366 610.437038
+TGGGTG 66.142930 1.586598 1.851805 0.910654 7.657366 1251.611739
+TGGGTT 66.287716 1.657332 2.012598 1.031801 7.657366 1166.984829
+TGGTAA 71.315618 1.906175 1.926509 0.966311 7.657366 1591.974852
+TGGTAC 68.300288 1.868973 2.183697 1.166135 7.657366 1099.667456
+TGGTAG 70.624427 1.915576 1.982893 1.009042 7.657366 900.635343
+TGGTAT 69.496032 2.092410 2.192887 1.173504 7.657366 1552.190809
+TGGTCA 63.969489 2.018902 2.572314 1.490892 7.657366 2241.519035
+TGGTCC 62.549926 2.157408 2.502891 1.430946 7.657366 576.501792
+TGGTCG 64.083444 2.069109 2.554963 1.475833 7.657366 2038.593460
+TGGTCT 63.603393 2.214084 2.590076 1.506361 7.657366 1007.570989
+TGGTGA 68.098701 1.835368 2.083926 1.087136 7.657366 2779.367478
+TGGTGC 64.656386 1.841569 2.319691 1.276749 7.657366 2642.467100
+TGGTGG 67.078881 1.894086 2.163543 1.150029 7.657366 3113.416188
+TGGTGT 65.657062 1.969662 2.332604 1.287425 7.657366 1944.399278
+TGGTTA 59.366699 2.029285 2.516954 1.443023 7.657366 1773.246028
+TGGTTC 56.516298 1.843908 2.543850 1.466214 7.657366 1188.032951
+TGGTTG 58.817652 1.926593 2.489084 1.419121 7.657366 2038.463780
+TGGTTT 57.731314 1.857696 2.353622 1.304864 7.657366 2675.229123
+TGTAAA 102.877477 2.227141 2.505609 1.433278 7.657366 1126.379954
+TGTAAC 100.480513 2.334214 2.550731 1.472167 7.657366 864.066503
+TGTAAG 101.673151 2.221597 2.447684 1.383864 7.657366 507.683061
+TGTAAT 102.123895 2.372119 2.709217 1.611484 7.657366 1180.872446
+TGTACA 96.563002 2.134097 2.182648 1.165295 7.657366 736.459015
+TGTACC 96.155835 2.172762 2.374001 1.321849 7.657366 863.485178
+TGTACG 96.098333 2.063128 2.238444 1.210263 7.657366 912.176107
+TGTACT 97.612472 2.273557 2.468665 1.401695 7.657366 799.417956
+TGTAGA 99.903354 2.312238 2.386941 1.332670 7.657366 645.711215
+TGTAGC 98.499526 2.170642 2.352712 1.304107 7.657366 818.809282
+TGTAGG 99.410182 2.104379 2.278149 1.242606 7.657366 614.524396
+TGTAGT 100.098084 2.696882 2.646828 1.556140 7.657366 672.508764
+TGTATA 97.470358 2.510415 2.566098 1.485492 7.657366 620.603868
+TGTATC 98.633387 2.588735 2.863323 1.750918 7.657366 791.653240
+TGTATG 97.056305 2.570293 2.793156 1.686953 7.657366 885.213584
+TGTATT 100.447195 2.629866 2.962663 1.842823 7.657366 1072.425617
+TGTCAA 110.648973 2.366465 2.440338 1.377638 7.657366 687.071104
+TGTCAC 109.070351 2.358733 2.405824 1.348516 7.657366 981.638262
+TGTCAG 110.122969 2.182488 2.369279 1.317906 7.657366 1336.312901
+TGTCAT 109.934734 2.247078 2.530309 1.454523 7.657366 1313.957692
+TGTCCA 109.392328 2.359325 2.472485 1.404950 7.657366 844.252057
+TGTCCC 107.847719 2.302927 2.335237 1.289605 7.657366 426.953903
+TGTCCG 109.327352 2.414832 2.442516 1.379483 7.657366 948.084806
+TGTCCT 108.798124 2.432198 2.513891 1.440390 7.657366 480.759822
+TGTCGA 110.356683 2.517194 2.564982 1.484522 7.657366 1617.590772
+TGTCGC 107.916290 2.556850 2.570245 1.489094 7.657366 1837.670143
+TGTCGG 109.619869 2.657784 2.589747 1.506074 7.657366 1408.026753
+TGTCGT 109.503790 2.584931 2.698992 1.602370 7.657366 1066.676635
+TGTCTA 110.395388 3.627745 2.418483 1.359173 7.657366 444.244869
+TGTCTC 108.458759 2.456359 2.695669 1.599411 7.657366 524.881061
+TGTCTG 109.536064 2.659104 2.541311 1.464020 7.657366 1236.568051
+TGTCTT 109.666933 2.233173 2.622809 1.535007 7.657366 865.443379
+TGTGAA 99.646442 2.613108 2.457864 1.392506 7.657366 1172.621482
+TGTGAC 96.500517 2.465020 2.474779 1.406905 7.657366 779.097424
+TGTGAG 98.841511 2.677743 2.515691 1.441937 7.657366 392.540413
+TGTGAT 98.753650 2.533490 2.503068 1.431098 7.657366 1214.044178
+TGTGCA 98.071276 2.606161 2.296197 1.257402 7.657366 1182.589473
+TGTGCC 97.762397 2.472225 2.431210 1.369916 7.657366 1259.464336
+TGTGCG 98.108310 2.440988 2.394129 1.338695 7.657366 1358.708220
+TGTGCT 99.559308 2.611271 2.563012 1.482812 7.657366 1093.188569
+TGTGGA 100.100933 2.816418 2.499080 1.427679 7.657366 1137.754209
+TGTGGC 98.461928 2.642642 2.422493 1.362555 7.657366 1607.390253
+TGTGGG 99.477695 2.561990 2.476037 1.407979 7.657366 783.920437
+TGTGGT 100.356677 2.672960 2.653540 1.562064 7.657366 1329.241064
+TGTGTA 100.713293 2.278672 2.480017 1.411375 7.657366 521.161009
+TGTGTC 100.991885 3.248862 2.767336 1.663616 7.657366 624.638413
+TGTGTG 100.286665 2.914332 2.761993 1.658801 7.657366 667.530899
+TGTGTT 103.239579 2.938655 3.167654 2.037356 7.657366 889.797933
+TGTTAA 98.567551 2.233113 2.044368 1.056328 7.657366 1361.710758
+TGTTAC 94.918393 2.016102 1.909154 0.953283 7.657366 1047.505064
+TGTTAG 97.618004 2.226036 2.049417 1.060244 7.657366 639.224075
+TGTTAT 96.186514 1.911511 1.992932 1.016715 7.657366 1316.533713
+TGTTCA 92.233894 2.148211 2.128631 1.122305 7.657366 1939.809230
+TGTTCC 90.963044 2.114278 2.110475 1.107977 7.657366 1601.954493
+TGTTCG 92.379774 2.154613 2.103100 1.102174 7.657366 1636.865340
+TGTTCT 92.019219 2.082235 2.188425 1.169924 7.657366 1379.259613
+TGTTGA 96.056653 2.459756 2.003821 1.025059 7.657366 2208.416305
+TGTTGC 92.589989 1.944307 1.885295 0.935469 7.657366 2474.075044
+TGTTGG 95.145757 2.026766 1.960226 0.991789 7.657366 1460.575620
+TGTTGT 94.007832 2.033321 1.952445 0.985890 7.657366 1566.235523
+TGTTTA 88.831353 1.886066 1.893170 0.941336 7.657366 1817.626619
+TGTTTC 86.773451 1.691262 1.774881 0.854504 7.657366 1442.116344
+TGTTTG 88.429361 1.944864 1.827448 0.892746 7.657366 2039.480776
+TGTTTT 87.968469 1.902231 1.770673 0.851467 7.657366 2563.749639
+TTAAAA 89.857248 1.963449 1.859862 0.916603 7.657366 2189.197577
+TTAAAC 87.603428 1.970355 2.089196 1.091262 7.657366 1484.510936
+TTAAAG 89.015713 1.975209 1.898916 0.945625 7.657366 1782.887840
+TTAAAT 88.257385 1.976996 2.051610 1.061946 7.657366 1661.688356
+TTAACA 81.052991 2.119848 1.964171 0.994785 7.657366 1472.881292
+TTAACC 80.036784 2.031287 1.924190 0.964566 7.657366 1934.663699
+TTAACG 80.726396 2.018714 1.920924 0.962112 7.657366 2458.431519
+TTAACT 80.817519 2.012132 1.819893 0.887215 7.657366 1330.607479
+TTAAGA 88.187614 1.980693 1.901941 0.947886 7.657366 726.511106
+TTAAGC 86.700552 2.031369 2.015885 1.034329 7.657366 1428.476835
+TTAAGG 87.185532 1.985837 1.917571 0.959594 7.657366 835.432587
+TTAAGT 87.010701 2.309257 2.150285 1.139474 7.657366 917.956863
+TTAATA 79.940895 2.063397 1.942695 0.978515 7.657366 1410.297584
+TTAATC 78.572050 2.042478 2.287571 1.250323 7.657366 1800.287403
+TTAATG 79.289923 2.045150 2.067453 1.074270 7.657366 2182.612255
+TTAATT 79.466504 2.117578 2.361843 1.311707 7.657366 1776.290893
+TTACAA 91.069508 1.460580 1.438695 0.623610 7.657366 917.490170
+TTACAC 89.150667 1.323063 1.325478 0.551466 7.657366 936.584795
+TTACAG 90.380022 1.484434 1.428316 0.616874 7.657366 1339.733074
+TTACAT 89.876143 1.359828 1.340533 0.560888 7.657366 847.976448
+TTACCA 88.457845 1.474311 1.544849 0.693888 7.657366 1739.161781
+TTACCC 87.339972 1.551168 1.566577 0.708578 7.657366 1692.699000
+TTACCG 88.292519 1.507275 1.523206 0.679357 7.657366 3227.052476
+TTACCT 88.087655 1.453201 1.490295 0.657459 7.657366 1529.288306
+TTACGA 90.252231 1.373432 1.411554 0.606047 7.657366 934.474365
+TTACGC 88.319464 1.443067 1.441630 0.625520 7.657366 2589.452776
+TTACGG 89.384837 1.420304 1.435875 0.621778 7.657366 1433.971758
+TTACGT 88.705809 1.436070 1.440712 0.624922 7.657366 1214.497488
+TTACTA 87.418752 1.529340 1.544428 0.693604 7.657366 573.986671
+TTACTC 85.528291 1.414909 1.468495 0.643086 7.657366 940.148397
+TTACTG 86.683493 1.520500 1.493738 0.659739 7.657366 2238.834230
+TTACTT 86.369267 1.486386 1.485540 0.654315 7.657366 1102.266109
+TTAGAA 87.573457 1.836602 1.697783 0.799436 7.657366 422.443178
+TTAGAC 83.271399 1.801722 2.131310 1.124424 7.657366 248.814014
+TTAGAG 86.685232 1.753524 1.880547 0.931937 7.657366 291.792450
+TTAGAT 84.908565 1.839247 2.149040 1.138484 7.657366 377.393095
+TTAGCA 82.430620 2.133828 1.672616 0.781726 7.657366 852.783458
+TTAGCC 81.405219 2.037108 1.857251 0.914674 7.657366 1010.154764
+TTAGCG 82.405656 1.964150 1.849177 0.908715 7.657366 1410.961509
+TTAGCT 82.259688 1.962311 1.903390 0.948969 7.657366 606.393306
+TTAGGA 87.444471 1.796508 1.851528 0.910449 7.657366 187.788622
+TTAGGC 84.575991 1.891069 1.950068 0.984090 7.657366 405.591553
+TTAGGG 86.105697 1.561369 1.694683 0.797247 7.657366 311.048373
+TTAGGT 85.722957 1.686550 1.955748 0.988393 7.657366 428.907575
+TTAGTA 81.234559 1.740103 2.048468 1.059507 7.657366 357.759445
+TTAGTC 79.148045 2.223472 2.293599 1.255269 7.657366 468.895438
+TTAGTG 80.110638 2.038894 2.180402 1.163496 7.657366 692.800478
+TTAGTT 80.227856 2.048180 2.272985 1.238383 7.657366 559.904375
+TTATAA 86.228499 1.816726 1.858591 0.915664 7.657366 1032.938803
+TTATAC 83.548609 1.933809 2.011842 1.031219 7.657366 715.861841
+TTATAG 85.586818 1.982595 1.859147 0.916075 7.657366 674.349836
+TTATAT 84.760204 1.869776 1.976445 1.004124 7.657366 1101.033606
+TTATCA 80.995212 2.029685 2.408289 1.350589 7.657366 2463.328991
+TTATCC 80.091540 2.208105 2.472295 1.404787 7.657366 1981.323926
+TTATCG 81.074069 2.179479 2.389471 1.334790 7.657366 2826.388901
+TTATCT 80.675009 2.121255 2.419425 1.359968 7.657366 2019.565249
+TTATGA 84.433593 1.978009 2.068785 1.075309 7.657366 1357.161901
+TTATGC 81.407462 1.988983 2.147120 1.136959 7.657366 1774.117372
+TTATGG 83.329665 2.025690 2.053864 1.063697 7.657366 1703.597584
+TTATGT 82.129668 1.965899 2.137036 1.128958 7.657366 990.287312
+TTATTA 76.562346 2.317661 2.584379 1.501394 7.657366 1824.168268
+TTATTC 73.784251 2.303124 2.647578 1.556802 7.657366 1924.685149
+TTATTG 75.644054 2.310052 2.498608 1.427274 7.657366 2241.947462
+TTATTT 75.007530 2.316655 2.484809 1.415467 7.657366 2300.314204
+TTCAAA 99.064405 1.789124 1.669848 0.779787 7.657366 1745.327888
+TTCAAC 97.069044 1.607501 1.782874 0.860283 7.657366 2014.401272
+TTCAAG 98.195890 1.753073 1.654599 0.769130 7.657366 647.636787
+TTCAAT 97.885507 1.701297 1.867944 0.922584 7.657366 2061.323176
+TTCACA 92.625340 1.707399 1.593502 0.726924 7.657366 1169.668588
+TTCACC 91.698263 1.801378 1.724007 0.818029 7.657366 2867.386347
+TTCACG 92.364429 1.771353 1.614238 0.741160 7.657366 1989.353779
+TTCACT 92.287656 1.761713 1.728270 0.821065 7.657366 1574.364014
+TTCAGA 97.328956 1.813051 1.789079 0.864778 7.657366 1681.904577
+TTCAGC 95.531267 1.789545 1.841483 0.903050 7.657366 3617.981151
+TTCAGG 96.437279 1.711562 1.784127 0.861190 7.657366 2586.498866
+TTCAGT 96.291425 1.803700 1.925414 0.965487 7.657366 1656.875510
+TTCATA 91.250813 1.736876 1.854258 0.912463 7.657366 1363.770675
+TTCATC 89.974915 1.896032 2.005278 1.026177 7.657366 3002.993762
+TTCATG 90.646856 1.768706 1.877274 0.929505 7.657366 1574.228697
+TTCATT 90.939699 1.882916 2.065678 1.072887 7.657366 1902.030765
+TTCCAA 97.498349 1.581975 1.389592 0.591959 7.657366 418.513802
+TTCCAC 95.651708 1.619782 1.512185 0.671997 7.657366 1660.095595
+TTCCAG 97.078743 1.785010 1.404288 0.601374 7.657366 3677.466423
+TTCCAT 96.501455 1.566570 1.552642 0.699145 7.657366 1757.215334
+TTCCCA 94.613191 1.466607 1.354384 0.569604 7.657366 1078.304908
+TTCCCC 93.235063 1.408952 1.339824 0.560443 7.657366 1236.804407
+TTCCCG 94.367755 1.534342 1.330636 0.554689 7.657366 1943.572337
+TTCCCT 94.098019 1.579121 1.439299 0.624003 7.657366 861.140422
+TTCCGA 96.970795 1.952305 1.478060 0.649379 7.657366 866.079825
+TTCCGC 94.822678 1.782930 1.591472 0.725536 7.657366 2515.163690
+TTCCGG 96.094892 1.873042 1.499185 0.663351 7.657366 2895.696624
+TTCCGT 95.486527 1.680947 1.650914 0.766561 7.657366 1370.175782
+TTCCTA 93.939643 1.578837 1.550541 0.697727 7.657366 208.401573
+TTCCTC 92.049032 1.665661 1.690398 0.794225 7.657366 945.569807
+TTCCTG 93.291759 1.760115 1.593653 0.727027 7.657366 2696.494257
+TTCCTT 92.746087 1.692349 1.810225 0.880155 7.657366 1014.264684
+TTCGAA 96.064695 1.790225 1.772189 0.852561 7.657366 995.678513
+TTCGAC 92.263159 1.827176 2.033254 1.047726 7.657366 1115.307392
+TTCGAG 94.754472 1.811191 1.808478 0.878881 7.657366 614.065318
+TTCGAT 93.652695 1.747674 2.007829 1.028135 7.657366 1650.139916
+TTCGCA 91.720299 1.801086 1.686162 0.791242 7.657366 1320.761147
+TTCGCC 90.516637 1.785849 1.800577 0.873128 7.657366 3620.371548
+TTCGCG 91.563503 1.964889 1.780450 0.858529 7.657366 2379.124971
+TTCGCT 91.428300 1.718376 1.837892 0.900410 7.657366 2081.341137
+TTCGGA 95.866405 1.828561 1.812292 0.881663 7.657366 707.825887
+TTCGGC 93.125526 1.830574 1.906026 0.950941 7.657366 2083.679649
+TTCGGG 94.925293 1.729650 1.819667 0.887050 7.657366 991.694336
+TTCGGT 94.521103 1.881065 1.967011 0.996943 7.657366 1659.696978
+TTCGTA 90.455956 1.907018 1.908888 0.953083 7.657366 889.883051
+TTCGTC 88.598703 2.028445 2.152704 1.141397 7.657366 1578.428296
+TTCGTG 89.843955 1.880665 2.039824 1.052809 7.657366 1014.021599
+TTCGTT 89.927872 2.017723 2.246086 1.216466 7.657366 1547.610429
+TTCTAA 93.373803 1.673565 1.712370 0.809761 7.657366 418.760275
+TTCTAC 90.705791 1.839928 1.920139 0.961522 7.657366 972.670506
+TTCTAG 93.296608 1.935304 1.806201 0.877222 7.657366 56.033738
+TTCTAT 91.649391 1.768882 1.894885 0.942615 7.657366 661.480335
+TTCTCA 87.866163 2.138037 2.178146 1.161691 7.657366 1184.460703
+TTCTCC 86.474457 2.094220 2.272042 1.237613 7.657366 1254.942039
+TTCTCG 87.862923 1.961108 2.170373 1.155478 7.657366 1358.830087
+TTCTCT 87.512015 2.204280 2.254753 1.223513 7.657366 1391.306464
+TTCTGA 91.461535 1.774660 1.799608 0.872423 7.657366 1712.301937
+TTCTGC 88.511633 1.869118 1.967653 0.997431 7.657366 2699.973625
+TTCTGG 90.373516 1.948486 1.898248 0.945126 7.657366 2832.602240
+TTCTGT 89.253124 1.789762 1.959256 0.991053 7.657366 1271.088821
+TTCTTA 83.680663 2.012995 2.267779 1.234131 7.657366 808.938023
+TTCTTC 80.821485 2.156866 2.314604 1.272551 7.657366 2763.658448
+TTCTTG 83.091777 2.500140 2.313396 1.271556 7.657366 1096.548243
+TTCTTT 81.884651 2.059591 2.103887 1.102793 7.657366 2336.301104
+TTGAAA 84.756017 4.963277 2.054037 1.063831 7.657366 7014.432743
+TTGAAC 80.049730 2.328853 2.449813 1.385669 7.657366 1504.898386
+TTGAAG 81.713635 2.225479 2.250529 1.220077 7.657366 1817.689308
+TTGAAT 80.749710 2.109599 2.335659 1.289954 7.657366 1126.013872
+TTGACA 73.219131 2.122051 2.125809 1.120074 7.657366 652.597225
+TTGACC 72.326217 2.171258 2.102281 1.101531 7.657366 1254.635906
+TTGACG 72.913690 2.146087 2.156197 1.144176 7.657366 1548.758400
+TTGACT 72.909982 1.854706 1.890186 0.939111 7.657366 694.737846
+TTGAGA 81.970375 2.437294 2.187789 1.169415 7.657366 939.997173
+TTGAGC 80.071861 2.410640 2.378180 1.325340 7.657366 1584.156870
+TTGAGG 80.699517 2.367779 2.134575 1.127009 7.657366 890.913107
+TTGAGT 80.369202 2.104801 2.256042 1.224562 7.657366 856.047512
+TTGATA 71.975408 2.018916 2.034767 1.048896 7.657366 1898.797311
+TTGATC 71.191863 2.254100 2.468279 1.401366 7.657366 1658.336401
+TTGATG 71.400761 2.194058 2.198464 1.177984 7.657366 2812.933867
+TTGATT 71.777949 2.394555 2.463554 1.397344 7.657366 2037.921051
+TTGCAA 87.867448 1.511880 1.528001 0.682568 7.657366 1317.764770
+TTGCAC 85.812626 1.518590 1.461050 0.638202 7.657366 1200.041253
+TTGCAG 87.028525 1.532848 1.566986 0.708856 7.657366 2774.892602
+TTGCAT 86.424877 1.442913 1.451331 0.631844 7.657366 1280.392055
+TTGCCA 84.894480 1.649060 1.715181 0.811756 7.657366 3355.407338
+TTGCCC 83.814552 1.571110 1.698522 0.799958 7.657366 2161.213222
+TTGCCG 84.708261 1.634963 1.723389 0.817589 7.657366 4267.364059
+TTGCCT 84.477093 1.588630 1.640914 0.759607 7.657366 1716.066834
+TTGCGA 86.821088 1.531022 1.521715 0.678360 7.657366 1434.799150
+TTGCGC 84.623271 1.649216 1.581438 0.718685 7.657366 3249.401468
+TTGCGG 85.870718 1.571667 1.615585 0.742088 7.657366 2375.645389
+TTGCGT 85.152295 1.648433 1.600043 0.731405 7.657366 1554.298067
+TTGCTA 83.598658 1.592268 1.719142 0.814569 7.657366 961.106140
+TTGCTC 81.894261 1.577050 1.625144 0.748683 7.657366 1784.852233
+TTGCTG 83.085053 1.628740 1.692635 0.795802 7.657366 4545.729355
+TTGCTT 82.524841 1.900213 1.691080 0.794706 7.657366 1416.816388
+TTGGAA 84.876239 1.935752 2.088231 1.090506 7.657366 394.041188
+TTGGAC 80.757305 2.238019 2.350322 1.302121 7.657366 256.370743
+TTGGAG 83.903601 1.944675 2.038632 1.051886 7.657366 373.877185
+TTGGAT 82.031756 2.224003 2.447457 1.383671 7.657366 479.298072
+TTGGCA 79.708265 2.286970 1.875475 0.928169 7.657366 1732.461697
+TTGGCC 78.680145 2.732625 2.064263 1.071785 7.657366 922.310036
+TTGGCG 79.514988 2.233358 1.950420 0.984356 7.657366 2823.120473
+TTGGCT 79.554976 2.245393 2.087465 1.089906 7.657366 1151.266691
+TTGGGA 85.159014 1.801969 2.031379 1.046277 7.657366 503.057930
+TTGGGC 82.792448 2.137299 2.083124 1.086508 7.657366 789.593933
+TTGGGG 84.227414 1.993609 1.944870 0.980158 7.657366 582.255052
+TTGGGT 83.945011 2.385338 2.238303 1.210148 7.657366 817.915079
+TTGGTA 78.907517 2.240626 2.298398 1.259210 7.657366 1094.621511
+TTGGTC 77.213956 2.471317 2.584121 1.501169 7.657366 1274.231344
+TTGGTG 78.144163 2.358420 2.357448 1.308047 7.657366 1920.847347
+TTGGTT 78.440020 2.524975 2.616133 1.529150 7.657366 1238.232771
+TTGTAA 81.598889 1.909125 2.096338 1.096862 7.657366 945.469652
+TTGTAC 78.449790 1.996765 2.261639 1.229123 7.657366 826.837406
+TTGTAG 80.794649 1.835625 2.258901 1.226891 7.657366 831.578586
+TTGTAT 79.494190 2.060989 2.274466 1.239594 7.657366 719.182930
+TTGTCA 76.635676 2.440620 2.621213 1.533606 7.657366 1411.271851
+TTGTCC 75.206621 2.612111 2.710858 1.612948 7.657366 860.497775
+TTGTCG 76.385531 2.437808 2.714943 1.616596 7.657366 1932.619577
+TTGTCT 75.920761 2.193673 2.587948 1.504505 7.657366 853.573525
+TTGTGA 79.394071 2.247929 2.272827 1.238254 7.657366 1061.372408
+TTGTGC 76.005267 2.433894 2.559431 1.479706 7.657366 1523.801540
+TTGTGG 78.274274 2.287370 2.378928 1.325966 7.657366 1322.598804
+TTGTGT 76.726042 2.151031 2.503200 1.431211 7.657366 746.325123
+TTGTTA 72.092701 2.447314 2.782524 1.677331 7.657366 1408.043560
+TTGTTC 68.843833 2.570899 2.782646 1.677441 7.657366 1859.642412
+TTGTTG 71.037506 2.525343 2.741649 1.640507 7.657366 2563.465690
+TTGTTT 69.981087 2.543187 2.704925 1.607656 7.657366 1851.551854
+TTTAAA 101.566823 1.820704 1.706847 0.805846 7.657366 2020.790250
+TTTAAC 99.854844 1.786191 1.693412 0.796351 7.657366 1998.353441
+TTTAAG 100.660573 1.768121 1.656540 0.770483 7.657366 963.195643
+TTTAAT 101.083515 1.832170 1.858431 0.915546 7.657366 2281.723717
+TTTACA 97.031308 1.677961 1.537612 0.689017 7.657366 1266.952692
+TTTACC 96.706423 1.774928 1.582224 0.719221 7.657366 3015.063183
+TTTACG 96.824547 1.705017 1.524444 0.680186 7.657366 2034.955274
+TTTACT 97.784421 2.162126 1.952487 0.985922 7.657366 1909.749907
+TTTAGA 99.799985 1.998398 1.788800 0.864575 7.657366 422.763343
+TTTAGC 98.310329 1.711668 1.608470 0.737191 7.657366 1320.746317
+TTTAGG 98.950826 1.757734 1.646984 0.763826 7.657366 430.240100
+TTTAGT 99.567587 1.736728 1.773352 0.853400 7.657366 741.754676
+TTTATA 97.332878 1.983467 1.794490 0.868704 7.657366 1327.242826
+TTTATC 97.487352 1.906160 1.877205 0.929454 7.657366 3258.058740
+TTTATG 96.887699 1.984195 1.793040 0.867651 7.657366 2077.131351
+TTTATT 99.108167 2.107548 2.014730 1.033441 7.657366 3097.820185
+TTTCAA 105.570428 2.234273 1.814699 0.883420 7.657366 1896.512603
+TTTCAC 103.672397 2.252946 1.843448 0.904496 7.657366 2676.202183
+TTTCAG 104.844955 2.243491 1.829823 0.894487 7.657366 3304.319381
+TTTCAT 104.427343 2.242314 1.913154 0.956280 7.657366 2504.609122
+TTTCCA 103.864998 2.257077 1.737650 0.827758 7.657366 2721.425613
+TTTCCC 102.439406 2.005009 1.687005 0.791835 7.657366 1768.903958
+TTTCCG 103.668995 2.306021 1.790458 0.865778 7.657366 2322.789563
+TTTCCT 103.238929 2.162385 1.828037 0.893177 7.657366 1563.833754
+TTTCGA 104.679029 2.338456 1.940220 0.976645 7.657366 1517.343344
+TTTCGC 102.842674 2.293407 1.782202 0.859796 7.657366 3397.874087
+TTTCGG 104.244852 2.321910 1.871134 0.924949 7.657366 1824.027949
+TTTCGT 104.064418 2.639845 1.942032 0.978013 7.657366 1542.172565
+TTTCTA 103.956748 2.161903 1.869439 0.923692 7.657366 753.242630
+TTTCTC 103.273068 2.243523 1.980554 1.007257 7.657366 2050.303138
+TTTCTG 103.952680 2.211184 1.841361 0.902960 7.657366 2933.806785
+TTTCTT 104.110806 2.346291 2.036064 1.049899 7.657366 2420.474442
+TTTGAA 98.195203 1.821762 1.634854 0.755403 7.657366 1605.198094
+TTTGAC 95.285645 1.655560 1.653199 0.768154 7.657366 1314.413915
+TTTGAG 97.184240 1.778115 1.648516 0.764892 7.657366 885.937089
+TTTGAT 96.873215 1.780612 1.727982 0.820860 7.657366 2462.538328
+TTTGCA 95.796997 1.830892 1.537395 0.688872 7.657366 2092.598418
+TTTGCC 95.371668 1.834686 1.570980 0.711568 7.657366 4076.661810
+TTTGCG 95.947938 1.867489 1.553120 0.699468 7.657366 2818.563101
+TTTGCT 96.736898 1.829856 1.646773 0.763679 7.657366 2937.971765
+TTTGGA 98.222590 2.088881 1.753574 0.839164 7.657366 410.257948
+TTTGGC 96.327253 1.774386 1.576664 0.715433 7.657366 2274.236422
+TTTGGG 97.594285 1.864593 1.606801 0.736043 7.657366 755.994877
+TTTGGT 98.170704 2.037933 1.707004 0.805957 7.657366 1990.067802
+TTTGTA 96.720637 1.894769 1.798128 0.871347 7.657366 1392.993410
+TTTGTC 96.864608 2.119666 1.878686 0.930554 7.657366 1831.797975
+TTTGTG 96.605245 2.058672 1.798217 0.871412 7.657366 1694.007311
+TTTGTT 98.646527 2.169676 2.070636 1.076753 7.657366 2421.767559
+TTTTAA 96.021950 1.909439 1.474878 0.647283 7.657366 2066.000096
+TTTTAC 92.912905 1.904288 1.364313 0.575879 7.657366 2565.648934
+TTTTAG 95.290028 1.886366 1.472878 0.645967 7.657366 869.300390
+TTTTAT 93.959501 1.821118 1.420913 0.612085 7.657366 2761.265733
+TTTTCA 91.043207 1.662321 1.500668 0.664335 7.657366 3167.468044
+TTTTCC 89.695233 1.589440 1.458543 0.636560 7.657366 2773.364459
+TTTTCG 91.099504 1.686789 1.497260 0.662073 7.657366 2233.388555
+TTTTCT 90.935756 1.732010 1.523306 0.679424 7.657366 2779.099142
+TTTTGA 94.125513 1.957212 1.425667 0.615159 7.657366 1954.525006
+TTTTGC 91.302041 1.759910 1.367492 0.577893 7.657366 4240.706276
+TTTTGG 93.589030 1.809573 1.404695 0.601636 7.657366 1482.029244
+TTTTGT 92.249556 1.771851 1.435917 0.621805 7.657366 2298.042509
+TTTTTA 88.111000 1.508968 1.355124 0.570071 7.657366 2939.046221
+TTTTTC 86.588569 1.605815 1.319500 0.547740 7.657366 4051.654065
+TTTTTG 87.844591 1.541016 1.316434 0.545832 7.657366 3164.686502
+TTTTTT 87.762283 1.653769 1.259599 0.510868 7.657366 3907.377778
diff --git a/scripts/consensus.make b/scripts/consensus.make
index ad818e2..b55a14a 100755
--- a/scripts/consensus.make
+++ b/scripts/consensus.make
@@ -12,7 +12,7 @@ READS_BASE=$(basename $(READS))
#
# A pipeline to recompute a consensus sequence for an assembly
#
-all: $(READS_BASE).pp.sorted.bam $(READS_BASE).pp.sorted.bam.bai
+all: $(READS_BASE).pp.sorted.bam $(READS_BASE).pp.sorted.bam.bai $(ASSEMBLY).fai
#
# Preprocess the reads to make a name map
@@ -48,3 +48,9 @@ $(ASSEMBLY).bwt: $(ASSEMBLY)
#
%.sorted.bam.bai: %.sorted.bam
samtools index $^
+
+#
+# Index assembly
+#
+$(ASSEMBLY).fai: $(ASSEMBLY)
+ samtools faidx $<
diff --git a/scripts/dropmodel.py b/scripts/dropmodel.py
new file mode 100644
index 0000000..918b5a4
--- /dev/null
+++ b/scripts/dropmodel.py
@@ -0,0 +1,73 @@
+#! /usr/bin/env python
+# Generate a new model with a lower k-mer size than the input model
+
+import sys
+import argparse
+import itertools
+import numpy
+
+alphabet = [ 'A', 'C', 'G','T' ]
+def make_all_mers(k):
+ return [ "".join(x) for x in itertools.product(alphabet, repeat=k) ]
+
+parser = argparse.ArgumentParser( description='Reduce a 6-mer model to a 5-mer model')
+parser.add_argument('-i', '--input', type=str, required=True)
+parser.add_argument('-t', '--type', type=str, required=False, default="dropmodel")
+args = parser.parse_args()
+
+# Read the initial model from a file
+f = open(args.input)
+
+K = 0
+model = dict()
+
+header_lines_to_copy = { "#strand", "#kit" }
+header_lines = list()
+input_model_name = ""
+
+for line in f:
+ line = line.rstrip()
+ fields = line.split()
+
+ # copy then skip header lines
+ if line[0] == '#' or line.find("kmer") == 0:
+ if fields[0] in header_lines_to_copy:
+ header_lines.append(line)
+ if fields[0] == "#model_name":
+ input_model_name = fields[1]
+ else:
+ # store the k-mer size
+ if K == 0:
+ K = len(fields[0])
+ else:
+ assert len(fields[0]) == K
+
+ # store values
+ model[fields[0]] = tuple(fields[1:6])
+
+# reduce the k-mer size by 1 and output the new model
+P = K - 1
+pmers = make_all_mers(P)
+print("#model_name\t" + args.input + ".dropmodel")
+print("#type\t" + args.type)
+print("\n".join(header_lines))
+print("#derived_from\t" + input_model_name)
+
+print "\t".join(["kmer", "level_mean", "level_stdv", "sd_mean", "sd_stdv"])
+
+num_samples_per_kmer = 1000
+
+for pmer in pmers:
+
+ kmers_with_pmer = [ pmer + a for a in alphabet ]
+ samples = list()
+
+ for kmer in kmers_with_pmer:
+
+ # sample values from this gaussian
+ samples += list(numpy.random.normal(model[kmer][0], model[kmer][1], num_samples_per_kmer))
+
+ m = numpy.mean(samples)
+ s = numpy.std(samples)
+ out = [m, s, 0.0, 0.0, 0.0]
+ print "\t".join([pmer] + [str(x) for x in out])
diff --git a/scripts/nanopolish_makerange.py b/scripts/nanopolish_makerange.py
index 72737aa..1fab730 100644
--- a/scripts/nanopolish_makerange.py
+++ b/scripts/nanopolish_makerange.py
@@ -3,17 +3,14 @@ from Bio import SeqIO
recs = [ (rec.name, len(rec.seq)) for rec in SeqIO.parse(open(sys.argv[1]), "fasta")]
-# Do not change, must match nanopolish segment lengths
SEGMENT_LENGTH = 10000
-
-# Ok to change this
-SEGMENTS_PER_BATCH = 10
+OVERLAP_LENGTH = 200
for name, length in recs:
n_segments = (length / SEGMENT_LENGTH) + 1
- for n in xrange(0, n_segments, SEGMENTS_PER_BATCH):
- if ( n + SEGMENTS_PER_BATCH) > n_segments:
- print "%s:%d-%d" % (name, n, n_segments)
+ for n in xrange(0, length, SEGMENT_LENGTH):
+ if ( n + SEGMENT_LENGTH) > length:
+ print "%s:%d-%d" % (name, n, length - 1)
else:
- print "%s:%d-%d" % (name, n, n + SEGMENTS_PER_BATCH)
+ print "%s:%d-%d" % (name, n, n + SEGMENT_LENGTH + OVERLAP_LENGTH)
diff --git a/scripts/nanopolish_merge.py b/scripts/nanopolish_merge.py
index 0440aca..c603e2b 100644
--- a/scripts/nanopolish_merge.py
+++ b/scripts/nanopolish_merge.py
@@ -59,49 +59,39 @@ def merge_into_consensus(consensus, incoming, overlap_length):
return merged
-# Make placeholder segments using the original assembly as a guide
-original_assembly = sys.argv[1]
-recs = [ (rec.name, len(rec.seq)) for rec in SeqIO.parse(open(original_assembly), "fasta")]
-
-# Do not change, must match nanopolish segment lengths
+# Do not change, must match nanopolish_makerange.py segment lengths
SEGMENT_LENGTH = 10000
OVERLAP_LENGTH = 200
segments_by_name = dict()
-for name, length in recs:
-
- n_segments = (length / SEGMENT_LENGTH) + 1
- segments_by_name[name] = [""] * n_segments
-for fn in sys.argv[2:]:
+# Load the polished segments into a dictionary keyed by the start coordinate
+for fn in sys.argv[1:]:
for rec in SeqIO.parse(open(fn), "fasta"):
- (contig, segment) = rec.name.split(":")
- segments_by_name[contig][int(segment)] = str(rec.seq)
-
-# Confirm all segments are present
-segment_not_found = False
-for contig_name in sorted(segments_by_name.keys()):
- for (segment_id, sequence) in enumerate(segments_by_name[contig_name]):
+ (contig, segment_range) = rec.name.split(":")
- if sequence is "":
- sys.stderr.write("ERROR_MISSING %s %d\n" % (contig_name, segment_id))
- segment_not_found = True
+ if contig not in segments_by_name:
+ segments_by_name[contig] = dict()
+
+ segment_start, segment_end = segment_range.split("-")
-if segment_not_found:
- sys.exit(1)
+ sys.stderr.write('Insert %s %s\n' % (contig, segment_start))
+ segments_by_name[contig][int(segment_start)] = str(rec.seq)
# Assemble while making sure every segment is present
for contig_name in sorted(segments_by_name.keys()):
assembly = ""
- for (segment_id, sequence) in enumerate(segments_by_name[contig_name]):
-
- if sequence is "":
- print "ERROR, segment %d of contig %s is missing" % (segment_id, contig_name)
- sys.exit(1)
+ prev_segment = None
+ for segment_start in sorted(segments_by_name[contig_name]):
- sys.stderr.write('Merging %s %d\n' % (contig_name, segment_id))
+ sys.stderr.write('Merging %s %d\n' % (contig_name, segment_start))
+ # Ensure the segments overlap
+ assert(prev_segment is None or prev_segment + SEGMENT_LENGTH + OVERLAP_LENGTH > segment_start)
+
+ sequence = segments_by_name[contig_name][segment_start]
assembly = merge_into_consensus(assembly, sequence, OVERLAP_LENGTH)
-
+ prev_segment = segment_start
+
# Write final assembly
- print(">%s\n%s\n" % (contig_name, assembly))
+ print(">%s\n%s" % (contig_name, assembly))
diff --git a/src/alignment/nanopolish_alignment_db.cpp b/src/alignment/nanopolish_alignment_db.cpp
index 1b7bd62..39fb3e1 100644
--- a/src/alignment/nanopolish_alignment_db.cpp
+++ b/src/alignment/nanopolish_alignment_db.cpp
@@ -12,6 +12,7 @@
#include "htslib/faidx.h"
#include "htslib/hts.h"
#include "htslib/sam.h"
+#include "nanopolish_methyltrain.h"
// Various file handle and structures
// needed to traverse a bam file
@@ -25,11 +26,13 @@ struct BamHandles
AlignmentDB::AlignmentDB(const std::string& reads_file,
const std::string& reference_file,
const std::string& sequence_bam,
- const std::string& event_bam) :
- m_fast5_name_map(reads_file),
+ const std::string& event_bam,
+ bool calibrate_reads) :
m_reference_file(reference_file),
m_sequence_bam(sequence_bam),
- m_event_bam(event_bam)
+ m_event_bam(event_bam),
+ m_fast5_name_map(reads_file),
+ m_calibrate_on_load(calibrate_reads)
{
_clear_region();
}
@@ -44,8 +47,12 @@ std::string AlignmentDB::get_reference_substring(const std::string& contig,
int stop_position) const
{
assert(m_region_contig == contig);
- assert(m_region_start <= start_position);
- assert(m_region_end >= stop_position);
+ if(m_region_start > start_position || m_region_end < stop_position) {
+ fprintf(stderr, "[alignmentdb] error: requested coordinates "
+ "[%d %d] is outside of region boundary [%d %d]\n",
+ start_position, stop_position, m_region_start, m_region_end);
+ exit(EXIT_FAILURE);
+ }
return m_region_ref_sequence.substr(start_position - m_region_start, stop_position - start_position + 1);
}
@@ -64,14 +71,14 @@ std::vector<std::string> AlignmentDB::get_read_substrings(const std::string& con
const SequenceAlignmentRecord& record = m_sequence_records[i];
if(record.aligned_bases.empty())
continue;
-
+
int r1, r2;
- bool bounded = _find_by_ref_bounds(record.aligned_bases,
- start_position,
+ bool bounded = _find_by_ref_bounds(record.aligned_bases,
+ start_position,
stop_position,
r1,
r2);
-
+
if(bounded) {
out.push_back(record.sequence.substr(r1, r2 - r1 + 1));
}
@@ -90,8 +97,13 @@ std::vector<HMMInputData> AlignmentDB::get_event_subsequences(const std::string&
std::vector<HMMInputData> out;
for(size_t i = 0; i < m_event_records.size(); ++i) {
const EventAlignmentRecord& record = m_event_records[i];
- if(record.aligned_events.empty())
+ if(record.aligned_events.empty()) {
continue;
+ }
+
+ if(!record.sr->has_events_for_strand(record.strand)) {
+ continue;
+ }
HMMInputData data;
data.read = record.sr;
@@ -128,8 +140,13 @@ std::vector<HMMInputData> AlignmentDB::get_events_aligned_to(const std::string&
std::vector<HMMInputData> out;
for(size_t i = 0; i < m_event_records.size(); ++i) {
const EventAlignmentRecord& record = m_event_records[i];
- if(record.aligned_events.empty())
+ if(record.aligned_events.empty()) {
continue;
+ }
+
+ if(!record.sr->has_events_for_strand(record.strand)) {
+ continue;
+ }
HMMInputData data;
data.read = record.sr;
@@ -160,8 +177,6 @@ std::vector<Variant> AlignmentDB::get_variants_in_region(const std::string& cont
std::map<std::string, std::pair<Variant, int>> map;
std::vector<int> depth(stop_position - start_position + 1, 0);
- size_t num_aligned_reads = 0;
-
for(size_t i = 0; i < m_sequence_records.size(); ++i) {
const SequenceAlignmentRecord& record = m_sequence_records[i];
if(record.aligned_bases.empty())
@@ -170,28 +185,64 @@ std::vector<Variant> AlignmentDB::get_variants_in_region(const std::string& cont
AlignedPairConstIter start_iter;
AlignedPairConstIter stop_iter;
_find_iter_by_ref_bounds(record.aligned_bases, start_position, stop_position, start_iter, stop_iter);
-
- //printf("[%zu] iter: [%d %d] [%d %d] first: %d last: %d\n", i, start_iter->ref_pos, start_iter->read_pos, stop_iter->ref_pos, stop_iter->read_pos,
+
+ // Increment the depth over this region
+ int depth_start = start_iter->ref_pos;
+ int depth_end = stop_iter == record.aligned_bases.end() ?
+ record.aligned_bases.back().ref_pos : stop_iter->ref_pos;
+
+ // clamp
+ depth_start = std::max(depth_start, start_position);
+ depth_end = std::min(depth_end, stop_position);
+
+ for(; depth_start < depth_end; ++depth_start) {
+ assert(depth_start >= start_position);
+ assert(depth_start - start_position < depth.size());
+ depth[depth_start - start_position]++;
+ }
+
+ //printf("[%zu] iter: [%d %d] [%d %d] first: %d last: %d\n", i, start_iter->ref_pos, start_iter->read_pos, stop_iter->ref_pos, stop_iter->read_pos,
// record.aligned_bases.front().ref_pos, record.aligned_bases.back().ref_pos);
- for(; start_iter != stop_iter; ++start_iter) {
-
+
+ // Find the boundaries of a matching region
+ while(start_iter != stop_iter) {
+ // skip out-of-range
int rp = start_iter->ref_pos;
+ if(rp < start_position || rp > stop_position) {
+ continue;
+ }
+
char rb = m_region_ref_sequence[start_iter->ref_pos - m_region_start];
char ab = record.sequence[start_iter->read_pos];
- if(rp < start_position || rp > stop_position) {
- continue;
+ bool is_mismatch = rb != ab;
+ auto next_iter = start_iter + 1;
+
+ bool is_gap = next_iter != stop_iter &&
+ (next_iter->ref_pos != start_iter->ref_pos + 1 ||
+ next_iter->read_pos != start_iter->read_pos + 1);
+
+ if(is_gap) {
+ // advance the next iterator until a match is found
+ while(next_iter != stop_iter) {
+ char n_rb = m_region_ref_sequence[next_iter->ref_pos - m_region_start];
+ char n_ab = record.sequence[next_iter->read_pos];
+ if(n_rb == n_ab) {
+ break;
+ }
+ ++next_iter;
+ }
}
-
- // Increment depth
- depth[rp - start_position]++;
- if(rb != ab) {
+ if(next_iter != stop_iter && (is_mismatch || is_gap)) {
Variant v;
v.ref_name = contig;
v.ref_position = start_iter->ref_pos;
- v.ref_seq = rb;
- v.alt_seq = ab;
+
+ size_t ref_sub_start = start_iter->ref_pos - m_region_start;
+ size_t ref_sub_end = next_iter->ref_pos - m_region_start;
+ v.ref_seq = m_region_ref_sequence.substr(ref_sub_start, ref_sub_end - ref_sub_start);
+ v.alt_seq = record.sequence.substr(start_iter->read_pos, next_iter->read_pos - start_iter->read_pos);
std::string key = v.key();
auto iter = map.find(key);
@@ -201,6 +252,7 @@ std::vector<Variant> AlignmentDB::get_variants_in_region(const std::string& cont
iter->second.second += 1;
}
}
+ start_iter = next_iter;
}
}
@@ -209,7 +261,7 @@ std::vector<Variant> AlignmentDB::get_variants_in_region(const std::string& cont
size_t count = iter->second.second;
size_t d = depth[v.ref_position - start_position];
double f = (double)count / d;
- if(f >= min_frequency && d >= min_depth) {
+ if(f >= min_frequency && (int)d >= min_depth) {
v.add_info("BaseCalledReadsWithVariant", count);
v.add_info("BaseCalledFrequency", f);
variants.push_back(v);
@@ -219,7 +271,7 @@ std::vector<Variant> AlignmentDB::get_variants_in_region(const std::string& cont
std::sort(variants.begin(), variants.end(), sortByPosition);
return variants;
}
-
+
void AlignmentDB::load_region(const std::string& contig,
int start_position,
int stop_position)
@@ -364,8 +416,15 @@ void AlignmentDB::_load_events_by_region()
// Do we need to load this fast5 file?
if(m_squiggle_read_map.find(read_name) == m_squiggle_read_map.end()) {
- m_squiggle_read_map[read_name] = new SquiggleRead(read_name, fast5_path);
+ SquiggleRead* sr = new SquiggleRead(read_name, fast5_path);
+ // Switch the read to use an alternative kmer model
+ if(!m_model_type_string.empty()) {
+ sr->replace_models(m_model_type_string);
+ }
+
+ m_squiggle_read_map[read_name] = sr;
}
+
event_record.sr = m_squiggle_read_map[read_name];
// extract the event stride tag which tells us whether the
@@ -381,6 +440,15 @@ void AlignmentDB::_load_events_by_region()
event_record.strand = is_template ? T_IDX : C_IDX;
m_event_records.push_back(event_record);
+ if(m_calibrate_on_load) {
+ std::vector<EventAlignment> event_alignment = _build_event_alignment(event_record);
+ fprintf(stderr, "Rescale for %s strand: %d rc: %d\n", event_record.sr->read_name.c_str(), event_record.strand, event_record.rc);
+ event_record.sr->print_scaling_parameters(stderr, event_record.strand);
+ fprintf(stderr, "recal events: %zu\n", event_alignment.size());
+ recalibrate_model(*event_record.sr, event_record.strand, event_alignment, &gDNAAlphabet, true, false);
+ event_record.sr->print_scaling_parameters(stderr, event_record.strand);
+ }
+
/*
printf("event_record[%zu] name: %s stride: %d align bounds [%d %d] [%d %d]\n",
m_event_records.size() - 1,
@@ -399,6 +467,40 @@ void AlignmentDB::_load_events_by_region()
sam_close(handles.bam_fh);
}
+std::vector<EventAlignment> AlignmentDB::_build_event_alignment(const EventAlignmentRecord& event_record) const
+{
+ std::vector<EventAlignment> alignment;
+ const SquiggleRead* sr = event_record.sr;
+ const Alphabet* alphabet = sr->pore_model[event_record.strand].pmalphabet;
+ size_t k = sr->pore_model[event_record.strand].k;
+
+ for(const auto& ap : event_record.aligned_events) {
+
+ EventAlignment ea;
+ ea.ref_position = ap.ref_pos;
+ if(ea.ref_position < m_region_start || ea.ref_position >= m_region_end - k) {
+ continue;
+ }
+
+ ea.event_idx = ap.read_pos;
+
+ std::string kmer = get_reference_substring(m_region_contig, ea.ref_position, ea.ref_position + k - 1);
+ assert(kmer.size() == k);
+
+ // ref data
+ ea.ref_name = "read"; // not needed
+ ea.read_idx = -1; // not needed
+ ea.ref_kmer = kmer;
+ ea.strand_idx = event_record.strand;
+ ea.rc = event_record.rc;
+ ea.model_kmer = kmer;
+ ea.hmm_state = 'M';
+ alignment.push_back(ea);
+ }
+
+ return alignment;
+}
+
bool AlignmentDB::_find_iter_by_ref_bounds(const std::vector<AlignedPair>& pairs,
int ref_start,
int ref_stop,
diff --git a/src/alignment/nanopolish_alignment_db.h b/src/alignment/nanopolish_alignment_db.h
index c6179f4..8a57614 100644
--- a/src/alignment/nanopolish_alignment_db.h
+++ b/src/alignment/nanopolish_alignment_db.h
@@ -40,7 +40,8 @@ class AlignmentDB
AlignmentDB(const std::string& reads_file,
const std::string& reference_file,
const std::string& sequence_bam,
- const std::string& event_bam);
+ const std::string& event_bam,
+ const bool calibrate_reads = false);
~AlignmentDB();
@@ -70,8 +71,14 @@ class AlignmentDB
double min_frequency,
int min_depth) const;
+ const std::vector<EventAlignmentRecord>& get_eventalignment_records() const { return m_event_records; }
+
+ // reference metadata
+ std::string get_region_contig() const { return m_region_contig; }
int get_region_start() const { return m_region_start; }
int get_region_end() const { return m_region_end; }
+
+ void set_alternative_model_type(const std::string model_type_string) { m_model_type_string = model_type_string; }
private:
@@ -79,6 +86,8 @@ class AlignmentDB
void _load_events_by_region();
void _clear_region();
+ std::vector<EventAlignment> _build_event_alignment(const EventAlignmentRecord& event_record) const;
+
// Search the vector of AlignedPairs using lower_bound/upper_bound
// and the input reference coordinates. If the search succeeds,
// set read_start/read_stop to be the read_pos of the bounding elements
@@ -102,6 +111,9 @@ class AlignmentDB
std::string m_sequence_bam;
std::string m_event_bam;
+ // parameters
+ bool m_calibrate_on_load;
+
// loaded region
std::string m_region_ref_sequence;
std::string m_region_contig;
@@ -113,6 +125,7 @@ class AlignmentDB
std::vector<SequenceAlignmentRecord> m_sequence_records;
std::vector<EventAlignmentRecord> m_event_records;
SquiggleReadMap m_squiggle_read_map;
+ std::string m_model_type_string;
};
#endif
diff --git a/src/alignment/nanopolish_anchor.cpp b/src/alignment/nanopolish_anchor.cpp
index be0bb35..1397f5d 100644
--- a/src/alignment/nanopolish_anchor.cpp
+++ b/src/alignment/nanopolish_anchor.cpp
@@ -13,15 +13,18 @@
#include "htslib/faidx.h"
#include "nanopolish_common.h"
#include "nanopolish_anchor.h"
+#include "nanopolish_scorereads.h"
+#include "nanopolish_methyltrain.h"
#include "nanopolish_squiggle_read.h"
-HMMRealignmentInput build_input_for_region(const std::string& bam_filename,
- const std::string& ref_filename,
- const Fast5Map& read_name_map,
+HMMRealignmentInput build_input_for_region(const std::string& bam_filename,
+ const std::string& ref_filename,
+ const Fast5Map& read_name_map,
const std::string& contig_name,
- int start,
- int end,
- int stride)
+ int start,
+ int end,
+ int stride,
+ const std::string& alternative_model_type)
{
// Initialize return data
HMMRealignmentInput ret;
@@ -38,7 +41,7 @@ HMMRealignmentInput build_input_for_region(const std::string& bam_filename,
// read the bam header
bam_hdr_t* hdr = sam_hdr_read(bam_fh);
int contig_id = bam_name2id(hdr, contig_name.c_str());
-
+
// load reference fai file
faidx_t *fai = fai_load(ref_filename.c_str());
@@ -48,18 +51,21 @@ HMMRealignmentInput build_input_for_region(const std::string& bam_filename,
// load the reference sequence for this region
int fetched_len = 0;
char* ref_segment = faidx_fetch_seq(fai, contig_name.c_str(), start, end, &fetched_len);
+ if(ref_segment == NULL) {
+ exit(EXIT_FAILURE);
+ }
ret.original_sequence = ref_segment;
// Initialize iteration
bam1_t* record = bam_init1();
hts_itr_t* itr = sam_itr_queryi(bam_idx, contig_id, start, end);
-
+
// Iterate over reads aligned here
std::vector<HMMReadAnchorSet> read_anchors;
std::vector<std::vector<std::string>> read_substrings;
// kmer size of pore model
- uint32_t k;
+ uint32_t k = 0;
// Load the SquiggleReads aligned to this region and the bases
// that are mapped to our reference anchoring positions
@@ -72,7 +78,23 @@ HMMRealignmentInput build_input_for_region(const std::string& bam_filename,
// load read
ret.reads.push_back(std::unique_ptr<SquiggleRead>(new SquiggleRead(read_name, fast5_path)));
- const SquiggleRead& sr = *ret.reads.back();
+ SquiggleRead& sr = *ret.reads.back();
+ if(!alternative_model_type.empty()) {
+ sr.replace_models(alternative_model_type);
+ }
+
+ // Recalibrate each strand
+ for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; strand_idx++) {
+ if(!sr.has_events_for_strand(strand_idx)) {
+ continue;
+ }
+
+ std::vector<EventAlignment> ao = alignment_from_read(sr, strand_idx, -1,
+ "", fai, hdr,
+ record, -1, -1);
+ recalibrate_model(sr, strand_idx, ao, &gDNAAlphabet, true, true);
+ }
+
k = sr.pore_model[T_IDX].k;
// parse alignments to reference
@@ -109,20 +131,20 @@ HMMRealignmentInput build_input_for_region(const std::string& bam_filename,
read_kidx = sr.flip_k_strand(read_kidx);
// If the aligned base is beyong the start of the last k-mer of the read, skip
- if(read_kidx >= sr.read_sequence.size() - k + 1) {
+ if(read_kidx < 0 || read_kidx >= (int)(sr.read_sequence.size() - k + 1)) {
continue;
}
int template_idx = sr.get_closest_event_to(read_kidx, T_IDX);
- int complement_idx = sr.get_closest_event_to(read_kidx, C_IDX);
+ int complement_idx = sr.has_events_for_strand(C_IDX) ? sr.get_closest_event_to(read_kidx, C_IDX) : -1;
- assert(template_idx != -1 && complement_idx != -1);
- assert(template_idx < sr.events[T_IDX].size());
- assert(complement_idx < sr.events[C_IDX].size());
+ assert(template_idx != -1 && (!sr.has_events_for_strand(C_IDX) || complement_idx != -1));
+ assert(template_idx < (int)sr.events[T_IDX].size());
+ assert(sr.events[C_IDX].empty() || complement_idx < (int)sr.events[C_IDX].size());
event_anchors.strand_anchors[T_IDX][ai] = { template_idx, template_rc };
event_anchors.strand_anchors[C_IDX][ai] = { complement_idx, complement_rc };
-
+
// If this is not the last anchor, extract the sequence of the read
// from this anchor to the next anchor as an alternative assembly
if(ai < read_bases_for_anchors.size() - 1) {
@@ -134,7 +156,7 @@ HMMRealignmentInput build_input_for_region(const std::string& bam_filename,
if(do_base_rc) {
start_kidx = sr.flip_k_strand(start_kidx);
end_kidx = sr.flip_k_strand(end_kidx);
-
+
// swap
int tmp = end_kidx;
end_kidx = start_kidx;
@@ -196,7 +218,7 @@ HMMRealignmentInput build_input_for_region(const std::string& bam_filename,
// base, these sequences need to overlap by k - 1 bases
int base_length = stride + k;
- if(ai * stride + base_length > fetched_len)
+ if((int)ai * stride + base_length > fetched_len)
base_length = fetched_len - ai * stride;
column.base_sequence = std::string(ref_segment + ai * stride, base_length);
@@ -227,8 +249,8 @@ std::vector<AlignedPair> get_aligned_pairs(const bam1_t* record, int read_stride
std::vector<AlignedPair> out;
// This code is derived from bam_fillmd1_core
- uint8_t *ref = NULL;
- uint8_t *seq = bam_get_seq(record);
+ //uint8_t *ref = NULL;
+ //uint8_t *seq = bam_get_seq(record);
uint32_t *cigar = bam_get_cigar(record);
const bam1_core_t *c = &record->core;
@@ -238,7 +260,7 @@ std::vector<AlignedPair> get_aligned_pairs(const bam1_t* record, int read_stride
// query pos is an index in the query string that is recorded in the bam
// we record this as a sanity check
- int query_pos = 0;
+ //int query_pos = 0;
int ref_pos = c->pos;
diff --git a/src/alignment/nanopolish_anchor.h b/src/alignment/nanopolish_anchor.h
index 818635e..5edef3b 100644
--- a/src/alignment/nanopolish_anchor.h
+++ b/src/alignment/nanopolish_anchor.h
@@ -87,9 +87,8 @@ HMMRealignmentInput build_input_for_region(const std::string& bam_filename,
const std::string& contig_name,
int start,
int end,
- int stride);
-
-
+ int stride,
+ const std::string& alternative_model_type);
// Return a vector specifying pairs of bases that have been aligned to each other
// This function can handle an "event cigar" bam record, which requires the ability
diff --git a/src/alignment/nanopolish_eventalign.cpp b/src/alignment/nanopolish_eventalign.cpp
index 465580f..6770d68 100644
--- a/src/alignment/nanopolish_eventalign.cpp
+++ b/src/alignment/nanopolish_eventalign.cpp
@@ -19,6 +19,7 @@
#include <set>
#include <omp.h>
#include <getopt.h>
+#include <iterator>
#include "htslib/faidx.h"
#include "nanopolish_eventalign.h"
#include "nanopolish_iupac.h"
@@ -29,6 +30,7 @@
#include "nanopolish_anchor.h"
#include "nanopolish_fast5_map.h"
#include "nanopolish_hmm_input_sequence.h"
+#include "nanopolish_pore_model_set.h"
#include "H5pubconf.h"
#include "profiler.h"
#include "progress.h"
@@ -61,6 +63,9 @@ static const char *EVENTALIGN_USAGE_MESSAGE =
" --progress print out a progress message\n"
" -n, --print-read-names print read names instead of indexes\n"
" --summary=FILE summarize the alignment of each read/strand in FILE\n"
+" --stdv enable stdv modelling\n"
+" --samples write the raw samples for the event to the tsv output\n"
+" --models-fofn=FILE read alternative k-mer models from FILE\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
namespace opt
@@ -71,17 +76,21 @@ namespace opt
static std::string genome_file;
static std::string region;
static std::string summary_file;
+ static std::string models_fofn;
+ static std::string alternative_model_type = DEFAULT_MODEL_TYPE;
static int output_sam = 0;
static int progress = 0;
static int num_threads = 1;
static int scale_events = 0;
static int batch_size = 128;
static bool print_read_names;
+ static bool full_output;
+ static bool write_samples = false;
}
static const char* shortopts = "r:b:g:t:w:vn";
-enum { OPT_HELP = 1, OPT_VERSION, OPT_PROGRESS, OPT_SAM, OPT_SUMMARY, OPT_SCALE_EVENTS };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_PROGRESS, OPT_SAM, OPT_SUMMARY, OPT_SCALE_EVENTS, OPT_STDV, OPT_MODELS_FOFN, OPT_SAMPLES };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
@@ -91,7 +100,10 @@ static const struct option longopts[] = {
{ "window", required_argument, NULL, 'w' },
{ "threads", required_argument, NULL, 't' },
{ "summary", required_argument, NULL, OPT_SUMMARY },
+ { "models-fofn", required_argument, NULL, OPT_MODELS_FOFN },
{ "print-read-names", no_argument, NULL, 'n' },
+ { "stdv", no_argument, NULL, OPT_STDV },
+ { "samples", no_argument, NULL, OPT_SAMPLES },
{ "scale-events", no_argument, NULL, OPT_SCALE_EVENTS },
{ "sam", no_argument, NULL, OPT_SAM },
{ "progress", no_argument, NULL, OPT_PROGRESS },
@@ -166,7 +178,7 @@ void trim_aligned_pairs_to_ref_region(std::vector<AlignedPair>& aligned_pairs, i
// that is not greater than ref_pos_max. It starts the search at pair_idx
int get_end_pair(const std::vector<AlignedPair>& aligned_pairs, int ref_pos_max, int pair_idx)
{
- while(pair_idx < aligned_pairs.size()) {
+ while(pair_idx < (int)aligned_pairs.size()) {
if(aligned_pairs[pair_idx].ref_pos > ref_pos_max)
return pair_idx - 1;
pair_idx += 1;
@@ -200,8 +212,12 @@ void emit_tsv_header(FILE* fp)
fprintf(fp, "%s\t%s\t%s\t%s\t%s\t", "contig", "position", "reference_kmer",
(not opt::print_read_names? "read_index" : "read_name"), "strand");
fprintf(fp, "%s\t%s\t%s\t%s\t", "event_index", "event_level_mean", "event_stdv", "event_length");
- fprintf(fp, "%s\t%s\t%s\t%s\n", "model_kmer", "model_mean", "model_stdv");
+ fprintf(fp, "%s\t%s\t%s\t%s", "model_kmer", "model_mean", "model_stdv", "standardized_level");
+ if(opt::write_samples) {
+ fprintf(fp, "\t%s", "samples");
+ }
+ fprintf(fp, "\n");
}
void emit_sam_header(samFile* fp, const bam_hdr_t* hdr)
@@ -289,7 +305,8 @@ void emit_event_alignment_sam(htsFile* fp,
const bam1_t* base_record,
const std::vector<EventAlignment>& alignments)
{
- assert(!alignments.empty());
+ if(alignments.empty())
+ return;
bam1_t* event_record = bam_init1();
// Variable-length data
@@ -381,34 +398,51 @@ void emit_event_alignment_tsv(FILE* fp,
// event information
float event_mean = sr.get_drift_corrected_level(ea.event_idx, ea.strand_idx);
- float event_stdv = sr.get_event_stdv(ea.event_idx, ea.strand_idx);
+ float event_stdv = sr.get_stdv(ea.event_idx, ea.strand_idx);
float event_duration = sr.get_duration(ea.event_idx, ea.strand_idx);
+ uint32_t rank = params.alphabet->kmer_rank(ea.model_kmer.c_str(), k);
+ float model_mean = 0.0;
+ float model_stdv = 0.0;
+
if(opt::scale_events) {
// scale reads to the model
event_mean = (event_mean - sr.pore_model[ea.strand_idx].shift) / sr.pore_model[ea.strand_idx].scale;
- fprintf(fp, "%d\t%.2lf\t%.3lf\t%.3lf\t", ea.event_idx, event_mean, event_stdv, event_duration);
-
- // unscaled parameters
- uint32_t rank = params.alphabet->kmer_rank(ea.model_kmer.c_str(), k);
- PoreModelStateParams model = sr.pore_model[ea.strand_idx].get_parameters(rank);
- fprintf(fp, "%s\t%.2lf\t%.2lf\n", ea.model_kmer.c_str(),
- model.level_mean,
- model.level_stdv);
+ // unscaled model parameters
+ if(ea.hmm_state != 'B') {
+ PoreModelStateParams model = sr.pore_model[ea.strand_idx].get_parameters(rank);
+ model_mean = model.level_mean;
+ model_stdv = model.level_stdv;
+ }
} else {
// scale model to the reads
- float event_mean = sr.get_drift_corrected_level(ea.event_idx, ea.strand_idx);
- float event_duration = sr.get_duration(ea.event_idx, ea.strand_idx);
- fprintf(fp, "%d\t%.2lf\t%.3lf\t%.3lf\t", ea.event_idx, event_mean, event_stdv, event_duration);
+ if(ea.hmm_state != 'B') {
+ GaussianParameters model = sr.pore_model[ea.strand_idx].get_scaled_parameters(rank);
+ model_mean = model.mean;
+ model_stdv = model.stdv;
+ }
+ }
- uint32_t rank = params.alphabet->kmer_rank(ea.model_kmer.c_str(), k);
- GaussianParameters model = sr.pore_model[ea.strand_idx].get_scaled_parameters(rank);
- fprintf(fp, "%s\t%.2lf\t%.2lf\n", ea.model_kmer.c_str(),
- model.mean,
- model.stdv);
+ float standard_level = (event_mean - model_mean) / (sqrt(sr.pore_model[ea.strand_idx].var) * model_stdv);
+ fprintf(fp, "%d\t%.2lf\t%.3lf\t%.5lf\t", ea.event_idx, event_mean, event_stdv, event_duration);
+ fprintf(fp, "%s\t%.2lf\t%.2lf\t%.2lf", ea.model_kmer.c_str(),
+ model_mean,
+ model_stdv,
+ standard_level);
+
+ if(opt::write_samples) {
+ std::vector<float> samples = sr.get_scaled_samples_for_event(ea.strand_idx, ea.event_idx);
+ std::stringstream sample_ss;
+ std::copy(samples.begin(), samples.end(), std::ostream_iterator<float>(sample_ss, ","));
+
+ // remove training comma
+ std::string sample_str = sample_ss.str();
+ sample_str.resize(sample_str.size() - 1);
+ fprintf(fp, "\t%s", sample_str.c_str());
}
+ fprintf(fp, "\n");
}
}
@@ -422,9 +456,9 @@ EventalignSummary summarize_alignment(const SquiggleRead& sr,
uint32_t k = sr.pore_model[strand_idx].k;
size_t prev_ref_pos = std::string::npos;
-
+
// the number of unique reference positions seen in the alignment
- size_t num_unique_ref_pos = 0;
+ //size_t num_unique_ref_pos = 0;
for(size_t i = 0; i < alignments.size(); ++i) {
@@ -480,14 +514,24 @@ void realign_read(EventalignWriter writer,
std::string fast5_path = name_map.get_path(read_name);
// load read
- SquiggleRead sr(read_name, fast5_path);
-
+ SquiggleRead sr(read_name, fast5_path, opt::write_samples ? SRF_LOAD_RAW_SAMPLES : 0);
+
+ if(!opt::alternative_model_type.empty()) {
+ sr.replace_models(opt::alternative_model_type);
+ }
+
if(opt::verbose > 1) {
fprintf(stderr, "Realigning %s [%zu %zu]\n",
read_name.c_str(), sr.events[0].size(), sr.events[1].size());
}
for(int strand_idx = 0; strand_idx < 2; ++strand_idx) {
+
+ // Do not align this strand if it was not sequenced
+ if(!sr.has_events_for_strand(strand_idx)) {
+ continue;
+ }
+
EventAlignmentParameters params;
params.sr = &sr;
params.fai = fai;
@@ -516,10 +560,12 @@ void realign_read(EventalignWriter writer,
}
if(writer.summary_fp != NULL && summary.num_events > 0) {
+
+ PoreModel& pore_model = sr.pore_model[strand_idx];
fprintf(writer.summary_fp, "%zu\t%s\t%s\t", read_idx, read_name.c_str(), sr.fast5_path.c_str());
- fprintf(writer.summary_fp, "%s\t%s\t", sr.pore_model[strand_idx].name.c_str(), strand_idx == 0 ? "template" : "complement");
+ fprintf(writer.summary_fp, "%s\t%s\t", pore_model.name.c_str(), strand_idx == 0 ? "template" : "complement");
fprintf(writer.summary_fp, "%d\t%d\t%d\t%d\t", summary.num_events, summary.num_matches, summary.num_skips, summary.num_stays);
- fprintf(writer.summary_fp, "%.2lf\n", summary.sum_duration);
+ fprintf(writer.summary_fp, "%.2lf\t%.3lf\t%.3lf\t%.3lf\t%.3lf\n", summary.sum_duration, pore_model.shift, pore_model.scale, pore_model.drift, pore_model.var);
}
}
}
@@ -590,7 +636,6 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
int last_event = params.sr->get_closest_event_to(read_kidx_end, params.strand_idx);
bool forward = first_event < last_event;
- int last_event_output = -1;
int curr_start_event = first_event;
int curr_start_ref = aligned_pairs.front().ref_pos;
int curr_pair_idx = 0;
@@ -618,8 +663,8 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
HMMInputSequence hmm_sequence(fwd_subseq, rc_subseq, params.alphabet);
- // Nothing to align to
- if(hmm_sequence.length() < k)
+ // Require a minimum amount of sequence to align to
+ if(hmm_sequence.length() < 2 * k)
break;
// Set up HMM input
@@ -628,6 +673,7 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
input.anchor_index = 0; // not used here
input.event_start_idx = curr_start_event;
input.event_stop_idx = params.sr->get_closest_event_to(curr_end_read, params.strand_idx);
+ //printf("[SEGMENT_START] read: %s event start: %zu event end: %zu\n", params.sr->read_name.c_str(), input.event_start_idx, input.event_stop_idx);
// A limitation of the segment-by-segment alignment is that we can't jump
// over very large deletions wrt to the reference. The effect of this
@@ -639,7 +685,7 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
input.strand = params.strand_idx;
input.event_stride = input.event_start_idx < input.event_stop_idx ? 1 : -1;
input.rc = rc_flags[params.strand_idx];
-
+
std::vector<HMMAlignmentState> event_alignment = profile_hmm_align(hmm_sequence, input);
// Output alignment
@@ -647,7 +693,24 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
size_t event_align_idx = 0;
// If we aligned to the last event, output everything and stop
- bool last_section = end_pair_idx == aligned_pairs.size() - 1;
+ bool last_section = end_pair_idx == (int)aligned_pairs.size() - 1;
+
+ /*
+ // Don't allow the segment to end on an E state or else we get alignment
+ // artifacts at the segment boundary
+ if(!last_section) {
+ size_t last_match_index = event_alignment.size() - 1;
+ while(event_alignment[last_match_index].state != 'M') {
+ last_match_index -= 1;
+ }
+
+ event_alignment.resize(last_match_index + 1);
+ if(event_alignment.empty()) {
+ break;
+ }
+ assert(event_alignment.back().state == 'M');
+ }
+ */
int last_event_output = 0;
int last_ref_kmer_output = 0;
@@ -656,7 +719,7 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
(num_output < output_stride || last_section); event_align_idx++) {
HMMAlignmentState& as = event_alignment[event_align_idx];
- if(as.state != 'K' && as.event_idx != curr_start_event) {
+ if(as.state != 'K' && (int)as.event_idx != curr_start_event) {
EventAlignment ea;
@@ -672,15 +735,21 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
ea.rc = input.rc;
// hmm
- ea.model_kmer = hmm_sequence.get_kmer(as.kmer_idx, k, input.rc);
ea.hmm_state = as.state;
+ if(ea.hmm_state != 'B') {
+ ea.model_kmer = hmm_sequence.get_kmer(as.kmer_idx, k, input.rc);
+ } else {
+ ea.model_kmer = std::string(k, 'N');
+ }
+
// store
alignment_output.push_back(ea);
// update
last_event_output = as.event_idx;
last_ref_kmer_output = curr_start_ref + as.kmer_idx;
+
num_output += 1;
}
}
@@ -688,6 +757,7 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
// Advance the pair iterator to the ref base
curr_start_event = last_event_output;
curr_start_ref = last_ref_kmer_output;
+ //printf("[SEGMENT_END] read: %s last event output: %zu ref pos: %zu (%s)\n", params.sr->read_name.c_str(), last_event_output, last_ref_kmer_output, ref_seq.substr(last_ref_kmer_output - ref_offset, k).c_str());
curr_pair_idx = get_end_pair(aligned_pairs, curr_start_ref, curr_pair_idx);
#if EVENTALIGN_TRAIN
@@ -695,6 +765,10 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
params.sr->parameters[params.strand_idx].add_training_from_alignment(hmm_sequence, input, event_alignment);
global_training[params.strand_idx].add_training_from_alignment(hmm_sequence, input, event_alignment);
#endif
+
+ if(num_output == 0) {
+ break;
+ }
} // for segment
return alignment_output;
@@ -712,7 +786,11 @@ void parse_eventalign_options(int argc, char** argv)
case '?': die = true; break;
case 't': arg >> opt::num_threads; break;
case 'n': opt::print_read_names = true; break;
+ case 'f': opt::full_output = true; break;
+ case OPT_STDV: model_stdv() = true; break;
+ case OPT_SAMPLES: opt::write_samples = true; break;
case 'v': opt::verbose++; break;
+ case OPT_MODELS_FOFN: arg >> opt::models_fofn; break;
case OPT_SCALE_EVENTS: opt::scale_events = true; break;
case OPT_SUMMARY: arg >> opt::summary_file; break;
case OPT_SAM: opt::output_sam = true; break;
@@ -755,6 +833,14 @@ void parse_eventalign_options(int argc, char** argv)
die = true;
}
+ if(opt::models_fofn.empty()) {
+ std::cerr << SUBPROGRAM ": a --models file must be provided\n";
+ die = true;
+ } else {
+ // initialize the model set from the fofn
+ PoreModelSet::initialize(opt::models_fofn);
+ }
+
if (die)
{
std::cout << "\n" << EVENTALIGN_USAGE_MESSAGE;
@@ -768,7 +854,7 @@ int eventalign_main(int argc, char** argv)
omp_set_num_threads(opt::num_threads);
Fast5Map name_map(opt::reads_file);
-
+
// Open the BAM and iterate over reads
// load bam file
@@ -825,7 +911,7 @@ int eventalign_main(int argc, char** argv)
writer.summary_fp = fopen(opt::summary_file.c_str(), "w");
// header
fprintf(writer.summary_fp, "read_index\tread_name\tfast5_path\tmodel_name\tstrand\tnum_events\t");
- fprintf(writer.summary_fp, "num_matches\tnum_skips\tnum_stays\ttotal_duration\n");
+ fprintf(writer.summary_fp, "num_matches\tnum_skips\tnum_stays\ttotal_duration\tshift\tscale\tdrift\tvar\n");
}
// Initialize iteration
diff --git a/src/alignment/nanopolish_eventalign.h b/src/alignment/nanopolish_eventalign.h
index 4364359..5b19ffb 100644
--- a/src/alignment/nanopolish_eventalign.h
+++ b/src/alignment/nanopolish_eventalign.h
@@ -41,7 +41,7 @@ struct EventAlignmentParameters
size_t strand_idx;
// optional
- Alphabet* alphabet;
+ const Alphabet* alphabet;
int read_idx;
int region_start;
int region_end;
diff --git a/src/common/logger.hpp b/src/common/logger.hpp
new file mode 100644
index 0000000..43cc6af
--- /dev/null
+++ b/src/common/logger.hpp
@@ -0,0 +1,260 @@
+//-----------------------------------------------
+// Copyright 2013 Ontario Institute for Cancer Research
+// Written by Matei David (mdavid at oicr.on.ca)
+// Released under the MIT license
+//-----------------------------------------------
+
+// Logger mechanism
+//
+// Properties:
+// - thread-safe, non-garbled output (uses c++11's thread_local)
+// - customizable ostream sink. by default, uses std::clog
+//
+// Main exports:
+// - "Logger" class
+// - "level_wrapper" namespace
+// - "LOG" macro
+//
+// To use:
+// - In source code, use:
+//
+// LOG(info) << "hello" << endl;
+// // or
+// LOG("main", info) << "hello" << endl;
+// // or
+// LOG("main", info, sink_os) << "hello" << endl;
+//
+// Here, "main" is the facility (a string) and info is the message level.
+// Note that "logger" is a macro which knows how to look up the name info
+// inside level_wrapper. The macro introduces C++ code equivalent to:
+//
+// if (...message should be ignored...) then; else sink_os
+//
+// NOTE: As with assert(), the code in the output stream following the
+// logger() macro will ***not be executed*** if the log level of the
+// facility is higher than the level of the message.
+//
+// - To set the default log level (for unspecified facilities):
+//
+// Logger::set_default_level(Logger::level_from_string(s));
+//
+// - To set the log level for the "main" facility:
+//
+// Logger::set_facility_level("main", Logger::level_from_string(s));
+//
+// - By using these functions, one can set log levels using command-line
+// parameters and achieve dynamic log level settings without recompiling.
+
+
+#ifndef __LOGGER_HPP
+#define __LOGGER_HPP
+
+#include <string>
+#include <vector>
+#include <map>
+#include <sstream>
+#include <iostream>
+#include <mutex>
+
+namespace level_wrapper
+{
+ // log levels
+ enum level
+ {
+ error = 0,
+ warning,
+ info,
+ debug,
+ debug1,
+ debug2
+ };
+}
+
+class Logger
+ : public std::ostringstream
+{
+public:
+ typedef level_wrapper::level level;
+ // Constructor: initialize buffer.
+ Logger(const std::string& facility, level msg_level,
+ const std::string& file_name, unsigned line_num, const std::string& func_name,
+ std::ostream& os = std::clog)
+ : _os_p(&os)
+ {
+ *this << "= " << facility << "." << int(msg_level)
+ << " " << file_name << ":" << line_num
+ << " " << func_name << " ";
+ }
+ // Destructor: dump buffer to output.
+ ~Logger()
+ {
+ _os_p->write(this->str().c_str(), this->str().size());
+ }
+ // Produce l-value for output chaining.
+ std::ostream& l_value() { return *this; }
+
+ // static methods for setting and getting facility log levels.
+ static level get_default_level()
+ {
+ return default_level();
+ }
+ static void set_default_level(level l)
+ {
+ static std::mutex m;
+ std::lock_guard< std::mutex > lg(m);
+ default_level() = l;
+ }
+ static void set_default_level(int l)
+ {
+ set_default_level(get_level(l));
+ }
+ static void set_default_level(const std::string& s)
+ {
+ set_default_level(get_level(s));
+ }
+ static level get_facility_level(const std::string& facility)
+ {
+ return (facility_level_map().count(facility) > 0?
+ facility_level_map().at(facility) : get_default_level());
+ }
+ static void set_facility_level(const std::string& facility, level l)
+ {
+ static std::mutex m;
+ std::lock_guard< std::mutex > lg(m);
+ facility_level_map()[facility] = l;
+ }
+ static void set_facility_level(const std::string& facility, int l)
+ {
+ set_facility_level(facility, get_level(l));
+ }
+ static void set_facility_level(const std::string& facility, const std::string& s)
+ {
+ set_facility_level(facility, get_level(s));
+ }
+ // static methods for setting log levels from command-line options
+ static void set_level_from_option(const std::string& l, std::ostream* os_p = nullptr)
+ {
+ size_t i = l.find(':');
+ if (i == std::string::npos)
+ {
+ set_default_level(l);
+ if (os_p)
+ {
+ (*os_p) << "set default log level to: "
+ << static_cast< int >(Logger::get_default_level()) << std::endl;
+ }
+ }
+ else
+ {
+ set_facility_level(l.substr(0, i), l.substr(i + 1));
+ if (os_p)
+ {
+ (*os_p) << "set log level of '" << l.substr(0, i) << "' to: "
+ << static_cast< int >(Logger::get_facility_level(l.substr(0, i))) << std::endl;
+ }
+ }
+ }
+ static void set_levels_from_options(const std::vector< std::string >& v, std::ostream* os_p = nullptr)
+ {
+ for (const auto& l : v)
+ {
+ set_level_from_option(l, os_p);
+ }
+ }
+ // public static utility functions (used by LOG macro)
+ static level get_level(level l) { return l; }
+ static level get_level(int i) { return static_cast< level >(i); }
+ static level get_level(const std::string& s) { return level_from_string(s); }
+ // public static member (used by LOG macro)
+ static level& thread_local_last_level()
+ {
+ static thread_local level _last_level = level_wrapper::error;
+ return _last_level;
+ }
+private:
+ // sink for this Logger object
+ std::ostream* _os_p;
+
+ // private static data members
+ static level& default_level()
+ {
+ static level _default_level = level_wrapper::error;
+ return _default_level;
+ }
+ static std::map< std::string, level >& facility_level_map()
+ {
+ static std::map< std::string, level > _facility_level_map;
+ return _facility_level_map;
+ }
+ // private static utility functions
+ static level level_from_string(const std::string& s)
+ {
+ std::istringstream iss(s + "\n");
+ int tmp_int = -1;
+ iss >> tmp_int;
+ if (iss.good())
+ {
+ return level(tmp_int);
+ }
+ else
+ {
+ if (s == "error") return level_wrapper::error;
+ else if (s == "warning") return level_wrapper::warning;
+ else if (s == "info") return level_wrapper::info;
+ else if (s == "debug") return level_wrapper::debug;
+ else if (s == "debug1") return level_wrapper::debug1;
+ else if (s == "debug2") return level_wrapper::debug2;
+ else
+ {
+ std::cerr << "could not parse log level: " << s << "\n";
+ std::exit(1);
+ }
+ }
+ }
+}; // class Logger
+
+#define __FILENAME__ (std::string(__FILE__).find('/') != std::string::npos? std::string(__FILE__).substr(std::string(__FILE__).rfind('/') + 1) : std::string(__FILE__))
+
+/**
+ * LOG macro
+ *
+ * Synopsis:
+ * LOG(facility, level_spec, sink) << message
+ * LOG(facility, level_spec) << message
+ * LOG(level_spec) << message
+ *
+ * `facility` : string
+ * `level_spec` : integer, string, or logger level
+ * `sink` : sink ostream
+ *
+ * Log to `facility` at logger level `level_spec` and dump output to `sink`.
+ * If sink is omitted, it defaults to std::clog.
+ * If `facility` is omitted (logger has single argument), the macro LOG_FACILITY
+ * is used instead.
+ */
+
+#define __LOG_3(facility, level_spec, sink) \
+ { using namespace level_wrapper; Logger::thread_local_last_level() = Logger::get_level(level_spec); } \
+ if (Logger::thread_local_last_level() > Logger::get_facility_level(facility)) ; \
+ else Logger(facility, Logger::thread_local_last_level(), __FILENAME__, __LINE__, __func__, sink).l_value()
+
+#define __LOG_2(facility, level_spec) \
+ { using namespace level_wrapper; Logger::thread_local_last_level() = Logger::get_level(level_spec); } \
+ if (Logger::thread_local_last_level() > Logger::get_facility_level(facility)) ; \
+ else Logger(facility, Logger::thread_local_last_level(), __FILENAME__, __LINE__, __func__).l_value()
+
+#define __LOG_1(level_spec) \
+ __LOG_2(LOG_FACILITY, level_spec)
+
+// we need 2-level indirection in order to trigger expansion after token pasting
+// http://stackoverflow.com/questions/1597007/creating-c-macro-with-and-line-token-concatenation-with-positioning-macr
+// http://stackoverflow.com/a/11763196/717706
+#define __LOG_aux2(N, ...) __LOG_ ## N (__VA_ARGS__)
+#define __LOG_aux1(N, ...) __LOG_aux2(N, __VA_ARGS__)
+
+#define __NARGS_AUX(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, ...) _9
+#define __NARGS(...) __NARGS_AUX(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0)
+
+#define LOG(...) __LOG_aux1(__NARGS(__VA_ARGS__), __VA_ARGS__)
+
+#endif
diff --git a/src/common/logsum.cpp b/src/common/logsum.cpp
index 0c5b21a..8425d0e 100644
--- a/src/common/logsum.cpp
+++ b/src/common/logsum.cpp
@@ -93,6 +93,9 @@ p7_FLogsumError(float a, float b)
return (exp(approx) - exp(exact));
}
+struct Init_Caller { Init_Caller() { p7_FLogsumInit(); } };
+Init_Caller init_object;
+
/*****************************************************************
* 2. Benchmark driver.
*****************************************************************/
diff --git a/src/common/logsumset.hpp b/src/common/logsumset.hpp
new file mode 100644
index 0000000..85fdd54
--- /dev/null
+++ b/src/common/logsumset.hpp
@@ -0,0 +1,80 @@
+#ifndef __LOGSUMSET_HPP
+#define __LOGSUMSET_HPP
+
+#include <cassert>
+#include <cmath>
+#include <set>
+
+#include "logsum.h"
+//#include "logger.hpp"
+
+template< typename Float_Type >
+class logsumset
+{
+public:
+ logsumset(bool use_set) : _val_set(), _val(-INFINITY), _use_set(use_set) {}
+
+ template < typename Input_Iterator >
+ logsumset(Input_Iterator it, Input_Iterator it_end, bool use_set)
+ : logsumset(use_set)
+ {
+ while (it != it_end)
+ {
+ add(*it);
+ }
+ }
+
+ template < typename Input_Range >
+ logsumset(const Input_Range& rg, bool use_set)
+ : logsumset(rg.begin(), rg.end(), use_set) {}
+
+ const bool& use_set() const { return _use_set; }
+ bool& use_set() { return _use_set; }
+
+ void add(Float_Type v)
+ {
+ if (_use_set)
+ {
+ _val_set.insert(v);
+ }
+ else
+ {
+ _val = p7_FLogsum(_val, v);
+ }
+ }
+
+ Float_Type val()
+ {
+ if (not _val_set.empty())
+ {
+ _val_set.insert(_val);
+ while (_val_set.size() > 1)
+ {
+ Float_Type a = *_val_set.begin();
+ assert(not std::isnan(a));
+ _val_set.erase(_val_set.begin());
+ Float_Type b = *_val_set.begin();
+ assert(not std::isnan(b));
+ _val_set.erase(_val_set.begin());
+#ifdef LOG
+ if (b - a > 15.7 and b > -80)
+ {
+ LOG("logsumset", warning)
+ << "precision loss: a=" << a << " b=" << b << std::endl;
+ }
+#endif
+ _val_set.insert(p7_FLogsum(a, b));
+ }
+ _val = *_val_set.begin();
+ _val_set.erase(_val_set.begin());
+ }
+ return _val;
+ }
+
+private:
+ std::multiset< Float_Type > _val_set;
+ Float_Type _val;
+ bool _use_set;
+}; // class logsumset
+
+#endif
diff --git a/src/common/nanopolish_alphabet.cpp b/src/common/nanopolish_alphabet.cpp
index 500108c..84e7a65 100644
--- a/src/common/nanopolish_alphabet.cpp
+++ b/src/common/nanopolish_alphabet.cpp
@@ -5,9 +5,14 @@
//
// nanopolish_alphabet -- support for multiple alphabets
//
-#include <assert.h>
+#include <cassert>
+#include <vector>
#include "nanopolish_alphabet.h"
+//
+// DNAAlphabet
+//
+
const uint8_t DNAAlphabet::_rank[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -26,10 +31,14 @@ const uint8_t DNAAlphabet::_rank[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
+const char* DNAAlphabet::_name = "nucleotide";
const char* DNAAlphabet::_base = "ACGT";
const char* DNAAlphabet::_complement = "TGCA";
const uint32_t DNAAlphabet::_size = 4;
+//
+// methyl-cytosine in CG context
+//
const uint8_t MethylCpGAlphabet::_rank[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -48,8 +57,122 @@ const uint8_t MethylCpGAlphabet::_rank[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
+
+const char* MethylCpGAlphabet::_name = "cpg";
const char* MethylCpGAlphabet::_base = "ACGMT";
+const char* MethylCpGAlphabet::_complement = "TGCGA";
const uint32_t MethylCpGAlphabet::_size = 5;
+const uint32_t MethylCpGAlphabet::_num_recognition_sites = 1;
+const uint32_t MethylCpGAlphabet::_recognition_length = 2;
+const char* MethylCpGAlphabet::_recognition_sites[] = { "CG" };
+const char* MethylCpGAlphabet::_recognition_sites_methylated[] = { "MG" };
+const char* MethylCpGAlphabet::_recognition_sites_methylated_complement[] = { "GM" };
+
+//
+// Dam methylation: methyl-adenine in GATC context
+//
+const uint8_t MethylDamAlphabet::_rank[256] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,1,0,0,0,2,0,0,0,0,0,3,0,0,
+ 0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+const char* MethylDamAlphabet::_name = "dam";
+const char* MethylDamAlphabet::_base = "ACGMT";
+const char* MethylDamAlphabet::_complement = "TGCTA";
+const uint32_t MethylDamAlphabet::_size = 5;
+
+const uint32_t MethylDamAlphabet::_num_recognition_sites = 1;
+const uint32_t MethylDamAlphabet::_recognition_length = 4;
+const char* MethylDamAlphabet::_recognition_sites[] = { "GATC" };
+const char* MethylDamAlphabet::_recognition_sites_methylated[] = { "GMTC" };
+const char* MethylDamAlphabet::_recognition_sites_methylated_complement[] = { "CTMG" };
+
+//
+// Dcm methylation: methyl-cytosine in CCAGG and CCTGG context
+//
+const uint8_t MethylDcmAlphabet::_rank[256] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,1,0,0,0,2,0,0,0,0,0,3,0,0,
+ 0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+const char* MethylDcmAlphabet::_name = "dcm";
+const char* MethylDcmAlphabet::_base = "ACGMT";
+const char* MethylDcmAlphabet::_complement = "TGCGA";
+const uint32_t MethylDcmAlphabet::_size = 5;
+
+const uint32_t MethylDcmAlphabet::_num_recognition_sites = 2;
+const uint32_t MethylDcmAlphabet::_recognition_length = 5;
+const char* MethylDcmAlphabet::_recognition_sites[] = { "CCAGG", "CCTGG" };
+const char* MethylDcmAlphabet::_recognition_sites_methylated[] = { "CMAGG", "CMTGG" };
+const char* MethylDcmAlphabet::_recognition_sites_methylated_complement[] = { "GGTMC", "GGAMC" };
+
+// Global objects
DNAAlphabet gDNAAlphabet;
MethylCpGAlphabet gMCpGAlphabet;
+MethylDamAlphabet gMethylDamAlphabet;
+MethylDcmAlphabet gMethylDcmAlphabet;
+
+std::vector<const Alphabet*> get_alphabet_list()
+{
+ std::vector<const Alphabet*> list = { &gDNAAlphabet,
+ &gMCpGAlphabet,
+ &gMethylDamAlphabet,
+ &gMethylDcmAlphabet };
+ return list;
+}
+
+// Select the alphabet that best matches bases
+const Alphabet* best_alphabet(const char *bases)
+{
+ std::vector<const Alphabet*> list = get_alphabet_list();
+
+ for (auto alphabet: list)
+ if (alphabet->contains_all(bases))
+ return alphabet;
+
+ return nullptr;
+}
+
+// Select the alphabet by name
+const Alphabet* get_alphabet_by_name(const std::string& name)
+{
+ std::vector<const Alphabet*> list = get_alphabet_list();
+
+ for (auto alphabet: list)
+ if (alphabet->get_name() == name)
+ return alphabet;
+
+ fprintf(stderr, "Error, unknown alphabet name: %s\n", name.c_str());
+ exit(EXIT_FAILURE);
+ return nullptr;
+}
+
diff --git a/src/common/nanopolish_alphabet.h b/src/common/nanopolish_alphabet.h
index 22824dd..a9fefae 100644
--- a/src/common/nanopolish_alphabet.h
+++ b/src/common/nanopolish_alphabet.h
@@ -9,12 +9,50 @@
#define NANOPOLISH_ALPHABET_H
#include <string>
+#include <cstring>
#include <inttypes.h>
#include <assert.h>
#include "nanopolish_iupac.h"
-// A table to map { A, C, G, T } => { 0, 1, 2, 3 }
-extern const uint8_t dna_base_rank[];
+#define METHYLATED_SYMBOL 'M'
+
+struct RecognitionMatch
+{
+ unsigned offset; // the matched position in the recognition site
+ unsigned length; // the length of the match, 0 indicates no match
+ bool covers_methylated_site; // does the match cover an M base?
+};
+
+// Check whether a recognition site starts at position i of str
+inline RecognitionMatch match_to_site(const std::string& str, size_t i, const char* recognition, size_t rl)
+{
+ RecognitionMatch match;
+ match.length = 0;
+ match.offset = 0;
+ match.covers_methylated_site = false;
+
+ // Case 1: str is a substring of recognition
+ const char* p = strstr(recognition, str.c_str());
+ if(i == 0 && p != NULL) {
+ match.offset = p - recognition;
+ match.length = str.length();
+ } else {
+ // Case 2: the suffix str[i..n] is a prefix of recognition
+ size_t cl = std::min(rl, str.length() - i);
+ if(str.compare(i, cl, recognition, cl) == 0) {
+ match.offset = 0;
+ match.length = cl;
+ }
+ }
+
+ //printf("Match site: %s %s %s %d %d\n", str.c_str(), str.substr(i).c_str(), recognition, match.offset, match.length);
+ if(match.length > 0) {
+ match.covers_methylated_site =
+ str.substr(i, match.length).find_first_of(METHYLATED_SYMBOL) != std::string::npos;
+ }
+
+ return match;
+}
// Abstract base class for alphabets
class Alphabet
@@ -23,7 +61,16 @@ class Alphabet
// basic functions
virtual uint8_t rank(char b) const = 0;
virtual char base(uint8_t r) const = 0;
+ virtual char complement(char b) const = 0;
virtual uint32_t size() const = 0;
+ virtual std::string get_name() const = 0;
+
+ // support for methylated bases with recognition sequences
+ virtual size_t num_recognition_sites() const = 0;
+ virtual size_t recognition_length() const = 0;
+ virtual const char* get_recognition_site(size_t i) const = 0;
+ virtual const char* get_recognition_site_methylated(size_t i) const = 0;
+ virtual const char* get_recognition_site_methylated_complement(size_t i) const = 0;
// return the lexicographic rank of the kmer amongst all strings of
// length k for this alphabet
@@ -64,129 +111,253 @@ class Alphabet
return n;
}
- // reverse complement a string over this alphabet
- virtual std::string reverse_complement(const std::string& seq) const = 0;
+ // reverse-complement a string
+ // when the string contains methylated bases, the methylation
+ // symbol transfered to the output strand in the appropriate position
+ virtual std::string reverse_complement(const std::string& str) const
+ {
+ std::string out(str.length(), 'A');
+ size_t i = 0; // input
+ int j = str.length() - 1; // output
+ while(i < str.length()) {
+ int recognition_index = -1;
+ RecognitionMatch match;
- // remove ambiguous nucleotides from the string
- virtual std::string disambiguate(const std::string& seq) const = 0;
-};
+ // Does this location (partially) match a methylated recognition site?
+ for(size_t j = 0; j < num_recognition_sites(); ++j) {
+ match = match_to_site(str, i, get_recognition_site_methylated(j), recognition_length());
+ if(match.length > 0 && match.covers_methylated_site) {
+ recognition_index = j;
+ break;
+ }
+ }
-struct DNAAlphabet : public Alphabet
-{
- static const uint8_t _rank[256];
- static const char* _base;
- static const char* _complement;
- static const uint32_t _size;
+ // If this subsequence matched a methylated recognition site,
+ // copy the complement of the site to the output
+ if(recognition_index != -1) {
+ for(size_t k = match.offset; k < match.offset + match.length; ++k) {
+ out[j--] = get_recognition_site_methylated_complement(recognition_index)[k];
+ i += 1;
+ }
+ } else {
+ // complement a single base
+ assert(str[i] != METHYLATED_SYMBOL);
+ out[j--] = complement(str[i++]);
+ }
+ }
+ return out;
+ }
- virtual uint8_t rank(char b) const { return _rank[b]; }
- virtual char base(uint8_t r) const { return _base[r]; }
- virtual uint32_t size() const { return _size; }
+ // return a new copy of the string with IUPAC ambiguity characters changed
+ virtual std::string disambiguate(const std::string& str) const
+ {
+ std::string out(str);
+ size_t i = 0;
+ while(i < out.length()) {
+ size_t stride = 1;
+ bool is_recognition_site = false;
- virtual std::string reverse_complement(const std::string& seq) const
- {
- std::string out(seq.length(), 'A');
- size_t last_pos = seq.length() - 1;
- for(int i = last_pos; i >= 0; --i) {
- out[last_pos - i] = _complement[_rank[seq[i]]];
+ // Does this location (partially) match a methylated recognition site?
+ for(size_t j = 0; j < num_recognition_sites(); ++j) {
+
+ RecognitionMatch match = match_to_site(str, i, get_recognition_site_methylated(j), recognition_length());
+ if(match.length > 0) {
+ stride = match.length; // skip to end of match
+ is_recognition_site = true;
+ break;
+ }
+ }
+
+ // disambiguate if not a recognition site
+ if(!is_recognition_site) {
+ assert(IUPAC::isValid(out[i]));
+ out[i] = IUPAC::getPossibleSymbols(out[i])[0];
+ stride = 1;
+ }
+
+ i += stride;
+ }
+ return out;
}
- return out;
- }
- // return a new copy of the string with ambiguous characters changed
- virtual std::string disambiguate(const std::string& str) const
- {
- std::string out(str);
- for(size_t i = 0; i < str.length(); ++i) {
- assert(IUPAC::isValid(str[i]));
- out[i] = IUPAC::getPossibleSymbols(str[i])[0];
+ // If the alphabet supports methylated bases, convert str
+ // to a methylated string using the recognition sites
+ virtual std::string methylate(const std::string& str) const
+ {
+ std::string out(str);
+ size_t i = 0;
+ while(i < out.length()) {
+ size_t stride = 1;
+
+ // Does this location match a recognition site?
+ for(size_t j = 0; j < num_recognition_sites(); ++j) {
+
+ RecognitionMatch match = match_to_site(str, i, get_recognition_site(j), recognition_length());
+ // Require the recognition site to be completely matched
+ if(match.length == recognition_length()) {
+ // Replace by the methylated version
+ out.replace(i, recognition_length(), get_recognition_site_methylated(j));
+ stride = match.length; // skip to end of match
+ break;
+ }
+ }
+
+ i += stride;
+ }
+ return out;
}
- return out;
- }
-};
+ // Remove methylated bases according to the recognition site
+ std::string unmethylate(const std::string& str) const
+ {
+ std::string out(str);
+ size_t i = 0;
+ while(i < out.length()) {
+ size_t stride = 1;
-// DNABaseMap with methyl-cytosine
-struct MethylCpGAlphabet : public Alphabet
-{
- static const uint8_t _rank[256];
- static const char* _base;
- static const char* _complement;
- static const uint32_t _size;
+ // Does this location (partially) match a methylated recognition site?
+ for(size_t j = 0; j < num_recognition_sites(); ++j) {
- virtual uint8_t rank(char b) const { return _rank[b]; }
- virtual char base(uint8_t r) const { return _base[r]; }
- virtual uint32_t size() const { return _size; }
+ RecognitionMatch match = match_to_site(str, i, get_recognition_site_methylated(j), recognition_length());
+ if(match.length > 0) {
- virtual std::string reverse_complement(const std::string& seq) const
- {
- std::string out(seq.length(), 'A');
- size_t i = 0; // input
- int j = seq.length() - 1; // output
- while(i < seq.length()) {
- if(seq[i] == 'M') {
-
- out[j--] = 'G';
- i += 1;
-
- // CpG methylation model requires M to be followed by G
- // (if there is space)
- if(j >= 0) {
- assert(i < seq.length());
- assert(seq[i] == 'G');
- out[j--] = 'M';
- ++i;
+ // Replace by the unmethylated version
+ out.replace(i, match.length, get_recognition_site(j) + match.offset, match.length);
+ stride = match.length; // skip to end of match
+ break;
+ }
}
- } else {
- out[j--] = DNAAlphabet::_complement[DNAAlphabet::_rank[seq[i++]]];
+
+ i += stride;
}
+ return out;
}
- return out;
+
+ // does this alphabet contain all of the nucleotides in bases?
+ virtual inline bool contains_all(const char *bases) const = 0;
+};
+
+#define BASIC_MEMBER_BOILERPLATE \
+ static const uint8_t _rank[256]; \
+ static const char* _name; \
+ static const char* _base; \
+ static const char* _complement; \
+ static const uint32_t _size;
+
+#define BASIC_ACCESSOR_BOILERPLATE \
+ virtual std::string get_name() const { return _name; } \
+ virtual uint8_t rank(char b) const { return _rank[(int)b]; } \
+ virtual char base(uint8_t r) const { return _base[r]; } \
+ virtual char complement(char b) const { return _complement[_rank[(int)b]]; } \
+ virtual uint32_t size() const { return _size; } \
+
+struct DNAAlphabet : public Alphabet
+{
+ // members
+ BASIC_MEMBER_BOILERPLATE
+
+ // functions
+ BASIC_ACCESSOR_BOILERPLATE
+
+ // no methylation in this alphabet
+ virtual size_t num_recognition_sites() const { return 0; }
+ virtual size_t recognition_length() const { return 0; }
+ virtual const char* get_recognition_site(size_t) const { return NULL; }
+ virtual const char* get_recognition_site_methylated(size_t) const { return NULL; }
+ virtual const char* get_recognition_site_methylated_complement(size_t) const {
+ return NULL;
}
- // return a new copy of the string with ambiguous characters changed
- virtual std::string disambiguate(const std::string& str) const
+ // does this alphabet contain all of the nucleotides in bases?
+ virtual inline bool contains_all(const char *bases) const
{
- std::string out(str);
- for(size_t i = 0; i < str.length(); ++i) {
- if(str[i] == 'M' && i != str.length() - 1 && str[i + 1] == 'G') {
- // CpG site, assume its methylated not an ambiguity symbol
- out[i] = 'M';
- } else {
- assert(IUPAC::isValid(str[i]));
- out[i] = IUPAC::getPossibleSymbols(str[i])[0];
- }
- }
- return out;
+ return strspn(bases, _base) == strlen(bases);
}
+};
+
+#define METHYLATION_MEMBER_BOILERPLATE \
+ static const uint32_t _num_recognition_sites; \
+ static const uint32_t _recognition_length; \
+ static const char* _recognition_sites[]; \
+ static const char* _recognition_sites_methylated[]; \
+ static const char* _recognition_sites_methylated_complement[];
+
+#define METHYLATION_ACCESSOR_BOILERPLATE \
+ virtual size_t num_recognition_sites() const { return _num_recognition_sites; } \
+ virtual size_t recognition_length() const { return _recognition_length; } \
+ virtual const char* get_recognition_site(size_t i) const { return _recognition_sites[i]; } \
+ virtual const char* get_recognition_site_methylated(size_t i) const { return _recognition_sites_methylated[i]; } \
+ virtual const char* get_recognition_site_methylated_complement(size_t i) const { \
+ return _recognition_sites_methylated_complement[i]; \
+ }
- // Convert CpGs of the sequence to mCpG
- std::string methylate(const std::string& str) const
+//
+// methyl-cytosine in CG context
+//
+struct MethylCpGAlphabet : public Alphabet
+{
+ // member variables, expanded by macrocs
+ BASIC_MEMBER_BOILERPLATE
+ METHYLATION_MEMBER_BOILERPLATE
+
+ // member functions
+ BASIC_ACCESSOR_BOILERPLATE
+ METHYLATION_ACCESSOR_BOILERPLATE
+
+ // does this alphabet contain all of the nucleotides in bases?
+ virtual inline bool contains_all(const char *bases) const
{
- std::string out(str);
- for(size_t i = 0; i < out.length() - 1; ++i) {
- if(out[i] == 'C' && out[i + 1] == 'G') {
- out[i] = 'M';
- }
- }
- return out;
+ return strspn(bases, _base) == strlen(bases);
}
+};
+
+//
+// Dam methylation: methyl-adenine in GATC context
+//
+struct MethylDamAlphabet : public Alphabet
+{
+ // member variables, expanded by macrocs
+ BASIC_MEMBER_BOILERPLATE
+ METHYLATION_MEMBER_BOILERPLATE
+
+ // member functions
+ BASIC_ACCESSOR_BOILERPLATE
+ METHYLATION_ACCESSOR_BOILERPLATE
- // Convert methylated bases to C
- std::string unmethylate(const std::string& str) const
+ // does this alphabet contain all of the nucleotides in bases?
+ virtual inline bool contains_all(const char *bases) const
{
- std::string out(str);
- for(size_t i = 0; i < out.length(); ++i) {
- if(out[i] == 'M') {
- out[i] = 'C';
- }
- }
- return out;
+ return strspn(bases, _base) == strlen(bases);
}
+};
+//
+// Dcm methylation: methyl-cytosine in CCAGG and CCTGG context
+//
+struct MethylDcmAlphabet : public Alphabet
+{
+ // member variables, expanded by macrocs
+ BASIC_MEMBER_BOILERPLATE
+ METHYLATION_MEMBER_BOILERPLATE
+
+ // member functions
+ BASIC_ACCESSOR_BOILERPLATE
+ METHYLATION_ACCESSOR_BOILERPLATE
+
+ // does this alphabet contain all of the nucleotides in bases?
+ virtual inline bool contains_all(const char *bases) const
+ {
+ return strspn(bases, _base) == strlen(bases);
+ }
};
// Global alphabet objects that can be re-used
extern DNAAlphabet gDNAAlphabet;
extern MethylCpGAlphabet gMCpGAlphabet;
+extern MethylDamAlphabet gMethylDamAlphabet;
+extern MethylDcmAlphabet gMethylDcmAlphabet;
+
+const Alphabet* best_alphabet(const char *bases);
+const Alphabet* get_alphabet_by_name(const std::string& name);
#endif
diff --git a/src/common/nanopolish_common.cpp b/src/common/nanopolish_common.cpp
index e9c163c..873906f 100644
--- a/src/common/nanopolish_common.cpp
+++ b/src/common/nanopolish_common.cpp
@@ -28,3 +28,31 @@ std::vector<std::string> split(std::string in, char delimiter)
out.push_back(in.substr(lastPos));
return out;
}
+
+bool ends_with(const std::string& str, const std::string& suffix)
+{
+ if(suffix.empty()) {
+ return true;
+ }
+
+ size_t pos = str.find(suffix);
+ if(pos == std::string::npos) {
+ return false;
+ }
+ return pos + suffix.size() == str.length();
+}
+// from: http://stackoverflow.com/questions/9330915/number-of-combinations-n-choose-r-in-c
+size_t nChoosek(size_t n, size_t k)
+{
+ if (k > n) return 0;
+ if (k * 2 > n) k = n-k;
+ if (k == 0) return 1;
+
+ int result = n;
+ for( int i = 2; i <= k; ++i ) {
+ result *= (n-i+1);
+ result /= i;
+ }
+ return result;
+}
+
diff --git a/src/common/nanopolish_common.h b/src/common/nanopolish_common.h
index dae1cc9..85b5bfb 100644
--- a/src/common/nanopolish_common.h
+++ b/src/common/nanopolish_common.h
@@ -18,7 +18,7 @@
#include "logsum.h"
#define PACKAGE_NAME "nanopolish"
-#define PACKAGE_VERSION "0.4.0"
+#define PACKAGE_VERSION "0.5.0"
#define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
//
@@ -108,4 +108,20 @@ inline double add_logs(const double a, const double b)
// split a string based on a delimiter
std::vector<std::string> split(std::string in, char delimiter);
+// returns true if the provided string ends with the given suffix
+bool ends_with(const std::string& str, const std::string& suffix);
+
+// from: http://stackoverflow.com/questions/9330915/number-of-combinations-n-choose-r-in-c
+size_t nChoosek(size_t n, size_t k);
+
+// print a warning message to stderr a single time
+// this is only for debugging, please don't litter the code with them
+#define WARN_ONCE(x) static bool _warn_once = true; if(_warn_once) \
+ fprintf(stderr, "WARNING: [%s]\n", (x)); _warn_once = false;
+
+template<class T>
+std::string array2str(const T& array)
+{
+ return std::string(array.begin(), array.end());
+}
#endif
diff --git a/src/common/nanopolish_fast5_map.cpp b/src/common/nanopolish_fast5_map.cpp
index c2530e8..d935219 100644
--- a/src/common/nanopolish_fast5_map.cpp
+++ b/src/common/nanopolish_fast5_map.cpp
@@ -19,7 +19,7 @@
//
#define FOFN_SUFFIX ".fast5.fofn"
-KSEQ_INIT(gzFile, gzread);
+KSEQ_INIT(gzFile, gzread)
Fast5Map::Fast5Map(const std::string& fasta_filename)
{
@@ -29,7 +29,7 @@ Fast5Map::Fast5Map(const std::string& fasta_filename)
struct stat fofn_file_s;
struct stat fasta_file_s;
int fofn_ret = stat(fofn_filename.c_str(), &fofn_file_s);
- int fasta_ret = stat(fasta_filename.c_str(), &fasta_file_s);
+ stat(fasta_filename.c_str(), &fasta_file_s);
// Use the stored fofn if its available and newer than the fasta
if(fofn_ret == 0 && fofn_file_s.st_mtime > fasta_file_s.st_mtime) {
diff --git a/src/common/nanopolish_iupac.h b/src/common/nanopolish_iupac.h
index 4466213..b78faf3 100644
--- a/src/common/nanopolish_iupac.h
+++ b/src/common/nanopolish_iupac.h
@@ -25,6 +25,6 @@ namespace IUPAC
// Returns a string defining the possible unambiguous bases for each symbol
// in the alphabet
std::string getPossibleSymbols(char c);
-};
+}
#endif
diff --git a/src/common/nanopolish_model_names.cpp b/src/common/nanopolish_model_names.cpp
new file mode 100644
index 0000000..19cc69e
--- /dev/null
+++ b/src/common/nanopolish_model_names.cpp
@@ -0,0 +1,66 @@
+//---------------------------------------------------------
+// Copyright 2015 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_model_names -- Get metadata for ONT model names
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <map>
+#include "nanopolish_model_names.h"
+#include "nanopolish_common.h"
+
+static std::string strand_by_idx[] = { "template", "complement.pop1", "complement.pop2" };
+static std::string short_strand_by_idx[] = { "t", "c.p1", "c.p2" };
+static std::string short_kit_by_idx[] = { "005", "006", "007" };
+
+static std::map< std::string, ModelMetadata > known_models = {
+
+ // SQK005 models
+ { "r7.3_template_median68pA.model", {T_IDX, 0, KV_SQK005 } },
+ { "r7.3_complement_median68pA_pop1.model", {C_IDX, 1, KV_SQK005 } },
+ { "r7.3_complement_median68pA_pop2.model", {C_IDX, 2, KV_SQK005 } },
+
+ // SQK006 models
+ { "r7.3_e6_70bps_6mer_template_median68pA.model", {T_IDX, 0, KV_SQK006 } },
+ { "r7.3_e6_70bps_6mer_complement_median68pA_pop1.model", {C_IDX, 1, KV_SQK006 } },
+ { "r7.3_e6_70bps_6mer_complement_median68pA_pop2.model", {C_IDX, 2, KV_SQK006 } },
+
+ // SQK007 models
+ { "r9.template.model", {T_IDX, 0, KV_SQK007 } },
+ { "r9.template.5mer.base.model", {T_IDX, 0, KV_SQK007 } },
+ { "r9.template.5mer.base.model.trained", {T_IDX, 0, KV_SQK007 } }
+
+};
+
+std::string ModelMetadata::get_short_name() const
+{
+ assert(this->model_idx < 3);
+ assert(this->kit < NUM_KITS);
+ return short_strand_by_idx[this->model_idx] + "." + short_kit_by_idx[this->kit];
+}
+
+std::string ModelMetadata::get_kit_name() const
+{
+ assert(this->kit < NUM_KITS);
+ return "SQK" + short_kit_by_idx[this->kit];
+}
+
+std::string ModelMetadata::get_strand_model_name() const
+{
+ assert(this->model_idx < 3);
+ return strand_by_idx[this->model_idx];
+}
+
+ModelMetadata get_model_metadata_from_name(const std::string& name)
+{
+ auto iter = known_models.find(name);
+ if (iter != known_models.end()) {
+ return iter->second;
+ } else {
+ std::cerr << "Error: unknown model: " << name << "\n";
+ exit(EXIT_FAILURE);
+ }
+}
diff --git a/src/common/nanopolish_model_names.h b/src/common/nanopolish_model_names.h
new file mode 100644
index 0000000..69955f7
--- /dev/null
+++ b/src/common/nanopolish_model_names.h
@@ -0,0 +1,41 @@
+//---------------------------------------------------------
+// Copyright 2016 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_model_names -- Get metadata for ONT model names
+//
+#ifndef NANOPOLISH_MODEL_NAMES_H
+#define NANOPOLISH_MODEL_NAMES_H
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include <math.h>
+
+enum KitVersion
+{
+ KV_SQK005 = 0,
+ KV_SQK006,
+ KV_SQK007,
+ NUM_KITS
+};
+
+// The parameters of a gaussian distribution
+struct ModelMetadata
+{
+ uint8_t strand_idx;
+ uint8_t model_idx; // template = 0, pop1 = 1, pop2 = 2
+ KitVersion kit;
+
+ std::string get_short_name() const;
+ std::string get_kit_name() const;
+ std::string get_strand_model_name() const;
+};
+
+//
+// Functions
+//
+ModelMetadata get_model_metadata_from_name(const std::string& name);
+
+#endif
diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp
index bcd3d31..6852982 100644
--- a/src/common/nanopolish_variant.cpp
+++ b/src/common/nanopolish_variant.cpp
@@ -7,9 +7,14 @@
//
#include <algorithm>
#include <map>
+#include <iterator>
+#include <iomanip>
#include "nanopolish_profile_hmm.h"
#include "nanopolish_variant.h"
#include "nanopolish_haplotype.h"
+#include "nanopolish_model_names.h"
+
+//#define DEBUG_HAPLOTYPE_SELECTION 1
// return a new copy of the string with gap symbols removed
std::string remove_gaps(const std::string& str)
@@ -176,7 +181,7 @@ std::vector<Variant> select_variants(const std::vector<Variant>& candidate_varia
}
}
- if(best_variant_lp - base_lp > 1.0) {
+ if(best_variant_lp - base_lp > 0.1) {
// move the best variant from the all list to the selected list
Variant& best_variant = all_variants[best_variant_idx];
best_variant.add_info("TotalReads", input.size());
@@ -202,14 +207,34 @@ std::vector<Variant> select_variants(const std::vector<Variant>& candidate_varia
std::vector<Variant> select_variant_set(const std::vector<Variant>& candidate_variants,
Haplotype base_haplotype,
const std::vector<HMMInputData>& input,
+ const int max_haplotypes,
const uint32_t alignment_flags)
{
size_t num_variants = candidate_variants.size();
- size_t num_haplotypes = 1 << num_variants;
+
+ // Determine the maximum number of variants we can jointly test
+ // without exceeding the maximum number of haplotypes
+ size_t sum_num_haplotypes = 0;
+ size_t max_r = 1;
- double base_lp_by_strand[2] = { 0.0f, 0.0f };
- double base_lp_by_rc[2] = { 0.0f, 0.0f };
+ while(max_r <= num_variants) {
+ size_t num_haplotypes_r = nChoosek(num_variants, max_r);
+ if(num_haplotypes_r + sum_num_haplotypes < max_haplotypes) {
+ sum_num_haplotypes += num_haplotypes_r;
+ } else {
+ break;
+ }
+ //printf("n: %zu r: %zu nCr: %zu sum: %zu\n", num_variants, max_r, num_haplotypes_r, sum_num_haplotypes);
+ max_r += 1;
+ }
+ max_r -= 1;
+
+ // Calculate the likelihood of the haplotype with no additional variants added
+ // also do some bookkeeping about per-read/per-model likelihoods
double base_lp = 0.0f;
+ double base_lp_by_model_strand[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+ int read_counts[6] = { 0, 0, 0, 0, 0, 0 };
+
std::vector<double> base_lp_by_read(input.size());
#pragma omp parallel for
@@ -221,78 +246,196 @@ std::vector<Variant> select_variant_set(const std::vector<Variant>& candidate_va
{
base_lp_by_read[j] = tmp;
base_lp += tmp;
- base_lp_by_strand[input[j].strand] += tmp;
- base_lp_by_rc[input[j].rc] += tmp;
+
+ int mid = input[j].read->pore_model[input[j].strand].metadata.model_idx;
+ int cid = 2 * mid + input[j].rc;
+ base_lp_by_model_strand[cid] += tmp;
+ read_counts[cid] += 1;
}
}
double best_lp = -INFINITY;
std::vector<Variant> best_variant_set;
- // The haplotype id is a bitmask indicating which variants
- // to apply to get the haplotype. We skip the empty
- // variant set.
- for(size_t hi = 1; hi < num_haplotypes; ++hi) {
+ // Score haplotypes by adding 1, 2, ..., max_r variant sets to it
+ for(size_t r = 1; r <= max_r; ++r) {
+ // From: http://stackoverflow.com/questions/9430568/generating-combinations-in-c
+ std::vector<bool> variant_selector(num_variants);
+ std::fill(variant_selector.begin(), variant_selector.begin() + r, true);
- Haplotype current_haplotype = base_haplotype;
- std::vector<Variant> current_variant_set;
+ do {
+ Haplotype current_haplotype = base_haplotype;
+ std::vector<Variant> current_variant_set;
+ bool good_haplotype = true;
+
+ for(size_t vi = 0; vi < num_variants; vi++) {
+ if(!variant_selector[vi]) {
+ continue;
+ }
- for(size_t vi = 0; vi < num_variants; vi++) {
- // if bit vi is set in the haplotype id, apply this variant
- if( (hi & (1 << vi)) == 0) {
+ current_variant_set.push_back(candidate_variants[vi]);
+ good_haplotype = good_haplotype && current_haplotype.apply_variant(current_variant_set.back());
+ }
+
+ // skip the haplotype if all the variants couldnt be added to it
+ if(!good_haplotype) {
continue;
}
- current_variant_set.push_back(candidate_variants[vi]);
- current_haplotype.apply_variant(current_variant_set.back());
- }
-
- // score the haplotype
- double current_lp = 0.0f;
- double current_lp_by_strand[2] = { 0.0f, 0.0f };
- double current_lp_by_rc[2] = { 0.0f, 0.0f };
+ // score the haplotype
+ double current_lp = 0.0f;
+ double current_lp_by_model_strand[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+ size_t supporting_reads = 0;
+ std::vector<double> relative_lp_by_read(input.size(), 0.0f);
- size_t supporting_reads = 0;
+ #pragma omp parallel for
+ for(size_t j = 0; j < input.size(); ++j) {
+ double tmp = profile_hmm_score(current_haplotype.get_sequence(), input[j], alignment_flags);
- #pragma omp parallel for
- for(size_t j = 0; j < input.size(); ++j) {
- double tmp = profile_hmm_score(current_haplotype.get_sequence(), input[j], alignment_flags);
- #pragma omp critical
- {
- current_lp += tmp;
- supporting_reads += tmp > base_lp_by_read[j];
- current_lp_by_strand[input[j].strand] += tmp;
- current_lp_by_rc[input[j].rc] += tmp;
+ #pragma omp critical
+ {
+ current_lp += tmp;
+ supporting_reads += tmp > base_lp_by_read[j];
+ int mid = input[j].read->pore_model[input[j].strand].metadata.model_idx;
+ int cid = 2 * mid + input[j].rc;
+ current_lp_by_model_strand[cid] += tmp;
+ relative_lp_by_read[j] = tmp - base_lp_by_read[j];
+ }
}
- }
- if(current_lp > best_lp && current_lp - base_lp > 1.0) {
- best_lp = current_lp;
- best_variant_set = current_variant_set;
-
- // Annotate variants
- for(size_t vi = 0; vi < best_variant_set.size(); ++vi) {
- Variant& v = best_variant_set[vi];
- v.add_info("TotalReads", input.size());
- v.add_info("SupportingReads", supporting_reads);
- v.add_info("SupportFraction", (double)supporting_reads / input.size());
- v.add_info("TemplateQuality", current_lp_by_strand[0] - base_lp_by_strand[0]);
- v.add_info("ComplementQuality", current_lp_by_strand[1] - base_lp_by_strand[1]);
- v.add_info("ForwardQuality", current_lp_by_rc[0] - base_lp_by_rc[0]);
- v.add_info("ReverseQuality", current_lp_by_rc[1] - base_lp_by_rc[1]);
- v.quality = best_lp - base_lp;
+ if(current_lp > best_lp && current_lp - base_lp > 0.1) {
+ best_lp = current_lp;
+ best_variant_set = current_variant_set;
+
+ // Annotate variants
+ for(size_t vi = 0; vi < best_variant_set.size(); ++vi) {
+ Variant& v = best_variant_set[vi];
+ v.add_info("TotalReads", input.size());
+ v.add_info("SupportingReads", supporting_reads);
+ v.add_info("SupportFraction", (double)supporting_reads / input.size());
+
+ // Annotate variants with qualities from the three possible models
+ std::string names[3] = { "Template", "Comp.P1", "Comp.P2" };
+
+ for(int mid = 0; mid < 3; mid++) {
+ int cid = 2 * mid;
+ double s0 = current_lp_by_model_strand[cid] - base_lp_by_model_strand[cid];
+ int c0 = read_counts[cid];
+
+ double s1 = current_lp_by_model_strand[cid + 1] - base_lp_by_model_strand[cid + 1];
+ int c1 = read_counts[cid + 1];
+
+ std::stringstream ss;
+ ss << std::setprecision(4) << s0 / c0 << "," << s1 / c1;
+ v.add_info(names[mid], ss.str());
+ }
+
+ /*
+ v.add_info("TemplateQuality", current_lp_by_strand[0] - base_lp_by_strand[0]);
+ v.add_info("ComplementQuality", current_lp_by_strand[1] - base_lp_by_strand[1]);
+ v.add_info("ForwardQuality", current_lp_by_rc[0] - base_lp_by_rc[0]);
+ v.add_info("ReverseQuality", current_lp_by_rc[1] - base_lp_by_rc[1]);
+ v.add_info("TAvgQuality", (current_lp_by_model[0] - base_lp_by_model[0]) / model_count[0]);
+ v.add_info("C1AvgQuality", (current_lp_by_model[1] - base_lp_by_model[1]) / model_count[1]);
+ v.add_info("C2AvgQuality", (current_lp_by_model[2] - base_lp_by_model[2]) / model_count[2]);
+ */
+
+ std::stringstream counts;
+ std::ostream_iterator<int> rc_out(counts, ",");
+ std::copy(std::begin(read_counts), std::end(read_counts), rc_out);
+ std::string rc_str = counts.str();
+ v.add_info("ReadCounts", rc_str.substr(0, rc_str.size() - 1));
+
+ std::stringstream scores;
+ std::ostream_iterator<float> scores_out(scores, ",");
+ std::copy(std::begin(relative_lp_by_read), std::end(relative_lp_by_read), scores_out);
+ std::string scores_str = scores.str();
+ v.add_info("Scores", scores_str.substr(0, scores_str.size() - 1));
+
+ v.quality = best_lp - base_lp;
+ }
}
- }
-
#ifdef DEBUG_HAPLOTYPE_SELECTION
- std::stringstream ss;
- for(size_t vi = 0; vi < current_variant_set.size(); ++vi) {
- const Variant& v = current_variant_set[vi];
- ss << (vi > 0 ? "," : "") << v.key();
- }
- fprintf(stderr, "haplotype: %zu variants: %s relative score: %.2lf\n", hi, ss.str().c_str(), current_lp - base_lp);
+ std::stringstream ss;
+ for(size_t vi = 0; vi < current_variant_set.size(); ++vi) {
+ const Variant& v = current_variant_set[vi];
+ ss << (vi > 0 ? "," : "") << v.key();
+ }
+ fprintf(stderr, "haplotype: %zu variants: %s relative score: %.2lf\n", hi, ss.str().c_str(), current_lp - base_lp);
#endif
+ } while(std::prev_permutation(variant_selector.begin(), variant_selector.end()));
}
return best_variant_set;
}
+std::vector<Variant> select_positive_scoring_variants(std::vector<Variant>& candidate_variants,
+ Haplotype base_haplotype,
+ const std::vector<HMMInputData>& input,
+ const uint32_t alignment_flags)
+{
+ std::vector<Variant> selected_variants;
+ double base_score = 0.0f;
+ #pragma omp parallel for
+ for(size_t j = 0; j < input.size(); ++j) {
+
+ double score = profile_hmm_score(base_haplotype.get_sequence(), input[j], alignment_flags);
+
+ #pragma omp atomic
+ base_score += score;
+ }
+
+ for(size_t vi = 0; vi < candidate_variants.size(); ++vi) {
+
+ Haplotype current_haplotype = base_haplotype;
+ current_haplotype.apply_variant(candidate_variants[vi]);
+
+ double haplotype_score = 0.0f;
+ #pragma omp parallel for
+ for(size_t j = 0; j < input.size(); ++j) {
+ double score = profile_hmm_score(current_haplotype.get_sequence(), input[j], alignment_flags);
+
+ #pragma omp atomic
+ haplotype_score += score;
+ }
+
+ if(haplotype_score > base_score) {
+ candidate_variants[vi].quality = haplotype_score - base_score;
+ selected_variants.push_back(candidate_variants[vi]);
+ }
+ }
+
+ return selected_variants;
+}
+
+Variant score_variant(const Variant& input_variant,
+ Haplotype base_haplotype,
+ const std::vector<HMMInputData>& input,
+ const uint32_t alignment_flags)
+{
+ Variant out_variant = input_variant;
+
+ double base_score = 0.0f;
+ #pragma omp parallel for
+ for(size_t j = 0; j < input.size(); ++j) {
+
+ double score = profile_hmm_score(base_haplotype.get_sequence(), input[j], alignment_flags);
+
+ #pragma omp atomic
+ base_score += score;
+ }
+
+ base_haplotype.apply_variant(input_variant);
+
+ double haplotype_score = 0.0f;
+#pragma omp parallel for
+ for(size_t j = 0; j < input.size(); ++j) {
+ double score = profile_hmm_score(base_haplotype.get_sequence(), input[j], alignment_flags);
+
+#pragma omp atomic
+ haplotype_score += score;
+ }
+
+ out_variant.quality = haplotype_score - base_score;
+ return out_variant;
+}
+
diff --git a/src/common/nanopolish_variant.h b/src/common/nanopolish_variant.h
index e99c97d..3eed2e2 100644
--- a/src/common/nanopolish_variant.h
+++ b/src/common/nanopolish_variant.h
@@ -32,7 +32,7 @@ struct Variant
return out.str();
}
- void write_vcf(FILE* fp)
+ void write_vcf(FILE* fp) const
{
fprintf(fp, "%s\t%zu\t%s\t", ref_name.c_str(), ref_position + 1, ".");
fprintf(fp, "%s\t%s\t%.1lf\t", ref_seq.c_str(), alt_seq.c_str(), quality);
@@ -58,7 +58,7 @@ struct Variant
assert(!ref_name.empty());
assert(!ref_seq.empty());
assert(!alt_seq.empty());
- assert(ref_position >= 0);
+ //assert(ref_position >= 0);
assert(quality >= 0.0f);
}
@@ -90,6 +90,15 @@ inline bool sortByPosition(const Variant& a, const Variant& b)
a.ref_name < b.ref_name;
}
+class VariantKeyComp
+{
+ public:
+ inline bool operator()(const Variant& a, const Variant& b)
+ {
+ return a.key() < b.key();
+ }
+};
+
// Determine potential variants between the reference and haplotype string
std::vector<Variant> extract_variants(const std::string& reference,
const std::string& haplotype);
@@ -109,7 +118,20 @@ std::vector<Variant> select_variants(const std::vector<Variant>& candidate_varia
std::vector<Variant> select_variant_set(const std::vector<Variant>& candidate_variants,
Haplotype base_haplotype,
const std::vector<HMMInputData>& input,
+ const int max_haplotypes,
const uint32_t alignment_flags);
+// Select variants that have a positive score wrt the base haplotype
+std::vector<Variant> select_positive_scoring_variants(std::vector<Variant>& candidate_variants,
+ Haplotype base_haplotype,
+ const std::vector<HMMInputData>& input,
+ const uint32_t alignment_flags);
+
+// Score a single variant
+Variant score_variant(const Variant& input_variant,
+ Haplotype base_haplotype,
+ const std::vector<HMMInputData>& input,
+ const uint32_t alignment_flags);
+
#endif
diff --git a/src/common/progress.h b/src/common/progress.h
index e4f3aeb..b8b5cd3 100644
--- a/src/common/progress.h
+++ b/src/common/progress.h
@@ -15,7 +15,7 @@ class Progress
{
public:
- Progress(const std::string message) : m_message(message), m_os(std::cerr)
+ Progress(const std::string message) : m_os(std::cerr), m_message(message)
{
#if HAVE_CLOCK_GETTIME
timespec start;
@@ -31,8 +31,8 @@ class Progress
{
// print
- int max_leader = 40;
- int bar_width = 50;
+ unsigned max_leader = 40;
+ unsigned bar_width = 50;
std::string leader;
if(m_message.size() > max_leader) {
@@ -43,13 +43,13 @@ class Progress
m_os << leader << " [";
- int pos = bar_width * progress;
- for (int i = 0; i < bar_width; ++i) {
+ unsigned pos = bar_width * progress;
+ for (unsigned i = 0; i < bar_width; ++i) {
if (i < pos) m_os << "=";
else if (i == pos) m_os << ">";
else m_os << " ";
}
- m_os << "] " << int(progress * 100.0) << "% in " << get_elapsed_seconds() << "s\r";
+ m_os << "] " << unsigned(progress * 100.0) << "% in " << get_elapsed_seconds() << "s\r";
m_os.flush();
}
diff --git a/src/hmm/invgauss.hpp b/src/hmm/invgauss.hpp
new file mode 100644
index 0000000..140f29d
--- /dev/null
+++ b/src/hmm/invgauss.hpp
@@ -0,0 +1,149 @@
+#ifndef __INVGAUSS_HPP
+#define __INVGAUSS_HPP
+
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <random>
+#include <tuple>
+
+template < typename Real_Type >
+class inverse_gaussian_distribution
+{
+public:
+ typedef Real_Type result_type;
+ typedef std::pair< Real_Type, Real_Type > param_type;
+
+ inverse_gaussian_distribution(const param_type& p) : _p(p) {}
+ inverse_gaussian_distribution(Real_Type mu, Real_Type lambda) : inverse_gaussian_distribution(std::make_pair(mu, lambda)) {}
+ inverse_gaussian_distribution() : inverse_gaussian_distribution(1.0, 1.0) {}
+
+ template < class Generator >
+ Real_Type operator () (Generator& g, const param_type& p) { return generate(g, p.first, p.second); }
+ template < class Generator >
+ Real_Type operator () (Generator& g) { return generate(g, _p.first, _p.second); }
+
+ void reset() const {}
+
+ Real_Type mean() const { return _p.first; }
+ Real_Type shape() const { return _p.second; }
+
+ param_type param() const { return _p; }
+ void param(const param_type& p) { _p = p; }
+
+ Real_Type min() const { return 0.0; }
+ Real_Type max() const { return std::numeric_limits< Real_Type >::max(); }
+
+ friend bool operator == (const inverse_gaussian_distribution& lhs, const inverse_gaussian_distribution& rhs) { return lhs.params() == rhs.params(); }
+ friend bool operator != (const inverse_gaussian_distribution& lhs, const inverse_gaussian_distribution& rhs) { return not (lhs == rhs); }
+
+ friend std::ostream& operator << (std::ostream& os, const inverse_gaussian_distribution& d)
+ {
+ os << d.p.first << " " << d.p.second;
+ return os;
+ }
+ friend std::istream& operator >> (std::istream& is, inverse_gaussian_distribution& d)
+ {
+ is >> d._p.first >> d._p.second;
+ return is;
+ }
+
+private:
+ param_type _p;
+
+ template < class Generator >
+ static Real_Type generate(Generator& g, Real_Type mu, Real_Type lambda)
+ {
+ auto v = std::normal_distribution< Real_Type >()(g);
+ auto y = v * v;
+ auto x = (mu + ( mu * mu * y ) / ( lambda * 2.0 )
+ - ( mu * std::sqrt( mu * y * lambda * 4.0 + mu * mu * y * y ) ) / ( lambda * 2.0 ) );
+ auto z = std::uniform_real_distribution< Real_Type >()(g);
+ if (z <= mu / (mu + x))
+ {
+ return x;
+ }
+ else
+ {
+ return ( mu * mu ) / x;
+ }
+ }
+}; // class inverse_gaussian_distribution
+
+#endif
+
+#ifdef INVGAUSS_SAMPLE
+/*
+
+Compile:
+
+g++ -std=c++11 -D INVGAUSS_SAMPLE -x c++ invgauss.hpp -o invgauss
+
+Run:
+
+./invgauss 1 40 1000 42 >ig.1.40.txt
+
+Visualize with matplotlib:
+
+import numpy as np
+import scipy as sp
+import matplotlib.pyplot as plt
+
+def inv_gauss(x, m, l):
+ return sp.sqrt(l / (2 * sp.pi * x * x * x)) * sp.exp(- l * (x - m) * (x - m) / (2 * m * m * x))
+
+x = np.loadtxt(open('ig.1.40.txt'))
+b = np.linspace(.4, 3, 100)
+cb = b[:-1] + (b[1] - b[0])/2
+y = inv_gauss(cb, 1, 40)
+
+plt.figure(0)
+plt.hist(x, bins=b, normed=1, histtype='step')
+plt.gca().set_color_cycle(None)
+plt.plot(cb, y, '--')
+
+*/
+
+#include <cassert>
+#include <chrono>
+#include <sstream>
+
+using namespace std;
+
+int main(int argc, char* argv[])
+{
+ if (argc < 4)
+ {
+ cerr << "use: " << argv[0] << " mu lambda n [seed]" << endl;
+ exit(EXIT_FAILURE);
+ }
+ float mu = 0.0;
+ float lambda = 0.0;
+ size_t n = 0;
+ size_t seed = 0;
+ istringstream(argv[1]) >> mu;
+ clog << "mu: " << mu << endl;
+ assert(mu > 0.0);
+ istringstream(argv[2]) >> lambda;
+ clog << "lambda: " << lambda << endl;
+ assert(lambda > 0.0);
+ istringstream(argv[3]) >> n;
+ clog << "n: " << n << endl;
+ if (argc >= 5)
+ {
+ istringstream(argv[4]) >> seed;
+ }
+ if (seed == 0)
+ {
+ seed = chrono::high_resolution_clock::now().time_since_epoch().count();
+ }
+ clog << "seed: " << seed << endl;
+ std::mt19937 rg(seed);
+ inverse_gaussian_distribution< float > ig(mu, lambda);
+ for (size_t i = 0; i < n; ++i)
+ {
+ cout << ig(rg) << endl;
+ }
+}
+
+#endif
diff --git a/src/hmm/nanopolish_duration_model.cpp b/src/hmm/nanopolish_duration_model.cpp
new file mode 100755
index 0000000..9af7dfc
--- /dev/null
+++ b/src/hmm/nanopolish_duration_model.cpp
@@ -0,0 +1,90 @@
+//---------------------------------------------------------
+// Copyright 2016 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_duration_model -- Model the duration
+// of bases passing through the pore
+//
+#include "nanopolish_duration_model.h"
+#include "nanopolish_profile_hmm.h"
+
+DurationModel::DurationModel()
+{
+
+}
+
+std::vector<double> DurationModel::generate_aligned_durations(const std::string& sequence,
+ const HMMInputData& data,
+ const uint32_t alignment_flags)
+{
+ size_t k = data.read->pore_model[0].k;
+ size_t num_kmers = sequence.size() - k + 1;
+ // initialize the vector of durations
+ std::vector<double> duration_by_kmer_position(num_kmers, 0.0);
+ std::vector<HMMAlignmentState> alignment = profile_hmm_align(sequence, data, alignment_flags);
+ for(size_t ai = 0; ai < alignment.size(); ai++) {
+
+ /*
+ fprintf(stderr, "alignment[%zu]: %s %zu %c %.5lf\n", ai,
+ sequence.substr(alignment[ai].kmer_idx, 6).c_str(),
+ alignment[ai].event_idx,
+ alignment[ai].state,
+ data.read->get_duration(alignment[ai].event_idx, data.strand));
+ */
+ if(alignment[ai].state != 'K') {
+ duration_by_kmer_position[alignment[ai].kmer_idx] += data.read->get_duration(alignment[ai].event_idx, data.strand);
+ }
+ }
+ return duration_by_kmer_position;
+}
+
+double DurationModel::log_gamma_sum(double x, double n)
+{
+ const static double a = 2.461964; // shape
+ const static double b = 587.2858; // rate
+ GammaParameters params;
+ params.shape = a;
+ params.rate = b;
+ return log_gamma_sum(x, params, n);
+}
+
+double DurationModel::log_gamma_sum(double x, const GammaParameters& params, double n)
+{
+ assert(x >= 0.0);
+
+ double na = n * params.shape;
+ return (na * log(params.rate)) - lgamma(na) + (na - 1) * log(x) - params.rate * x;
+}
+
+GammaParameters DurationModel::gamma_fit(const std::vector<double>& input)
+{
+ double s = gamma_fit_calculate_s(input);
+ //double k = (3 - s + sqrt(pow(s - 3.0, 2.0) + 24.0 * s)) / (12 * s);
+ double k = 2.461964; // use known k
+ double sum = 0;
+ double n = input.size();
+ for(size_t i = 0; i < input.size(); ++i) {
+ sum += input[i];
+ }
+ double sigma = sum / (k * n);
+
+ GammaParameters params;
+ params.shape = k;
+ params.rate = (1.0 / sigma);
+ return params;
+}
+
+double DurationModel::gamma_fit_calculate_s(const std::vector<double>& input)
+{
+ double sum_1 = 0;
+ double sum_2 = 0;
+ double n = input.size();
+ for(size_t i = 0; i < input.size(); ++i) {
+ sum_1 += input[i];
+ assert(input[i] > 0.0f);
+ sum_2 += log(input[i]);
+ }
+
+ return log(sum_1 / n) - sum_2 / n;
+}
diff --git a/src/hmm/nanopolish_duration_model.h b/src/hmm/nanopolish_duration_model.h
new file mode 100755
index 0000000..45f4d36
--- /dev/null
+++ b/src/hmm/nanopolish_duration_model.h
@@ -0,0 +1,57 @@
+//---------------------------------------------------------
+// Copyright 2016 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_duration_model -- Model the duration
+// of bases passing through the pore
+//
+#ifndef NANOPOLISH_DURATION_MODEL_H
+#define NANOPOLISH_DURATION_MODEL_H
+
+#include <stdint.h>
+#include <vector>
+#include <string>
+#include "nanopolish_common.h"
+
+#define MIN_DURATION 0.00025
+#define MAX_INDEX 99
+
+struct GammaParameters
+{
+ double shape;
+ double rate;
+};
+
+class DurationModel
+{
+ public:
+ DurationModel();
+
+ //
+ static std::vector<double> generate_aligned_durations(const std::string& sequence,
+ const HMMInputData& data,
+ const uint32_t alignment_flags);
+ //
+ // Log of gamma PDF for the sum of n observations
+ //
+ static double log_gamma_sum(double x, double n);
+ static double log_gamma_sum(double x, const GammaParameters& params, double n);
+
+ //
+ // Fit the parameters of the gamma distribution for the input data
+ //
+ static GammaParameters gamma_fit(const std::vector<double>& input);
+ static double gamma_fit_calculate_s(const std::vector<double>& input);
+
+ private:
+
+ // singleton accessor function
+ static DurationModel& getInstance()
+ {
+ static DurationModel instance;
+ return instance;
+ }
+};
+
+#endif
diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h
index e90462d..bc315f8 100644
--- a/src/hmm/nanopolish_emissions.h
+++ b/src/hmm/nanopolish_emissions.h
@@ -13,22 +13,20 @@
#include "nanopolish_common.h"
#include "nanopolish_squiggle_read.h"
-//#define MODEL_STDV
//#define DEBUG_HMM_EMISSION 1
// From SO: http://stackoverflow.com/questions/10847007/using-the-gaussian-probability-density-function-in-c
+static const float inv_sqrt_2pi = 0.3989422804014327;
inline float normal_pdf(float x, const GaussianParameters& g)
{
- static const float inv_sqrt_2pi = 0.3989422804014327;
float a = (x - g.mean) / g.stdv;
return inv_sqrt_2pi / g.stdv * exp(-0.5f * a * a);
}
-inline float log_normal_pdf(float x, const GaussianParameters& g)
+inline float normal_pdf(float x, const PoreModelStateParams& s)
{
- static const float log_inv_sqrt_2pi = log(0.3989422804014327);
- float a = (x - g.mean) / g.stdv;
- return log_inv_sqrt_2pi - g.log_stdv + (-0.5f * a * a);
+ float a = (x - s.level_mean) / s.level_stdv;
+ return inv_sqrt_2pi / s.level_stdv * exp(-0.5f * a * a);
}
inline float z_score(const SquiggleRead& read,
@@ -42,56 +40,68 @@ inline float z_score(const SquiggleRead& read,
return (level - model.mean) / model.stdv;
}
+static const float log_inv_sqrt_2pi = log(0.3989422804014327);
+inline float log_normal_pdf(float x, const PoreModelStateParams& s)
+{
+ float a = (x - s.level_mean) / s.level_stdv;
+ return log_inv_sqrt_2pi - s.level_log_stdv + (-0.5f * a * a);
+}
+
+
+inline float log_normal_pdf(float x, const GaussianParameters& g)
+{
+ float a = (x - g.mean) / g.stdv;
+ return log_inv_sqrt_2pi - g.log_stdv + (-0.5f * a * a);
+}
+
+inline float log_invgauss_pdf(float x, float log_x, const PoreModelStateParams& s)
+{
+ static const float log_2pi = log(2 * M_PI);
+ float a = (x - s.sd_mean) / s.sd_mean;
+ return (s.sd_log_lambda - log_2pi - 3 * log_x - s.sd_lambda * a * a / x) / 2;
+}
+
+inline bool& model_stdv()
+{
+ static bool _model_stdv = false;
+ return _model_stdv;
+}
+
inline float log_probability_match(const SquiggleRead& read,
uint32_t kmer_rank,
uint32_t event_idx,
- uint8_t strand,
- float state_scale = 1.0f,
- float log_state_scale = 0.0f)
+ uint8_t strand)
{
const PoreModel& pm = read.pore_model[strand];
// event level mean
float level = read.get_drift_corrected_level(event_idx, strand);
-
- GaussianParameters model = pm.get_scaled_parameters(kmer_rank);
-
- // we go to great lengths to avoid calling log() in the inner loop of the HMM
- // for this reason we duplicate data here and require the caller to pass
- // in the scale and log(scale), presumably these are cached
- model.stdv *= state_scale;
- model.log_stdv += log_state_scale;
- float lp = log_normal_pdf(level, model);
-
-#if MODEL_STDV
- // event level stdv
- float stdv = read.events[strand].stdv[event_idx];
- float model_sd_mean = pm.state[kmer_rank].sd_mean * pm.scale_sd;
- float model_sd_stdv = pm.state[kmer_rank].sd_stdv * sqrt(pow(pm.scale_sd, 3.0) / pm.var_sd);
- lp += log_normal_pdf(stdv, model_sd_mean, model_sd_stdv);
-#endif
+ float stdv = read.get_stdv(event_idx, strand);
+ float log_stdv = read.get_log_stdv(event_idx, strand);
+ PoreModelStateParams state = pm.get_scaled_state(kmer_rank);
+
+ // add the event's stdv to the model stdv
+ state.level_stdv = sqrt(pow(state.level_stdv, 2.0) + pow(stdv, 2.0));
+ state.level_log_stdv = 0.5 * add_logs(2.0 * state.level_log_stdv, 2.0 * log_stdv);
+
+ float lp = log_normal_pdf(level, state);
+
+ if(model_stdv())
+ {
+ float lp_stdv = log_invgauss_pdf(stdv, log_stdv, state);
+ lp += lp_stdv;
+ }
#if DEBUG_HMM_EMISSION
- printf("Event[%d] Kmer: %d -- L:%.1lf m: %.1lf s: %.1lf p: %.3lf p_old: %.3lf\n", event_idx, kmer_rank, level, model.mean, model.stdv, exp(lp), normal_pdf(level, model));
+ printf("Event[%d] Kmer: %d -- L:%.1lf m: %.1lf s: %.1lf p: %.3lf p_old: %.3lf\n", event_idx, kmer_rank, level, state.level_mean, state.level_stdv, exp(lp), normal_pdf(level, state));
#endif
return lp;
}
-inline float log_probability_event_insert(const SquiggleRead& read,
- uint32_t kmer_rank,
- uint32_t event_idx,
- uint8_t strand)
-{
- static const float scale = 1.75f;
- static const float log_scale = log(scale);
-
- return log_probability_match(read, kmer_rank, event_idx, strand, scale, log_scale);
-}
-
-inline float log_probability_background(const SquiggleRead& read,
- uint32_t event_idx,
- uint8_t strand)
+inline float log_probability_background(const SquiggleRead&,
+ uint32_t,
+ uint8_t)
{
return -3.0f;
}
diff --git a/src/hmm/nanopolish_hmm_input_sequence.h b/src/hmm/nanopolish_hmm_input_sequence.h
index e121780..b5ea5d4 100644
--- a/src/hmm/nanopolish_hmm_input_sequence.h
+++ b/src/hmm/nanopolish_hmm_input_sequence.h
@@ -23,8 +23,8 @@ class HMMInputSequence
// constructors
HMMInputSequence(const std::string& seq) :
- m_seq(seq),
- m_alphabet(&gDNAAlphabet)
+ m_alphabet(&gDNAAlphabet),
+ m_seq(seq)
{
m_rc_seq = m_alphabet->reverse_complement(seq);
}
@@ -32,9 +32,9 @@ class HMMInputSequence
HMMInputSequence(const std::string& fwd,
const std::string& rc,
const Alphabet* alphabet) :
+ m_alphabet(alphabet),
m_seq(fwd),
- m_rc_seq(rc),
- m_alphabet(alphabet)
+ m_rc_seq(rc)
{
}
@@ -43,6 +43,8 @@ class HMMInputSequence
//
size_t length() const { return m_seq.length(); }
+ // swap sequence and its reverse complement
+ void swap() { m_seq.swap(m_rc_seq); }
// returns the i-th kmer of the sequence
inline std::string get_kmer(uint32_t i, uint32_t k, bool do_rc) const
diff --git a/src/hmm/nanopolish_pore_model_set.cpp b/src/hmm/nanopolish_pore_model_set.cpp
new file mode 100644
index 0000000..147054c
--- /dev/null
+++ b/src/hmm/nanopolish_pore_model_set.cpp
@@ -0,0 +1,100 @@
+//---------------------------------------------------------
+// Copyright 2015 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_pore_model_set -- A class that maintains
+// a collection of pore models that SquiggleReads
+// can load during initialization.
+//
+#include "nanopolish_pore_model_set.h"
+
+//
+PoreModelSet::~PoreModelSet()
+{
+}
+
+//
+void PoreModelSet::initialize(const std::string& fofn_filename)
+{
+ // grab singleton instance
+ PoreModelSet& model_set = getInstance();
+
+ // open the fofn file reader
+ std::ifstream fofn_reader(fofn_filename);
+ if(!fofn_reader.is_open()) {
+ fprintf(stderr, "Error: could not read %s\n", fofn_filename.c_str());
+ exit(EXIT_FAILURE);
+ }
+
+ std::string model_filename;
+ while(getline(fofn_reader, model_filename)) {
+
+ // read the model
+ PoreModel p(model_filename);
+ assert(!p.name.empty());
+ assert(!p.type.empty());
+
+ model_set.model_type_sets[p.type][p.metadata.get_short_name()] = p;
+
+ fprintf(stderr, "registering model %s-%s\n", p.metadata.get_short_name().c_str(), p.type.c_str());
+ }
+}
+
+//
+bool PoreModelSet::has_model(const std::string& type, const std::string& short_name)
+{
+ PoreModelSet& model_set = getInstance();
+
+ auto iter_type = model_set.model_type_sets.find(type);
+ if(iter_type == model_set.model_type_sets.end()) {
+ return false;
+ }
+
+ auto iter_short_name = iter_type->second.find(short_name);
+ return iter_short_name != iter_type->second.end();
+}
+
+//
+const PoreModel& PoreModelSet::get_model(const std::string& type, const std::string& short_name)
+{
+ PoreModelSet& model_set = getInstance();
+
+ auto iter_type = model_set.model_type_sets.find(type);
+ if(iter_type == model_set.model_type_sets.end()) {
+ fprintf(stderr, "Error: cannot find model type %s\n", type.c_str());
+ exit(EXIT_FAILURE);
+ }
+
+ auto iter_short_name = iter_type->second.find(short_name);
+ if(iter_short_name == iter_type->second.end()) {
+ fprintf(stderr, "Error: cannot find model %s for type %s\n", short_name.c_str(), type.c_str());
+ exit(EXIT_FAILURE);
+ }
+
+ return iter_short_name->second;
+}
+
+//
+const PoreModelMap& PoreModelSet::get_models(const std::string& type)
+{
+ PoreModelSet& model_set = getInstance();
+
+ auto iter_type = model_set.model_type_sets.find(type);
+ if(iter_type == model_set.model_type_sets.end()) {
+ fprintf(stderr, "Error: cannot find model type %s\n", type.c_str());
+ exit(EXIT_FAILURE);
+ }
+
+ return iter_type->second;
+}
+
+void PoreModelSet::insert_model(const std::string& type, const PoreModel& model)
+{
+ #pragma omp critical
+ {
+ PoreModelSet& model_set = getInstance();
+ std::string key = model.metadata.get_short_name();
+ model_set.model_type_sets[type][key] = model;
+ }
+}
diff --git a/src/hmm/nanopolish_pore_model_set.h b/src/hmm/nanopolish_pore_model_set.h
new file mode 100644
index 0000000..c50e953
--- /dev/null
+++ b/src/hmm/nanopolish_pore_model_set.h
@@ -0,0 +1,72 @@
+//---------------------------------------------------------
+// Copyright 2015 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_pore_model_set -- A class that maintains
+// a collection of pore models that SquiggleReads
+// can load during initialization.
+//
+#ifndef NANOPOLISH_PORE_MODEL_SET_H
+#define NANOPOLISH_PORE_MODEL_SET_H
+
+#include <map>
+#include "nanopolish_poremodel.h"
+
+#define DEFAULT_MODEL_TYPE "ONT"
+
+typedef std::map<std::string, PoreModel> PoreModelMap;
+
+class PoreModelSet
+{
+ public:
+
+ //
+ // initialize the model set from a .fofn file
+ //
+ static void initialize(const std::string& fofn_filename);
+
+ //
+ // check if a model with this type and short name exists
+ //
+ static bool has_model(const std::string& type, const std::string& short_name);
+
+ //
+ // get a model from the set using its type and short name
+ //
+ static const PoreModel& get_model(const std::string& type, const std::string& short_name);
+
+ //
+ // get all the models for this type
+ //
+ static const PoreModelMap& get_models(const std::string& type);
+
+ //
+ // insert the new model into the specified type
+ //
+ static void insert_model(const std::string& type, const PoreModel& model);
+
+ // destructor
+ ~PoreModelSet();
+
+ private:
+
+ // singleton accessor function
+ static PoreModelSet& getInstance()
+ {
+ static PoreModelSet instance;
+ return instance;
+ }
+
+ // do not allow copies of this classs
+ PoreModelSet(PoreModelSet const&) = delete;
+ void operator=(PoreModelSet const&) = delete;
+ PoreModelSet() {}; // public constructor not allowed
+
+ // this is a map from a pore model type (like "base" or "derived"
+ // to a map of models indexed by their short name
+ // for example m_model_type_sets["base"]["t.007"]
+ std::map<std::string, PoreModelMap> model_type_sets;
+};
+
+#endif
diff --git a/src/hmm/nanopolish_profile_hmm.cpp b/src/hmm/nanopolish_profile_hmm.cpp
index f51c4b1..23b3e14 100644
--- a/src/hmm/nanopolish_profile_hmm.cpp
+++ b/src/hmm/nanopolish_profile_hmm.cpp
@@ -20,7 +20,7 @@ void profile_hmm_forward_initialize(FloatMatrix& fm)
for(uint32_t ri = 0; ri < fm.n_rows; ri++) {
set(fm, ri, PS_KMER_SKIP, -INFINITY);
- set(fm, ri, PS_EVENT_SPLIT, -INFINITY);
+ set(fm, ri, PS_BAD_EVENT, -INFINITY);
set(fm, ri, PS_MATCH, -INFINITY);
}
}
@@ -28,9 +28,9 @@ void profile_hmm_forward_initialize(FloatMatrix& fm)
// Terminate the forward algorithm by calculating
// the probability of transitioning to the end state
// for all columns and a given row
-float profile_hmm_forward_terminate(const FloatMatrix& fm,
- const FloatMatrix& tm,
- uint32_t row)
+float profile_hmm_forward_terminate(const FloatMatrix&,
+ const FloatMatrix&,
+ uint32_t)
{
assert(false);
return -INFINITY;
@@ -128,6 +128,16 @@ std::vector<HMMAlignmentState> profile_hmm_align(const HMMInputSequence& sequenc
profile_hmm_fill_generic(sequence, data, e_start, flags, output);
// Traverse the backtrack matrix to compute the results
+ int traversal_stride = data.event_stride;
+
+#if HMM_REVERSE_FIX
+ // Hack to support the fixed HMM
+ // TODO: clean up
+ traversal_stride = 1;
+ if(data.event_stride == -1) {
+ e_start = data.event_stop_idx;
+ }
+#endif
// start from the last event matched to the last kmer
uint32_t row = n_rows - 1;
@@ -135,15 +145,18 @@ std::vector<HMMAlignmentState> profile_hmm_align(const HMMInputSequence& sequenc
while(row > 0) {
- uint32_t event_idx = e_start + (row - 1) * data.event_stride;
+ uint32_t event_idx = e_start + (row - 1) * traversal_stride;
uint32_t block = col / PS_NUM_STATES;
- assert(block > 0);
- assert(get(vm, row, col) != -INFINITY);
-
uint32_t kmer_idx = block - 1;
-
ProfileState curr_ps = (ProfileState) (col % PS_NUM_STATES);
+#if DEBUG_BACKTRACK
+ printf("backtrace %zu %zu coord: (%zu, %zu, %zu) state: %d\n", event_idx, kmer_idx, row, col, block, curr_ps);
+#endif
+
+ assert(block > 0);
+ assert(get(vm, row, col) != -INFINITY);
+
HMMAlignmentState as;
as.event_idx = event_idx;
as.kmer_idx = kmer_idx;
@@ -153,36 +166,61 @@ std::vector<HMMAlignmentState> profile_hmm_align(const HMMInputSequence& sequenc
as.state = ps2char(curr_ps);
alignment.push_back(as);
- // Update the event (row) and k-mer using the current state
- // The next state is encoded in the backtrack matrix for the current cell
- ProfileState next_ps = (ProfileState)get(bm, row, col);
-
- // If we hit the softclip state we are done aligning
- if(next_ps == PS_PRE_SOFT) {
+ // Update the event (row) and k-mer using the backtrack matrix
+ HMMMovementType movement = (HMMMovementType)get(bm, row, col);
+ if(movement == HMT_FROM_SOFT) {
break;
}
+
+ // update kmer_idx and state
+ ProfileState next_ps;
+ switch(movement) {
+ case HMT_FROM_SAME_M:
+ next_ps = PS_MATCH;
+ break;
+ case HMT_FROM_PREV_M:
+ kmer_idx -= 1;
+ next_ps = PS_MATCH;
+ break;
+ case HMT_FROM_SAME_B:
+ next_ps = PS_BAD_EVENT;
+ break;
+ case HMT_FROM_PREV_B:
+ kmer_idx -= 1;
+ next_ps = PS_BAD_EVENT;
+ break;
+ case HMT_FROM_PREV_K:
+ kmer_idx -= 1;
+ next_ps = PS_KMER_SKIP;
+ break;
+ case HMT_FROM_SOFT:
+ assert(false);
+ break;
+ }
-#if DEBUG_BACKTRACK
- printf("Backtrack [%zu %zu] k: %zu block: %zu curr_ps: %c next_ps: %c\n", row, col, kmer_idx, block, ps2char(curr_ps), ps2char(next_ps));
-#endif
-
- if(curr_ps == PS_MATCH) {
+ // update row (event) idx only if this isn't a kmer skip, which is silent
+ if(curr_ps != PS_KMER_SKIP) {
row -= 1;
- kmer_idx -= 1;
- } else if(curr_ps == PS_EVENT_SPLIT) {
- row -= 1;
- // kmer stays the same
- } else {
- assert(curr_ps == PS_KMER_SKIP);
- // row stays the same
- kmer_idx -= 1;
}
col = PS_NUM_STATES * (kmer_idx + 1) + next_ps;
}
+
+#if HMM_REVERSE_FIX
+ // change the strand of the kmer indices if we aligned to the reverse strand
+ if(data.event_stride == -1) {
+ for(size_t ai = 0; ai < alignment.size(); ++ai) {
+ size_t k_idx = alignment[ai].kmer_idx;
+ alignment[ai].kmer_idx = sequence.length() - k_idx - k;
+ }
+ } else {
+ std::reverse(alignment.begin(), alignment.end());
+ }
+#else
//
std::reverse(alignment.begin(), alignment.end());
+#endif
//
free_matrix(vm);
diff --git a/src/hmm/nanopolish_profile_hmm.h b/src/hmm/nanopolish_profile_hmm.h
index dc0afe0..c2bdf70 100644
--- a/src/hmm/nanopolish_profile_hmm.h
+++ b/src/hmm/nanopolish_profile_hmm.h
@@ -16,6 +16,9 @@
#include "nanopolish_emissions.h"
#include "nanopolish_hmm_input_sequence.h"
+//#define HMM_REVERSE_FIX 1
+//#define DEBUG_FILL 1
+
//
// High level algorithms
//
@@ -63,12 +66,24 @@ void profile_hmm_update_training(const HMMInputSequence& sequence,
enum ProfileState
{
PS_KMER_SKIP = 0,
- PS_EVENT_SPLIT,
+ PS_BAD_EVENT,
PS_MATCH,
PS_NUM_STATES = 3,
PS_PRE_SOFT // intentionally after PS_NUM_STATES
};
+enum HMMMovementType
+{
+ HMT_FROM_SAME_M = 0,
+ HMT_FROM_PREV_M,
+ HMT_FROM_SAME_B,
+ HMT_FROM_PREV_B,
+ HMT_FROM_PREV_K,
+ HMT_FROM_SOFT,
+ HMT_NUM_MOVEMENT_TYPES
+};
+typedef struct { float x[HMT_NUM_MOVEMENT_TYPES]; } HMMUpdateScores;
+
// Flags to modify the behaviour of the HMM
enum HMMAlignmentFlags
{
@@ -77,22 +92,25 @@ enum HMMAlignmentFlags
};
// Convert an enumerated state into a symbol
-inline char ps2char(ProfileState ps) { return "KEMNS"[ps]; }
+inline char ps2char(ProfileState ps) { return "KBMNS"[ps]; }
// Pre-computed transitions from the previous block
// into the current block of states. Log-scaled.
struct BlockTransitions
{
- // Transition from m state
- float lp_me;
+ // Transition from m state (match event to k-mer)
+ float lp_mm_self;
+ float lp_mb;
float lp_mk;
- float lp_mm;
+ float lp_mm_next;
- // Transitions from e state
- float lp_ee;
- float lp_em;
+ // Transitions from b state (bad event that should be ignored)
+ float lp_bb;
+ float lp_bk;
+ float lp_bm_next; // movement to next k-mer
+ float lp_bm_self; // movement to k-mer that we came from
- // Transitions from k state
+ // Transitions from k state (no observation from k-mer)
float lp_kk;
float lp_km;
};
diff --git a/src/hmm/nanopolish_profile_hmm.inl b/src/hmm/nanopolish_profile_hmm.inl
index 38a7449..84948ec 100644
--- a/src/hmm/nanopolish_profile_hmm.inl
+++ b/src/hmm/nanopolish_profile_hmm.inl
@@ -22,6 +22,8 @@ inline float calculate_skip_probability(const HMMInputSequence& sequence,
return parameters.get_skip_probability(level_i.mean, level_j.mean);
}
+//#define USE_EXTERNAL_PARAMS 1
+
inline std::vector<BlockTransitions> calculate_transitions(uint32_t num_kmers, const HMMInputSequence& sequence, const HMMInputData& data)
{
const TransitionParameters& parameters = data.read->parameters[data.strand];
@@ -31,32 +33,48 @@ inline std::vector<BlockTransitions> calculate_transitions(uint32_t num_kmers, c
for(uint32_t ki = 0; ki < num_kmers; ++ki) {
// probability of skipping k_i from k_(i - 1)
- float p_skip = ki > 0 ? calculate_skip_probability(sequence, data, ki - 1, ki) : 0.0f;
+ float p_stay = 0.4;
+#ifndef USE_EXTERNAL_PARAMS
+ float p_skip = 0.0025; //ki > 0 ? calculate_skip_probability(sequence, data, ki - 1, ki) : 0.0f;
+ float p_bad = 0.001;
+ float p_bad_self = p_bad;
+ float p_skip_self = 0.3;
+#else
+ extern float g_p_skip, g_p_skip_self, g_p_bad, g_p_bad_self;
+ float p_skip = g_p_skip;
+ float p_skip_self = g_p_skip_self;
+ float p_bad = g_p_bad;
+ float p_bad_self = g_p_bad_self;
+#endif
// transitions from match state in previous block
- float p_mk = p_skip;
- float p_me = (1 - p_skip) * parameters.trans_m_to_e_not_k;
- float p_mm = 1.0f - p_me - p_mk;
+ float p_mk = p_skip; // probability of not observing an event at all
+ float p_mb = p_bad; // probabilty of observing a bad event
+ float p_mm_self = p_stay; // probability of observing additional events from this k-mer
+ float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
// transitions from event split state in previous block
- float p_ee = parameters.trans_e_to_e;
- float p_em = 1.0f - p_ee;
- // p_ie not allowed
+ float p_bb = p_bad_self;
+ float p_bk, p_bm_next, p_bm_self;
+ p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
// transitions from kmer skip state in previous block
- float p_kk = p_skip;
- float p_km = 1 - p_skip;
- // p_ei not allowed
+ float p_kk = p_skip_self;
+ float p_km = 1.0f - p_kk;
+ // p_kb not needed, equivalent to B->K
// log-transform and store
BlockTransitions& bt = transitions[ki];
- bt.lp_me = log(p_me);
bt.lp_mk = log(p_mk);
- bt.lp_mm = log(p_mm);
-
- bt.lp_ee = log(p_ee);
- bt.lp_em = log(p_em);
+ bt.lp_mb = log(p_mb);
+ bt.lp_mm_self = log(p_mm_self);
+ bt.lp_mm_next = log(p_mm_next);
+
+ bt.lp_bb = log(p_bb);
+ bt.lp_bk = log(p_bk);
+ bt.lp_bm_next = log(p_bm_next);
+ bt.lp_bm_self = log(p_bm_self);
bt.lp_kk = log(p_kk);
bt.lp_km = log(p_km);
@@ -72,16 +90,18 @@ class ProfileHMMForwardOutput
ProfileHMMForwardOutput(FloatMatrix* p) : p_fm(p), lp_end(-INFINITY) {}
//
- inline void update_4(uint32_t row, uint32_t col, float m, float e, float k, float s, float lp_emission)
+ inline void update_cell(uint32_t row, uint32_t col, const HMMUpdateScores& scores, float lp_emission)
{
- float sum_1 = add_logs(m, e);
- float sum_2 = add_logs(k, s);
- float sum = add_logs(sum_1, sum_2) + lp_emission;
+ float sum = scores.x[0];
+ for(auto i = 1; i < HMT_NUM_MOVEMENT_TYPES; ++i) {
+ sum = add_logs(sum, scores.x[i]);
+ }
+ sum += lp_emission;
set(*p_fm, row, col, sum);
}
// add in the probability of ending the alignment at row,col
- inline void update_end(float v, uint32_t row, uint32_t col)
+ inline void update_end(float v, uint32_t, uint32_t)
{
lp_end = add_logs(lp_end, v);
}
@@ -120,24 +140,17 @@ class ProfileHMMViterbiOutput
public:
ProfileHMMViterbiOutput(FloatMatrix* pf, UInt8Matrix* pb) : p_fm(pf), p_bm(pb), lp_end(-INFINITY) {}
- inline void update_4(uint32_t row, uint32_t col, float m, float e, float k, float s, float lp_emission)
+ inline void update_cell(uint32_t row, uint32_t col, const HMMUpdateScores& scores, float lp_emission)
{
// probability update
- float max = std::max(std::max(m, e),
- std::max(k, s));
+ float max = scores.x[0];
+ uint8_t from = 0;
+ for(auto i = 1; i < HMT_NUM_MOVEMENT_TYPES; ++i) {
+ max = scores.x[i] > max ? scores.x[i] : max;
+ from = max == scores.x[i] ? i : from;
+ }
set(*p_fm, row, col, max + lp_emission);
-
- // backtrack update
- uint8_t from;
- if(max == m)
- from = PS_MATCH;
- else if(max == e)
- from = PS_EVENT_SPLIT;
- else if(max == k)
- from = PS_KMER_SKIP;
- else if(max == s)
- from = PS_PRE_SOFT;
set(*p_bm, row, col, from);
}
@@ -260,19 +273,34 @@ inline std::vector<float> make_post_flanking(const HMMInputData& data,
// The templated ProfileHMMOutput class allows one to run either Viterbi
// or the Forward algorithm.
template<class ProfileHMMOutput>
-inline float profile_hmm_fill_generic(const HMMInputSequence& sequence,
- const HMMInputData& data,
- const uint32_t e_start,
+inline float profile_hmm_fill_generic(const HMMInputSequence& _sequence,
+ const HMMInputData& _data,
+ const uint32_t,
uint32_t flags,
ProfileHMMOutput& output)
{
PROFILE_FUNC("profile_hmm_fill_generic")
+ HMMInputSequence sequence = _sequence;
+ HMMInputData data = _data;
+ assert( (data.rc && data.event_stride == -1) || (!data.rc && data.event_stride == 1));
+
+#if HMM_REVERSE_FIX
+ if(data.event_stride == -1) {
+ sequence.swap();
+ uint32_t tmp = data.event_stop_idx;
+ data.event_stop_idx = data.event_start_idx;
+ data.event_start_idx = tmp;
+ data.event_stride = 1;
+ data.rc = false;
+ }
+#endif
+ uint32_t e_start = data.event_start_idx;
+
const TransitionParameters& parameters = data.read->parameters[data.strand];
// Calculate number of blocks
- // A block of the HMM is a set of PS_KMER_SKIP, PS_EVENT_SPLIT, PS_MATCH
- // events for one kmer
+ // A block of the HMM is a set of states for one kmer
uint32_t num_blocks = output.get_num_columns() / PS_NUM_STATES;
uint32_t last_event_row_idx = output.get_num_rows() - 1;
@@ -304,6 +332,9 @@ inline float profile_hmm_fill_generic(const HMMInputSequence& sequence,
float lp_sm, lp_ms;
lp_sm = lp_ms = 0.0f;
+ // the penalty is controlled by the transition probability
+ float BAD_EVENT_PENALTY = 0.0f;
+
// Fill in matrix
for(uint32_t row = 1; row < output.get_num_rows(); row++) {
@@ -323,44 +354,56 @@ inline float profile_hmm_fill_generic(const HMMInputSequence& sequence,
uint32_t event_idx = e_start + (row - 1) * data.event_stride;
uint32_t rank = kmer_ranks[kmer_idx];
float lp_emission_m = log_probability_match(*data.read, rank, event_idx, data.strand);
- float lp_emission_e = log_probability_event_insert(*data.read, rank, event_idx, data.strand);
+ float lp_emission_b = BAD_EVENT_PENALTY;
+ HMMUpdateScores scores;
+
// state PS_MATCH
- float m_m = bt.lp_mm + output.get(row - 1, prev_block_offset + PS_MATCH);
- float m_e = bt.lp_em + output.get(row - 1, prev_block_offset + PS_EVENT_SPLIT);
- float m_k = bt.lp_km + output.get(row - 1, prev_block_offset + PS_KMER_SKIP);
+ scores.x[HMT_FROM_SAME_M] = bt.lp_mm_self + output.get(row - 1, curr_block_offset + PS_MATCH);
+ scores.x[HMT_FROM_PREV_M] = bt.lp_mm_next + output.get(row - 1, prev_block_offset + PS_MATCH);
+ scores.x[HMT_FROM_SAME_B] = bt.lp_bm_self + output.get(row - 1, curr_block_offset + PS_BAD_EVENT);
+ scores.x[HMT_FROM_PREV_B] = bt.lp_bm_next + output.get(row - 1, prev_block_offset + PS_BAD_EVENT);
+ scores.x[HMT_FROM_PREV_K] = bt.lp_km + output.get(row - 1, prev_block_offset + PS_KMER_SKIP);
// m_s is the probability of going from the start state
// to this kmer. The start state is (currently) only
// allowed to go to the first kmer. If ALLOW_PRE_CLIP
// is defined, we allow all events before this one to be skipped,
// with a penalty;
- float m_s = (kmer_idx == 0 &&
- (event_idx == e_start ||
- (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY;
+ scores.x[HMT_FROM_SOFT] = (kmer_idx == 0 &&
+ (event_idx == e_start ||
+ (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY;
- output.update_4(row, curr_block_offset + PS_MATCH, m_m, m_e, m_k, m_s, lp_emission_m);
+ output.update_cell(row, curr_block_offset + PS_MATCH, scores, lp_emission_m);
- // state PS_EVENT_SPLIT
- float e_m = bt.lp_me + output.get(row - 1, curr_block_offset + PS_MATCH);
- float e_e = bt.lp_ee + output.get(row - 1, curr_block_offset + PS_EVENT_SPLIT);
- output.update_4(row, curr_block_offset + PS_EVENT_SPLIT, e_m, e_e, -INFINITY, -INFINITY, lp_emission_e);
+ // state PS_BAD_EVENT
+ scores.x[HMT_FROM_SAME_M] = bt.lp_mb + output.get(row - 1, curr_block_offset + PS_MATCH);
+ scores.x[HMT_FROM_PREV_M] = -INFINITY; // not allowed
+ scores.x[HMT_FROM_SAME_B] = bt.lp_bb + output.get(row - 1, curr_block_offset + PS_BAD_EVENT);
+ scores.x[HMT_FROM_PREV_B] = -INFINITY;
+ scores.x[HMT_FROM_PREV_K] = -INFINITY;
+ scores.x[HMT_FROM_SOFT] = -INFINITY;
+ output.update_cell(row, curr_block_offset + PS_BAD_EVENT, scores, lp_emission_b);
// state PS_KMER_SKIP
- float k_m = bt.lp_mk + output.get(row, prev_block_offset + PS_MATCH);
- float k_k = bt.lp_kk + output.get(row, prev_block_offset + PS_KMER_SKIP);
- output.update_4(row, curr_block_offset + PS_KMER_SKIP, k_m, -INFINITY, k_k, -INFINITY, 0.0f); // no emission
+ scores.x[HMT_FROM_SAME_M] = -INFINITY;
+ scores.x[HMT_FROM_PREV_M] = bt.lp_mk + output.get(row, prev_block_offset + PS_MATCH);
+ scores.x[HMT_FROM_SAME_B] = -INFINITY;
+ scores.x[HMT_FROM_PREV_B] = bt.lp_bk + output.get(row, prev_block_offset + PS_BAD_EVENT);
+ scores.x[HMT_FROM_PREV_K] = bt.lp_kk + output.get(row, prev_block_offset + PS_KMER_SKIP);
+ scores.x[HMT_FROM_SOFT] = -INFINITY;
+ output.update_cell(row, curr_block_offset + PS_KMER_SKIP, scores, 0.0f); // no emission
// If POST_CLIP is enabled we allow the last kmer to transition directly
// to the end after any event. Otherwise we only allow it from the
// last kmer/event match.
if(kmer_idx == last_kmer_idx && ( (flags & HAF_ALLOW_POST_CLIP) || row == last_event_row_idx)) {
float lp1 = lp_ms + output.get(row, curr_block_offset + PS_MATCH) + post_flank[row - 1];
- float lp2 = lp_ms + output.get(row, curr_block_offset + PS_EVENT_SPLIT) + post_flank[row - 1];
+ float lp2 = lp_ms + output.get(row, curr_block_offset + PS_BAD_EVENT) + post_flank[row - 1];
float lp3 = lp_ms + output.get(row, curr_block_offset + PS_KMER_SKIP) + post_flank[row - 1];
output.update_end(lp1, row, curr_block_offset + PS_MATCH);
- output.update_end(lp2, row, curr_block_offset + PS_EVENT_SPLIT);
+ output.update_end(lp2, row, curr_block_offset + PS_BAD_EVENT);
output.update_end(lp3, row, curr_block_offset + PS_KMER_SKIP);
}
@@ -371,29 +414,29 @@ inline float profile_hmm_fill_generic(const HMMInputSequence& sequence,
#ifdef DEBUG_FILL
printf("Row %u block %u\n", row, block);
- printf("\tTransitions: p_mx [%.3lf %.3lf %.3lf]\n", bt.lp_mm, bt.lp_me, bt.lp_mk);
- printf("\t p_ex [%.3lf %.3lf %.3lf]\n", bt.lp_em, bt.lp_ee, 0.0f);
- printf("\t p_lx [%.3lf %.3lf %.3lf]\n", bt.lp_km, 0.0, bt.lp_kk);
- printf("\tPS_MATCH -- Transitions: [%.3lf %.3lf %.3lf] Prev: [%.2lf %.2lf %.2lf] sum: %.2lf\n",
- bt.lp_mm, bt.lp_em, bt.lp_km,
+ printf("\tPS_MATCH -- Transitions: [%.3lf %.3lf %.3lf %.3lf %.3lf] Prev: [%.2lf %.2lf %.2lf %.2lf %.2lf] out: %.2lf\n",
+ bt.lp_mm_self, bt.lp_mm_next, bt.lp_bm_self, bt.lp_bm_next, bt.lp_km,
output.get(row - 1, prev_block_offset + PS_MATCH),
- output.get(row - 1, prev_block_offset + PS_EVENT_SPLIT),
+ output.get(row - 1, curr_block_offset + PS_MATCH),
+ output.get(row - 1, prev_block_offset + PS_BAD_EVENT),
+ output.get(row - 1, curr_block_offset + PS_BAD_EVENT),
output.get(row - 1, prev_block_offset + PS_KMER_SKIP),
- 0.0f);
- printf("\tPS_EVENT_SPLIT -- Transitions: [%.3lf %.3lf] Prev: [%.2lf %.2lf] sum: %.2lf\n",
- bt.lp_me, bt.lp_ee,
+ output.get(row, curr_block_offset + PS_MATCH));
+ printf("\tPS_BAD_EVENT -- Transitions: [%.3lf %.3lf] Prev: [%.2lf %.2lf] out: %.2lf\n",
+ bt.lp_mb, bt.lp_bb,
output.get(row - 1, curr_block_offset + PS_MATCH),
- output.get(row - 1, curr_block_offset + PS_EVENT_SPLIT),
- 0.0f);
+ output.get(row - 1, curr_block_offset + PS_BAD_EVENT),
+ output.get(row, curr_block_offset + PS_BAD_EVENT));
- printf("\tPS_KMER_SKIP -- Transitions: [%.3lf %.3lf] Prev: [%.2lf %.2lf] sum: %.2lf\n",
- bt.lp_mk, bt.lp_kk,
+ printf("\tPS_KMER_SKIP -- Transitions: [%.3lf %.3lf %.3lf] Prev: [%.2lf %.2lf %.2lf] sum: %.2lf\n",
+ bt.lp_mk, bt.lp_bk, bt.lp_kk,
output.get(row, prev_block_offset + PS_MATCH),
+ output.get(row, prev_block_offset + PS_BAD_EVENT),
output.get(row, prev_block_offset + PS_KMER_SKIP),
- 0.0f);
+ output.get(row, curr_block_offset + PS_KMER_SKIP));
- printf("\tEMISSION: %.2lf %.2lf\n", lp_emission_m, lp_emission_e);
+ printf("\tEMISSION: %.2lf %.2lf\n", lp_emission_m, lp_emission_b);
#endif
}
}
diff --git a/src/hmm/nanopolish_transition_parameters.cpp b/src/hmm/nanopolish_transition_parameters.cpp
index 8999d62..ac52765 100644
--- a/src/hmm/nanopolish_transition_parameters.cpp
+++ b/src/hmm/nanopolish_transition_parameters.cpp
@@ -12,6 +12,9 @@
#include "nanopolish_poremodel.h"
#include "nanopolish_squiggle_read.h"
+//#define PRINT_TRAINING_MESSAGES 1
+//#define SHOW_TRAINING_RESULT 1
+
TransitionParameters::TransitionParameters()
{
// initialize training data
@@ -21,9 +24,9 @@ TransitionParameters::TransitionParameters()
td.n_skips = 0;
//
- allocate_matrix(td.state_transitions, 3, 3);
- for(int i = 0; i < td.state_transitions.n_rows; ++i) {
- for(int j = 0; j < td.state_transitions.n_cols; ++j) {
+ allocate_matrix(td.state_transitions, 3, 6);
+ for(unsigned i = 0; i < td.state_transitions.n_rows; ++i) {
+ for(unsigned j = 0; j < td.state_transitions.n_cols; ++j) {
set(td.state_transitions, i, j, 0);
}
}
@@ -46,27 +49,27 @@ TransitionParameters::~TransitionParameters()
free_matrix(training_data.state_transitions);
}
-void TransitionParameters::initialize(const std::string& model_name)
+void TransitionParameters::initialize(const ModelMetadata& metadata)
{
is_initialized = true;
- if(model_name == "r7.3_template_median68pA.model" ||
- model_name == "r7.3_complement_median68pA_pop1.model" ||
- model_name == "r7.3_complement_median68pA_pop2.model")
- {
+ if(metadata.kit == KV_SQK005) {
initialize_sqkmap005();
- }
- else if(model_name == "r7.3_e6_70bps_6mer_template_median68pA.model")
- {
- initialize_sqkmap006_template();
- }
- else if(model_name == "r7.3_e6_70bps_6mer_complement_median68pA_pop1.model" ||
- model_name == "r7.3_e6_70bps_6mer_complement_median68pA_pop2.model")
- {
- initialize_sqkmap006_complement();
+ } else if(metadata.kit == KV_SQK006) {
+ if(metadata.strand_idx == T_IDX) {
+ initialize_sqkmap006_template();
+ } else {
+ initialize_sqkmap006_complement();
+ }
+ } else if(metadata.kit == KV_SQK007) {
+ if(metadata.strand_idx == T_IDX) {
+ initialize_sqkmap007_template();
+ } else {
+ initialize_sqkmap007_complement();
+ }
} else {
- printf("Error: unknown model: %s\n", model_name.c_str());
- exit(EXIT_FAILURE);
+ fprintf(stderr, "Warning: unknown model kit: %d\n", metadata.kit);
+ initialize_sqkmap005();
}
}
@@ -185,6 +188,78 @@ void TransitionParameters::initialize_sqkmap006_complement()
skip_probabilities[29] = 0.119;
}
+void TransitionParameters::initialize_sqkmap007_template()
+{
+ trans_m_to_e_not_k = 0.310;
+ trans_e_to_e = 0.637;
+ skip_probabilities[0] = 0.054;
+ skip_probabilities[1] = 0.055;
+ skip_probabilities[2] = 0.050;
+ skip_probabilities[3] = 0.035;
+ skip_probabilities[4] = 0.035;
+ skip_probabilities[5] = 0.026;
+ skip_probabilities[6] = 0.020;
+ skip_probabilities[7] = 0.019;
+ skip_probabilities[8] = 0.014;
+ skip_probabilities[9] = 0.013;
+ skip_probabilities[10] = 0.010;
+ skip_probabilities[11] = 0.009;
+ skip_probabilities[12] = 0.008;
+ skip_probabilities[13] = 0.008;
+ skip_probabilities[14] = 0.007;
+ skip_probabilities[15] = 0.007;
+ skip_probabilities[16] = 0.007;
+ skip_probabilities[17] = 0.007;
+ skip_probabilities[18] = 0.006;
+ skip_probabilities[19] = 0.006;
+ skip_probabilities[20] = 0.006;
+ skip_probabilities[21] = 0.006;
+ skip_probabilities[22] = 0.005;
+ skip_probabilities[23] = 0.006;
+ skip_probabilities[24] = 0.006;
+ skip_probabilities[25] = 0.006;
+ skip_probabilities[26] = 0.007;
+ skip_probabilities[27] = 0.007;
+ skip_probabilities[28] = 0.007;
+ skip_probabilities[29] = 0.008;
+}
+
+void TransitionParameters::initialize_sqkmap007_complement()
+{
+ trans_m_to_e_not_k = 0.211;
+ trans_e_to_e = 0.670;
+ skip_probabilities[0] = 0.096;
+ skip_probabilities[1] = 0.092;
+ skip_probabilities[2] = 0.074;
+ skip_probabilities[3] = 0.048;
+ skip_probabilities[4] = 0.037;
+ skip_probabilities[5] = 0.026;
+ skip_probabilities[6] = 0.018;
+ skip_probabilities[7] = 0.016;
+ skip_probabilities[8] = 0.013;
+ skip_probabilities[9] = 0.011;
+ skip_probabilities[10] = 0.009;
+ skip_probabilities[11] = 0.008;
+ skip_probabilities[12] = 0.007;
+ skip_probabilities[13] = 0.007;
+ skip_probabilities[14] = 0.006;
+ skip_probabilities[15] = 0.006;
+ skip_probabilities[16] = 0.007;
+ skip_probabilities[17] = 0.007;
+ skip_probabilities[18] = 0.005;
+ skip_probabilities[19] = 0.006;
+ skip_probabilities[20] = 0.006;
+ skip_probabilities[21] = 0.005;
+ skip_probabilities[22] = 0.006;
+ skip_probabilities[23] = 0.005;
+ skip_probabilities[24] = 0.005;
+ skip_probabilities[25] = 0.006;
+ skip_probabilities[26] = 0.006;
+ skip_probabilities[27] = 0.007;
+ skip_probabilities[28] = 0.006;
+ skip_probabilities[29] = 0.009;
+}
+
//
double TransitionParameters::get_skip_probability(double k_level1, double k_level2) const
{
@@ -199,7 +274,7 @@ int statechar2index(char s)
{
switch(s) {
case 'M': return 0;
- case 'E': return 1;
+ case 'B': return 1;
case 'K': return 2;
}
assert(false);
@@ -207,10 +282,10 @@ int statechar2index(char s)
}
//
-void TransitionParameters::add_transition_observation(char state_from, char state_to)
+void TransitionParameters::add_transition_observation(char state_from, char state_to, bool kmer_move)
{
int f_idx = statechar2index(state_from);
- int t_idx = statechar2index(state_to);
+ int t_idx = 2 * statechar2index(state_to) + kmer_move;
int count = get(training_data.state_transitions, f_idx, t_idx);
set(training_data.state_transitions, f_idx, t_idx, count + 1);
@@ -230,15 +305,21 @@ void TransitionParameters::add_training_from_alignment(const HMMInputSequence& s
const uint32_t k = pm.k;
size_t n_kmers = sequence.length() - k + 1;
+#ifdef PRINT_TRAINING_MESSAGES
uint32_t strand_idx = 0;
+#endif
char prev_s = 'M';
for(size_t pi = 0; pi < alignment.size(); ++pi) {
uint32_t ei = alignment[pi].event_idx;
uint32_t ki = alignment[pi].kmer_idx;
+
+ bool kmer_move = pi == 0 || alignment[pi - 1].kmer_idx != ki;
+ bool event_move = pi == 0 || alignment[pi - 1].event_idx != ei;
char s = alignment[pi].state;
-
+ add_transition_observation(prev_s, s, kmer_move);
+
// Record transition observations
// We do not record observations for merge states as there was no kmer transitions
// We also do not record observations for the beginning of the matches as the
@@ -247,13 +328,13 @@ void TransitionParameters::add_training_from_alignment(const HMMInputSequence& s
// skip transition training data
// we do not process the E state here as no k-mer move was made
- if(s != 'E') {
+ if(s != 'B') {
uint32_t transition_kmer_from = alignment[pi - 1].kmer_idx;
uint32_t transition_kmer_to = alignment[pi].kmer_idx;
// Specially handle skips
// We only want to record the first k-mer skipped if multiple were skipped
- if(s == 'K') {
+ if(s == 'K' && prev_s == 'M') {
transition_kmer_from = alignment[pi - 1].kmer_idx;
transition_kmer_to = transition_kmer_from + 1;
}
@@ -274,24 +355,11 @@ void TransitionParameters::add_training_from_alignment(const HMMInputSequence& s
}
// State-to-state transition
- add_transition_observation(prev_s, s);
-
- // emission
- float level = data.read->get_drift_corrected_level(ei, data.strand);
- float sd = data.read->events[data.strand][ei].stdv;
- float duration = data.read->get_duration(ei, data.strand);
- if(ki >= n_kmers)
- printf("%zu %d %d %zu %.2lf %c\n", pi, ei, ki, n_kmers, alignment[pi].l_fm, s);
-
+ add_transition_observation(prev_s, s, kmer_move);
assert(ki < n_kmers);
- uint32_t rank = sequence.get_kmer_rank(ki, k, data.rc);
-
- GaussianParameters model = pm.get_scaled_parameters(rank);
- float norm_level = (level - model.mean) / model.stdv;
-
- prev_s = s;
}
+ prev_s = s;
// summary
training_data.n_matches += (s == 'M');
training_data.n_merges += (s == 'E');
@@ -307,40 +375,32 @@ void TransitionParameters::train()
// Profile HMM transitions
//
- size_t sum_m_not_k = get(td.state_transitions, statechar2index('M'), statechar2index('M')) +
- get(td.state_transitions, statechar2index('M'), statechar2index('E'));
-
- size_t me = get(td.state_transitions, statechar2index('M'), statechar2index('E'));
- double p_me_not_k = (double)me / sum_m_not_k;
-
- size_t sum_e = 0;
- for(int j = 0; j < td.state_transitions.n_cols; ++j) {
- sum_e += get(td.state_transitions, statechar2index('E'), j);
- }
-
- size_t ee = get(td.state_transitions, statechar2index('E'), statechar2index('E'));
- double p_ee = (double)ee / sum_e;
-
#ifdef SHOW_TRAINING_RESULT
fprintf(stderr, "TRANSITIONS\n");
- fprintf(stderr, "M->E|not_k: %lf\n", p_me_not_k);
- fprintf(stderr, "E->E: %lf\n", p_ee);
+ //fprintf(stderr, "M->E|not_k: %lf\n", p_me_not_k);
+ //fprintf(stderr, "E->E: %lf\n", p_ee);
for(int i = 0; i < td.state_transitions.n_rows; ++i) {
- fprintf(stderr, "\t%c: ", "MEK"[i]);
+ fprintf(stderr, "\t%c: ", "MBK"[i]);
for(int j = 0; j < td.state_transitions.n_cols; ++j) {
fprintf(stderr, "%d ", get(td.state_transitions, i, j));
}
fprintf(stderr, "\n");
}
-#endif
- if(sum_e == 0 || sum_m_not_k == 0) {
- // insufficient data to train, use defaults
- return;
- }
+ for(int i = 0; i < td.state_transitions.n_rows; ++i) {
+ fprintf(stderr, "\t%c: ", "MBK"[i]);
+ size_t col_sum = 0;
+ for(int j = 0; j < td.state_transitions.n_cols; ++j) {
+ col_sum += get(td.state_transitions, i, j);
+ }
- trans_m_to_e_not_k = p_me_not_k;
- trans_e_to_e = p_ee;
+ for(int j = 0; j < td.state_transitions.n_cols; ++j) {
+ double p = get(td.state_transitions, i, j) / (double)col_sum;
+ fprintf(stderr, "%04.3lf ", p);
+ }
+ fprintf(stderr, "\n");
+ }
+#endif
//
// Signal-dependent skip probability
@@ -374,3 +434,15 @@ void TransitionParameters::train()
#endif
}
}
+
+void TransitionParameters::print() const
+{
+ /*
+ fprintf(stderr, "TRANSITIONS\n");
+ fprintf(stderr, "trans_m_to_e_not_k = %.3lf;\n", trans_m_to_e_not_k);
+ fprintf(stderr, "trans_e_to_e = %.3lf;\n", trans_e_to_e);
+ */
+ for(size_t bin = 0; bin < skip_probabilities.size(); bin++) {
+ fprintf(stderr, "skip_probabilities[%zu] = %.3lf;\n", bin, skip_probabilities[bin]);
+ }
+}
diff --git a/src/hmm/nanopolish_transition_parameters.h b/src/hmm/nanopolish_transition_parameters.h
index f4b9bed..17e7f2f 100644
--- a/src/hmm/nanopolish_transition_parameters.h
+++ b/src/hmm/nanopolish_transition_parameters.h
@@ -13,6 +13,7 @@
#include <stdint.h>
#include "nanopolish_matrix.h"
#include "nanopolish_hmm_input_sequence.h"
+#include "nanopolish_model_names.h"
//
struct KmerTransitionObservation
@@ -44,7 +45,7 @@ class TransitionParameters
TransitionParameters();
~TransitionParameters();
- void initialize(const std::string& model_name);
+ void initialize(const ModelMetadata& metadata);
// update transition parameters from training data
void train();
@@ -53,7 +54,7 @@ class TransitionParameters
double get_skip_probability(double k_level1, double k_level2) const;
// add an observation of a state transition to the training data
- void add_transition_observation(char hmm_state_from, char hmm_state_to);
+ void add_transition_observation(char hmm_state_from, char hmm_state_to, bool kmer_move);
// update the training data using the alignment
void add_training_from_alignment(const HMMInputSequence& sequence,
@@ -61,6 +62,8 @@ class TransitionParameters
const std::vector<HMMAlignmentState>& alignment,
size_t ignore_edge_length = 5);
+ void print() const;
+
//
// data
//
@@ -82,16 +85,17 @@ class TransitionParameters
// Data used to train the model
TransitionTrainingData training_data;
-
private:
// Model-specific transition initialization
void initialize_sqkmap005();
void initialize_sqkmap006_template();
void initialize_sqkmap006_complement();
+ void initialize_sqkmap007_template();
+ void initialize_sqkmap007_complement();
// Not allowed
- TransitionParameters(const TransitionParameters& other) {}
+ TransitionParameters(const TransitionParameters&) {}
// Calculate which bin of the skip probability table this level difference falls in
inline size_t get_skip_bin(double k_level1, double k_level2) const
diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp
index 8c99379..577d0e3 100644
--- a/src/main/nanopolish.cpp
+++ b/src/main/nanopolish.cpp
@@ -5,8 +5,9 @@
//
// nanopolish.cpp -- main driver program
//
-#include <stdio.h>
#include <string>
+#include <map>
+#include <functional>
#include "logsum.h"
#include "nanopolish_call_variants.h"
#include "nanopolish_consensus.h"
@@ -14,50 +15,47 @@
#include "nanopolish_getmodel.h"
#include "nanopolish_methyltrain.h"
#include "nanopolish_methyltest.h"
+#include "nanopolish_scorereads.h"
+#include "nanopolish_train_poremodel_from_basecalls.h"
-// This code needs to be run before any of the program logic
-// It sets up pre-computed values and caches
-void initialize()
-{
- p7_FLogsumInit();
-}
+int print_usage(int argc, char **argv);
-void print_usage()
+static std::map< std::string, std::function<int(int, char**)> > programs = {
+ {"help", print_usage},
+ {"--help", print_usage},
+ {"consensus", consensus_main},
+ {"eventalign", eventalign_main},
+ {"getmodel", getmodel_main},
+ {"variants", call_variants_main},
+ {"methyltrain", methyltrain_main},
+ {"methyltest", methyltest_main},
+ {"scorereads", scorereads_main} ,
+ {"train-poremodel-from-basecalls", train_poremodel_from_basecalls_main}
+};
+
+int print_usage(int, char **)
{
- printf("usage: nanopolish [command] [options]\n");
+ std::cout << "usage: nanopolish [command] [options]" << std::endl;
+ std::cout << " valid commands: " << std::endl;
+ for (const auto &item : programs){
+ std::cout << " " << item.first << std::endl;
+ }
+ std::cout << " for help on given command, type nanopolish command --help" << std::endl;
+ return 0;
}
int main(int argc, char** argv)
{
- initialize();
-
if(argc <= 1) {
printf("error: no command provided\n");
- print_usage();
+ print_usage(argc - 1 , argv + 1);
return 0;
} else {
std::string command(argv[1]);
- if(command == "help" || command == "--help") {
- print_usage();
- return 0;
- } else if(command == "consensus") {
- consensus_main(argc - 1, argv + 1);
- return 0;
- } else if(command == "eventalign") {
- eventalign_main(argc - 1, argv + 1);
- return 0;
- } else if(command == "getmodel") {
- getmodel_main(argc - 1, argv + 1);
- return 0;
- } else if(command == "variants") {
- call_variants_main(argc - 1, argv + 1);
- return 0;
- } else if(command == "methyltrain") {
- methyltrain_main(argc - 1, argv + 1);
- return 0;
- } else if(command == "methyltest") {
- methyltest_main(argc - 1, argv + 1);
- return 0;
- }
+ auto iter = programs.find(command);
+ if (iter != programs.end())
+ return iter->second( argc - 1, argv + 1);
+ else
+ return print_usage( argc - 1, argv + 1);
}
}
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index e7b2515..d0f5804 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -21,6 +21,7 @@
#include <set>
#include <omp.h>
#include <getopt.h>
+#include <iterator>
#include "htslib/faidx.h"
#include "nanopolish_poremodel.h"
#include "nanopolish_transition_parameters.h"
@@ -32,6 +33,8 @@
#include "nanopolish_fast5_map.h"
#include "nanopolish_variant.h"
#include "nanopolish_haplotype.h"
+#include "nanopolish_pore_model_set.h"
+#include "nanopolish_duration_model.h"
#include "profiler.h"
#include "progress.h"
#include "stdaln.h"
@@ -50,6 +53,9 @@
//#define DEBUG_SEGMENT_ID 193
//#define DEBUG_BENCHMARK 1
+// Hack hack hack
+float g_p_skip, g_p_skip_self, g_p_bad, g_p_bad_self;
+
//
// Getopt
//
@@ -69,6 +75,8 @@ static const char *CONSENSUS_USAGE_MESSAGE =
" --version display version\n"
" --help display this help and exit\n"
" --snps only call SNPs\n"
+" --consensus run in consensus calling mode\n"
+" --fix-homopolymers run the experimental homopolymer caller\n"
" -w, --window=STR find variants in window STR (format: ctg:start-end)\n"
" -r, --reads=FILE the 2D ONT reads are in fasta FILE\n"
" -b, --bam=FILE the reads aligned to the reference genome are in bam FILE\n"
@@ -79,6 +87,7 @@ static const char *CONSENSUS_USAGE_MESSAGE =
" -m, --min-candidate-frequency=F alternative bases in F proporation of aligned reads are candidate variants (default 0.2)\n"
" -c, --candidates=VCF read variant candidates from VCF, rather than discovering them from aligned reads\n"
" --calculate-all-support when making a call, also calculate the support of the 3 other possible bases\n"
+" --models-fofn=FILE read alternative k-mer models from FILE\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
namespace opt
@@ -90,17 +99,39 @@ namespace opt
static std::string genome_file;
static std::string output_file;
static std::string candidates_file;
+ static std::string models_fofn;
static std::string window;
+ static std::string consensus_output;
+ static std::string alternative_model_type = DEFAULT_MODEL_TYPE;
static double min_candidate_frequency = 0.2f;
static int calculate_all_support = false;
static int snps_only = 0;
static int show_progress = 0;
static int num_threads = 1;
+ static int calibrate = 0;
+ static int consensus_mode = 0;
+ static int fix_homopolymers = 0;
+ static int min_distance_between_variants = 10;
+ static int min_flanking_sequence = 30;
+ static int max_haplotypes = 1000;
+ static int debug_alignments = 0;
}
static const char* shortopts = "r:b:g:t:w:o:e:m:c:v";
-enum { OPT_HELP = 1, OPT_VERSION, OPT_VCF, OPT_PROGRESS, OPT_SNPS_ONLY, OPT_CALC_ALL_SUPPORT };
+enum { OPT_HELP = 1,
+ OPT_VERSION,
+ OPT_VCF,
+ OPT_PROGRESS,
+ OPT_SNPS_ONLY,
+ OPT_CALC_ALL_SUPPORT,
+ OPT_CONSENSUS,
+ OPT_FIX_HOMOPOLYMERS,
+ OPT_MODELS_FOFN,
+ OPT_P_SKIP,
+ OPT_P_SKIP_SELF,
+ OPT_P_BAD,
+ OPT_P_BAD_SELF };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
@@ -113,6 +144,13 @@ static const struct option longopts[] = {
{ "threads", required_argument, NULL, 't' },
{ "min-candidate-frequency", required_argument, NULL, 'm' },
{ "candidates", required_argument, NULL, 'c' },
+ { "models-fofn", required_argument, NULL, OPT_MODELS_FOFN },
+ { "p-skip", required_argument, NULL, OPT_P_SKIP },
+ { "p-skip-self", required_argument, NULL, OPT_P_SKIP_SELF },
+ { "p-bad", required_argument, NULL, OPT_P_BAD },
+ { "p-bad-self", required_argument, NULL, OPT_P_BAD_SELF },
+ { "consensus", required_argument, NULL, OPT_CONSENSUS },
+ { "fix-homopolymers", no_argument, NULL, OPT_FIX_HOMOPOLYMERS },
{ "calculate-all-support", no_argument, NULL, OPT_CALC_ALL_SUPPORT },
{ "snps", no_argument, NULL, OPT_SNPS_ONLY },
{ "progress", no_argument, NULL, OPT_PROGRESS },
@@ -136,7 +174,7 @@ void annotate_with_all_support(std::vector<Variant>& variants,
{
for(size_t vi = 0; vi < variants.size(); vi++) {
-
+
// Generate a haplotype containing every variant in the set except for vi
Haplotype test_haplotype = base_haplotype;
for(size_t vj = 0; vj < variants.size(); vj++) {
@@ -147,7 +185,7 @@ void annotate_with_all_support(std::vector<Variant>& variants,
}
test_haplotype.apply_variant(variants[vj]);
}
-
+
// Make a vector of four haplotypes, one per base
std::vector<Haplotype> curr_haplotypes;
Variant tmp_variant = variants[vi];
@@ -185,7 +223,128 @@ void annotate_with_all_support(std::vector<Variant>& variants,
}
}
-std::vector<Variant> get_variants_from_vcf(const std::string& filename,
+std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& alignments,
+ int region_start,
+ int region_end,
+ uint32_t alignment_flags)
+{
+ std::vector<Variant> out_variants;
+
+ std::string contig = alignments.get_region_contig();
+
+ // Add all positively-scoring single-base changes into the candidate set
+ for(size_t i = region_start; i < region_end; ++i) {
+
+ for(size_t j = 0; j < 4; ++j) {
+ // Substitutions
+ Variant v;
+ v.ref_name = contig;
+ v.ref_position = i;
+ v.ref_seq = alignments.get_reference_substring(contig, i, i);
+ v.alt_seq = "ACGT"[j];
+
+ if(v.ref_seq != v.alt_seq) {
+ out_variants.push_back(v);
+ }
+
+ // Insertions
+ v.alt_seq = v.ref_seq + "ACGT"[j];
+ // ignore insertions of the type "A" -> "AA" as these are redundant
+ if(v.alt_seq[1] != v.ref_seq[0]) {
+ out_variants.push_back(v);
+ }
+ }
+
+ // deletion
+ Variant del;
+ del.ref_name = contig;
+ del.ref_position = i - 1;
+ del.ref_seq = alignments.get_reference_substring(contig, i - 1, i);
+ del.alt_seq = del.ref_seq[0];
+
+ // ignore deletions of the type "AA" -> "A" as these are redundant
+ if(del.alt_seq[0] != del.ref_seq[1]) {
+ out_variants.push_back(del);
+ }
+ }
+ return out_variants;
+}
+
+std::vector<Variant> screen_variants_by_score(const AlignmentDB& alignments,
+ const std::vector<Variant>& candidate_variants,
+ uint32_t alignment_flags)
+{
+ if(opt::verbose > 3) {
+ fprintf(stderr, "==== Starting variant screening =====\n");
+ }
+
+ std::vector<Variant> out_variants;
+ std::string contig = alignments.get_region_contig();
+ for(size_t vi = 0; vi < candidate_variants.size(); ++vi) {
+ const Variant& v = candidate_variants[vi];
+
+ int calling_start = v.ref_position - opt::min_flanking_sequence;
+ int calling_end = v.ref_position + v.ref_seq.size() + opt::min_flanking_sequence;
+
+ Haplotype test_haplotype(contig,
+ calling_start,
+ alignments.get_reference_substring(contig, calling_start, calling_end));
+
+ std::vector<HMMInputData> event_sequences =
+ alignments.get_event_subsequences(contig, calling_start, calling_end);
+
+ Variant scored_variant = score_variant(v, test_haplotype, event_sequences, alignment_flags);
+ scored_variant.info = "";
+ if(scored_variant.quality > 0) {
+ out_variants.push_back(scored_variant);
+ }
+
+ if( (scored_variant.quality > 0 && opt::verbose > 3) || opt::verbose > 5) {
+ scored_variant.write_vcf(stderr);
+ }
+ }
+ return out_variants;
+}
+
+std::vector<Variant> expand_variants(const AlignmentDB& alignments,
+ const std::vector<Variant>& candidate_variants,
+ int region_start,
+ int region_end,
+ uint32_t alignment_flags)
+{
+ std::vector<Variant> out_variants;
+
+ std::string contig = alignments.get_region_contig();
+
+ for(size_t vi = 0; vi < candidate_variants.size(); ++vi) {
+ const Variant& in_variant = candidate_variants[vi];
+
+ // add the variant unmodified
+ out_variants.push_back(in_variant);
+
+ // don't do anything with substitutions
+ if(in_variant.ref_seq.size() == 1 && in_variant.alt_seq.size() == 1) {
+ continue;
+ }
+
+ // deletion
+ Variant v = candidate_variants[vi];
+ v.ref_seq = alignments.get_reference_substring(v.ref_name, v.ref_position, v.ref_position + v.ref_seq.size());
+ assert(v.ref_seq != candidate_variants[vi].ref_seq);
+ assert(v.ref_seq.substr(0, candidate_variants[vi].ref_seq.size()) == candidate_variants[vi].ref_seq);
+ out_variants.push_back(v);
+
+ // insertion
+ for(size_t j = 0; j < 4; ++j) {
+ v = candidate_variants[vi];
+ v.alt_seq.append(1, "ACGT"[j]);
+ out_variants.push_back(v);
+ }
+ }
+ return out_variants;
+}
+
+std::vector<Variant> get_variants_from_vcf(const std::string& filename,
const std::string& contig,
int region_start,
int region_end)
@@ -194,18 +353,18 @@ std::vector<Variant> get_variants_from_vcf(const std::string& filename,
std::ifstream infile(filename);
std::string line;
while(getline(infile, line)) {
-
+
// skip headers
if(line[0] == '#') {
continue;
}
-
+
// parse variant
Variant v(line);
if(v.ref_name == contig &&
- v.ref_position >= region_start &&
- v.ref_position <= region_end)
+ (int)v.ref_position >= region_start &&
+ (int)v.ref_position <= region_end)
{
out.push_back(v);
}
@@ -213,72 +372,376 @@ std::vector<Variant> get_variants_from_vcf(const std::string& filename,
return out;
}
-Haplotype call_variants_for_region(const std::string& contig, int region_start, int region_end)
+void print_debug_stats(const std::string& contig,
+ const int start_position,
+ const int stop_position,
+ const Haplotype& base_haplotype,
+ const Haplotype& called_haplotype,
+ const std::vector<HMMInputData>& event_sequences,
+ uint32_t alignment_flags)
{
- const int BUFFER = 20;
- uint32_t alignment_flags = HAF_ALLOW_PRE_CLIP | HAF_ALLOW_POST_CLIP;
- if(region_start < BUFFER)
- region_start = BUFFER;
+ std::stringstream prefix_ss;
+ prefix_ss << "variant.debug." << contig << ":" << start_position << "-" << stop_position;
+ std::string stats_fn = prefix_ss.str() + ".stats.out";
+ std::string alignment_fn = prefix_ss.str() + ".alignment.out";
+
+ FILE* stats_out = fopen(stats_fn.c_str(), "w");
+ FILE* alignment_out = fopen(alignment_fn.c_str(), "w");
+
+ for(size_t i = 0; i < event_sequences.size(); i++) {
+ const HMMInputData& data = event_sequences[i];
+
+ // summarize score
+ double num_events = abs(data.event_start_idx - data.event_stop_idx) + 1;
+ double base_score = profile_hmm_score(base_haplotype.get_sequence(), data, alignment_flags);
+ double called_score = profile_hmm_score(called_haplotype.get_sequence(), data, alignment_flags);
+ double base_avg = base_score / num_events;
+ double called_avg = called_score / num_events;
+ const PoreModel& pm = data.read->pore_model[data.strand];
+ fprintf(stats_out, "%s\t%zu\t%zu\t", data.read->read_name.c_str(), data.strand, data.rc);
+ fprintf(stats_out, "%.2lf\t%.2lf\t\t%.2lf\t%.2lf\t%.2lf\t", base_score, called_score, base_avg, called_avg, called_score - base_score);
+ fprintf(stats_out, "%.2lf\t%.2lf\t%.4lf\t%.2lf\n", pm.shift, pm.scale, pm.drift, pm.var);
+
+ // print paired alignment
+ std::vector<HMMAlignmentState> base_align = profile_hmm_align(base_haplotype.get_sequence(), data, alignment_flags);
+ std::vector<HMMAlignmentState> called_align = profile_hmm_align(called_haplotype.get_sequence(), data, alignment_flags);
+ size_t k = pm.k;
+ size_t bi = 0;
+ size_t ci = 0;
+
+ // Find the first event aligned in both
+ size_t max_event = std::max(base_align[0].event_idx, called_align[0].event_idx);
+ while(bi < base_align.size() && base_align[bi].event_idx != max_event) bi++;
+ while(ci < called_align.size() && called_align[ci].event_idx != max_event) ci++;
+
+ GaussianParameters standard_normal(0, 1.0);
+
+ double sum_base_abs_sl = 0.0f;
+ double sum_called_abs_sl = 0.0f;
+ while(bi < base_align.size() && ci < called_align.size()) {
+ size_t event_idx = base_align[bi].event_idx;
+ assert(called_align[ci].event_idx == event_idx);
+
+ double event_mean = data.read->get_fully_scaled_level(event_idx, data.strand);
+ double event_stdv = data.read->get_stdv(event_idx, data.strand);
+ double event_duration = data.read->get_duration(event_idx, data.strand);
+
+ std::string base_kmer = base_haplotype.get_sequence().substr(base_align[bi].kmer_idx, k);
+ std::string called_kmer = called_haplotype.get_sequence().substr(called_align[ci].kmer_idx, k);
+ if(data.rc) {
+ base_kmer = gDNAAlphabet.reverse_complement(base_kmer);
+ called_kmer = gDNAAlphabet.reverse_complement(called_kmer);
+ }
- // load the region, accounting for the buffering
- AlignmentDB alignments(opt::reads_file, opt::genome_file, opt::bam_file, opt::event_bam_file);
- alignments.load_region(contig, region_start - BUFFER, region_end + BUFFER);
- Haplotype derived_haplotype(contig,
- alignments.get_region_start(),
- alignments.get_reference());
+ PoreModelStateParams base_model = pm.states[pm.pmalphabet->kmer_rank(base_kmer.c_str(), k)];
+ PoreModelStateParams called_model = pm.states[pm.pmalphabet->kmer_rank(called_kmer.c_str(), k)];
- // Step 1. Discover putative variants across the whole region
- std::vector<Variant> candidate_variants;
- if(opt::candidates_file.empty()) {
- candidate_variants = alignments.get_variants_in_region(contig, region_start, region_end, opt::min_candidate_frequency, 20);
- } else {
- candidate_variants = get_variants_from_vcf(opt::candidates_file, contig, region_start, region_end);
+ float base_standard_level = (event_mean - base_model.level_mean) / (sqrt(pm.var) * base_model.level_stdv);
+ float called_standard_level = (event_mean - called_model.level_mean) / (sqrt(pm.var) * called_model.level_stdv);
+ base_standard_level = base_align[bi].state == 'M' ? base_standard_level : INFINITY;
+ called_standard_level = called_align[ci].state == 'M' ? called_standard_level : INFINITY;
+
+ sum_base_abs_sl = base_align[bi].l_fm;
+ sum_called_abs_sl = called_align[bi].l_fm;
+
+ char diff = base_kmer != called_kmer ? 'D' : ' ';
+ fprintf(alignment_out, "%s\t%zu\t%.2lf\t%.2lf\t%.4lf\t", data.read->read_name.c_str(), event_idx, event_mean, event_stdv, event_duration);
+ fprintf(alignment_out, "%c\t%c\t%zu\t%zu\t\t", base_align[bi].state, called_align[ci].state, base_align[bi].kmer_idx, called_align[ci].kmer_idx);
+ fprintf(alignment_out, "%s\t%.2lf\t%s\t%.2lf\t", base_kmer.c_str(), base_model.level_mean, called_kmer.c_str(), called_model.level_mean);
+ fprintf(alignment_out, "%.2lf\t%.2lf\t%c\t%.2lf\n", base_standard_level, called_standard_level, diff, sum_called_abs_sl - sum_base_abs_sl);
+
+ // Go to the next event
+ while(base_align[bi].event_idx == event_idx) bi++;
+ while(called_align[ci].event_idx == event_idx) ci++;
+ }
}
- // Step 2. Add variants to the haplotypes
- size_t calling_span = 10;
+ fclose(stats_out);
+ fclose(alignment_out);
+}
+
+Haplotype fix_homopolymers(const Haplotype& input_haplotype,
+ const AlignmentDB& alignments)
+{
+ uint32_t alignment_flags = 0;
+ Haplotype fixed_haplotype = input_haplotype;
+ const std::string& haplotype_sequence = input_haplotype.get_sequence();
+ size_t kmer_size = 6;
+ size_t MIN_HP_LENGTH = 3;
+ size_t MAX_HP_LENGTH = 9;
+ double CALL_THRESHOLD = 10;
+
+ // scan for homopolymers
+ size_t i = 0;
+ while(i < haplotype_sequence.size()) {
+ // start a new homopolymer
+ char hp_base = haplotype_sequence[i];
+ size_t hap_hp_start = i;
+ size_t ref_hp_start = hap_hp_start + input_haplotype.get_reference_position();
+ while(i < haplotype_sequence.size() && haplotype_sequence[i] == hp_base) i++;
+
+ if(i >= haplotype_sequence.size()) {
+ break;
+ }
+
+ size_t hap_hp_end = i;
+ size_t hp_length = hap_hp_end - hap_hp_start;
+
+ if(hp_length < MIN_HP_LENGTH || hp_length > MAX_HP_LENGTH)
+ continue;
+
+ // Set the calling range based on the *reference* (not haplotype) coordinates
+ // of the region surrounding the homopolymer. This is so we can extract the alignments
+ // from the alignment DB using reference coordinates. NB get_enclosing... may change
+ // hap_calling_start/end
+ if(hap_hp_start < opt::min_flanking_sequence)
+ continue;
+ if(hap_hp_end + opt::min_flanking_sequence >= haplotype_sequence.size())
+ continue;
+
+ size_t hap_calling_start = hap_hp_start - opt::min_flanking_sequence;
+ size_t hap_calling_end = hap_hp_end + opt::min_flanking_sequence;
+ size_t ref_calling_start, ref_calling_end;
+ input_haplotype.get_enclosing_reference_range_for_haplotype_range(hap_calling_start,
+ hap_calling_end,
+ ref_calling_start,
+ ref_calling_end);
+
+ if(ref_calling_start == std::string::npos || ref_calling_end == std::string::npos) {
+ continue;
+ }
+
+ if(opt::verbose > 3) {
+ fprintf(stderr, "[fixhp] Found %zu-mer %c at %zu (seq: %s)\n", hp_length, hp_base, hap_hp_start, haplotype_sequence.substr(hap_hp_start - kmer_size - 1, hp_length + 10).c_str());
+ }
+
+ if(ref_calling_start < alignments.get_region_start() || ref_calling_end >= alignments.get_region_end()) {
+ continue;
+ }
+ assert(ref_calling_start <= ref_calling_end);
+
+ if(ref_calling_start < input_haplotype.get_reference_position() ||
+ ref_calling_end >= input_haplotype.get_reference_end()) {
+ continue;
+ }
+
+ Haplotype calling_haplotype =
+ input_haplotype.substr_by_reference(ref_calling_start, ref_calling_end);
+ std::string calling_sequence = calling_haplotype.get_sequence();
+
+ // Get the events for the calling region
+ std::vector<HMMInputData> event_sequences =
+ alignments.get_event_subsequences(alignments.get_region_contig(), ref_calling_start, ref_calling_end);
+
+ // the kmer with the first base of the homopolymer in the last position
+ size_t k0 = hap_hp_start - hap_calling_start - kmer_size + 1;
+
+ // the kmer with the last base of the homopolymer in the first position
+ size_t k1 = hap_hp_end - hap_calling_start;
+
+ std::string key_str = haplotype_sequence.substr(hap_hp_start - kmer_size - 1, hp_length + 10);
+ std::vector<double> duration_likelihoods(MAX_HP_LENGTH + 1, 0.0f);
+ std::vector<double> event_likelihoods(MAX_HP_LENGTH + 1, 0.0f);
+
+ for(size_t j = 0; j < event_sequences.size(); ++j) {
+ assert(kmer_size == event_sequences[j].read->pore_model[0].k);
+ assert(kmer_size == 6);
+
+ // skip small event regions
+ if( abs(event_sequences[j].event_start_idx - event_sequences[j].event_stop_idx) < 10) {
+ continue;
+ }
+
+ // Fit a gamma distribution to the durations in this region of the read
+ const SquiggleRead* read = event_sequences[j].read;
+ size_t strand = event_sequences[j].strand;
+ double local_time = fabs(read->get_time(event_sequences[j].event_start_idx, strand) - read->get_time(event_sequences[j].event_stop_idx, strand));
+ double local_bases = calling_sequence.size();
+ double local_avg = local_time / local_bases;
+ GammaParameters params;
+ params.shape = 2.461964;
+ params.rate = (1 / local_avg) * params.shape;
+ if(opt::verbose > 3) {
+ fprintf(stderr, "[fixhp] RATE local: %s\t%.6lf\n", read->read_name.c_str(), local_avg);
+ }
+
+ // Calculate the duration likelihood of an l-mer at this hp
+ // we align to a modified version of the haplotype sequence which contains the l-mer
+ for(int var_sequence_length = MIN_HP_LENGTH; var_sequence_length <= MAX_HP_LENGTH; ++var_sequence_length) {
+ int var_sequence_diff = var_sequence_length - hp_length;
+ std::string variant_sequence = calling_sequence;
+ if(var_sequence_diff < 0) {
+ variant_sequence.erase(hap_hp_start - hap_calling_start, abs(var_sequence_diff));
+ } else if(var_sequence_diff > 0) {
+ variant_sequence.insert(hap_hp_start - hap_calling_start, var_sequence_diff, hp_base);
+ }
+
+ // align events
+ std::vector<double> durations_by_kmer = DurationModel::generate_aligned_durations(variant_sequence,
+ event_sequences[j],
+ alignment_flags);
+ // event current measurement likelihood using the standard HMM
+ event_likelihoods[var_sequence_length] += profile_hmm_score(variant_sequence, event_sequences[j], alignment_flags);
+
+ // the call window parameter determines how much flanking sequence around the HP we include in the total duration calculation
+ int call_window = 2;
+ size_t variant_offset_start = k0 + 4 - call_window;
+ size_t variant_offset_end = k0 + hp_length + var_sequence_diff + call_window;
+ double sum_duration = 0.0f;
+ for(size_t k = variant_offset_start; k < variant_offset_end; k++) {
+ sum_duration += durations_by_kmer[k];
+ }
+
+ double num_kmers = variant_offset_end - variant_offset_start;
+ double log_gamma = sum_duration > MIN_DURATION ? DurationModel::log_gamma_sum(sum_duration, params, num_kmers) : 0.0f;
+ duration_likelihoods[var_sequence_length] += log_gamma;
+ if(opt::verbose > 3) {
+ fprintf(stderr, "SUM_VAR\t%zu\t%d\t%d\t%d\t%d\t%.5lf\t%.2lf\n", ref_hp_start, hp_length, var_sequence_length, call_window, variant_offset_end - variant_offset_start, sum_duration, log_gamma);
+ }
+ }
+ }
+
+ std::stringstream duration_lik_out;
+ std::stringstream event_lik_out;
+ std::vector<double> score_by_length(duration_likelihoods.size());
+
+ // make a call
+ double max_score = -INFINITY;
+ size_t call = -1;
+
+ for(size_t len = MIN_HP_LENGTH; len <= MAX_HP_LENGTH; ++len) {
+ assert(len < duration_likelihoods.size());
+ double d_lik = duration_likelihoods[len];
+ double e_lik = event_likelihoods[len];
+
+ double score = d_lik + e_lik;
+ score_by_length[len] = score;
+ if(score > max_score) {
+ max_score = score;
+ call = len;
+ }
+ duration_lik_out << d_lik << "\t";
+ event_lik_out << e_lik << "\t";
+ }
+
+ double score = max_score - score_by_length[hp_length];
+ if(opt::verbose > 3) {
+ double del_score = duration_likelihoods[hp_length - 1] - duration_likelihoods[hp_length];
+ double ins_score = duration_likelihoods[hp_length + 1] - duration_likelihoods[hp_length];
+ double del_e_score = event_likelihoods[hp_length - 1] - event_likelihoods[hp_length];
+ double ins_e_score = event_likelihoods[hp_length + 1] - event_likelihoods[hp_length];
+ fprintf(stderr, "CALL\t%zu\t%.2lf\n", call, score);
+ fprintf(stderr, "LIKELIHOOD\t%s\n", duration_lik_out.str().c_str());
+ fprintf(stderr, "EIKELIHOOD\t%s\n", event_lik_out.str().c_str());
+ fprintf(stderr, "REF_SCORE\t%zu\t%zu\t%.2lf\t%.2lf\n", ref_hp_start, hp_length, del_score, ins_score);
+ fprintf(stderr, "EVENT_SCORE\t%zu\t%zu\t%.2lf\t%.2lf\n", ref_hp_start, hp_length, del_e_score, ins_e_score);
+ fprintf(stderr, "COMBINED_SCORE\t%zu\t%zu\t%.2lf\t%.2lf\n", ref_hp_start, hp_length, del_score + del_e_score, ins_score + ins_e_score);
+ }
+
+ if(score < CALL_THRESHOLD)
+ continue;
+
+ int size_diff = call - hp_length;
+ std::string contig = fixed_haplotype.get_reference_name();
+ if(size_diff > 0) {
+ // add a 1bp insertion in this region
+ // the variant might conflict with other variants in the region
+ // so we try multiple positions
+ // NB: it is intended that if the call is a 2bp (or greater) insertion
+ // we only insert 1bp (for now)
+ for(size_t k = hap_hp_start; k <= hap_hp_end; ++k) {
+ Variant v;
+ v.ref_name = contig;
+ v.ref_position = input_haplotype.get_reference_position_for_haplotype_base(k);
+ if(v.ref_position == std::string::npos) {
+ continue;
+ }
+ v.ref_seq = fixed_haplotype.substr_by_reference(v.ref_position, v.ref_position).get_sequence();
+ if(v.ref_seq.size() == 1 && v.ref_seq[0] == hp_base) {
+ v.alt_seq = v.ref_seq + hp_base;
+ v.quality = score;
+ // if the variant can be added here (ie it doesnt overlap a
+ // conflicting variant) then stop
+ if(fixed_haplotype.apply_variant(v)) {
+ break;
+ }
+ }
+ }
+ } else if(size_diff < 0) {
+ // add a 1bp deletion at this position
+ for(size_t k = hap_hp_start; k <= hap_hp_end; ++k) {
+ Variant v;
+ v.ref_name = contig;
+ v.ref_position = input_haplotype.get_reference_position_for_haplotype_base(k);
+ v.quality = score;
+ if(v.ref_position == std::string::npos) {
+ continue;
+ }
+ v.ref_seq = fixed_haplotype.substr_by_reference(v.ref_position, v.ref_position + 1).get_sequence();
+ if(v.ref_seq.size() == 2 && v.ref_seq[0] == hp_base && v.ref_seq[1] == hp_base) {
+ v.alt_seq = v.ref_seq[0];
+
+ // if the variant can be added here (ie it doesnt overlap a
+ // conflicting variant) then stop
+ if(fixed_haplotype.apply_variant(v)) {
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ return fixed_haplotype;
+}
+
+Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments,
+ const std::vector<Variant>& candidate_variants,
+ uint32_t alignment_flags)
+{
+ Haplotype derived_haplotype(alignments.get_region_contig(), alignments.get_region_start(), alignments.get_reference());
+
size_t curr_variant_idx = 0;
while(curr_variant_idx < candidate_variants.size()) {
-
- // Group the variants that are within calling_span bases of each other
+
+ // Group the variants that are within calling_span bases of each other
size_t end_variant_idx = curr_variant_idx + 1;
while(end_variant_idx < candidate_variants.size()) {
- int distance = candidate_variants[end_variant_idx].ref_position -
+ int distance = candidate_variants[end_variant_idx].ref_position -
candidate_variants[end_variant_idx - 1].ref_position;
- if(distance > calling_span)
+ if(distance > opt::min_distance_between_variants)
break;
end_variant_idx++;
}
-
+
size_t num_variants = end_variant_idx - curr_variant_idx;
- int calling_start = candidate_variants[curr_variant_idx].ref_position - calling_span;
+ int calling_start = candidate_variants[curr_variant_idx].ref_position - opt::min_flanking_sequence;
int calling_end = candidate_variants[end_variant_idx - 1].ref_position +
candidate_variants[end_variant_idx - 1].ref_seq.length() +
- calling_span;
+ opt::min_flanking_sequence;
int calling_size = calling_end - calling_start;
if(opt::verbose > 2) {
fprintf(stderr, "%zu variants in span [%d %d]\n", num_variants, calling_start, calling_end);
}
-
- // Only try to call variants if there is a reasonable amount and the window is not too large
- if(num_variants <= 10 && calling_size <= 100) {
+
+ // Only try to call if the window is not too large
+ if(calling_size <= 200) {
// Subset the haplotype to the region we are calling
- Haplotype calling_haplotype =
+ Haplotype calling_haplotype =
derived_haplotype.substr_by_reference(calling_start, calling_end);
-
+
// Get the events for the calling region
- std::vector<HMMInputData> event_sequences =
- alignments.get_event_subsequences(contig, calling_start, calling_end);
-
+ std::vector<HMMInputData> event_sequences =
+ alignments.get_event_subsequences(alignments.get_region_contig(), calling_start, calling_end);
+
// Subset the variants
- std::vector<Variant> calling_variants(candidate_variants.begin() + curr_variant_idx,
+ std::vector<Variant> calling_variants(candidate_variants.begin() + curr_variant_idx,
candidate_variants.begin() + end_variant_idx);
-
+
// Select the best set of variants
- std::vector<Variant> selected_variants =
- select_variant_set(calling_variants, calling_haplotype, event_sequences, alignment_flags);
+ std::vector<Variant> selected_variants =
+ select_variant_set(calling_variants, calling_haplotype, event_sequences, opt::max_haplotypes, alignment_flags);
// optionally annotate each variant with fraction of reads supporting A,C,G,T at this position
if(opt::calculate_all_support) {
@@ -294,7 +757,19 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
selected_variants[vi].write_vcf(stderr);
}
}
- }
+
+ if(opt::debug_alignments) {
+ print_debug_stats(alignments.get_region_contig(),
+ calling_start,
+ calling_end,
+ calling_haplotype,
+ derived_haplotype.substr_by_reference(calling_start, calling_end),
+ event_sequences,
+ alignment_flags);
+ }
+ } else {
+ fprintf(stderr, "Warning: %zu variants in span, region not called [%d %d]\n", num_variants, calling_start, calling_end);
+ }
// advance to start of next region
curr_variant_idx = end_variant_idx;
@@ -303,6 +778,112 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
return derived_haplotype;
}
+
+Haplotype call_variants_for_region(const std::string& contig, int region_start, int region_end)
+{
+ const int BUFFER = opt::min_flanking_sequence + 10;
+ uint32_t alignment_flags = HAF_ALLOW_PRE_CLIP | HAF_ALLOW_POST_CLIP;
+
+ // load the region, accounting for the buffering
+ if(region_start < BUFFER)
+ region_start = BUFFER;
+ AlignmentDB alignments(opt::reads_file, opt::genome_file, opt::bam_file, opt::event_bam_file, opt::calibrate);
+
+ if(!opt::alternative_model_type.empty()) {
+ alignments.set_alternative_model_type(opt::alternative_model_type);
+ }
+
+ alignments.load_region(contig, region_start - BUFFER, region_end + BUFFER);
+
+ // if the end of the region plus the buffer sequence goes past
+ // the end of the chromosome, we adjust the region end here
+ region_end = alignments.get_region_end() - BUFFER;
+
+ if(opt::verbose > 4) {
+ fprintf(stderr, "input region: %s\n", alignments.get_reference_substring(contig, region_start - BUFFER, region_end + BUFFER).c_str());
+ }
+
+/*
+ Haplotype called_haplotype(alignments.get_region_contig(),
+ alignments.get_region_start(),
+ alignments.get_reference());
+*/
+
+ // Step 1. Discover putative variants across the whole region
+ std::vector<Variant> candidate_variants;
+ if(opt::candidates_file.empty()) {
+ candidate_variants = alignments.get_variants_in_region(contig, region_start, region_end, opt::min_candidate_frequency, 20);
+ } else {
+ candidate_variants = get_variants_from_vcf(opt::candidates_file, contig, region_start, region_end);
+ }
+
+ if(opt::consensus_mode) {
+
+ // generate single-base edits that have a positive haplotype score
+ std::vector<Variant> single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end, alignment_flags);
+
+ // insert these into the candidate set
+ candidate_variants.insert(candidate_variants.end(), single_base_edits.begin(), single_base_edits.end());
+
+ // deduplicate variants
+ std::set<Variant, VariantKeyComp> dedup_set(candidate_variants.begin(), candidate_variants.end());
+ candidate_variants.clear();
+ candidate_variants.insert(candidate_variants.end(), dedup_set.begin(), dedup_set.end());
+ std::sort(candidate_variants.begin(), candidate_variants.end(), sortByPosition);
+ }
+
+ // Step 2. Call variants
+
+ // in consensus mode we iterate until a maximum number of rounds is reached
+ // or the variant set converges
+ size_t round = 0;
+ size_t MAX_ROUNDS = 5;
+ Haplotype called_haplotype(alignments.get_region_contig(),
+ alignments.get_region_start(),
+ alignments.get_reference());
+
+ while(opt::consensus_mode && round++ < MAX_ROUNDS) {
+ if(opt::verbose > 3) {
+ fprintf(stderr, "Round %zu\n", round);
+ }
+
+ // Filter the variant set down by only including those that individually contribute a positive score
+ std::vector<Variant> filtered_variants = screen_variants_by_score(alignments,
+ candidate_variants,
+ alignment_flags);
+
+ // Combine variants into sets that maximize their haplotype score
+ called_haplotype = call_haplotype_from_candidates(alignments,
+ filtered_variants,
+ alignment_flags);
+
+ if(opt::consensus_mode) {
+ // Expand the called variant set by adding nearby variants
+ std::vector<Variant> called_variants = called_haplotype.get_variants();
+ candidate_variants = expand_variants(alignments,
+ called_variants,
+ region_start,
+ region_end,
+ alignment_flags);
+ }
+ }
+
+ if(opt::fix_homopolymers) {
+ called_haplotype = fix_homopolymers(called_haplotype, alignments);
+ }
+
+ if(opt::consensus_mode) {
+ FILE* consensus_fp = fopen(opt::consensus_output.c_str(), "w");
+ fprintf(consensus_fp, ">%s:%d-%d\n%s\n", contig.c_str(),
+ alignments.get_region_start(),
+ alignments.get_region_end(),
+ called_haplotype.get_sequence().c_str());
+ fclose(consensus_fp);
+ }
+
+ return called_haplotype;
+}
+
void parse_call_variants_options(int argc, char** argv)
{
bool die = false;
@@ -320,9 +901,16 @@ void parse_call_variants_options(int argc, char** argv)
case '?': die = true; break;
case 't': arg >> opt::num_threads; break;
case 'v': opt::verbose++; break;
+ case OPT_CONSENSUS: arg >> opt::consensus_output; opt::consensus_mode = 1; break;
+ case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break;
+ case OPT_MODELS_FOFN: arg >> opt::models_fofn; break;
case OPT_CALC_ALL_SUPPORT: opt::calculate_all_support = 1; break;
case OPT_SNPS_ONLY: opt::snps_only = 1; break;
case OPT_PROGRESS: opt::show_progress = 1; break;
+ case OPT_P_SKIP: arg >> g_p_skip; break;
+ case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break;
+ case OPT_P_BAD: arg >> g_p_bad; break;
+ case OPT_P_BAD_SELF: arg >> g_p_bad_self; break;
case OPT_HELP:
std::cout << CONSENSUS_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
@@ -349,7 +937,7 @@ void parse_call_variants_options(int argc, char** argv)
std::cerr << SUBPROGRAM ": a --reads file must be provided\n";
die = true;
}
-
+
if(opt::genome_file.empty()) {
std::cerr << SUBPROGRAM ": a --genome file must be provided\n";
die = true;
@@ -360,7 +948,15 @@ void parse_call_variants_options(int argc, char** argv)
die = true;
}
- if (die)
+ if(opt::models_fofn.empty()) {
+ std::cerr << SUBPROGRAM ": a --models file must be provided\n";
+ die = true;
+ } else {
+ // initialize the model set from the fofn
+ PoreModelSet::initialize(opt::models_fofn);
+ }
+
+ if (die)
{
std::cout << "\n" << CONSENSUS_USAGE_MESSAGE;
exit(EXIT_FAILURE);
@@ -381,7 +977,7 @@ int call_variants_main(int argc, char** argv)
std::string contig;
int start_base;
int end_base;
-
+
parser >> contig >> start_base >> end_base;
end_base = std::min(end_base, get_contig_length(contig) - 1);
@@ -407,4 +1003,5 @@ int call_variants_main(int argc, char** argv)
if(out_fp != stdout) {
fclose(out_fp);
}
+ return 0;
}
diff --git a/src/nanopolish_call_variants.h b/src/nanopolish_call_variants.h
index 4566524..51248f6 100644
--- a/src/nanopolish_call_variants.h
+++ b/src/nanopolish_call_variants.h
@@ -8,6 +8,6 @@
#ifndef NANOPOLISH_CALL_VARIANTS_H
#define NANOPOLISH_CALL_VARIANTS_H
-void call_variants_main(int argc, char** argv);
+int call_variants_main(int argc, char** argv);
#endif
diff --git a/src/nanopolish_consensus.cpp b/src/nanopolish_consensus.cpp
index 785cf9e..e6cb619 100644
--- a/src/nanopolish_consensus.cpp
+++ b/src/nanopolish_consensus.cpp
@@ -27,6 +27,7 @@
#include "nanopolish_anchor.h"
#include "nanopolish_fast5_map.h"
#include "nanopolish_hmm_input_sequence.h"
+#include "nanopolish_pore_model_set.h"
#include "profiler.h"
#include "progress.h"
#include "stdaln.h"
@@ -57,7 +58,7 @@ SUBPROGRAM " Version " PACKAGE_VERSION "\n"
"Copyright 2015 Ontario Institute for Cancer Research\n";
static const char *CONSENSUS_USAGE_MESSAGE =
-"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n"
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] -w contig:start-end --reads reads.fa --bam alignments.bam --genome genome.fa\n"
"Compute a new consensus sequence for an assembly using a signal-level HMM\n"
"\n"
" -v, --verbose display verbose output\n"
@@ -69,6 +70,7 @@ static const char *CONSENSUS_USAGE_MESSAGE =
" -g, --genome=FILE the genome we are computing a consensus for is in FILE\n"
" -o, --outfile=FILE write result to FILE [default: stdout]\n"
" -t, --threads=NUM use NUM threads (default: 1)\n"
+" --models-fofn=FILE read alternative k-mer models from FILE\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
namespace opt
@@ -79,13 +81,15 @@ namespace opt
static std::string genome_file;
static std::string output_file;
static std::string window;
+ static std::string models_fofn;
+ static std::string alternative_model_type = DEFAULT_MODEL_TYPE;
static int show_progress = 0;
static int num_threads = 1;
}
static const char* shortopts = "r:b:g:t:w:o:v";
-enum { OPT_HELP = 1, OPT_VERSION, OPT_VCF, OPT_PROGRESS };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_VCF, OPT_PROGRESS, OPT_MODELS_FOFN };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
@@ -95,6 +99,7 @@ static const struct option longopts[] = {
{ "window", required_argument, NULL, 'w' },
{ "outfile", required_argument, NULL, 'o' },
{ "threads", required_argument, NULL, 't' },
+ { "models-fofn", required_argument, NULL, OPT_MODELS_FOFN },
{ "progress", no_argument, NULL, OPT_PROGRESS },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
@@ -126,6 +131,7 @@ std::vector<HMMInputData> get_input_for_columns(HMMRealignmentInput& window,
if(start_sa.rc != end_sa.rc)
continue;
+
HMMInputData data;
uint32_t read_idx = rsi / 2;
@@ -151,11 +157,7 @@ std::vector<HMMInputData> get_input_for_columns(HMMRealignmentInput& window,
// scoring functions without writing a bunch of code
double score_sequence(const std::string& sequence, const HMMInputData& data)
{
- //return score_skip_merge(sequence, state);
- //return score_khmm_model_postmerge(sequence, state);
- //return khmm_score(sequence, state, AP_GLOBAL);
return profile_hmm_score(sequence, data);
- //return score_emission_dp(sequence, state);
}
@@ -220,7 +222,6 @@ bool sortIndexedPathScoreDesc(const IndexedPathScore& a, const IndexedPathScore&
void score_paths(PathConsVector& paths, const std::vector<HMMInputData>& input)
{
PROFILE_FUNC("score_paths")
- double MIN_FIT = INFINITY;
size_t CULL_RATE = 5;
double CULL_MIN_SCORE = -30.0f;
double CULL_MIN_IMPROVED_FRACTION = 0.2f;
@@ -254,7 +255,7 @@ void score_paths(PathConsVector& paths, const std::vector<HMMInputData>& input)
fprintf(stderr, "Scoring %d\n", ri);
}
- const HMMInputData& data = input[ri];
+ //const HMMInputData& data = input[ri];
std::vector<IndexedPathScore> result(paths.size());
// Score all paths
@@ -306,6 +307,7 @@ void score_paths(PathConsVector& paths, const std::vector<HMMInputData>& input)
std::stable_sort(paths.begin(), paths.end(), sortPathConsScoreDesc);
#if DEBUG_PATH_SELECTION
+ double MIN_FIT = INFINITY;
for(size_t pi = 0; pi < paths.size(); ++pi) {
// Calculate the length of the matching prefix with the initial sequence
@@ -313,20 +315,16 @@ void score_paths(PathConsVector& paths, const std::vector<HMMInputData>& input)
char initial = s == first ? 'I' : ' ';
- printf("%zu\t%s\t%.1lf\t%zu %c %s", pi, paths[pi].path.c_str(), paths[pi].score, paths[pi].sum_rank, initial, paths[pi].mutdesc.c_str());
+ fprintf(stderr, "%zu\t%s\t%.1lf\t%zu %c %s", pi, paths[pi].path.c_str(), paths[pi].score, paths[pi].sum_rank, initial, paths[pi].mutdesc.c_str());
// If this is the truth path or the best path, show the scores for all reads
if(pi <= 1 || initial == 'I') {
for(uint32_t ri = 0; ri < input.size(); ++ri) {
const HMMInputData& data = input[ri];
- const KHMMParameters& parameters = data.read->parameters[data.strand];
- if( fabs(parameters.fit_quality) > MIN_FIT)
- continue;
-
double curr = score_sequence(paths[pi].path, input[ri]);
- printf("%.1lf,%.2lf ", parameters.fit_quality, curr);
+ fprintf(stderr, "%.2lf ", curr);
}
}
- printf("\n");
+ fprintf(stderr, "\n");
}
#endif
@@ -340,7 +338,7 @@ void extend_paths(PathConsVector& paths, int maxk = 2)
for(int k = 1; k <= maxk; ++k) {
- for(int pi = 0; pi < paths.size(); ++pi) {
+ for(unsigned pi = 0; pi < paths.size(); ++pi) {
std::string first(k, 'A');
std::string extension = first;
@@ -539,8 +537,10 @@ void filter_outlier_data(std::vector<HMMInputData>& input, const std::string& se
if(opt::verbose >= 1) {
fprintf(stderr, "OUTLIER_FILTER %d %.2lf %.2lf %.2lf\n", ri, curr, n_events, lp_per_event);
}
-
- if(fabs(lp_per_event) < 3.5f) {
+
+ // R9 thresholds
+ double threshold = model_stdv() ? 8.0f : 4.0f; // TODO: check
+ if(fabs(lp_per_event) < threshold) {
out_rs.push_back(rs);
}
}
@@ -560,7 +560,7 @@ std::string join_sequences_at_kmer(const std::string& a, const std::string& b, c
return a + b.substr(k);
}
-void run_splice_segment(HMMRealignmentInput& window, uint32_t segment_id)
+void run_splice_segment(HMMRealignmentInput& window, uint32_t segment_id, const uint32_t k)
{
// The structure of the data looks like this:
@@ -593,11 +593,11 @@ void run_splice_segment(HMMRealignmentInput& window, uint32_t segment_id)
// set up the input data for the HMM
std::vector<HMMInputData> data = get_input_for_columns(window, start_column, end_column);
-
- // assume models for all the reads have the same k
- assert(!data.empty());
- const uint32_t k = data[0].read->pore_model[data[0].strand].k;
+ if(opt::verbose > 0) {
+ fprintf(stderr, "correcting segment %u with %zu reads\n", segment_id, data.size());
+ }
+
// The current consensus sequence
std::string original = join_sequences_at_kmer(s_m_base, m_e_base, k);
std::string base = original;
@@ -669,8 +669,12 @@ void train_segment(HMMRealignmentInput& window, uint32_t segment_id)
// Set up the the input data for the HMM
std::vector<HMMInputData> input = get_input_for_columns(window, start_column, end_column);
+ // no training can be performed if there are no reads for this segment
+ if(input.empty()) {
+ return;
+ }
+
// assume models for all the reads have the same k
- assert(!input.empty());
const uint32_t k = input[0].read->pore_model[input[0].strand].k;
std::string segment_sequence = join_sequences_at_kmer(s_m_base, m_e_base, k);
@@ -699,18 +703,32 @@ void train(HMMRealignmentInput& window)
std::string call_consensus_for_window(const Fast5Map& name_map, const std::string& contig, int start_base, int end_base)
{
const int minor_segment_stride = 50;
- HMMRealignmentInput window = build_input_for_region(opt::bam_file, opt::genome_file, name_map, contig, start_base, end_base, minor_segment_stride);
+ HMMRealignmentInput window = build_input_for_region(opt::bam_file,
+ opt::genome_file,
+ name_map,
+ contig,
+ start_base,
+ end_base,
+ minor_segment_stride,
+ opt::alternative_model_type);
+ uint32_t num_segments = window.anchored_columns.size();
- if(window.reads.empty()) {
+ // If there are not reads or not enough segments do not try to call a consensus sequence
+ if(window.reads.empty() || num_segments < 3) {
// No data for this window, just return the original sequence as the consensus
assert(!window.original_sequence.empty());
return window.original_sequence;
}
+
+ if(opt::verbose > 0) {
+ fprintf(stderr, "correcting window %s:%d-%d with %zu reads\n", contig.c_str(), start_base, end_base, window.reads.size());
+ }
//
// Train the HMM
//
- train(window);
+ WARN_ONCE("Debug: using default transition parameters");
+ //train(window);
// assume models for all the reads have the same k
const uint32_t k = window.reads[0]->pore_model[T_IDX].k;
@@ -721,7 +739,6 @@ std::string call_consensus_for_window(const Fast5Map& name_map, const std::strin
std::string reference = "";
std::string consensus = "";
- uint32_t num_segments = window.anchored_columns.size();
uint32_t start_segment_id = 0;
// Copy the base segments before they are updated
@@ -744,7 +761,7 @@ std::string call_consensus_for_window(const Fast5Map& name_map, const std::strin
}
// run the consensus algorithm for this segment
- run_splice_segment(window, segment_id);
+ run_splice_segment(window, segment_id, k);
// run_splice_segment updates the base_sequence of the current anchor, grab it and append
std::string base = window.anchored_columns[segment_id].base_sequence;
@@ -786,6 +803,7 @@ void parse_consensus_options(int argc, char** argv)
case '?': die = true; break;
case 't': arg >> opt::num_threads; break;
case 'v': opt::verbose++; break;
+ case OPT_MODELS_FOFN: arg >> opt::models_fofn; break;
case OPT_PROGRESS: opt::show_progress = 1; break;
case OPT_HELP:
std::cout << CONSENSUS_USAGE_MESSAGE;
@@ -823,6 +841,19 @@ void parse_consensus_options(int argc, char** argv)
std::cerr << SUBPROGRAM ": a --bam file must be provided\n";
die = true;
}
+
+ if(opt::models_fofn.empty()) {
+ std::cerr << SUBPROGRAM ": a --models file must be provided\n";
+ die = true;
+ } else {
+ // initialize the model set from the fofn
+ PoreModelSet::initialize(opt::models_fofn);
+ }
+
+ if(opt::window.empty()) {
+ std::cerr << SUBPROGRAM ": the -w (window) parameter must be provided\n";
+ die = true;
+ }
if (die)
{
@@ -837,7 +868,7 @@ int consensus_main(int argc, char** argv)
omp_set_num_threads(opt::num_threads);
Fast5Map name_map(opt::reads_file);
-
+
// Parse the window string
// Replace ":" and "-" with spaces to make it parseable with stringstream
std::replace(opt::window.begin(), opt::window.end(), ':', ' ');
@@ -872,4 +903,5 @@ int consensus_main(int argc, char** argv)
if(out_fp != stdout) {
fclose(out_fp);
}
+ return 0;
}
diff --git a/src/nanopolish_consensus.h b/src/nanopolish_consensus.h
index 38afea1..2cefc03 100644
--- a/src/nanopolish_consensus.h
+++ b/src/nanopolish_consensus.h
@@ -9,6 +9,6 @@
#ifndef NANOPOLISH_CONSENSUS_H
#define NANOPOLISH_CONSENSUS_H
-void consensus_main(int argc, char** argv);
+int consensus_main(int argc, char** argv);
#endif
diff --git a/src/nanopolish_getmodel.cpp b/src/nanopolish_getmodel.cpp
index 6f44689..6902c4d 100644
--- a/src/nanopolish_getmodel.cpp
+++ b/src/nanopolish_getmodel.cpp
@@ -120,4 +120,5 @@ int getmodel_main(int argc, char** argv)
gDNAAlphabet.lexicographic_next(kmer); // advance kmer
}
}
+ return 0;
}
diff --git a/src/nanopolish_getmodel.h b/src/nanopolish_getmodel.h
index c2d3aab..7d5ffb9 100644
--- a/src/nanopolish_getmodel.h
+++ b/src/nanopolish_getmodel.h
@@ -9,6 +9,6 @@
#ifndef NANOPOLISH_GETMODEL_H
#define NANOPOLISH_GETMODEL_H
-void getmodel_main(int argc, char** argv);
+int getmodel_main(int argc, char** argv);
#endif
diff --git a/src/nanopolish_haplotype.cpp b/src/nanopolish_haplotype.cpp
index df01957..5816278 100644
--- a/src/nanopolish_haplotype.cpp
+++ b/src/nanopolish_haplotype.cpp
@@ -26,7 +26,7 @@ Haplotype::Haplotype(const std::string& ref_name,
}
//
-void Haplotype::apply_variant(const Variant& v)
+bool Haplotype::apply_variant(const Variant& v)
{
// Search the coordinate map for the reference position
size_t derived_idx = _find_derived_index_by_ref_lower_bound(v.ref_position);
@@ -36,7 +36,7 @@ void Haplotype::apply_variant(const Variant& v)
if(derived_idx == m_coordinate_map.size() ||
m_coordinate_map[derived_idx] != v.ref_position)
{
- return;
+ return false;
}
// Check that the string matches
@@ -45,7 +45,7 @@ void Haplotype::apply_variant(const Variant& v)
// no match, variant conflicts with haplotype sequence
if(m_sequence.substr(derived_idx, rl) != v.ref_seq) {
- return;
+ return false;
}
// update sequence
@@ -67,10 +67,11 @@ void Haplotype::apply_variant(const Variant& v)
assert(m_coordinate_map.size() == m_sequence.size());
m_variants.push_back(v);
+ return true;
}
// return a new haplotype subsetted by reference coordinates
-Haplotype Haplotype::substr_by_reference(size_t start, size_t end)
+Haplotype Haplotype::substr_by_reference(size_t start, size_t end) const
{
assert(start >= m_ref_position);
assert(start <= m_ref_position + m_reference.length());
@@ -111,7 +112,32 @@ Haplotype Haplotype::substr_by_reference(size_t start, size_t end)
return ret;
}
-size_t Haplotype::_find_derived_index_by_ref_lower_bound(size_t ref_index)
+size_t Haplotype::get_reference_position_for_haplotype_base(size_t i) const
+{
+ assert(i < m_coordinate_map.size());
+ return m_coordinate_map[i] == INSERTED_POSITION ? std::string::npos : m_coordinate_map[i];
+}
+
+void Haplotype::get_enclosing_reference_range_for_haplotype_range(size_t& hap_lower, size_t& hap_upper,
+ size_t& ref_lower, size_t& ref_upper) const
+{
+ while(hap_lower > 0 && m_coordinate_map[hap_lower] == INSERTED_POSITION) {
+ hap_lower--;
+ }
+
+ while(hap_upper < m_coordinate_map.size() && m_coordinate_map[hap_upper] == INSERTED_POSITION) {
+ hap_upper++;
+ }
+
+ if(hap_lower == 0 || hap_upper >= m_coordinate_map.size()) {
+ hap_lower = hap_upper = ref_lower = ref_upper = std::string::npos;
+ } else {
+ ref_lower = m_coordinate_map[hap_lower];
+ ref_upper = m_coordinate_map[hap_upper];
+ }
+}
+
+size_t Haplotype::_find_derived_index_by_ref_lower_bound(size_t ref_index) const
{
for(size_t i = 0; i < m_coordinate_map.size(); ++i) {
if(m_coordinate_map[i] != INSERTED_POSITION && m_coordinate_map[i] >= ref_index) {
diff --git a/src/nanopolish_haplotype.h b/src/nanopolish_haplotype.h
index 1943fdf..9cf54fe 100644
--- a/src/nanopolish_haplotype.h
+++ b/src/nanopolish_haplotype.h
@@ -26,14 +26,35 @@ class Haplotype
// get the sequence of the reference
const std::string& get_reference() const { return m_reference; }
+ // get the reference location
+ const std::string get_reference_name() const { return m_ref_name; }
+ const size_t get_reference_position() const { return m_ref_position; }
+ const size_t get_reference_end() const { return m_ref_position + m_reference.length(); }
+
+ // return the reference position corresponding to base i of the haplotype
+ // returns std::string::npos if the base was inserted into the haplotype
+ // and therefore has no corresponding reference base
+ size_t get_reference_position_for_haplotype_base(size_t i) const;
+
// add a variant into the haplotype
- void apply_variant(const Variant& v);
+ // returns true if the variant is successfully added to the haplotype
+ bool apply_variant(const Variant& v);
// return all the variants on this haplotype
std::vector<Variant> get_variants() const { return m_variants; }
+
+ // Set ref_lower and ref_upper to be valid reference (ie non-deleted/inserted) positions that
+ // contain the lower/supper positions on the haplotype.
+ // If no such range exists then the return value(s) is set to std::string::npos
+
+ // Extend the haplotype range (hap_lower/hap_upper) until both have a cooresponding
+ // reference base. Set ref_lower/ref_upper to these values. If a range cannot be found,
+ // the out parameters are set to std::string::npos
+ void get_enclosing_reference_range_for_haplotype_range(size_t& hap_lower, size_t& hap_upper,
+ size_t& ref_lower, size_t& ref_upper) const;
// return a new haplotype subsetted by reference coordinates
- Haplotype substr_by_reference(size_t start, size_t end);
+ Haplotype substr_by_reference(size_t start, size_t end) const;
private:
@@ -43,7 +64,7 @@ class Haplotype
// Find the first derived index that has a corresponding
// reference position which is not less than ref_index.
// This mimics std::lower_bound
- size_t _find_derived_index_by_ref_lower_bound(size_t ref_index);
+ size_t _find_derived_index_by_ref_lower_bound(size_t ref_index) const;
//
// data
diff --git a/src/nanopolish_methyltest.cpp b/src/nanopolish_methyltest.cpp
index fe1a022..a6c4700 100644
--- a/src/nanopolish_methyltest.cpp
+++ b/src/nanopolish_methyltest.cpp
@@ -3,7 +3,7 @@
// Written by Jared Simpson (jared.simpson at oicr.on.ca)
//---------------------------------------------------------
//
-// nanopolish_methyltrain -- train a methylation model
+// nanopolish_methyltest -- test CpG sites for methylation
//
#include <stdio.h>
#include <stdlib.h>
@@ -30,6 +30,7 @@
#include "nanopolish_anchor.h"
#include "nanopolish_fast5_map.h"
#include "nanopolish_methyltrain.h"
+#include "nanopolish_pore_model_set.h"
#include "H5pubconf.h"
#include "profiler.h"
#include "progress.h"
@@ -85,7 +86,7 @@ SUBPROGRAM " Version " PACKAGE_VERSION "\n"
static const char *METHYLTEST_USAGE_MESSAGE =
"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n"
-"Train a methylation model\n"
+"Test CpG sites for methylation\n"
"\n"
" -v, --verbose display verbose output\n"
" --version display version\n"
@@ -106,6 +107,7 @@ namespace opt
static std::string genome_file;
static std::string models_fofn;
static std::string region;
+ static std::string cpg_methylation_model_type = "reftrained";
static int progress = 0;
static int num_threads = 1;
static int batch_size = 128;
@@ -129,35 +131,20 @@ static const struct option longopts[] = {
{ NULL, 0, NULL, 0 }
};
-// Expand the event indices outwards
-void bump(HMMInputData& data, int amount)
-{
- if(data.event_start_idx < data.event_stop_idx) {
- data.event_start_idx -= amount;
- data.event_stop_idx += amount;
- } else {
- data.event_start_idx += amount;
- data.event_stop_idx -= amount;
- }
-}
-// Realign the read in event space
-void test_read(const ModelMap& model_map,
- const Fast5Map& name_map,
- const faidx_t* fai,
- const bam_hdr_t* hdr,
- const bam1_t* record,
- size_t read_idx,
- const OutputHandles& handles)
+// Test CpG sites in this read for methylation
+void calculate_methylation_for_read(const Fast5Map& name_map,
+ const faidx_t* fai,
+ const bam_hdr_t* hdr,
+ const bam1_t* record,
+ size_t read_idx,
+ const OutputHandles& handles)
{
// Load a squiggle read for the mapped read
std::string read_name = bam_get_qname(record);
std::string fast5_path = name_map.get_path(read_name);
-
- // load read
SquiggleRead sr(read_name, fast5_path);
- double read_score = 0.0f;
- size_t num_sites_tested = 0;
+ // An output map from reference positions to scored CpG sites
std::map<int, ScoredSite> site_score_map;
for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) {
@@ -166,22 +153,13 @@ void test_read(const ModelMap& model_map,
std::vector<int> site_ends;
std::vector<int> site_count;
- double strand_score = 0.0f;
+ // replace the baked-in pore model with the methylation model
+ // (including unmethylated kmers) for this strand
+ sr.replace_model(strand_idx, opt::cpg_methylation_model_type);
- // replace model
- std::string curr_model = sr.pore_model[strand_idx].name;
+ size_t k = sr.pore_model[strand_idx].k;
- std::string methyl_model = curr_model + ".methyltrain";
- auto model_iter = model_map.find(methyl_model);
-
- if(model_iter != model_map.end()) {
- sr.pore_model[strand_idx].update_states( model_iter->second );
- } else {
- fprintf(stderr, "Error, methylated model %s not found\n", methyl_model.c_str());
- exit(EXIT_FAILURE);
- }
-
- // Align to the new model
+ // Align in event space using the new model
EventAlignmentParameters params;
params.sr = &sr;
params.fai = fai;
@@ -196,8 +174,6 @@ void test_read(const ModelMap& model_map,
continue;
std::string contig = alignment_output.front().ref_name.c_str();
- //emit_event_alignment_tsv(stdout, sr, params, alignment_output);
-
// Convert the EventAlignment to a map between reference positions and events
std::vector<AlignedPair> event_aligned_pairs;
for(size_t i = 0; i < alignment_output.size(); ++i) {
@@ -210,13 +186,13 @@ void test_read(const ModelMap& model_map,
int ref_start_pos = event_aligned_pairs.front().ref_pos;
int ref_end_pos = event_aligned_pairs.back().ref_pos;
+ // Extract the reference sequence for this region
int fetched_len = 0;
assert(ref_end_pos >= ref_start_pos);
-
- // Extract the reference sequence for this region
std::string ref_seq = get_reference_region_ts(params.fai, contig.c_str(), ref_start_pos,
ref_end_pos, &fetched_len);
+ // Remove non-ACGT bases from this reference segment
ref_seq = gDNAAlphabet.disambiguate(ref_seq);
// Scan the sequence for CpGs
@@ -228,11 +204,12 @@ void test_read(const ModelMap& model_map,
}
}
- // Batch the CpGs together
+ // Batch the CpGs together into groups that are separated by some minimum distance
int min_separation = 10;
size_t curr_idx = 0;
while(curr_idx < cpg_sites.size()) {
-
+
+ // Find the endpoint of this group of sites
size_t end_idx = curr_idx + 1;
while(end_idx < cpg_sites.size()) {
if(cpg_sites[end_idx] - cpg_sites[end_idx - 1] > min_separation)
@@ -240,6 +217,7 @@ void test_read(const ModelMap& model_map,
end_idx += 1;
}
+ // the coordinates on the reference substring for this group of sites
int sub_start_pos = cpg_sites[curr_idx] - min_separation;
int sub_end_pos = cpg_sites[end_idx - 1] + min_separation;
@@ -248,16 +226,19 @@ void test_read(const ModelMap& model_map,
std::string subseq = ref_seq.substr(sub_start_pos, sub_end_pos - sub_start_pos + 1);
std::string rc_subseq = mtest_alphabet->reverse_complement(subseq);
+ // using the reference-to-event map, look up the event indices for this segment
AlignedPairRefLBComp lb_comp;
AlignedPairConstIter start_iter = std::lower_bound(event_aligned_pairs.begin(), event_aligned_pairs.end(),
sub_start_pos + ref_start_pos, lb_comp);
AlignedPairConstIter stop_iter = std::lower_bound(event_aligned_pairs.begin(), event_aligned_pairs.end(),
- sub_end_pos + ref_start_pos, lb_comp);
-
- if(start_iter != event_aligned_pairs.end() && stop_iter != event_aligned_pairs.end()) {
-
- std::string site_string = ref_seq.substr(cpg_sites[curr_idx] - 3, 5);
+ sub_end_pos + ref_start_pos, lb_comp);
+
+ // Only process this region if the the read is aligned within the boundaries
+ // and the span between the start/end is not unusually short
+ if(start_iter != event_aligned_pairs.end() && stop_iter != event_aligned_pairs.end() &&
+ abs(start_iter->read_pos - stop_iter->read_pos) > 10)
+ {
uint32_t hmm_flags = HAF_ALLOW_PRE_CLIP | HAF_ALLOW_POST_CLIP;
@@ -269,23 +250,21 @@ void test_read(const ModelMap& model_map,
data.rc = alignment_output.front().rc;
data.event_start_idx = start_iter->read_pos;
data.event_stop_idx = stop_iter->read_pos;
- data.event_stride = data.event_start_idx < data.event_stop_idx ? 1 : -1;
-
+ data.event_stride = data.event_start_idx <= data.event_stop_idx ? 1 : -1;
+
+ // Calculate the likelihood of the unmethylated sequence
HMMInputSequence unmethylated(subseq, rc_subseq, mtest_alphabet);
double unmethylated_score = profile_hmm_score(unmethylated, data, hmm_flags);
- // Methylate the CpGs in the sequence and score again
- std::string mcpg_subseq = gMCpGAlphabet.methylate(subseq);
- std::string rc_mcpg_subseq = gMCpGAlphabet.reverse_complement(mcpg_subseq);
-
- //printf("m_subs: %s\n", mcpg_subseq.c_str());
- //printf("m_rc_s: %s\n", rc_mcpg_subseq.c_str());
+ // Methylate all CpGs in the sequence and score again
+ std::string mcpg_subseq = mtest_alphabet->methylate(subseq);
+ std::string rc_mcpg_subseq = mtest_alphabet->reverse_complement(mcpg_subseq);
+ // Calculate the likelihood of the methylated sequence
HMMInputSequence methylated(mcpg_subseq, rc_mcpg_subseq, mtest_alphabet);
double methylated_score = profile_hmm_score(methylated, data, hmm_flags);
- double diff = methylated_score - unmethylated_score;
- ScoredSite ss;
+ // Aggregate score
int start_position = cpg_sites[curr_idx] + ref_start_pos;
auto iter = site_score_map.find(start_position);
if(iter == site_score_map.end()) {
@@ -295,23 +274,20 @@ void test_read(const ModelMap& model_map,
ss.start_position = start_position;
ss.end_position = cpg_sites[end_idx - 1] + ref_start_pos;
ss.n_cpg = end_idx - curr_idx;
- ss.sequence = site_string;
+
+ // extract the CpG site(s) with a k-mers worth of surrounding context
+ size_t site_output_start = cpg_sites[curr_idx] - k + 1;
+ size_t site_output_end = cpg_sites[end_idx - 1] + k;
+ ss.sequence = ref_seq.substr(site_output_start, site_output_end - site_output_start);
+
+ // insert into the map
iter = site_score_map.insert(std::make_pair(start_position, ss)).first;
}
+ // set strand-specific score
+ // upon output below the strand scores will be summed
iter->second.ll_unmethylated[strand_idx] = unmethylated_score;
iter->second.ll_methylated[strand_idx] = methylated_score;
-
- /*
- // Debug alignments
- printf("Forward unmethylated: %.2lf\n", unmethylated_score);
- printf("Forward methylated: %.2lf\n", methylated_score);
- std::vector<HMMAlignmentState> um_align = profile_hmm_align(unmethylated, data, hmm_flags);
- print_alignment("unmethylated", start_position, 0, unmethylated, data, um_align);
-
- std::vector<HMMAlignmentState> m_align = profile_hmm_align(methylated, data, hmm_flags);
- print_alignment("methylated", start_position, 0, methylated, data, m_align);
- */
}
}
@@ -321,10 +297,12 @@ void test_read(const ModelMap& model_map,
#pragma omp critical(methyltest_write)
{
+ // these variables are sums over all sites within a read
double ll_ratio_sum_strand[2] = { 0.0f, 0.0f };
double ll_ratio_sum_both = 0;
size_t num_positive = 0;
+ // write all sites for this read
for(auto iter = site_score_map.begin(); iter != site_score_map.end(); ++iter) {
const ScoredSite& ss = iter->second;
@@ -336,15 +314,18 @@ void test_read(const ModelMap& model_map,
num_positive += diff > 0;
fprintf(handles.site_writer, "%s\t%d\t%d\t", ss.chromosome.c_str(), ss.start_position, ss.end_position);
- fprintf(handles.site_writer, "LL_METH=%.2lf;LL_UNMETH=%.2lf;LL_RATIO=%.2lf;", sum_ll_m, sum_ll_u, diff);
- fprintf(handles.site_writer, "N_CPG=%d;SEQUENCE=%s\n", ss.n_cpg, ss.sequence.c_str());
+ fprintf(handles.site_writer, "ReadIdx=%zu;", read_idx);
+ fprintf(handles.site_writer, "LogLikMeth=%.2lf;LogLikUnmeth=%.2lf;LogLikRatio=%.2lf;", sum_ll_m, sum_ll_u, diff);
+ fprintf(handles.site_writer, "LogLikMethByStrand=%.2lf,%.2lf;", ss.ll_methylated[0], ss.ll_methylated[1]);
+ fprintf(handles.site_writer, "LogLikUnmethByStrand=%.2lf,%.2lf;", ss.ll_unmethylated[0], ss.ll_unmethylated[1]);
+ fprintf(handles.site_writer, "NumCpGs=%d;Sequence=%s\n", ss.n_cpg, ss.sequence.c_str());
ll_ratio_sum_strand[0] += ss.ll_methylated[0] - ss.ll_unmethylated[0];
ll_ratio_sum_strand[1] += ss.ll_methylated[1] - ss.ll_unmethylated[1];
ll_ratio_sum_both += diff;
}
std::string complement_model = sr.pore_model[C_IDX].name;
- fprintf(handles.read_writer, "%s\t%.2lf\t%zu\t%s\tNUM_POSITIVE=%zu\n", fast5_path.c_str(), ll_ratio_sum_both, site_score_map.size(), complement_model.c_str(), num_positive);
+ fprintf(handles.read_writer, "%s\t%.2lf\t%zu\t%s\tNumPositive=%zu\n", fast5_path.c_str(), ll_ratio_sum_both, site_score_map.size(), complement_model.c_str(), num_positive);
for(size_t si = 0; si < NUM_STRANDS; ++si) {
std::string model = sr.pore_model[si].name;
@@ -394,7 +375,7 @@ void parse_methyltest_options(int argc, char** argv)
std::cerr << SUBPROGRAM ": a --reads file must be provided\n";
die = true;
}
-
+
if(opt::genome_file.empty()) {
std::cerr << SUBPROGRAM ": a --genome file must be provided\n";
die = true;
@@ -404,13 +385,16 @@ void parse_methyltest_options(int argc, char** argv)
std::cerr << SUBPROGRAM ": a --bam file must be provided\n";
die = true;
}
-
+
if(opt::models_fofn.empty()) {
- std::cerr << SUBPROGRAM ": a --models-fofn file must be provided\n";
+ std::cerr << SUBPROGRAM ": a --models file must be provided\n";
die = true;
+ } else {
+ // initialize the model set from the fofn
+ PoreModelSet::initialize(opt::models_fofn);
}
- if (die)
+ if (die)
{
std::cout << "\n" << METHYLTEST_USAGE_MESSAGE;
exit(EXIT_FAILURE);
@@ -423,8 +407,7 @@ int methyltest_main(int argc, char** argv)
omp_set_num_threads(opt::num_threads);
Fast5Map name_map(opt::reads_file);
- ModelMap models = read_models_fofn(opt::models_fofn);
-
+
// Open the BAM and iterate over reads
// load bam file
@@ -438,7 +421,7 @@ int methyltest_main(int argc, char** argv)
// read the bam header
bam_hdr_t* hdr = sam_hdr_read(bam_fh);
-
+
// load reference fai file
faidx_t *fai = fai_load(opt::genome_file.c_str());
@@ -492,20 +475,20 @@ int methyltest_main(int argc, char** argv)
do {
assert(num_records_buffered < records.size());
-
+
// read a record into the next slot in the buffer
result = sam_itr_next(bam_fh, itr, records[num_records_buffered]);
num_records_buffered += result >= 0;
// realign if we've hit the max buffer size or reached the end of file
if(num_records_buffered == records.size() || result < 0) {
-
+
#pragma omp parallel for
for(size_t i = 0; i < num_records_buffered; ++i) {
bam1_t* record = records[i];
size_t read_idx = num_reads_processed + i;
if( (record->core.flag & BAM_FUNMAP) == 0) {
- test_read(models, name_map, fai, hdr, record, read_idx, handles);
+ calculate_methylation_for_read(name_map, fai, hdr, record, read_idx, handles);
}
}
@@ -514,7 +497,7 @@ int methyltest_main(int argc, char** argv)
}
} while(result >= 0);
-
+
assert(num_records_buffered == 0);
progress.end();
diff --git a/src/nanopolish_methyltrain.cpp b/src/nanopolish_methyltrain.cpp
index d17d637..614699c 100644
--- a/src/nanopolish_methyltrain.cpp
+++ b/src/nanopolish_methyltrain.cpp
@@ -12,14 +12,17 @@
#include <vector>
#include <inttypes.h>
#include <assert.h>
-#include <math.h>
+#include <cmath>
#include <sys/time.h>
#include <algorithm>
#include <fstream>
#include <sstream>
+#include <iomanip>
#include <set>
+#include <map>
#include <omp.h>
#include <getopt.h>
+#include <cstddef>
#include "htslib/faidx.h"
#include "nanopolish_methyltrain.h"
#include "nanopolish_eventalign.h"
@@ -30,121 +33,32 @@
#include "nanopolish_profile_hmm.h"
#include "nanopolish_anchor.h"
#include "nanopolish_fast5_map.h"
+#include "nanopolish_model_names.h"
+#include "nanopolish_pore_model_set.h"
+#include "training_core.hpp"
#include "H5pubconf.h"
#include "profiler.h"
#include "progress.h"
+#include "logger.hpp"
-//
-// Structs
-//
+#include "nanopolish_scorereads.h"
+#include "../eigen/Eigen/Dense"
-// The state training data comes in two different
-// sizes Full and Minimal. The model training functions
-// only actually need the Minimal data but for exploration
-// the Full data is useful so left as an option.
-struct FullStateTrainingData
-{
- //
- // Functions
- //
- FullStateTrainingData(const SquiggleRead& sr,
- const EventAlignment& ea,
- uint32_t rank,
- const std::string& prev_kmer,
- const std::string& next_kmer)
- {
- // scale the observation to the expected pore model
- this->level_mean = sr.get_fully_scaled_level(ea.event_idx, ea.strand_idx);
- //this->event_stdv = sr.events[strand_idx][ea.event_idx].stdv / sr.pore_model[strand_idx].scale_sd;
- this->level_stdv = 0;
- this->duration = sr.events[ea.strand_idx][ea.event_idx].duration;
-
- this->read_var = (float)sr.pore_model[ea.strand_idx].var;
- this->ref_position = ea.ref_position;
- this->ref_strand = ea.rc;
-
- GaussianParameters model = sr.pore_model[ea.strand_idx].get_scaled_parameters(rank);
- this->z = (sr.get_drift_corrected_level(ea.event_idx, ea.strand_idx) - model.mean ) / model.stdv;
- this->prev_kmer = prev_kmer;
- this->next_kmer = next_kmer;
- }
-
- static void write_header(FILE* fp)
- {
- fprintf(fp, "model\tmodel_kmer\tlevel_mean\tlevel_stdv\tduration\tref_pos\tref_strand\tz\tread_var\tprev_kmer\tnext_kmer\n");
- }
+extern float g_p_skip, g_p_skip_self, g_p_bad, g_p_bad_self;
- void write_tsv(FILE* fp, const std::string& model_name, const std::string& kmer) const
- {
- fprintf(fp, "%s\t%s\t%.2lf\t%.2lf\t%.3lf\t%d\t%d\t%.2lf\t%.2lf\t%s\t%s\n",
- model_name.c_str(),
- kmer.c_str(),
- level_mean,
- level_stdv,
- duration,
- ref_position,
- ref_strand,
- z,
- read_var,
- prev_kmer.c_str(),
- next_kmer.c_str());
- }
-
- //
- // Data
- //
-
- float level_mean;
- float level_stdv;
- float duration;
- float read_var;
-
- int ref_position;
- int ref_strand;
-
- float z;
- std::string prev_kmer;
- std::string next_kmer;
-};
-
-struct MinimalStateTrainingData
+//
+// Enums
+//
+enum TrainingTarget
{
- //
- // Functions
- //
- MinimalStateTrainingData(const SquiggleRead& sr,
- const EventAlignment& ea,
- uint32_t rank,
- const std::string& prev_kmer,
- const std::string& next_kmer)
- {
- // scale the observation to the expected pore model
- this->level_mean = sr.get_fully_scaled_level(ea.event_idx, ea.strand_idx);
- this->read_var = (float)sr.pore_model[ea.strand_idx].var;
- }
-
- static void write_header(FILE* fp)
- {
- fprintf(fp, "model\tmodel_kmer\tlevel_mean\tread_var\n");
- }
-
- void write_tsv(FILE* fp, const std::string& model_name, const std::string& kmer) const
- {
- fprintf(fp, "%s\t%s\t%.2lf\t%.2lf\n",
- model_name.c_str(),
- kmer.c_str(),
- level_mean,
- read_var);
- }
-
- //
- // Data
- //
- float level_mean;
- float read_var;
+ TT_UNMETHYLATED_KMERS,
+ TT_METHYLATED_KMERS,
+ TT_ALL_KMERS
};
-typedef MinimalStateTrainingData StateTrainingData;
+//
+// Structs
+//
struct StateSummary
{
@@ -156,14 +70,8 @@ struct StateSummary
int num_stays;
};
-struct GaussianMixture
-{
- std::vector<float> weights;
- std::vector<GaussianParameters> params;
-};
-
//
-Alphabet* mtrain_alphabet = &gMCpGAlphabet;
+const Alphabet* mtrain_alphabet = NULL;
//
// Typedefs
@@ -189,146 +97,217 @@ static const char *METHYLTRAIN_USAGE_MESSAGE =
" --version display version\n"
" --help display this help and exit\n"
" -m, --models-fofn=FILE read the models to be trained from the FOFN\n"
-" --train-unmethylated train unmethylated 5-mers instead of methylated\n"
+" --train-kmers=STR train methylated, unmethylated or all kmers\n"
+" -c --calibrate recalibrate aligned reads to model before training\n"
" --no-update-models do not write out trained models\n"
+" --output-scores optionally output read scores during training\n"
" -r, --reads=FILE the 2D ONT reads are in fasta FILE\n"
" -b, --bam=FILE the reads aligned to the genome assembly are in bam FILE\n"
-" -g, --genome=FILE the genome we are computing a consensus for is in FILE\n"
+" -g, --genome=FILE the reference genome is in FILE\n"
" -t, --threads=NUM use NUM threads (default: 1)\n"
-" -s, --out-suffix=STR name output files like model.out_suffix\n"
+" --filter-policy=STR filter reads for [R7-methylation] or [R9-nucleotide] project\n"
+" -s, --out-suffix=STR name output files like <strand>.out_suffix\n"
+" --out-fofn=FILE write the names of the output models into FILE\n"
+" --rounds=NUM number of training rounds to perform\n"
" --progress print out a progress message\n"
+" --stdv enable stdv modelling\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
namespace opt
{
static unsigned int verbose;
+ static unsigned int calibrate=0;
static std::string reads_file;
static std::string bam_file;
static std::string genome_file;
static std::string models_fofn;
static std::string region;
- static std::string out_suffix = ".methyltrain";
+ static std::string out_suffix = ".trained";
+ static std::string out_fofn = "trained.fofn";
+ static std::string initial_model_type = "ONT";
+ static std::string trained_model_type = "reftrained";
+
+ static TrainingTarget training_target = TT_METHYLATED_KMERS;
static bool write_models = true;
- static bool train_unmethylated = false;
- static int progress = 0;
- static int num_threads = 1;
- static int batch_size = 128;
+ static bool output_scores = false;
+ static unsigned progress = 0;
+ static unsigned num_threads = 1;
+ static unsigned batch_size = 128;
+
+ // Constants that determine which events to use for training
+ static float min_event_duration = 0.002;
+ static unsigned min_distance_from_alignment_end = 5;
+ static unsigned min_number_of_events_to_train = 100;
+ static unsigned num_training_rounds = 5;
}
-static const char* shortopts = "r:b:g:t:m:vn";
-
-enum { OPT_HELP = 1, OPT_VERSION, OPT_PROGRESS, OPT_NO_UPDATE_MODELS, OPT_TRAIN_UNMETHYLATED };
+static const char* shortopts = "r:b:g:t:m:vnc";
+
+enum { OPT_HELP = 1,
+ OPT_VERSION,
+ OPT_PROGRESS,
+ OPT_NO_UPDATE_MODELS,
+ OPT_TRAIN_KMERS,
+ OPT_OUTPUT_SCORES,
+ OPT_OUT_FOFN,
+ OPT_STDV,
+ OPT_LOG_LEVEL,
+ OPT_FILTER_POLICY,
+ OPT_NUM_ROUNDS,
+ OPT_P_SKIP,
+ OPT_P_SKIP_SELF,
+ OPT_P_BAD,
+ OPT_P_BAD_SELF
+ };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
+ { "calibrate", no_argument, NULL, 'c' },
{ "reads", required_argument, NULL, 'r' },
{ "bam", required_argument, NULL, 'b' },
{ "genome", required_argument, NULL, 'g' },
- { "window", required_argument, NULL, 'w' },
{ "threads", required_argument, NULL, 't' },
{ "models-fofn", required_argument, NULL, 'm' },
{ "out-suffix", required_argument, NULL, 's' },
+ { "stdv", no_argument, NULL, OPT_STDV },
+ { "out-fofn", required_argument, NULL, OPT_OUT_FOFN },
+ { "train-kmers", required_argument, NULL, OPT_TRAIN_KMERS },
+ { "p-skip", required_argument, NULL, OPT_P_SKIP },
+ { "p-skip-self", required_argument, NULL, OPT_P_SKIP_SELF },
+ { "p-bad", required_argument, NULL, OPT_P_BAD },
+ { "p-bad-self", required_argument, NULL, OPT_P_BAD_SELF },
+ { "output-scores", no_argument, NULL, OPT_OUTPUT_SCORES },
{ "no-update-models", no_argument, NULL, OPT_NO_UPDATE_MODELS },
- { "train-unmethylated", no_argument, NULL, OPT_TRAIN_UNMETHYLATED },
{ "progress", no_argument, NULL, OPT_PROGRESS },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
+ { "log-level", required_argument, NULL, OPT_LOG_LEVEL },
+ { "filter-policy", required_argument, NULL, OPT_FILTER_POLICY },
+ { "rounds", required_argument, NULL, OPT_NUM_ROUNDS },
{ NULL, 0, NULL, 0 }
};
-GaussianMixture train_gaussian_mixture(const std::vector<StateTrainingData>& data,
- const GaussianMixture& input_mixture)
+// recalculate shift, scale, drift, scale_sd from an alignment and the read
+// returns true if the recalibration was performed
+// in either case, sets residual to the L1 norm of the residual
+bool recalibrate_model(SquiggleRead &sr,
+ const int strand_idx,
+ const std::vector<EventAlignment> &alignment_output,
+ const Alphabet* alphabet,
+ const bool scale_var,
+ const bool scale_drift)
{
+ std::vector<double> raw_events, times, level_means, level_stdvs;
+ uint32_t k = sr.pore_model[strand_idx].k;
+ const uint32_t num_equations = scale_drift ? 3 : 2;
+
+ //std::cout << "Previous pore model parameters: " << sr.pore_model[strand_idx].shift << ", "
+ // << sr.pore_model[strand_idx].scale << ", "
+ // << sr.pore_model[strand_idx].drift << ", "
+ // << sr.pore_model[strand_idx].var << std::endl;
+
+ // extract necessary vectors from the read and the pore model; note do not want scaled values
+ for ( const auto &ea : alignment_output ) {
+ if(ea.hmm_state == 'M') {
+ std::string model_kmer = ea.rc ? alphabet->reverse_complement(ea.ref_kmer) : ea.ref_kmer;
+ uint32_t rank = alphabet->kmer_rank(model_kmer.c_str(), k);
+
+ raw_events.push_back ( sr.get_uncorrected_level(ea.event_idx, strand_idx) );
+ level_means.push_back( sr.pore_model[strand_idx].states[rank].level_mean );
+ level_stdvs.push_back( sr.pore_model[strand_idx].states[rank].level_stdv );
+ if (scale_drift)
+ times.push_back ( sr.get_time(ea.event_idx, strand_idx) );
+ }
+ }
- size_t n_components = input_mixture.params.size();
- size_t n_data = data.size();
- assert(input_mixture.weights.size() == n_components);
- GaussianMixture curr_mixture = input_mixture;
-
- for(size_t iteration = 0; iteration < 10; ++iteration) {
- std::vector<double> mean_sum(n_components, 0.0f);
- std::vector<double> var_sum(n_components, 0.0f);
-
- GaussianMixture new_mixture = curr_mixture;
- for(size_t j = 0; j < n_components; ++j) {
- new_mixture.weights[j] = 0.0f;
+ const int minNumEventsToRescale = 200;
+ bool recalibrated = false;
+ if (raw_events.size() >= minNumEventsToRescale) {
+ // Assemble linear system corresponding to weighted least squares problem
+ // Can just directly call a weighted least squares solver, but there's enough
+ // structure in our problem it's a little faster just to build the normal eqn
+ // matrices ourselves
+ Eigen::MatrixXd A(num_equations, num_equations);
+ Eigen::VectorXd b(num_equations);
+
+ for (int i=0; i<num_equations; i++) {
+ b(i) = 0.;
+ for (int j=0; j<num_equations; j++)
+ A(i,j) = 0.;
}
- std::vector<std::vector<double> > resp;
-
- for(size_t i = 0; i < n_data; ++i) {
- // Calculate the posterior probability that
- // data i came from each component of the mixture
-
- // P(data i | component j) P(component j)
- std::vector<double> t(n_components, 0.0f);
- double t_sum = -INFINITY;
- for(size_t j = 0; j < n_components; ++j) {
- t[j] = log_normal_pdf(data[i].level_mean, curr_mixture.params[j]) + log(curr_mixture.weights[j]);
- if(t[j] != -INFINITY && ! std::isnan(t[j])) {
- t_sum = add_logs(t_sum, t[j]);
- }
- }
+ for (size_t i=0; i<raw_events.size(); i++) {
+ double inv_var = 1./(level_stdvs[i]*level_stdvs[i]);
+ double mu = level_means[i];
+ double e = raw_events[i];
- // store P(component j | data i)
- for(size_t j = 0; j < n_components; ++j) {
- t[j] = exp(t[j] - t_sum);
- new_mixture.weights[j] += t[j];
- }
- resp.push_back(t);
- }
-
- for(size_t j = 0; j < n_components; ++j) {
- new_mixture.weights[j] /= n_data;
- }
+ A(0,0) += inv_var; A(0,1) += mu*inv_var;
+ A(1,1) += mu*mu*inv_var;
+
+ b(0) += e*inv_var;
+ b(1) += mu*e*inv_var;
- // Calculate mean
- for(size_t i = 0; i < n_data; ++i) {
- for(size_t j = 0; j < n_components; ++j) {
- double w_ij = resp[i][j];
- mean_sum[j] += w_ij * data[i].level_mean;
+ if (scale_drift) {
+ double t = times[i];
+ A(0,2) += t*inv_var;
+ A(1,2) += mu*t*inv_var;
+ A(2,2) += t*t*inv_var;
+ b(2) += t*e*inv_var;
}
}
-
- std::vector<double> new_mean(2);
- for(size_t j = 0; j < n_components; ++j) {
- new_mean[j] = mean_sum[j] / (n_data * new_mixture.weights[j]);
+ A(1,0) = A(0,1);
+ if (scale_drift) {
+ A(2,0) = A(0,2);
+ A(2,1) = A(1,2);
}
- // Calculate variance
- for(size_t i = 0; i < n_data; ++i) {
- for(size_t j = 0; j < n_components; ++j) {
- double w_ij = resp[i][j];
- var_sum[j] += w_ij * pow( (data[i].level_mean - new_mean[j]) / data[i].read_var, 2.0);
+ // perform the linear solve
+ Eigen::VectorXd x = A.fullPivLu().solve(b);
+
+ double shift = x(0);
+ double scale = x(1);
+ double drift = scale_drift ? x(2) : 0.;
+
+ sr.pore_model[strand_idx].shift = shift;
+ sr.pore_model[strand_idx].scale = scale;
+ sr.pore_model[strand_idx].drift = drift;
+
+ if (scale_var) {
+ double var = 0.;
+ for (size_t i=0; i<raw_events.size(); i++) {
+ double yi = (raw_events[i] - shift - scale*level_means[i]);
+ if (scale_drift)
+ yi -= drift*times[i];
+ var+= yi*yi/(level_stdvs[i]*level_stdvs[i]);
}
- }
-
- std::vector<double> new_var(2);
- for(size_t j = 0; j < n_components; ++j) {
- new_var[j] = var_sum[j] / (n_data * new_mixture.weights[j]);
- }
+ var /= raw_events.size();
- for(size_t j = 0; j < n_components; ++j) {
- new_mixture.params[j] = GaussianParameters(new_mean[j], sqrt(new_var[j]));
- //fprintf(stderr, "MIXTURE\t%zu\t%.2lf\t%.2lf\t%.2lf\n", j, curr_mixture.weights[j], curr_mixture.params[j].mean, curr_mixture.params[j].stdv);
+ sr.pore_model[strand_idx].var = sqrt(var); // 'var' is really the scaling for std dev.
}
- curr_mixture = new_mixture;
+ if (sr.pore_model[strand_idx].is_scaled)
+ sr.pore_model[strand_idx].bake_gaussian_parameters();
+
+ recalibrated = true;
+ //std::cout << "Updated pore model parameters: " << sr.pore_model[strand_idx].shift << ", "
+ // << sr.pore_model[strand_idx].scale << ", "
+ // << sr.pore_model[strand_idx].drift << ", "
+ // << sr.pore_model[strand_idx].var << std::endl;
}
- return curr_mixture;
+
+ return recalibrated;
}
-// Realign the read in event space
-void train_read(const ModelMap& model_map,
- const Fast5Map& name_map,
- const faidx_t* fai,
- const bam_hdr_t* hdr,
- const bam1_t* record,
- size_t read_idx,
- int region_start,
- int region_end,
- size_t round,
- ModelTrainingMap& training)
+// Update the training data with aligned events from a read
+void add_aligned_events(const Fast5Map& name_map,
+ const faidx_t* fai,
+ const bam_hdr_t* hdr,
+ const bam1_t* record,
+ size_t read_idx,
+ int region_start,
+ int region_end,
+ size_t round,
+ ModelTrainingMap& training)
{
// Load a squiggle read for the mapped read
std::string read_name = bam_get_qname(record);
@@ -337,19 +316,16 @@ void train_read(const ModelMap& model_map,
// load read
SquiggleRead sr(read_name, fast5_path);
+ // replace the models that are built into the read with the current trained model
+ sr.replace_models(opt::trained_model_type);
+
for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) {
-
- // replace model with the training model
- std::string curr_model = sr.pore_model[strand_idx].name;
- auto model_iter = model_map.find(curr_model);
- if(model_iter != model_map.end()) {
- sr.pore_model[strand_idx].update_states(model_iter->second);
- } else {
- printf("Error: model %s not found\n", curr_model.c_str());
- assert(false && "Model not found");
+ // skip if 1D reads and this is the wrong strand
+ if(!sr.has_events_for_strand(strand_idx)) {
+ continue;
}
-
+
// set k
uint32_t k = sr.pore_model[strand_idx].k;
@@ -360,93 +336,107 @@ void train_read(const ModelMap& model_map,
params.hdr = hdr;
params.record = record;
params.strand_idx = strand_idx;
-
+
params.alphabet = mtrain_alphabet;
params.read_idx = read_idx;
params.region_start = region_start;
params.region_end = region_end;
std::vector<EventAlignment> alignment_output = align_read_to_ref(params);
+ if (alignment_output.size() == 0)
+ return;
- // Update model observations
- #pragma omp critical
- {
- //emit_event_alignment_tsv(stdout, sr, params, alignment_output);
+ // Update pore model based on alignment
+ std::string curr_model = sr.pore_model[strand_idx].metadata.get_short_name();
+ double orig_score = -INFINITY;
- // Get the training data for this model
- auto& emission_map = training[curr_model];
+ if (opt::output_scores) {
+ orig_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL);
- for(size_t i = 0; i < alignment_output.size(); ++i) {
- const EventAlignment& ea = alignment_output[i];
- std::string model_kmer = ea.model_kmer;
+ #pragma omp critical(print)
+ std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Original " << orig_score << std::endl;
+ }
- // Grab the previous/next model kmer
- // If the read is from the same strand as the reference
- // the next kmer comes from the next alignment_output (and vice-versa)
- // other the indices are swapped
- int next_stride = ea.rc ? -1 : 1;
+ if ( opt::calibrate ) {
+ double resid = 0.;
+ recalibrate_model(sr, strand_idx, alignment_output, mtrain_alphabet, resid, true);
- std::string prev_kmer = "";
- std::string next_kmer = "";
+ if (opt::output_scores) {
+ double rescaled_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL);
+ #pragma omp critical(print)
+ {
+ std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Rescaled " << rescaled_score << std::endl;
+ std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Delta " << rescaled_score-orig_score << std::endl;
+ }
+ }
+ }
- if(i > 0 && i < alignment_output.size() - 1) {
- assert(alignment_output[i + next_stride].event_idx - ea.event_idx == 1);
- assert(alignment_output[i - next_stride].event_idx - ea.event_idx == -1);
+ // Get the training data for this model
+ auto& emission_map = training[curr_model];
- // check for exactly one base of movement along the reference
- if( abs(alignment_output[i + next_stride].ref_position - ea.ref_position) == 1) {
- next_kmer = alignment_output[i + next_stride].model_kmer;
- }
+ for(size_t i = 0; i < alignment_output.size(); ++i) {
+ const EventAlignment& ea = alignment_output[i];
+ std::string model_kmer = ea.model_kmer;
- if( abs(alignment_output[i - next_stride].ref_position - ea.ref_position) == 1) {
- prev_kmer = alignment_output[i - next_stride].model_kmer;
- }
- }
+ // Grab the previous/next model kmer from the alignment_output table.
+ // If the read is from the same strand as the reference
+ // the next kmer comes from the next alignment_output (and vice-versa)
+ // other the indices are swapped
+ int next_stride = ea.rc ? -1 : 1;
- uint32_t rank = mtrain_alphabet->kmer_rank(model_kmer.c_str(), k);
- auto& kmer_summary = emission_map[rank];
-
- // Should we use this event for training?
- bool use_for_training = i > 5 &&
- i + 5 < alignment_output.size() &&
- alignment_output[i].hmm_state == 'M' &&
- prev_kmer != "" &&
- next_kmer != "";
+ std::string prev_kmer = "";
+ std::string next_kmer = "";
- if(use_for_training) {
+ if(i > 0 && i < alignment_output.size() - 1) {
- StateTrainingData std(sr, ea, rank, prev_kmer, next_kmer);
- kmer_summary.events.push_back(std);
+ // check that the event indices are correct for the next expected position
+ assert(alignment_output[i + next_stride].event_idx - ea.event_idx == 1);
+ assert(alignment_output[i - next_stride].event_idx - ea.event_idx == -1);
+
+ // only set the previous/next when there was exactly one base of movement along the referenc
+ if( std::abs(alignment_output[i + next_stride].ref_position - ea.ref_position) == 1) {
+ next_kmer = alignment_output[i + next_stride].model_kmer;
}
- if(ea.hmm_state == 'M') {
- kmer_summary.num_matches += 1;
- } else if(ea.hmm_state == 'E') {
- kmer_summary.num_stays += 1;
+ if( std::abs(alignment_output[i - next_stride].ref_position - ea.ref_position) == 1) {
+ prev_kmer = alignment_output[i - next_stride].model_kmer;
}
+ }
+ // Get the rank of the kmer that we aligned to (on the sequencing strand, = model_kmer)
+ uint32_t rank = mtrain_alphabet->kmer_rank(model_kmer.c_str(), k);
+ assert(rank < emission_map.size());
+ auto& kmer_summary = emission_map[rank];
+
+ // We only use this event for training if its not at the end of the alignment
+ // (to avoid bad alignments around the read edges) and if its not too short (to
+ // avoid bad measurements from effecting the levels too much)
+ bool use_for_training = i > opt::min_distance_from_alignment_end &&
+ i + opt::min_distance_from_alignment_end < alignment_output.size() &&
+ alignment_output[i].hmm_state == 'M' &&
+ sr.get_duration( alignment_output[i].event_idx, strand_idx) >= opt::min_event_duration &&
+ sr.get_fully_scaled_level(alignment_output[i].event_idx, strand_idx) >= 1.0;
+
+ if(use_for_training) {
+ StateTrainingData std(sr, ea, rank, prev_kmer, next_kmer);
+ #pragma omp critical(kmer)
+ kmer_summary.events.push_back(std);
+ }
+
+ if(ea.hmm_state == 'M') {
+ #pragma omp atomic
+ kmer_summary.num_matches += 1;
+ } else if(ea.hmm_state == 'E') {
+ #pragma omp atomic
+ kmer_summary.num_stays += 1;
}
}
} // for strands
}
-ModelMap read_models_fofn(const std::string& fofn_name)
-{
- ModelMap out;
- std::ifstream fofn_reader(fofn_name);
- std::string model_filename;
-
- while(getline(fofn_reader, model_filename)) {
- printf("reading %s\n", model_filename.c_str());
- PoreModel p(model_filename, *mtrain_alphabet);
- assert(!p.name.empty());
-
- out[p.name] = p;
- }
- return out;
-}
-
void parse_methyltrain_options(int argc, char** argv)
{
+ std::string training_target_str = "";
+ std::string filter_policy_str = "";
bool die = false;
for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
std::istringstream arg(optarg != NULL ? optarg : "");
@@ -459,15 +449,28 @@ void parse_methyltrain_options(int argc, char** argv)
case 'm': arg >> opt::models_fofn; break;
case 's': arg >> opt::out_suffix; break;
case 'v': opt::verbose++; break;
- case OPT_TRAIN_UNMETHYLATED: opt::train_unmethylated = true; break;
+ case 'c': opt::calibrate = 1; break;
+ case OPT_STDV: model_stdv() = true; break;
+ case OPT_OUT_FOFN: arg >> opt::out_fofn; break;
+ case OPT_NUM_ROUNDS: arg >> opt::num_training_rounds; break;
+ case OPT_OUTPUT_SCORES: opt::output_scores = true; break;
+ case OPT_TRAIN_KMERS: arg >> training_target_str; break;
+ case OPT_FILTER_POLICY: arg >> filter_policy_str; break;
case OPT_NO_UPDATE_MODELS: opt::write_models = false; break;
case OPT_PROGRESS: opt::progress = true; break;
+ case OPT_P_SKIP: arg >> g_p_skip; break;
+ case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break;
+ case OPT_P_BAD: arg >> g_p_bad; break;
+ case OPT_P_BAD_SELF: arg >> g_p_bad_self; break;
case OPT_HELP:
std::cout << METHYLTRAIN_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
case OPT_VERSION:
std::cout << METHYLTRAIN_VERSION_MESSAGE;
exit(EXIT_SUCCESS);
+ case OPT_LOG_LEVEL:
+ Logger::set_level_from_option(arg.str());
+ break;
}
}
@@ -499,26 +502,56 @@ void parse_methyltrain_options(int argc, char** argv)
std::cerr << SUBPROGRAM ": a --bam file must be provided\n";
die = true;
}
-
+
if(opt::models_fofn.empty()) {
- std::cerr << SUBPROGRAM ": a --models-fofn file must be provided\n";
+ std::cerr << SUBPROGRAM ": a --models file must be provided\n";
die = true;
+ } else {
+ // initialize the model set from the fofn
+ PoreModelSet::initialize(opt::models_fofn);
+ }
+
+ // Parse the training target string
+ if(training_target_str != "") {
+ if(training_target_str == "unmethylated") {
+ opt::training_target = TT_UNMETHYLATED_KMERS;
+ } else if(training_target_str == "methylated") {
+ opt::training_target = TT_METHYLATED_KMERS;
+ } else if(training_target_str == "all") {
+ opt::training_target = TT_ALL_KMERS;
+ } else {
+ std::cerr << SUBPROGRAM ": unknown --train-kmers string\n";
+ die = true;
+ }
+ }
+
+ // Parse the training target string
+ if(filter_policy_str != "") {
+ if(filter_policy_str == "R9-nucleotide") {
+ } else if(filter_policy_str == "R7-methylation") {
+ // default, do nothing
+ } else {
+ std::cerr << SUBPROGRAM ": unknown --filter-policy\n";
+ die = true;
+ }
}
- if (die)
- {
+ if (die) {
std::cout << "\n" << METHYLTRAIN_USAGE_MESSAGE;
exit(EXIT_FAILURE);
}
}
-ModelMap train_one_round(const ModelMap& models, const Fast5Map& name_map, size_t round)
+void train_one_round(const Fast5Map& name_map, size_t round)
{
+ const PoreModelMap& current_models = PoreModelSet::get_models(opt::trained_model_type);
+
// Initialize the training summary stats for each kmer for each model
ModelTrainingMap model_training_data;
- for(auto model_iter = models.begin(); model_iter != models.end(); model_iter++) {
- std::vector<StateSummary> summaries(model_iter->second.get_num_states()); // one per kmer in the model
- model_training_data[model_iter->first] = summaries;
+ for(auto current_model_iter = current_models.begin(); current_model_iter != current_models.end(); current_model_iter++) {
+ // one summary entry per kmer in the model
+ std::vector<StateSummary> summaries(current_model_iter->second.get_num_states());
+ model_training_data[current_model_iter->first] = summaries;
}
// Open the BAM and iterate over reads
@@ -534,7 +567,7 @@ ModelMap train_one_round(const ModelMap& models, const Fast5Map& name_map, size_
// read the bam header
bam_hdr_t* hdr = sam_hdr_read(bam_fh);
-
+
// load reference fai file
faidx_t *fai = fai_load(opt::genome_file.c_str());
@@ -548,7 +581,6 @@ ModelMap train_one_round(const ModelMap& models, const Fast5Map& name_map, size_
// TODO: is this valid?
itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0);
} else {
-
fprintf(stderr, "Region: %s\n", opt::region.c_str());
itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str());
hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end);
@@ -587,7 +619,7 @@ ModelMap train_one_round(const ModelMap& models, const Fast5Map& name_map, size_
bam1_t* record = records[i];
size_t read_idx = num_reads_realigned + i;
if( (record->core.flag & BAM_FUNMAP) == 0) {
- train_read(models, name_map, fai, hdr, record, read_idx, clip_start, clip_end, round, model_training_data);
+ add_aligned_events(name_map, fai, hdr, record, read_idx, clip_start, clip_end, round, model_training_data);
}
}
@@ -602,122 +634,154 @@ ModelMap train_one_round(const ModelMap& models, const Fast5Map& name_map, size_
assert(num_records_buffered == 0);
progress.end();
-
- std::stringstream training_fn;
- training_fn << opt::bam_file << ".methyltrain.tsv";
+ // open the summary file
std::stringstream summary_fn;
- summary_fn << opt::bam_file << ".methyltrain.summary";
-
+ summary_fn << "methyltrain" << opt::out_suffix << ".summary";
FILE* summary_fp = fopen(summary_fn.str().c_str(), "w");
- FILE* training_fp = fopen(training_fn.str().c_str(), "w");
+ fprintf(summary_fp, "model_short_name\tkmer\tnum_matches\tnum_skips\t"
+ "num_stays\tnum_events_for_training\twas_trained\t"
+ "trained_level_mean\ttrained_level_stdv\n");
- // training header
- StateTrainingData::write_header(training_fp);
-
- // Process the training results
- ModelMap trained_models;
-
+ // open the tsv file with the raw training data
+ std::stringstream training_fn;
+ training_fn << "methyltrain" << opt::out_suffix << ".round" << round << ".events.tsv";
+ std::ofstream training_ofs(training_fn.str());
+
+ // write out a header for the training data
+ StateTrainingData::write_header(training_ofs);
+
+ // iterate over models: template, complement_pop1, complement_pop2
for(auto model_training_iter = model_training_data.begin();
model_training_iter != model_training_data.end(); model_training_iter++) {
- // Initialize trained model from input model
- auto model_iter = models.find(model_training_iter->first);
- assert(model_iter != models.end());
+ // Initialize the trained model from the input model
+ auto current_model_iter = current_models.find(model_training_iter->first);
+ assert(current_model_iter != current_models.end());
std::string model_name = model_training_iter->first;
- std::string model_short_name = "";
-
- if(model_name == "r7.3_template_median68pA.model") {
- model_short_name = "t";
- } else if(model_name == "r7.3_complement_median68pA_pop1.model") {
- model_short_name = "c.p1";
- } else if(model_name == "r7.3_complement_median68pA_pop2.model") {
- model_short_name = "c.p2";
- } else if(model_name == "r7.3_e6_70bps_6mer_template_median68pA.model") {
- model_short_name = "t.006";
- } else if(model_name == "r7.3_e6_70bps_6mer_complement_median68pA_pop1.model") {
- model_short_name = "c.p1.006";
- } else if(model_name == "r7.3_e6_70bps_6mer_complement_median68pA_pop2.model") {
- model_short_name = "c.p2.006";
- } else {
- printf("Unknown model: %s\n", model_name.c_str());
- assert(false);
- }
+ std::string model_short_name = current_model_iter->second.metadata.get_short_name();
+
+ // Initialize the new model from the current model
+ PoreModel updated_model = current_model_iter->second;
+ uint32_t k = updated_model.k;
+ const std::vector<StateSummary>& summaries = model_training_iter->second;
- trained_models[model_training_iter->first] = model_iter->second;
- PoreModel& new_pm = trained_models[model_training_iter->first];
+ // Generate the complete set of kmers
+ std::string gen_kmer(k, 'A');
+ std::vector<std::string> all_kmers;
+ for(size_t ki = 0; ki < summaries.size(); ++ki) {
+ all_kmers.push_back(gen_kmer);
+ mtrain_alphabet->lexicographic_next(gen_kmer);
+ }
+ assert(gen_kmer == std::string(k, 'A'));
+ assert(all_kmers.front() == std::string(k, 'A'));
+ assert(all_kmers.back() == std::string(k, 'T'));
// Update means for each kmer
- uint32_t k = new_pm.k;
- std::string kmer(k, 'A');
- const std::vector<StateSummary>& summaries = model_training_iter->second;
+ #pragma omp parallel for
for(size_t ki = 0; ki < summaries.size(); ++ki) {
+ assert(ki < all_kmers.size());
+ std::string kmer = all_kmers[ki];
+
+ // write the observed values to a tsv file
+ #pragma omp critical
+ {
+ for(size_t ei = 0; ei < summaries[ki].events.size(); ++ei) {
+ summaries[ki].events[ei].write_tsv(training_ofs, model_short_name, kmer);
+ }
- // write a training file
- for(size_t ei = 0; ei < summaries[ki].events.size(); ++ei) {
- summaries[ki].events[ei].write_tsv(training_fp, model_short_name, kmer);
- }
-
- // write to the summary file
- fprintf(summary_fp, "%s\t%s\t%d\t%d\t%d\n", model_short_name.c_str(), kmer.c_str(), summaries[ki].num_matches, summaries[ki].num_skips, summaries[ki].num_stays);
-
- GaussianMixture mixture;
-
- // train a mixture model where a minority of k-mers aren't methylated
-
- // unmethylated component
- double um_rate = 0.05f;
- std::string um_kmer = gMCpGAlphabet.unmethylate(kmer);
- size_t um_ki = gMCpGAlphabet.kmer_rank(um_kmer.c_str(), k);
- GaussianParameters um_params(model_iter->second.get_parameters(um_ki).level_mean,
- model_iter->second.get_parameters(um_ki).level_stdv);
-
- mixture.weights.push_back(um_rate);
- mixture.params.push_back(um_params);
-
- GaussianParameters m_params(model_iter->second.get_parameters(ki).level_mean,
- model_iter->second.get_parameters(ki).level_stdv);
-
- mixture.weights.push_back(1 - um_rate);
- mixture.params.push_back(m_params);
-
- if(opt::verbose > 1) {
-
- fprintf(stderr, "INIT__MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(),
- mixture.weights[0], mixture.params[0].mean, mixture.params[0].stdv,
- mixture.weights[1], mixture.params[1].mean, mixture.params[1].stdv);
}
- GaussianMixture trained_mixture = train_gaussian_mixture(summaries[ki].events, mixture);
-
- if(opt::verbose > 1) {
- fprintf(stderr, "TRAIN_MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(),
- trained_mixture.weights[0], trained_mixture.params[0].mean, trained_mixture.params[0].stdv,
- trained_mixture.weights[1], trained_mixture.params[1].mean, trained_mixture.params[1].stdv);
- }
-
bool is_m_kmer = kmer.find('M') != std::string::npos;
- bool update_kmer = (is_m_kmer == !opt::train_unmethylated);
- if(update_kmer && summaries[ki].events.size() > 100) {
- new_pm.states[ki].level_mean = trained_mixture.params[1].mean;
- new_pm.states[ki].level_stdv = trained_mixture.params[1].stdv;
+ bool update_kmer = opt::training_target == TT_ALL_KMERS ||
+ (is_m_kmer && opt::training_target == TT_METHYLATED_KMERS) ||
+ (!is_m_kmer && opt::training_target == TT_UNMETHYLATED_KMERS);
+ bool trained = false;
+ // only train if there are a sufficient number of events for this kmer
+ if(update_kmer && summaries[ki].events.size() >= opt::min_number_of_events_to_train) {
+
+ // train a mixture model where a minority of k-mers aren't methylated
+ ParamMixture mixture;
+
+ float incomplete_methylation_rate = 0.05f;
+ std::string um_kmer = mtrain_alphabet->unmethylate(kmer);
+ size_t um_ki = mtrain_alphabet->kmer_rank(um_kmer.c_str(), k);
+
+ // Initialize the training parameters. If this is a kmer containing
+ // a methylation site we train a two component mixture, otherwise
+ // just fit a gaussian
+ float major_weight = is_m_kmer ? 1 - incomplete_methylation_rate : 1.0f;
+ mixture.log_weights.push_back(log(major_weight));
+ mixture.params.push_back(current_model_iter->second.get_parameters(ki));
+
+ if(is_m_kmer) {
+ // add second unmethylated component
+ mixture.log_weights.push_back(std::log(incomplete_methylation_rate));
+ mixture.params.push_back(current_model_iter->second.get_parameters(um_ki));
+ }
+
+ if(opt::verbose > 1) {
+ fprintf(stderr, "INIT__MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(),
+ std::exp(mixture.log_weights[0]), mixture.params[0].level_mean, mixture.params[0].level_stdv,
+ std::exp(mixture.log_weights[1]), mixture.params[1].level_mean, mixture.params[1].level_stdv);
+ }
+
+ ParamMixture trained_mixture = train_gaussian_mixture(summaries[ki].events, mixture);
+
+ if(opt::verbose > 1) {
+ fprintf(stderr, "TRAIN_MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(),
+ std::exp(trained_mixture.log_weights[0]), trained_mixture.params[0].level_mean, trained_mixture.params[0].level_stdv,
+ std::exp(trained_mixture.log_weights[1]), trained_mixture.params[1].level_mean, trained_mixture.params[1].level_stdv);
+ }
+
+ #pragma omp critical
+ updated_model.states[ki] = trained_mixture.params[0];
+
+ if (model_stdv()) {
+ ParamMixture ig_mixture;
+ // weights
+ ig_mixture.log_weights = trained_mixture.log_weights;
+ // states
+ ig_mixture.params.emplace_back(trained_mixture.params[0]);
+
+ if(is_m_kmer) {
+ ig_mixture.params.emplace_back(current_model_iter->second.get_parameters(um_ki));
+ }
+ // run training
+ auto trained_ig_mixture = train_invgaussian_mixture(summaries[ki].events, ig_mixture);
+
+ LOG("methyltrain", debug)
+ << "IG_INIT__MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " ["
+ << std::fixed << std::setprecision(5) << ig_mixture.params[0].sd_mean << " "
+ << ig_mixture.params[1].sd_mean << "]" << std::endl
+ << "IG_TRAIN_MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " ["
+ << trained_ig_mixture.params[0].sd_mean << " "
+ << trained_ig_mixture.params[1].sd_mean << "]" << std::endl;
+
+ // update state
+ #pragma omp critical
+ {
+ updated_model.states[ki] = trained_ig_mixture.params[0];
+ }
+ }
+
+ trained = true;
}
- /*
- if(kmer.find("CG") != std::string::npos) {
- float mu_prime = summaries[ki].mean_sum / summaries[ki].n;
- float var_prime = summaries[ki].var_sum / summaries[ki].n;
- new_pm[ki].level_mean = mu_prime;
- new_pm[ki].level_stdv = sqrt(var_prime);
- fprintf(stderr, "%s %s %.2lf %.2lf\n", model_training_iter->first.c_str(), kmer.c_str(), new_pm[ki].level_mean, new_pm[ki].level_stdv);
+ #pragma omp critical
+ {
+ fprintf(summary_fp, "%s\t%s\t%d\t%d\t%d\t%zu\t%d\t%.2lf\t%.2lf\n",
+ model_short_name.c_str(), kmer.c_str(),
+ summaries[ki].num_matches, summaries[ki].num_skips, summaries[ki].num_stays,
+ summaries[ki].events.size(), trained, updated_model.states[ki].level_mean, updated_model.states[ki].level_stdv);
}
- */
- mtrain_alphabet->lexicographic_next(kmer);
+
+ // add the updated model into the collection (or replace what is already there)
+ PoreModelSet::insert_model(opt::trained_model_type, updated_model);
}
}
-
// cleanup records
for(size_t i = 0; i < records.size(); ++i) {
bam_destroy1(records[i]);
@@ -729,21 +793,28 @@ ModelMap train_one_round(const ModelMap& models, const Fast5Map& name_map, size_
fai_destroy(fai);
sam_close(bam_fh);
hts_idx_destroy(bam_idx);
- fclose(training_fp);
fclose(summary_fp);
- return trained_models;
}
-void write_models(ModelMap& models)
+void write_models(const PoreModelMap& models, int round)
{
+ // file-of-filenames containing the new models
+ std::ofstream fofn_writer(opt::out_fofn);
+
// Write the model
for(auto model_iter = models.begin();
model_iter != models.end(); model_iter++) {
assert(!model_iter->second.model_filename.empty());
- std::string outname = model_iter->second.model_filename + opt::out_suffix;
- std::string modelname = model_iter->first + (!opt::train_unmethylated ? opt::out_suffix : "");
- models[model_iter->first].write( outname, *mtrain_alphabet, modelname );
+ std::stringstream round_ss;
+ round_ss << round;
+
+ std::string outname = model_iter->second.name + opt::out_suffix;
+ std::string modelname = model_iter->first + opt::out_suffix;
+ model_iter->second.write( outname, modelname );
+
+ // write the name of the trained model in the fofn
+ fofn_writer << outname << "\n";
}
}
@@ -753,17 +824,25 @@ int methyltrain_main(int argc, char** argv)
omp_set_num_threads(opt::num_threads);
Fast5Map name_map(opt::reads_file);
- ModelMap models = read_models_fofn(opt::models_fofn);
- static size_t TRAINING_ROUNDS = 10;
+ // copy the input model into a new type that will hold the trained models
+ const PoreModelMap& input_models = PoreModelSet::get_models(opt::initial_model_type);
+ for(auto model_iter : input_models) {
+ PoreModel model_copy = model_iter.second;
+ model_copy.type = opt::trained_model_type;
+ PoreModelSet::insert_model(model_copy.type, model_copy);
+ }
+
+ // Set the alphabet for this run to be the alphabet for the first model
+ assert(!PoreModelSet::get_models(opt::initial_model_type).empty());
+ mtrain_alphabet = PoreModelSet::get_models(opt::initial_model_type).begin()->second.pmalphabet;
- for(size_t round = 0; round < TRAINING_ROUNDS; round++) {
+ for(size_t round = 0; round < opt::num_training_rounds; round++) {
fprintf(stderr, "Starting round %zu\n", round);
- ModelMap trained_models = train_one_round(models, name_map, round);
+ train_one_round(name_map, round);
if(opt::write_models) {
- write_models(trained_models);
+ write_models(PoreModelSet::get_models(opt::trained_model_type), round);
}
- models = trained_models;
}
return EXIT_SUCCESS;
}
diff --git a/src/nanopolish_methyltrain.h b/src/nanopolish_methyltrain.h
index c57960b..c4c2ae4 100644
--- a/src/nanopolish_methyltrain.h
+++ b/src/nanopolish_methyltrain.h
@@ -8,17 +8,22 @@
#ifndef NANOPOLISH_METHYLTRAIN_H
#define NANOPOLISH_METHYLTRAIN_H
-#include <string>
-#include <map>
-#include "nanopolish_poremodel.h"
+//#include <string>
+//#include <map>
+//#include "nanopolish_poremodel.h"
+#include <vector>
+#include "nanopolish_eventalign.h"
+#include "nanopolish_squiggle_read.h"
-//
-// Typedefs
-//
-typedef std::map<std::string, PoreModel> ModelMap;
-
-// read models from a file
-ModelMap read_models_fofn(const std::string& fofn_name);
+// recalculate shift, scale, drift, scale_sd from an alignment and the read
+// returns true if the recalibration was performed
+// in either case, sets residual to the L1 norm of the residual
+bool recalibrate_model(SquiggleRead &sr,
+ const int strand_idx,
+ const std::vector<EventAlignment> &alignment_output,
+ const Alphabet* alphabet,
+ bool scale_var=true,
+ bool scale_drift=true);
int methyltrain_main(int argc, char** argv);
diff --git a/src/nanopolish_poremodel.cpp b/src/nanopolish_poremodel.cpp
index 411795a..5bd5476 100644
--- a/src/nanopolish_poremodel.cpp
+++ b/src/nanopolish_poremodel.cpp
@@ -16,48 +16,134 @@
void PoreModel::bake_gaussian_parameters()
{
scaled_params.resize(states.size());
+ scaled_states.resize(states.size());
- for(int i = 0; i < states.size(); ++i) {
+ for(unsigned i = 0; i < states.size(); ++i) {
- // these functions are provided by ONT
- scaled_params[i].mean = states[i].level_mean * scale + shift;
- scaled_params[i].stdv = states[i].level_stdv * var;
- scaled_params[i].log_stdv = log(scaled_params[i].stdv); // pre-computed for efficiency
+ // as per ONT documents
+ scaled_states[i].level_mean = states[i].level_mean * scale + shift;
+ scaled_states[i].level_stdv = states[i].level_stdv * var;
+ scaled_states[i].sd_mean = states[i].sd_mean * scale_sd;
+ scaled_states[i].sd_lambda = states[i].sd_lambda * var_sd;
+ scaled_states[i].update_sd_stdv();
- // These are not used, for now
- //scaled_state[i].sd_mean = state[i].sd_mean * scale_sd;
- //scaled_state[i].sd_stdv = state[i].sd_stdv * sqrt(pow(scale_sd, 3.0) / var_sd);
+ // for efficiency
+ scaled_states[i].update_logs();
+
+ // for compatibility
+ scaled_params[i].mean = scaled_states[i].level_mean;
+ scaled_params[i].stdv = scaled_states[i].level_stdv;
+ scaled_params[i].log_stdv = scaled_states[i].level_log_stdv;
}
is_scaled = true;
}
-PoreModel::PoreModel(const std::string filename, const Alphabet& alphabet)
+void add_found_bases(char *known, const char *kmer) {
+ char newbase[2];
+ unsigned posn;
+ newbase[1] = '\0';
+
+ while ( (posn = strspn(kmer, known)) != strlen(kmer) ){
+ newbase[0] = kmer[posn];
+ strcat(known, newbase);
+ }
+ return;
+}
+
+PoreModel::PoreModel(const std::string filename, const Alphabet *alphabet) : is_scaled(false), pmalphabet(alphabet)
{
model_filename = filename;
std::ifstream model_reader(filename);
std::string model_line;
+ bool model_metadata_in_header = false;
bool firstKmer = true;
- int ninserted = 0;
+ unsigned ninserted = 0;
- shift_offset = 0.0f;
+ this->shift = 0.0;
+ this->scale = 1.0;
+ this->drift = 0.0;
+ this->var = 1.0;
+ this->scale_sd = 1.0;
+ this->var_sd = 1.0;
+ this->shift_offset = 0.0f;
+ this->scale_offset = 0.0f;
+ const size_t maxNucleotides = 50;
+ char bases[maxNucleotides+1] = "";
+
+ std::map<std::string, PoreModelStateParams> kmers;
while (getline(model_reader, model_line)) {
std::stringstream parser(model_line);
// Extract the model name from the header
if (model_line.find("#model_name") != std::string::npos) {
std::string dummy;
- parser >> dummy >> name;
+ parser >> dummy >> this->name;
}
- // Extract shift offset from the header
+ // Extract the strand from the header
+ if (model_line.find("#strand") != std::string::npos) {
+ std::string dummy;
+ std::string in_strand;
+ parser >> dummy >> in_strand;
+
+ if(in_strand == "template") {
+ this->metadata.model_idx = 0;
+ } else if(in_strand == "complement.pop1") {
+ this->metadata.model_idx = 1;
+ } else if(in_strand == "complement.pop2") {
+ this->metadata.model_idx = 2;
+ } else {
+ fprintf(stderr, "Error, unrecognized model strand %s for input file %s\n",
+ in_strand.c_str(), filename.c_str());
+ exit(EXIT_FAILURE);
+ }
+
+ model_metadata_in_header = true;
+ }
+
+ // Extract the sequencing kit version from the header
+ if (model_line.find("#kit") != std::string::npos) {
+ std::string dummy;
+ std::string in_kit;
+ parser >> dummy >> in_kit;
+
+ if(in_kit == "SQK006") {
+ this->metadata.kit = KV_SQK007;
+ } else if(in_kit == "SQK007") {
+ this->metadata.kit = KV_SQK007;
+ } else {
+ fprintf(stderr, "Error, unrecognized model kit %s for input file %s\n",
+ in_kit.c_str(), filename.c_str());
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (model_line.find("#type") != std::string::npos) {
+ std::string dummy;
+ parser >> dummy >> this->type;
+ }
+
+ // Extract shift/scale offset from the header
// This will be applied to the per-read shift values
// to allow switching between models with different averages
if (model_line.find("#shift_offset") != std::string::npos) {
std::string dummy;
- parser >> dummy >> shift_offset;
- printf("found shift offset of %.2lf\n", shift_offset);
+ parser >> dummy >> this->shift_offset;
+ }
+
+ if (model_line.find("#scale_offset") != std::string::npos) {
+ std::string dummy;
+ parser >> dummy >> this->scale_offset;
+ }
+
+ // Use the alphabet defined in the header if available
+ if (model_line.find("#alphabet") != std::string::npos) {
+ std::string dummy;
+ std::string alphabet_name;
+ parser >> dummy >> alphabet_name;
+ pmalphabet = get_alphabet_by_name(alphabet_name);
}
// skip the rest of the header
@@ -67,72 +153,42 @@ PoreModel::PoreModel(const std::string filename, const Alphabet& alphabet)
std::string kmer;
PoreModelStateParams params;
+
+ // ig_lambda (R9), weight currently not read
parser >> kmer >> params.level_mean >> params.level_stdv >> params.sd_mean >> params.sd_stdv;
+ params.update_sd_lambda();
+ params.update_logs();
+
+ kmers[kmer] = params;
+ add_found_bases(bases, kmer.c_str());
+
if (firstKmer) {
k = kmer.length();
- states.resize( alphabet.get_num_strings(k) );
-
firstKmer = false;
}
-
- states[ alphabet.kmer_rank(kmer.c_str(), k) ] = params;
- ninserted++;
}
- assert( ninserted == states.size() );
-}
-
-PoreModel::PoreModel(fast5::File *f_p, const size_t strand, const Alphabet& alphabet)
-{
-
- std::vector<fast5::Model_Entry> model = f_p->get_model(strand);
- k = (uint32_t) strlen(model[0].kmer);
- states.resize( alphabet.get_num_strings(k) );
- assert(states.size() == model.size());
+ if(!model_metadata_in_header) {
+ this->metadata = get_model_metadata_from_name(this->name);
+ }
- // Copy into the pore model for this read
- for(size_t mi = 0; mi < model.size(); ++mi) {
- const fast5::Model_Entry& curr = model[mi];
+ if (pmalphabet == nullptr)
+ pmalphabet = best_alphabet(bases);
- size_t rank = alphabet.kmer_rank(curr.kmer, k);
- states[rank] = { static_cast<float>(curr.level_mean),
- static_cast<float>(curr.level_stdv),
- static_cast<float>(curr.sd_mean),
- static_cast<float>(curr.sd_stdv) };
- }
+ assert( pmalphabet != nullptr );
- // Load the scaling parameters for the pore model
- fast5::Model_Parameters params = f_p->get_model_parameters(strand);
- drift = params.drift;
- scale = params.scale;
- scale_sd = params.scale_sd;
- shift = params.shift;
- var = params.var;
- var_sd = params.var_sd;
-
- // no offset needed when loading directly from the fast5
- shift_offset = 0.0f;
-
- // apply shift/scale transformation to the pore model states
- bake_gaussian_parameters();
-
- // Read and shorten the model name
- std::string temp_name = f_p->get_model_file(strand);
- std::string leader = "/opt/chimaera/model/";
-
- size_t lp = temp_name.find(leader);
- // leader not found
- if(lp == std::string::npos) {
- name = temp_name;
- } else {
- name = temp_name.substr(leader.size());
+ states.resize(pmalphabet->get_num_strings(k));
+ for (const auto &iter : kmers ) {
+ ninserted++;
+ states[ pmalphabet->kmer_rank(iter.first.c_str(), k) ] = iter.second;
}
+ assert( ninserted == states.size() );
- std::replace(name.begin(), name.end(), '/', '_');
+ is_scaled = false;
}
-void PoreModel::write(const std::string filename, const Alphabet& alphabet, const std::string modelname)
+void PoreModel::write(const std::string filename, const std::string modelname) const
{
std::string outmodelname = modelname;
if(modelname.empty())
@@ -140,27 +196,31 @@ void PoreModel::write(const std::string filename, const Alphabet& alphabet, cons
std::ofstream writer(filename);
writer << "#model_name\t" << outmodelname << std::endl;
-
- printf("SHIFT OFFSET: %.lf\n", shift_offset);
- writer << "#shift_offset\t" << shift_offset << std::endl;
-
- std::string curr_kmer(k,alphabet.base(0));
- for(size_t ki = 0; ki < states.size(); ++ki) {
- writer << curr_kmer << "\t" << states[ki].level_mean << "\t" << states[ki].level_stdv << "\t"
- << states[ki].sd_mean << "\t" << states[ki].sd_stdv << std::endl;
- alphabet.lexicographic_next(curr_kmer);
+ writer << "#type\t" << this->type << std::endl;
+ writer << "#kit\t" << this->metadata.get_kit_name() << std::endl;
+ writer << "#strand\t" << this->metadata.get_strand_model_name() << std::endl;
+ writer << "#shift_offset\t" << this->shift_offset << std::endl;
+ writer << "#scale_offset\t" << this->scale_offset << std::endl;
+
+ std::string curr_kmer(k, this->pmalphabet->base(0));
+ for(size_t ki = 0; ki < this->states.size(); ++ki) {
+ writer << curr_kmer << "\t" << this->states[ki].level_mean << "\t" << this->states[ki].level_stdv << "\t"
+ << this->states[ki].sd_mean << "\t" << this->states[ki].sd_stdv << std::endl;
+ this->pmalphabet->lexicographic_next(curr_kmer);
}
writer.close();
}
-void PoreModel::update_states( const PoreModel &other )
+void PoreModel::update_states( const PoreModel &other )
{
k = other.k;
+ pmalphabet = other.pmalphabet;
shift += other.shift_offset;
+ scale += other.scale_offset;
update_states( other.states );
}
-void PoreModel::update_states( const std::vector<PoreModelStateParams> &otherstates )
+void PoreModel::update_states( const std::vector<PoreModelStateParams> &otherstates )
{
states = otherstates;
if (is_scaled) {
diff --git a/src/nanopolish_poremodel.h b/src/nanopolish_poremodel.h
index d9659f9..a3985ed 100644
--- a/src/nanopolish_poremodel.h
+++ b/src/nanopolish_poremodel.h
@@ -13,6 +13,8 @@
#include "nanopolish_common.h"
#include <inttypes.h>
#include <string>
+#include <map>
+#include "nanopolish_model_names.h"
#include "../fast5/src/fast5.hpp"
//
@@ -20,25 +22,53 @@ struct PoreModelStateParams
{
double level_mean;
double level_stdv;
-
double sd_mean;
double sd_stdv;
+
+ double level_log_stdv;
+ double sd_lambda;
+ double sd_log_lambda;
+
+ PoreModelStateParams& operator = (const fast5::Model_Entry& e)
+ {
+ level_mean = e.level_mean;
+ level_stdv = e.level_stdv;
+ sd_mean = e.sd_mean;
+ sd_stdv = e.sd_stdv;
+ update_sd_lambda();
+ return *this;
+ }
+
+ // update sd_lambda based on sd_mean & sd_stdv
+ void update_sd_lambda()
+ {
+ sd_lambda = pow(sd_mean, 3.0) / pow(sd_stdv, 2.0);
+ }
+ // update sd_stdv based on sd_mean & sd_lambda
+ void update_sd_stdv()
+ {
+ sd_stdv = pow(pow(sd_mean, 3.0) / sd_lambda, .5);
+ }
+ void update_logs()
+ {
+ level_log_stdv = log(level_stdv);
+ sd_log_lambda = log(sd_lambda);
+ }
};
//
class PoreModel
{
public:
- PoreModel(uint32_t _k=5) : is_scaled(false), k(_k) {}
+ PoreModel(uint32_t _k=5) : k(_k), is_scaled(false), pmalphabet(&gDNAAlphabet) {}
// These constructors and the output routine take an alphabet
// so that kmers are inserted/written in order
// nicer might be to store the states as a map from kmer -> state
- PoreModel(const std::string filename, const Alphabet& alphabet=gDNAAlphabet);
- PoreModel(fast5::File *f_p, const size_t strand, const Alphabet& alphabet=gDNAAlphabet);
+ PoreModel(const std::string filename, const Alphabet *alphabet=NULL);
- void write(const std::string filename, const Alphabet& alphabet, const std::string modelname="");
+ void write(const std::string filename, const std::string modelname="") const;
inline GaussianParameters get_scaled_parameters(const uint32_t kmer_rank) const
{
@@ -46,6 +76,12 @@ class PoreModel
return scaled_params[kmer_rank];
}
+ inline PoreModelStateParams get_scaled_state(const uint32_t kmer_rank) const
+ {
+ assert(is_scaled);
+ return scaled_states[kmer_rank];
+ }
+
inline PoreModelStateParams get_parameters(const uint32_t kmer_rank) const
{
return states[kmer_rank];
@@ -68,6 +104,8 @@ class PoreModel
// model metadata
std::string model_filename;
std::string name;
+ std::string type;
+ ModelMetadata metadata;
uint32_t k;
// per-read scaling parameters
@@ -78,15 +116,18 @@ class PoreModel
double scale_sd;
double var_sd;
- // to support swapping models, a .model file might contain a shift_offset field
- // which describes how to change the per-read shift values to match the incoming
- // model. This field stores this data, which might be 0.
+ // to support swapping models, a .model file might contain a shift_offset/scale_offset
+ // field which describes how to change the per-read shift values to match the incoming
+ // model.
double shift_offset;
+ double scale_offset;
- //
bool is_scaled;
+ const Alphabet *pmalphabet;
+
std::vector<PoreModelStateParams> states;
+ std::vector<PoreModelStateParams> scaled_states;
std::vector<GaussianParameters> scaled_params;
};
diff --git a/src/nanopolish_scorereads.cpp b/src/nanopolish_scorereads.cpp
new file mode 100644
index 0000000..79ff155
--- /dev/null
+++ b/src/nanopolish_scorereads.cpp
@@ -0,0 +1,593 @@
+//---------------------------------------------------------
+// Copyright 2015 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_scorereads -- score reads against an alignment, model
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <vector>
+#include <inttypes.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/time.h>
+#include <algorithm>
+#include <iterator>
+#include <fstream>
+#include <sstream>
+#include <set>
+#include <omp.h>
+#include <getopt.h>
+#include <cstddef>
+#include "htslib/faidx.h"
+#include "nanopolish_alphabet.h"
+#include "nanopolish_methyltrain.h"
+#include "nanopolish_eventalign.h"
+#include "nanopolish_iupac.h"
+#include "nanopolish_poremodel.h"
+#include "nanopolish_transition_parameters.h"
+#include "nanopolish_matrix.h"
+#include "nanopolish_profile_hmm.h"
+#include "nanopolish_anchor.h"
+#include "nanopolish_fast5_map.h"
+#include "nanopolish_pore_model_set.h"
+#include "H5pubconf.h"
+
+//
+// Getopt
+//
+#define SUBPROGRAM "scorereads"
+
+static const char *SCOREREADS_VERSION_MESSAGE =
+SUBPROGRAM " Version " PACKAGE_VERSION "\n"
+"Written by Jared Simpson.\n"
+"\n"
+"Copyright 2015 Ontario Institute for Cancer Research\n";
+
+static const char *SCOREREADS_USAGE_MESSAGE =
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n"
+"Score reads against an alignment, model\n"
+"\n"
+" -v, --verbose display verbose output\n"
+" --version display version\n"
+" --help display this help and exit\n"
+" -m, --models-fofn=FILE optionally use these models rather than models in fast5\n"
+" -c --calibrate recalibrate aligned reads to model before scoring\n"
+" -z --zero-drift if recalibrating, keep drift at 0\n"
+" -i --individual-reads=READ,READ optional comma-delimited list of readnames to score\n"
+" -r, --reads=FILE the 2D ONT reads are in fasta FILE\n"
+" -b, --bam=FILE the reads aligned to the genome assembly are in bam FILE\n"
+" -g, --genome=FILE the genome we are computing a consensus for is in FILE\n"
+" -w, --window=STR score reads in the window STR (format: ctg:start-end)\n"
+" -t, --threads=NUM use NUM threads (default: 1)\n"
+" --train-transitions train new transition parameters from the input reads\n"
+" --learn-model-offset learn the scaling offsets for the alternative pore models\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+
+namespace opt
+{
+ static unsigned int verbose;
+ static unsigned int calibrate=0;
+ static std::string reads_file;
+ static std::string bam_file;
+ static std::string genome_file;
+ static std::string models_fofn;
+ static std::string region;
+ static std::vector<std::string> readnames;
+ static std::string alternative_model_type = "ONT";
+ static int train_transitions = 0;
+ static int num_threads = 1;
+ static int batch_size = 128;
+
+ // Offset calculating parameters
+ static int learn_model_offset = 0;
+ static double lm_min_scale_offset = -0.5;
+ static double lm_max_scale_offset = 0.5;
+ static double lm_scale_offset_stride = 0.05;
+
+ static double lm_min_shift_offset = -20;
+ static double lm_max_shift_offset = 20;
+ static double lm_shift_offset_stride = 0.1;
+
+ static bool scale_drift = true;
+}
+
+static const char* shortopts = "i:r:b:g:t:m:w:vcz";
+
+enum { OPT_HELP = 1, OPT_VERSION, OPT_TRAIN_TRANSITIONS, OPT_LEARN_MODEL_OFFSET };
+
+static const struct option longopts[] = {
+ { "verbose", no_argument, NULL, 'v' },
+ { "calibrate", no_argument, NULL, 'c' },
+ { "zero-drift", no_argument, NULL, 'z' },
+ { "reads", required_argument, NULL, 'r' },
+ { "bam", required_argument, NULL, 'b' },
+ { "genome", required_argument, NULL, 'g' },
+ { "threads", required_argument, NULL, 't' },
+ { "models-fofn", required_argument, NULL, 'm' },
+ { "individual-reads", required_argument, NULL, 'i' },
+ { "window", required_argument, NULL, 'w' },
+ { "train-transitions", no_argument, NULL, OPT_TRAIN_TRANSITIONS },
+ { "learn-model-offset", no_argument, NULL, OPT_LEARN_MODEL_OFFSET },
+ { "help", no_argument, NULL, OPT_HELP },
+ { "version", no_argument, NULL, OPT_VERSION },
+ { NULL, 0, NULL, 0 }
+};
+
+double model_score(SquiggleRead &sr,
+ const size_t strand_idx,
+ const faidx_t *fai,
+ const std::vector<EventAlignment> &alignment_output,
+ const size_t events_per_segment,
+ TransitionParameters* transition_training)
+{
+ double curr_score = 0;
+ size_t nevents = 0;
+
+ for(int align_start_idx = events_per_segment;
+ align_start_idx < (int)alignment_output.size() - (int)events_per_segment;
+ align_start_idx += events_per_segment) {
+
+ const EventAlignment& align_start = alignment_output[align_start_idx];
+ const EventAlignment& align_end = alignment_output[align_start_idx + events_per_segment];
+ std::string contig = alignment_output.front().ref_name.c_str();
+
+ // Set up event data
+ HMMInputData data;
+ data.read = &sr;
+ data.anchor_index = -1; // unused
+ data.strand = strand_idx;
+ data.rc = alignment_output.front().rc;
+ data.event_start_idx = align_start.event_idx;
+ data.event_stop_idx = align_end.event_idx;
+ data.event_stride = data.event_start_idx <= data.event_stop_idx ? 1 : -1;
+
+ // Set up reference data
+ int ref_start_pos = align_start.ref_position;
+ int ref_end_pos = align_end.ref_position;
+ int fetched_len = 0;
+
+ assert(ref_end_pos >= ref_start_pos);
+
+ // Extract the reference sequence for this region
+ std::string ref_seq = get_reference_region_ts(fai, contig.c_str(), ref_start_pos,
+ ref_end_pos, &fetched_len);
+
+ if (fetched_len <= (int)sr.pore_model[strand_idx].k)
+ continue;
+
+ const Alphabet *alphabet = sr.pore_model[strand_idx].pmalphabet;
+
+ ref_seq = alphabet->disambiguate(ref_seq);
+ HMMInputSequence sequence(ref_seq, alphabet->reverse_complement(ref_seq), alphabet);
+
+ // Run HMM using current model
+ double segment_score = profile_hmm_score(sequence, data, 0);
+ int events_in_segment = abs(data.event_start_idx - data.event_stop_idx) + 1;
+
+ // Calculate scaling parameters for this local segment
+ std::vector<EventAlignment> event_alignment_sub(alignment_output.begin() + align_start_idx,
+ alignment_output.begin() + align_start_idx + events_per_segment);
+
+ double curr_shift = sr.pore_model[strand_idx].shift;
+ double curr_scale = sr.pore_model[strand_idx].scale;
+ double curr_drift = sr.pore_model[strand_idx].drift;
+ double curr_var = sr.pore_model[strand_idx].var;
+
+ recalibrate_model(sr, strand_idx, event_alignment_sub, &gDNAAlphabet, true, opt::scale_drift);
+
+ fprintf(stdout, "SEGMENT\t%s\t%zu\t%.3lf\t%d\t%.2lf\t%.2lf\t%.2lf\t%.2lf\n",
+ sr.read_name.c_str(),
+ nevents,
+ segment_score / events_in_segment,
+ events_in_segment,
+ sr.pore_model[strand_idx].shift,
+ sr.pore_model[strand_idx].scale,
+ sr.pore_model[strand_idx].drift,
+ sr.pore_model[strand_idx].var);
+
+ sr.pore_model[strand_idx].shift = curr_shift;
+ sr.pore_model[strand_idx].scale = curr_scale;
+ sr.pore_model[strand_idx].drift = curr_drift;
+ sr.pore_model[strand_idx].var = curr_var;
+ sr.pore_model[strand_idx].bake_gaussian_parameters();
+
+ curr_score += segment_score;
+ nevents += events_in_segment;
+
+
+ if(transition_training != NULL) {
+ std::vector<HMMAlignmentState> alignment = profile_hmm_align(sequence, data);
+ #pragma omp critical
+ {
+ transition_training->add_training_from_alignment(sequence, data, alignment);
+ }
+ }
+ }
+
+ if (nevents == 0)
+ return +1;
+ else
+ return curr_score/nevents;
+}
+
+void sweep_offset_parameters(SquiggleRead &sr,
+ const size_t strand_idx,
+ const size_t read_idx,
+ const faidx_t *fai,
+ const std::vector<EventAlignment> &alignment_output,
+ const size_t events_per_segment,
+ const std::string alternative_model_type,
+ FILE* offset_fp)
+{
+ double curr_score = 0;
+ size_t nevents = 0;
+
+ size_t align_start_idx = events_per_segment; // skip the first segment
+
+ // skip if there is not a full segment of events
+ if(align_start_idx > (int)alignment_output.size() - (int)events_per_segment)
+ return;
+
+ const EventAlignment& align_start = alignment_output[align_start_idx];
+ const EventAlignment& align_end = alignment_output[align_start_idx + events_per_segment];
+ std::string contig = alignment_output.front().ref_name.c_str();
+
+ // Set up event data
+ HMMInputData data;
+ data.read = &sr;
+ data.anchor_index = -1; // unused
+ data.strand = strand_idx;
+ data.rc = alignment_output.front().rc;
+ data.event_start_idx = align_start.event_idx;
+ data.event_stop_idx = align_end.event_idx;
+ data.event_stride = data.event_start_idx <= data.event_stop_idx ? 1 : -1;
+
+ // Set up reference data
+ int ref_start_pos = align_start.ref_position;
+ int ref_end_pos = align_end.ref_position;
+ int fetched_len = 0;
+
+ if(ref_end_pos < ref_start_pos) {
+ fprintf(stderr, "Error: bad coordinates [%d %d] event range: [%d %d]\n",
+ ref_start_pos, ref_end_pos, align_start.event_idx, align_end.event_idx);
+ }
+ assert(ref_end_pos >= ref_start_pos);
+
+ // Extract the reference sequence for this region
+ std::string ref_seq = get_reference_region_ts(fai, contig.c_str(), ref_start_pos,
+ ref_end_pos, &fetched_len);
+
+ if (fetched_len <= (int)sr.pore_model[strand_idx].k)
+ return;
+
+ const Alphabet *alphabet = sr.pore_model[strand_idx].pmalphabet;
+ ref_seq = alphabet->disambiguate(ref_seq);
+ HMMInputSequence sequence(ref_seq, alphabet->reverse_complement(ref_seq), alphabet);
+
+ // Run HMM using current model
+ double base_score = profile_hmm_score(sequence, data, 0);
+ double base_scale = sr.pore_model[strand_idx].scale;
+ double base_shift = sr.pore_model[strand_idx].shift;
+
+ // replace the model that is built into the read with the alternative model
+ sr.replace_model(strand_idx, alternative_model_type);
+
+ double max_improvement = -INFINITY;
+ double best_shift = -INFINITY;
+ double best_scale = -INFINITY;
+
+ for(double scale_offset = opt::lm_min_scale_offset;
+ scale_offset < opt::lm_max_scale_offset;
+ scale_offset += opt::lm_scale_offset_stride)
+ {
+ for(double shift_offset = opt::lm_min_shift_offset;
+ shift_offset < opt::lm_max_shift_offset;
+ shift_offset += opt::lm_shift_offset_stride)
+ {
+ // update scales and bake them
+ sr.pore_model[strand_idx].shift = base_shift + shift_offset;
+ sr.pore_model[strand_idx].scale = base_scale + scale_offset;
+ sr.pore_model[strand_idx].bake_gaussian_parameters();
+ double offset_score = profile_hmm_score(sequence, data, 0);
+ double improvement = offset_score - base_score;
+ fprintf(offset_fp, "%zu\t%zu\t%.2lf\t%.2lf\t%.2lf\n", read_idx, strand_idx, scale_offset, shift_offset, offset_score - base_score);
+
+ if(improvement > max_improvement) {
+ max_improvement = improvement;
+ best_scale = scale_offset;
+ best_shift = shift_offset;
+ }
+ }
+ }
+
+ fprintf(stderr, "best offsets for strand %zu scale: %.2lf shift: %.2lf improve: %.2lf base: %.2lf\n", strand_idx, best_scale, best_shift, max_improvement, base_score);
+
+ // replace parameters
+ sr.pore_model[strand_idx].shift = base_shift;
+ sr.pore_model[strand_idx].scale = base_scale;
+ sr.pore_model[strand_idx].bake_gaussian_parameters();
+}
+
+
+std::vector<EventAlignment> alignment_from_read(SquiggleRead& sr,
+ const size_t strand_idx,
+ const size_t read_idx,
+ const std::string& alternative_model_type,
+ const faidx_t* fai,
+ const bam_hdr_t* hdr,
+ const bam1_t* record,
+ int region_start,
+ int region_end)
+{
+ if(!alternative_model_type.empty()) {
+ sr.replace_model(strand_idx, alternative_model_type);
+ }
+
+ // Align to the new model
+ EventAlignmentParameters params;
+ params.sr = &sr;
+ params.fai = fai;
+ params.hdr = hdr;
+ params.record = record;
+ params.strand_idx = strand_idx;
+
+ params.alphabet = sr.pore_model[strand_idx].pmalphabet;
+ params.read_idx = read_idx;
+ params.region_start = region_start;
+ params.region_end = region_end;
+ return align_read_to_ref(params);
+}
+
+void parse_scorereads_options(int argc, char** argv)
+{
+ bool die = false;
+ std::string readlist;
+ for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
+ std::istringstream arg(optarg != NULL ? optarg : "");
+ switch (c) {
+ case 'r': arg >> opt::reads_file; break;
+ case 'b': arg >> opt::bam_file; break;
+ case 'g': arg >> opt::genome_file; break;
+ case 't': arg >> opt::num_threads; break;
+ case 'm': arg >> opt::models_fofn; break;
+ case 'w': arg >> opt::region; break;
+ case 'i': arg >> readlist; break;
+ case 'v': opt::verbose++; break;
+ case 'c': opt::calibrate = 1; break;
+ case 'z': opt::scale_drift = false; break;
+ case '?': die = true; break;
+ case OPT_TRAIN_TRANSITIONS: opt::train_transitions = 1; break;
+ case OPT_LEARN_MODEL_OFFSET: opt::learn_model_offset = 1; break;
+ case OPT_HELP:
+ std::cout << SCOREREADS_USAGE_MESSAGE;
+ exit(EXIT_SUCCESS);
+ case OPT_VERSION:
+ std::cout << SCOREREADS_VERSION_MESSAGE;
+ exit(EXIT_SUCCESS);
+ }
+ }
+
+ if (argc - optind > 0) {
+ std::cerr << SUBPROGRAM ": too many arguments\n";
+ die = true;
+ }
+
+ if(opt::num_threads <= 0) {
+ std::cerr << SUBPROGRAM ": invalid number of threads: " << opt::num_threads << "\n";
+ die = true;
+ }
+
+ if(opt::reads_file.empty()) {
+ std::cerr << SUBPROGRAM ": a --reads file must be provided\n";
+ die = true;
+ }
+
+ if(opt::genome_file.empty()) {
+ std::cerr << SUBPROGRAM ": a --genome file must be provided\n";
+ die = true;
+ }
+
+ if(opt::bam_file.empty()) {
+ std::cerr << SUBPROGRAM ": a --bam file must be provided\n";
+ die = true;
+ }
+
+ if(opt::models_fofn.empty()) {
+ std::cerr << SUBPROGRAM ": a --models file must be provided\n";
+ die = true;
+ } else {
+ // initialize the model set from the fofn
+ PoreModelSet::initialize(opt::models_fofn);
+ }
+
+ // this is much cleaner with sregex_token_iterator, which isn't implemented in gcc until 4.9
+ if (!readlist.empty()) {
+ size_t start = readlist.find_first_not_of(","), end=start;
+ while (start != std::string::npos){
+ end = readlist.find(",", start);
+ opt::readnames.push_back(readlist.substr(start, end-start));
+ start = readlist.find_first_not_of(",", end);
+ }
+ }
+
+ if (die)
+ {
+ std::cout << "\n" << SCOREREADS_USAGE_MESSAGE;
+ exit(EXIT_FAILURE);
+ }
+}
+
+
+int scorereads_main(int argc, char** argv)
+{
+ parse_scorereads_options(argc, argv);
+ omp_set_num_threads(opt::num_threads);
+
+ Fast5Map name_map(opt::reads_file);
+
+ // Open the BAM and iterate over reads
+
+ // load bam file
+ htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r");
+ assert(bam_fh != NULL);
+
+ // load bam index file
+ std::string index_filename = opt::bam_file + ".bai";
+ hts_idx_t* bam_idx = bam_index_load(index_filename.c_str());
+ assert(bam_idx != NULL);
+
+ // read the bam header
+ bam_hdr_t* hdr = sam_hdr_read(bam_fh);
+
+ // load reference fai file
+ faidx_t *fai = fai_load(opt::genome_file.c_str());
+
+ hts_itr_t* itr;
+
+ // If processing a region of the genome, only emit events aligned to this window
+ int clip_start = -1;
+ int clip_end = -1;
+
+ if(opt::region.empty()) {
+ // TODO: is this valid?
+ itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0);
+ } else {
+
+ fprintf(stderr, "Region: %s\n", opt::region.c_str());
+ itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str());
+ hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end);
+ }
+
+#ifndef H5_HAVE_THREADSAFE
+ if(opt::num_threads > 1) {
+ fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n");
+ fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n");
+ exit(1);
+ }
+#endif
+
+ // Initialize iteration
+ std::vector<bam1_t*> records(opt::batch_size, NULL);
+ for(size_t i = 0; i < records.size(); ++i) {
+ records[i] = bam_init1();
+ }
+
+ // Initialize transition training
+ TransitionParameters* transition_training[NUM_STRANDS];
+ if(opt::train_transitions) {
+ for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) {
+ transition_training[strand_idx] = new TransitionParameters;
+ }
+ } else {
+ for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) {
+ transition_training[strand_idx] = NULL;
+ }
+ }
+
+ FILE* offset_fp = NULL;
+ if(opt::learn_model_offset) {
+ offset_fp = fopen("model_offset.tsv", "w");
+ fprintf(offset_fp, "read_idx\tstrand_idx\tscale_offset\tshift_offset\timprovement\n");
+ }
+
+ int result;
+ size_t num_reads_realigned = 0;
+ size_t num_records_buffered = 0;
+
+ do {
+ assert(num_records_buffered < records.size());
+
+ // read a record into the next slot in the buffer
+ result = sam_itr_next(bam_fh, itr, records[num_records_buffered]);
+ num_records_buffered += result >= 0;
+
+ // realign if we've hit the max buffer size or reached the end of file
+ if(num_records_buffered == records.size() || result < 0) {
+ #pragma omp parallel for schedule(dynamic)
+ for(size_t i = 0; i < num_records_buffered; ++i) {
+ bam1_t* record = records[i];
+ size_t read_idx = num_reads_realigned + i;
+ if( (record->core.flag & BAM_FUNMAP) == 0) {
+
+ //load read
+ std::string read_name = bam_get_qname(record);
+ std::string fast5_path = name_map.get_path(read_name);
+ SquiggleRead sr(read_name, fast5_path);
+
+ // TODO: early exit when have processed all of the reads in readnames
+ if (!opt::readnames.empty() &&
+ std::find(opt::readnames.begin(), opt::readnames.end(), read_name) == opt::readnames.end() )
+ continue;
+
+ for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) {
+
+ if(!sr.has_events_for_strand(strand_idx)) {
+ continue;
+ }
+
+ // When learning model offsets, don't allow the base model to be swapped out
+ std::string model_type_for_alignment =
+ opt::learn_model_offset ? "" : opt::alternative_model_type;
+
+ std::vector<EventAlignment> ao = alignment_from_read(sr, strand_idx, read_idx,
+ model_type_for_alignment, fai, hdr,
+ record, clip_start, clip_end);
+ if (ao.size() == 0)
+ continue;
+
+ // Update pore model based on alignment
+ if( opt::calibrate ) {
+ recalibrate_model(sr, strand_idx, ao, &gDNAAlphabet, true, opt::scale_drift);
+ }
+
+ if(opt::learn_model_offset) {
+ sweep_offset_parameters(sr, strand_idx, read_idx, fai, ao, 500, opt::alternative_model_type, offset_fp);
+ }
+
+ double score = model_score(sr, strand_idx, fai, ao, 500, transition_training[strand_idx]);
+ if(score > 0)
+ continue;
+
+ #pragma omp critical(print)
+ std::cout << read_name << " " << ( strand_idx ? "complement" : "template" )
+ << " " << sr.pore_model[strand_idx].name << " " << score <<
+ " shift " << sr.pore_model[strand_idx].shift << " scale " << sr.pore_model[strand_idx].scale <<
+ " drift " << sr.pore_model[strand_idx].drift << " var " << sr.pore_model[strand_idx].var << std::endl;
+ }
+ }
+ }
+
+ num_reads_realigned += num_records_buffered;
+ num_records_buffered = 0;
+ }
+
+ } while(result >= 0);
+
+ if(opt::train_transitions) {
+ for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) {
+ fprintf(stderr, "Transition parameters for %zu\n", strand_idx);
+ transition_training[strand_idx]->train();
+ transition_training[strand_idx]->print();
+ delete transition_training[strand_idx];
+ transition_training[strand_idx] = NULL;
+ }
+ }
+
+ // cleanup records
+ for(size_t i = 0; i < records.size(); ++i) {
+ bam_destroy1(records[i]);
+ }
+
+ // cleanup
+ sam_itr_destroy(itr);
+ bam_hdr_destroy(hdr);
+ fai_destroy(fai);
+ sam_close(bam_fh);
+ hts_idx_destroy(bam_idx);
+ return 0;
+}
+
diff --git a/src/nanopolish_scorereads.h b/src/nanopolish_scorereads.h
new file mode 100644
index 0000000..cf16820
--- /dev/null
+++ b/src/nanopolish_scorereads.h
@@ -0,0 +1,32 @@
+//---------------------------------------------------------
+// Copyright 2015 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_scorereads -- score reads against an alignment, model
+//
+
+#ifndef NANOPOLISH_SCOREREADS_H
+#define NANOPOLISH_SCOREREADS_H
+
+
+std::vector<EventAlignment> alignment_from_read(SquiggleRead& sr,
+ const size_t strand_idx,
+ const size_t read_idx,
+ const std::string& alternative_model_type,
+ const faidx_t* fai,
+ const bam_hdr_t* hdr,
+ const bam1_t* record,
+ int region_start,
+ int region_end);
+
+double model_score(SquiggleRead &sr,
+ const size_t strand_idx,
+ const faidx_t *fai,
+ const std::vector<EventAlignment> &alignment_output,
+ const size_t events_per_segment,
+ TransitionParameters* transition_training);
+
+int scorereads_main(int argc, char** argv);
+
+#endif
diff --git a/src/nanopolish_squiggle_read.cpp b/src/nanopolish_squiggle_read.cpp
index 91b7cd6..5b0479e 100644
--- a/src/nanopolish_squiggle_read.cpp
+++ b/src/nanopolish_squiggle_read.cpp
@@ -9,14 +9,16 @@
#include <algorithm>
#include "nanopolish_common.h"
#include "nanopolish_squiggle_read.h"
+#include "nanopolish_pore_model_set.h"
+#include "nanopolish_methyltrain.h"
#include "src/fast5.hpp"
//
-SquiggleRead::SquiggleRead(const std::string& name, const std::string& path) :
+SquiggleRead::SquiggleRead(const std::string& name, const std::string& path, const uint32_t flags) :
read_name(name),
drift_correction_performed(false)
{
- load_from_fast5(path);
+ load_from_fast5(path, flags);
// perform drift correction and other scalings
transform();
@@ -30,7 +32,7 @@ SquiggleRead::~SquiggleRead()
int SquiggleRead::get_next_event(int start, int stop, int stride, uint32_t strand) const
{
while(start != stop) {
-
+
int ei = base_to_event_map[start].indices[strand].start;
if(ei != -1)
return ei;
@@ -74,7 +76,7 @@ void SquiggleRead::transform()
}
//
-void SquiggleRead::load_from_fast5(const std::string& fast5_path)
+void SquiggleRead::load_from_fast5(const std::string& fast5_path, const uint32_t flags)
{
fast5::File* f_p;
this->fast5_path = fast5_path;
@@ -84,54 +86,261 @@ void SquiggleRead::load_from_fast5(const std::string& fast5_path)
// Check if an alternative analysis group is present in the read name
int group_id = -1;
+ /*
size_t bc_2d_pos = read_name.find("Basecall_2D");
if(bc_2d_pos != std::string::npos) {
int ret = sscanf(read_name.substr(bc_2d_pos).c_str(), "Basecall_2D_%03d_2d", &group_id);
- }
-
+ }
+ */
// default to 0 group
if(group_id == -1) {
group_id = 0;
}
-
- f_p->set_basecalled_group_id(group_id);
- // Load PoreModel for both strands
- for (size_t si = 0; si < 2; ++si) {
- pore_model[si] = PoreModel( f_p, si );
+ //f_p->set_basecalled_group_id(group_id);
+
+ auto available_groups = f_p->get_basecall_group_list();
+
+ // precedence: 2D_NNN, RNN_1D_NNN, 1D_NNN
+ std::string basecall_group = "1D_000";
+ std::string event_group = "1D_000";
- // initialize transition parameters
- parameters[si].initialize(pore_model[si].name);
+ read_type = SRT_TEMPLATE;
+
+ for(auto g : available_groups) {
+ if(g == "2D_000") {
+ basecall_group = g;
+ // for 2D reads we still take events from the 1D group
+ read_type = SRT_2D;
+ } else if(g == "RNN_1D_000" && g != "2D_000") {
+ basecall_group = g;
+ event_group = g;
+ read_type = SRT_TEMPLATE;
+ }
}
-
- // Load events for both strands
+
+ read_sequence = f_p->get_basecall_seq(basecall_group, read_type);
+
+ // Load PoreModel for both strands
+ std::vector<EventRangeForBase> event_maps_1d[NUM_STRANDS];
+ std::string read_sequences_1d[NUM_STRANDS];
+
for (size_t si = 0; si < 2; ++si) {
- std::vector<fast5::Event_Entry> f5_events = f_p->get_events(si);
-
+
+ // Do we want to load this strand?
+ if(! (read_type == SRT_2D || read_type == si) ) {
+ continue;
+ }
+
+ // Detect and reject R7 reads
+ if(f_p->have_basecall_model(si)) {
+ fprintf(stderr, "Error: R7 read detected, please use nanopolish-0.4\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // Load the events for this strand
+ std::vector<fast5::Event_Entry> f5_events = f_p->get_basecall_events(event_group, si);
+
// copy events
events[si].resize(f5_events.size());
+ std::vector<double> p_model_states;
+
for(size_t ei = 0; ei < f5_events.size(); ++ei) {
const fast5::Event_Entry& f5_event = f5_events[ei];
- events[si][ei] = { static_cast<float>(f5_event.mean),
- static_cast<float>(f5_event.stdv),
- static_cast<float>(f5_event.start),
- static_cast<float>(f5_event.length) };
+ events[si][ei] = { static_cast<float>(f5_event.mean),
+ static_cast<float>(f5_event.stdv),
+ f5_event.start,
+ static_cast<float>(f5_event.length),
+ static_cast<float>(log(f5_event.stdv)) };
+ assert(f5_event.p_model_state >= 0.0 && f5_event.p_model_state <= 1.0);
+ p_model_states.push_back(f5_event.p_model_state);
+ }
+
+ // we need the 1D event map and sequence to calculate calibration parameters
+ // these will be copied into the member fields later if this is a 1D read,
+ // or discarded if this is a 2D read
+
+ // NB we use event_group in this call rather than basecall_group as we want the 1D basecalls that match the events
+ read_sequences_1d[si] = f_p->get_basecall_seq(event_group, si == 0 ? SRT_TEMPLATE : SRT_COMPLEMENT);
+ event_maps_1d[si] = build_event_map_1d(f_p, read_sequences_1d[si], si, f5_events);
+ std::vector<EventAlignment> alignment =
+ get_eventalignment_for_1d_basecalls(read_sequences_1d[si], event_maps_1d[si], 5, si);
+
+ // JTS Hack: blacklist bad k-mer and filter out events with low p_model_state
+ double keep_fraction = 0.75;
+ std::vector<double> sorted_p_model_states = p_model_states;
+ std::sort(sorted_p_model_states.begin(), sorted_p_model_states.end());
+ double p_model_state_threshold = sorted_p_model_states[sorted_p_model_states.size() * (1 - keep_fraction)];
+
+ std::string blacklist_kmer = "CCTAG";
+ std::vector<EventAlignment> filtered;
+
+ assert(p_model_states.size() == events[si].size());
+
+ for(const auto& ea : alignment) {
+ if((!ea.rc && ea.ref_kmer == blacklist_kmer) ||
+ (ea.rc && ea.ref_kmer == gDNAAlphabet.reverse_complement(blacklist_kmer)))
+ {
+ continue;
+ }
+
+ if(p_model_states[ea.event_idx] < p_model_state_threshold)
+ continue;
+
+ filtered.push_back(ea);
+ }
+
+ // Load the pore model (if requested) and calibrate it
+ if( (flags & SRF_NO_MODEL) == 0) {
+
+ // For the template strad we only have one candidate model
+ // For complement we need to select between the two possible models
+ std::vector<const PoreModel*> candidate_models;
+ if(si == 0) {
+ candidate_models.push_back(&PoreModelSet::get_model("base", "t.007"));
+ } else {
+ for(const std::string& cmn : { "c.p1.007", "c.p2.007" } ) {
+ if(PoreModelSet::has_model("base", cmn)) {
+ candidate_models.push_back(&PoreModelSet::get_model("base", cmn));
+ }
+ }
+ }
+
+ PoreModel best_model;
+ double best_model_var = INFINITY;
+
+ for(size_t model_idx = 0; model_idx < candidate_models.size(); model_idx++) {
+
+ pore_model[si] = *candidate_models[model_idx];
+
+ // Initialize to default scaling parameters
+ pore_model[si].shift = 0.0;
+ pore_model[si].scale = 1.0;
+ pore_model[si].drift = 0.0;
+ pore_model[si].var = 1.0;
+ pore_model[si].scale_sd = 1.0;
+ pore_model[si].var_sd = 1.0;
+
+ pore_model[si].bake_gaussian_parameters();
+
+ // run recalibration to get the best set of scaling parameters and the residual
+ // between the (scaled) event levels and the model
+ bool calibrated = recalibrate_model(*this, si, filtered, pore_model[si].pmalphabet, true, false);
+ if(calibrated) {
+ if(pore_model[si].var < best_model_var) {
+ best_model_var = pore_model[si].var;
+ best_model = pore_model[si];
+ }
+ }
+
+#ifdef DEBUG_MODEL_SELECTION
+ fprintf(stderr, "[calibration] read: %s strand: %zu model_idx: %zu "
+ "scale: %.2lf shift: %.2lf drift: %.5lf var: %.2lf\n",
+ read_name.substr(0, 6).c_str(), si, model_idx, pore_model[si].scale,
+ pore_model[si].shift, pore_model[si].drift, pore_model[si].var);
+#endif
+ }
+
+ if(best_model_var != INFINITY) {
+#ifdef DEBUG_MODEL_SELECTION
+ fprintf(stderr, "[calibration] selected model with var %.4lf\n", best_model_var);
+#endif
+ pore_model[si] = best_model;
+ pore_model[si].bake_gaussian_parameters();
+
+ // initialize transition parameters
+ parameters[si].initialize(pore_model[si].metadata);
+ } else {
+ // could not find a model for this strand, discard it
+ events[si].clear();
+ }
}
}
- //
- // Load basecalled sequence
- //
- read_sequence = f_p->basecalled_2D();
-
+ // Build the map from k-mers of the read sequence to events
+ if(read_type == SRT_2D) {
+ build_event_map_2d(f_p, basecall_group);
+ } else {
+ assert(read_type < NUM_STRANDS);
+ this->base_to_event_map.swap(event_maps_1d[read_type]);
+ }
+
+ // Load raw samples if requested
+ if(flags & SRF_LOAD_RAW_SAMPLES) {
+
+ auto& sample_read_names = f_p->get_raw_samples_read_name_list();
+ if(sample_read_names.empty()) {
+ fprintf(stderr, "Error, no raw samples found\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // we assume the first raw sample read is the one we're after
+ std::string sample_read_name = sample_read_names.front();
+
+ samples = f_p->get_raw_samples(sample_read_name);
+ sample_start_time = f_p->get_raw_samples_params(sample_read_name).start_time;
+
+ // retreive parameters
+ fast5::Channel_Id_Parameters channel_params = f_p->get_channel_id_params();
+ sample_rate = channel_params.sampling_rate;
+ }
+
+ delete f_p;
+}
+
+std::vector<EventRangeForBase> SquiggleRead::build_event_map_1d(fast5::File* f_p,
+ const std::string& read_sequence_1d,
+ uint32_t strand,
+ std::vector<fast5::Event_Entry>& f5_events)
+{
+ std::vector<EventRangeForBase> out_event_map;
+ const uint32_t k = pore_model[T_IDX].k;
+ assert(f5_events.size() == events[strand].size());
+
+ // initialize - one entry per read kmer
+ uint32_t n_read_kmers = read_sequence_1d.size() - k + 1;
+ out_event_map.resize(n_read_kmers);
+
+ // The range for the first k-mer always starts at event 0
+ assert(f5_events[0].move == 0);
+ out_event_map[0].indices[strand].start = 0;
+
+ size_t curr_k_idx = 0;
+ for(size_t ei = 0; ei < f5_events.size(); ++ei) {
+ const fast5::Event_Entry& f5_event = f5_events[ei];
+
+ // Does this event correspond to a different k-mer than the previous one?
+ if(f5_event.move > 0) {
+ assert(ei != 0);
+
+ // end the range for the current k-mer
+ out_event_map[curr_k_idx].indices[strand].stop = ei - 1;
+ curr_k_idx += f5_event.move;
+
+ // start the range for the next kmer
+ out_event_map[curr_k_idx].indices[strand].start = ei;
+ }
+
+ assert(read_sequence_1d.compare(curr_k_idx, k,
+ array2str(f5_event.model_state), 0, k) == 0);
+ }
+
+ // end the last range
+ out_event_map[curr_k_idx].indices[strand].stop = events[strand].size() - 1;
+ assert(out_event_map[curr_k_idx].indices[strand].start <= out_event_map[curr_k_idx].indices[strand].stop);
+ return out_event_map;
+}
+
+void SquiggleRead::build_event_map_2d(fast5::File* f_p, const std::string& basecall_group)
+{
//
// Build the map from read k-mers to events
//
- std::vector<fast5::Event_Alignment_Entry> event_alignments = f_p->get_event_alignments();
+ std::vector<fast5::Event_Alignment_Entry> event_alignments = f_p->get_basecall_event_alignment(basecall_group);
assert(!read_sequence.empty());
- const uint32_t k = pore_model[T_IDX].k;
- assert(pore_model[C_IDX].k == k);
+ // R9 change: use k from the event table as this might not match the pore model
+ uint32_t k = strnlen(event_alignments[0].kmer.data(), event_alignments[0].kmer.size());
uint32_t n_read_kmers = read_sequence.size() - k + 1;
base_to_event_map.resize(n_read_kmers);
@@ -144,37 +353,28 @@ void SquiggleRead::load_from_fast5(const std::string& fast5_path)
// sequences to work out which read base each entry is referring to
uint32_t start_ea_idx = 0;
uint32_t end_ea_idx = 0;
-
+ //printf("Starting event map construction for read %s\n", read_name.c_str());
while(start_ea_idx < event_alignments.size()) {
-hack:
uint32_t prev_kidx = read_kidx;
// Advance the kmer index until we have found the read kmer
// this tuple refers to
- while(read_kidx < n_read_kmers &&
- strncmp(event_alignments[start_ea_idx].kmer,
- read_sequence.c_str() + read_kidx, k) != 0) {
+ while(read_kidx < n_read_kmers &&
+ read_sequence.compare(read_kidx, k,
+ array2str(event_alignments[start_ea_idx].kmer),
+ 0, k) != 0)
+ {
read_kidx += 1;
}
- // In the most recent version of metrichor occasionally
- // a kmer will be present in the alignment table
- // that is not in the 2D read. This awful hack
- // will skip such k-mers. It is not a long-term
- // solution, only until metrichor is fixed.
- if(read_kidx - prev_kidx > 10) {
- start_ea_idx += 1;
- read_kidx = prev_kidx;
- goto hack;
- }
-
// Advance the event alignment end index to the last tuple
// with the same kmer as the start of this range
end_ea_idx = start_ea_idx;
while(end_ea_idx < event_alignments.size() &&
- strcmp(event_alignments[start_ea_idx].kmer,
- event_alignments[end_ea_idx].kmer) == 0) {
+ array2str(event_alignments[start_ea_idx].kmer).compare(0, k,
+ array2str(event_alignments[end_ea_idx].kmer), 0, k) == 0)
+ {
end_ea_idx += 1;
}
@@ -183,27 +383,133 @@ hack:
for(uint32_t i = start_ea_idx; i < end_ea_idx; ++i) {
fast5::Event_Alignment_Entry& eae = event_alignments[i];
-
+
for(uint32_t si = 0; si <= 1; ++si) {
- uint32_t incoming_idx = si == 0 ? eae.template_index : eae.complement_index;
-
+ int incoming_idx = si == 0 ? eae.template_index : eae.complement_index;
+
+ // if a strand couldn't be loaded (typically because calibration failed) ignore
+ // the events for the strand by setting to -1
+ incoming_idx = events[si].empty() ? -1 : incoming_idx;
+
// no event for this strand, nothing to update
- if(incoming_idx == -1)
+ if(incoming_idx == -1) {
continue;
-
+ }
if(erfb.indices[si].start == -1) {
- erfb.indices[si].start = incoming_idx;
+ erfb.indices[si].start = incoming_idx;
}
erfb.indices[si].stop = incoming_idx;
- assert(erfb.indices[si].start < events[si].size());
- assert(erfb.indices[si].stop < events[si].size());
+ assert(erfb.indices[si].start < (int)events[si].size());
+ assert(erfb.indices[si].stop < (int)events[si].size());
}
}
//printf("\t[%d %d] [%d %d]\n", erfb.indices[0].start, erfb.indices[0].stop, erfb.indices[1].start, erfb.indices[1].stop);
start_ea_idx = end_ea_idx;
}
+}
- delete f_p;
+void SquiggleRead::replace_models(const std::string& model_type)
+{
+
+ for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) {
+
+ // only replace this model if the strand was loaded
+ if( !(read_type == SRT_2D || read_type == strand_idx) || !has_events_for_strand(strand_idx)) {
+ continue;
+ }
+
+ PoreModel incoming_model =
+ PoreModelSet::get_model(model_type, this->pore_model[strand_idx].metadata.get_short_name());
+ replace_model(strand_idx, incoming_model);
+ }
+}
+
+void SquiggleRead::replace_model(size_t strand_idx, const std::string& model_type)
+{
+ PoreModel incoming_model =
+ PoreModelSet::get_model(model_type, this->pore_model[strand_idx].metadata.get_short_name());
+ replace_model(strand_idx, incoming_model);
+}
+
+void SquiggleRead::replace_model(size_t strand_idx, const PoreModel& model)
+{
+ this->pore_model[strand_idx].update_states( model );
+}
+
+// Return a vector of eventalignments for the events that made up the 2D basecalls in the read
+std::vector<EventAlignment> SquiggleRead::get_eventalignment_for_1d_basecalls(const std::string& read_sequence_1d,
+ const std::vector<EventRangeForBase>& base_to_event_map_1d,
+ const size_t k,
+ const size_t strand_idx) const
+{
+ std::vector<EventAlignment> alignment;
+
+ const Alphabet* alphabet = this->pore_model[strand_idx].pmalphabet;
+ size_t n_kmers = read_sequence_1d.size() - k + 1;
+ size_t prev_kmer_rank = -1;
+
+ for(size_t ki = 0; ki < n_kmers; ++ki) {
+ IndexPair event_range_for_kmer = base_to_event_map_1d[ki].indices[strand_idx];
+
+ // skip kmers without events
+ if(event_range_for_kmer.start == -1)
+ continue;
+
+ for(size_t event_idx = event_range_for_kmer.start;
+ event_idx <= event_range_for_kmer.stop; event_idx++)
+ {
+ assert(event_idx < this->events[strand_idx].size());
+
+ // since we use the 1D read seqence here we never have to reverse complement
+ std::string kmer = read_sequence_1d.substr(ki, k);
+ size_t kmer_rank = alphabet->kmer_rank(kmer.c_str(), k);
+
+ EventAlignment ea;
+ // ref data
+ ea.ref_name = "read"; // not needed
+ ea.ref_kmer = kmer;
+ ea.ref_position = ki;
+ ea.read_idx = -1; // not needed
+ ea.strand_idx = strand_idx;
+ ea.event_idx = event_idx;
+ ea.rc = false;
+ ea.model_kmer = kmer;
+ ea.hmm_state = prev_kmer_rank != kmer_rank ? 'M' : 'E';
+ alignment.push_back(ea);
+ prev_kmer_rank = kmer_rank;
+ }
+ }
+
+ return alignment;
+}
+
+size_t SquiggleRead::get_sample_index_at_time(size_t sample_time) const
+{
+ return sample_time - sample_start_time;
+}
+
+//
+std::vector<float> SquiggleRead::get_scaled_samples_for_event(size_t strand_idx, size_t event_idx) const
+{
+ double event_start_time = this->events[strand_idx][event_idx].start_time;
+ double event_duration = this->events[strand_idx][event_idx].duration;
+
+ size_t start_idx = this->get_sample_index_at_time(event_start_time * this->sample_rate);
+ size_t end_idx = this->get_sample_index_at_time((event_start_time + event_duration) * this->sample_rate);
+
+ std::vector<float> out;
+ for(size_t i = start_idx; i < end_idx; ++i) {
+ double curr_sample_time = (this->sample_start_time + i) / this->sample_rate;
+ //fprintf(stderr, "event_start: %.5lf sample start: %.5lf curr: %.5lf rate: %.2lf\n", event_start_time, this->sample_start_time / this->sample_rate, curr_sample_time, this->sample_rate);
+ double s = this->samples[i];
+ // apply scaling corrections
+ double scaled_s = s - this->pore_model[strand_idx].shift;
+ assert(curr_sample_time >= (this->sample_start_time / this->sample_rate));
+ scaled_s -= (curr_sample_time - (this->sample_start_time / this->sample_rate)) * this->pore_model[strand_idx].drift;
+ scaled_s /= this->pore_model[strand_idx].scale;
+ out.push_back(scaled_s);
+ }
+ return out;
}
diff --git a/src/nanopolish_squiggle_read.h b/src/nanopolish_squiggle_read.h
index 9bde393..aee80e3 100644
--- a/src/nanopolish_squiggle_read.h
+++ b/src/nanopolish_squiggle_read.h
@@ -12,15 +12,33 @@
#include "nanopolish_common.h"
#include "nanopolish_poremodel.h"
#include "nanopolish_transition_parameters.h"
+#include "nanopolish_eventalign.h"
#include <string>
+// the type of the read
+// do not change as template must always be 0 and complement 1
+enum SquiggleReadType
+{
+ SRT_TEMPLATE = 0,
+ SRT_COMPLEMENT,
+ SRT_2D
+};
+
+// Flags to control the behaviour of the read
+enum SquiggleReadFlags
+{
+ SRF_NO_MODEL = 1, // do not load a model
+ SRF_LOAD_RAW_SAMPLES = 2
+};
+
// The raw event data for a read
struct SquiggleEvent
{
float mean; // current level mean in picoamps
float stdv; // current level stdv
- float start_time; // start time of the event in seconds
+ double start_time; // start time of the event in seconds
float duration; // duration of the event in seconds
+ float log_stdv; // precompute for efficiency
};
struct IndexPair
@@ -43,7 +61,7 @@ class SquiggleRead
public:
SquiggleRead() : drift_correction_performed(false) {} // legacy TODO remove
- SquiggleRead(const std::string& name, const std::string& path);
+ SquiggleRead(const std::string& name, const std::string& path, const uint32_t flags = 0);
~SquiggleRead();
//
@@ -51,7 +69,7 @@ class SquiggleRead
//
// Load all the read data from a fast5 file
- void load_from_fast5(const std::string& fast5_path);
+ void load_from_fast5(const std::string& fast5_path, const uint32_t flags);
//
// Access to data
@@ -71,13 +89,19 @@ class SquiggleRead
assert(drift_correction_performed);
return events[strand][event_idx].mean;
}
-
- // Return the observed current level after correcting for drift
- inline float get_event_stdv(uint32_t event_idx, uint32_t strand) const
+
+ // Return the current stdv for the given event
+ inline float get_stdv(uint32_t event_idx, uint32_t strand) const
{
return events[strand][event_idx].stdv;
}
+ // Return log of the current stdv for the given event
+ inline float get_log_stdv(uint32_t event_idx, uint32_t strand) const
+ {
+ return events[strand][event_idx].log_stdv;
+ }
+
// Return the observed current level after correcting for drift, shift and scale
inline float get_fully_scaled_level(uint32_t event_idx, uint32_t strand) const
{
@@ -85,6 +109,28 @@ class SquiggleRead
float level = get_drift_corrected_level(event_idx, strand);
return (level - pore_model[strand].shift) / pore_model[strand].scale;
}
+
+ // Return the observed current level stdv, after correcting for scale
+ inline float get_scaled_stdv(uint32_t event_idx, uint32_t strand) const
+ {
+ return events[strand][event_idx].stdv / pore_model[strand].scale_sd;
+ }
+
+ inline float get_time(uint32_t event_idx, uint32_t strand) const
+ {
+ return events[strand][event_idx].start_time - events[strand][0].start_time;
+ }
+
+ // Return the observed current level after correcting for drift
+ inline float get_uncorrected_level(uint32_t event_idx, uint32_t strand) const
+ {
+ if (!drift_correction_performed)
+ return events[strand][event_idx].mean;
+ else {
+ double time = get_time(event_idx, strand);
+ return events[strand][event_idx].mean + (time * pore_model[strand].drift);
+ }
+ }
// Calculate the index of this k-mer on the other strand
inline int32_t flip_k_strand(int32_t k_idx) const
@@ -96,15 +142,44 @@ class SquiggleRead
// Transform each event by correcting for current drift
void transform();
- // get the index of the event tht is nearest to the given kmer
+ // get the index of the event that is nearest to the given kmer
int get_closest_event_to(int k_idx, uint32_t strand) const;
+ // replace the pore models with the models specified in the map or by a string
+ void replace_models(const std::string& model_type);
+ void replace_model(size_t strand_idx, const std::string& model_type);
+ void replace_model(size_t strand_idx, const PoreModel& model);
+
+ // returns true if this read has events for this strand
+ bool has_events_for_strand(size_t strand_idx) { return !this->events[strand_idx].empty(); }
+
+ // Create an eventalignment between the events of this read and its 1D basecalled sequence
+ std::vector<EventAlignment> get_eventalignment_for_1d_basecalls(const std::string& read_sequence_1d,
+ const std::vector<EventRangeForBase>& base_to_event_map_1d,
+ const size_t k,
+ const size_t strand_idx) const;
+
+ // Sample-level access
+ size_t get_sample_index_at_time(size_t sample_time) const;
+ std::vector<float> get_scaled_samples_for_event(size_t strand_idx, size_t event_idx) const;
+
+ // print the scaling parameters for this strand
+ void print_scaling_parameters(FILE* fp, size_t strand_idx) const
+ {
+ fprintf(fp, "shift: %.2lf scale: %.2lf drift: %.2lf var: %.2lf\n", this->pore_model[strand_idx].shift,
+ this->pore_model[strand_idx].scale,
+ this->pore_model[strand_idx].drift,
+ this->pore_model[strand_idx].var);
+ }
+
+
//
// Data
//
// unique identifier of the read
std::string read_name;
+ SquiggleReadType read_type;
std::string fast5_path;
uint32_t read_id;
std::string read_sequence;
@@ -115,6 +190,12 @@ class SquiggleRead
// one event sequence for each strand
std::vector<SquiggleEvent> events[2];
+
+ // optional fields holding the raw data
+ // this is not split into strands so there is only one vector, unlike events
+ std::vector<float> samples;
+ double sample_rate;
+ int64_t sample_start_time;
//
std::vector<EventRangeForBase> base_to_event_map;
@@ -124,7 +205,16 @@ class SquiggleRead
private:
- SquiggleRead(const SquiggleRead& other) {}
+ SquiggleRead(const SquiggleRead&) {}
+
+ // make a map from a base of the 1D read sequence to the range of events supporting that base
+ std::vector<EventRangeForBase> build_event_map_1d(fast5::File* f_p,
+ const std::string& read_sequence_1d,
+ uint32_t strand,
+ std::vector<fast5::Event_Entry>& f5_events);
+
+ // as above but for the 2D sequence. this fills in both the template and complete event indices
+ void build_event_map_2d(fast5::File* f_p, const std::string& basecall_group);
// helper for get_closest_event_to
int get_next_event(int start, int stop, int stride, uint32_t strand) const;
diff --git a/src/nanopolish_train_poremodel_from_basecalls.cpp b/src/nanopolish_train_poremodel_from_basecalls.cpp
new file mode 100644
index 0000000..a3225f6
--- /dev/null
+++ b/src/nanopolish_train_poremodel_from_basecalls.cpp
@@ -0,0 +1,398 @@
+//---------------------------------------------------------
+// Copyright 2016 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_train_poremodel_from_basecalls - train a
+// new pore model from the FAST5 output of a basecaller
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <vector>
+#include <inttypes.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/time.h>
+#include <algorithm>
+#include <sstream>
+#include <set>
+#include <omp.h>
+#include <getopt.h>
+#include "htslib/faidx.h"
+#include "nanopolish_poremodel.h"
+#include "nanopolish_squiggle_read.h"
+#include "nanopolish_methyltrain.h"
+#include "training_core.hpp"
+#include "profiler.h"
+#include "logger.hpp"
+
+//
+// Typedefs
+//
+typedef std::vector<StateTrainingData> TrainingDataVector;
+typedef std::vector<TrainingDataVector> KmerTrainingData;
+
+//
+// Getopt
+//
+#define SUBPROGRAM "train-poremodel-from-basecalls"
+
+static const char *TRAIN_POREMODEL_FROM_BASECALLS_VERSION_MESSAGE =
+SUBPROGRAM " Version " PACKAGE_VERSION "\n"
+"Written by Jared Simpson.\n"
+"\n"
+"Copyright 2016 Ontario Institute for Cancer Research\n";
+
+static const char *TRAIN_POREMODEL_FROM_BASECALLS_USAGE_MESSAGE =
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] input.fofn\n"
+"Train a new pore model using the basecalled reads in input.fofn\n"
+"\n"
+" -v, --verbose display verbose output\n"
+" --version display version\n"
+" --help display this help and exit\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+
+namespace opt
+{
+ static unsigned int verbose;
+ static std::string fofn_file;
+}
+
+static const char* shortopts = "v";
+
+enum { OPT_HELP = 1, OPT_VERSION };
+
+static const struct option longopts[] = {
+ { "verbose", no_argument, NULL, 'v' },
+ { "help", no_argument, NULL, OPT_HELP },
+ { "version", no_argument, NULL, OPT_VERSION },
+ { NULL, 0, NULL, 0 }
+};
+
+void parse_train_poremodel_from_basecalls_options(int argc, char** argv)
+{
+ bool die = false;
+ for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
+ std::istringstream arg(optarg != NULL ? optarg : "");
+ switch (c) {
+ case '?': die = true; break;
+ case 'v': opt::verbose++; break;
+ case OPT_HELP:
+ std::cout << TRAIN_POREMODEL_FROM_BASECALLS_USAGE_MESSAGE;
+ exit(EXIT_SUCCESS);
+ case OPT_VERSION:
+ std::cout << TRAIN_POREMODEL_FROM_BASECALLS_VERSION_MESSAGE;
+ exit(EXIT_SUCCESS);
+ }
+ }
+
+ if (argc - optind < 1) {
+ std::cerr << SUBPROGRAM ": not enough arguments\n";
+ die = true;
+ }
+
+ if (argc - optind > 1) {
+ std::cerr << SUBPROGRAM ": too many arguments\n";
+ die = true;
+ }
+
+ if (die)
+ {
+ std::cout << "\n" << TRAIN_POREMODEL_FROM_BASECALLS_USAGE_MESSAGE;
+ exit(EXIT_FAILURE);
+ }
+
+ opt::fofn_file = argv[optind++];
+}
+
+PoreModel initialize_pore_model(const std::vector<KmerTrainingData>& read_training_data,
+ const size_t k)
+{
+ size_t num_kmers_in_alphabet = gDNAAlphabet.get_num_strings(k);
+
+ // Select the read with the most events to form the basis for the pore model distribution
+ size_t max_events = 0;
+ size_t max_events_index = 0;
+ for(size_t rti = 0; rti < read_training_data.size(); ++rti) {
+ auto& kmer_training_data = read_training_data[rti];
+ size_t total_events = 0;
+
+ for(size_t ki = 0; ki < kmer_training_data.size(); ++ki) {
+ total_events += kmer_training_data[ki].size();
+ }
+ printf("read %zu has %zu events (max: %zu, %zu)\n", rti, total_events, max_events, max_events_index);
+
+ if(total_events > max_events) {
+ max_events = total_events;
+ max_events_index = rti;
+ }
+ }
+
+ // Set the initial pore model
+ PoreModel pore_model(k);
+ pore_model.states.resize(num_kmers_in_alphabet);
+ pore_model.scaled_states.resize(num_kmers_in_alphabet);
+ pore_model.scaled_params.resize(num_kmers_in_alphabet);
+
+ pore_model.shift = 0.0;
+ pore_model.scale = 1.0;
+ pore_model.drift = 0.0;
+ pore_model.var = 1.0;
+ pore_model.scale_sd = 1.0;
+ pore_model.var_sd = 1.0;
+ pore_model.shift_offset = 0.0;
+
+ auto& kmer_training_data_for_selected = read_training_data[max_events_index];
+
+ for(size_t ki = 0; ki < kmer_training_data_for_selected.size(); ++ki) {
+ std::vector<double> values;
+ std::stringstream ss;
+ for(size_t ei = 0; ei < kmer_training_data_for_selected[ki].size(); ++ei) {
+ values.push_back(kmer_training_data_for_selected[ki][ei].level_mean);
+ ss << values.back() << " ";
+ }
+
+ // Set the kmer's mean parameter to be the median of the recorded values
+ std::sort(values.begin(), values.end());
+
+ size_t n = values.size();
+ double median;
+ if(n == 0) {
+ median = 0.0f;
+ } else {
+ if(n % 2 == 0) {
+ median = (values[n / 2 - 1] + values[n/2]) / 2.0f;
+ } else {
+ median = values[n/2];
+ }
+
+ pore_model.states[ki].level_mean = median;
+ pore_model.states[ki].level_stdv = 1.0;
+ pore_model.states[ki].sd_mean = 0.0;
+ pore_model.states[ki].sd_stdv = 0.0;
+ pore_model.states[ki].sd_lambda = 0.0;
+ pore_model.states[ki].update_logs();
+
+ printf("k: %zu median: %.2lf values: %s\n", ki, median, ss.str().c_str());
+ }
+ }
+ pore_model.bake_gaussian_parameters();
+
+ return pore_model;
+}
+
+void alignment_to_training_data(const SquiggleRead* read,
+ const std::vector<EventAlignment>& alignment,
+ const size_t k,
+ size_t read_idx,
+ KmerTrainingData* out_data,
+ FILE* tsv_writer)
+{
+ for(auto const& a : alignment) {
+ size_t kmer_rank = gDNAAlphabet.kmer_rank(a.model_kmer.c_str(), k);
+ assert(kmer_rank < out_data->size());
+ assert(a.strand_idx == 0);
+ assert(a.event_idx < read->events[a.strand_idx].size());
+
+ double level = read->get_fully_scaled_level(a.event_idx, a.strand_idx);
+ double stdv = read->events[a.strand_idx][a.event_idx].stdv;
+
+ // If the scale/shift values are off, or the events are erroneous, the scaled events can have negative values
+ // causing the training to implode. Filter these here.
+ if(level >= 1.0) {
+ StateTrainingData std(level, stdv, read->pore_model[a.strand_idx].var);
+ out_data->at(kmer_rank).push_back(std);
+ }
+
+ if(tsv_writer) {
+ fprintf(tsv_writer, "%zu\t%s\t%.2lf\t%.5lf\n", read_idx, a.model_kmer.c_str(), level, read->events[a.strand_idx][a.event_idx].duration);
+ }
+ }
+}
+
+
+int train_poremodel_from_basecalls_main(int argc, char** argv)
+{
+ parse_train_poremodel_from_basecalls_options(argc, argv);
+
+ std::ifstream fofn_reader(opt::fofn_file);
+ std::string fast5_name;
+
+ //Logger::set_level_from_option("debug1");
+
+ // parameters
+ unsigned int basecalled_k = 5; // TODO: infer this
+ size_t num_kmers_in_alphabet = gDNAAlphabet.get_num_strings(basecalled_k);
+ unsigned int training_strand = T_IDX; // template training for now
+
+ // Read input
+ std::vector<SquiggleRead*> reads;
+ while(getline(fofn_reader, fast5_name)) {
+ fprintf(stderr, "Loading %s\n", fast5_name.c_str());
+ SquiggleRead* read = new SquiggleRead(fast5_name, fast5_name, SRF_NO_MODEL);
+
+ // initialize the scaling parameters to defaults
+ PoreModel& read_pore_model = read->pore_model[training_strand];
+ read_pore_model.shift = 0.0;
+ read_pore_model.scale = 1.0;
+ read_pore_model.drift = 0.0;
+ read_pore_model.var = 1.0;
+ read_pore_model.scale_sd = 1.0;
+ read_pore_model.var_sd = 1.0;
+
+ reads.push_back(read);
+ }
+ fprintf(stderr, "Loaded %zu reads\n", reads.size());
+
+ // This vector is indexed by read, then kmer, then event
+ std::vector<KmerTrainingData> read_training_data;
+
+ FILE* tsv_writer = fopen("train_poremodel_from_basecalls.tsv", "w");
+ fprintf(tsv_writer, "read_idx\tkmer\tlevel_mean\tduration\n");
+
+ size_t read_idx = 0;
+ for(auto* read : reads) {
+
+ // extract alignment of events to k-mers
+ assert(read->read_type != SRT_2D);
+ std::vector<EventAlignment> alignment =
+ read->get_eventalignment_for_1d_basecalls(read->read_sequence,
+ read->base_to_event_map,
+ basecalled_k,
+ training_strand);
+
+ // convert the alignment into model training data for this read
+ KmerTrainingData ktd(num_kmers_in_alphabet);
+ alignment_to_training_data(read,
+ alignment,
+ basecalled_k,
+ read_idx,
+ &ktd,
+ tsv_writer);
+
+ read_training_data.push_back(ktd);
+ read_idx++;
+ }
+
+ // Select the read with the most events as the "baseline" read for generating the model
+ PoreModel initial_pore_model = initialize_pore_model(read_training_data, basecalled_k);
+ PoreModel current_pore_model = initial_pore_model;
+
+ for(size_t iteration = 0; iteration < 10; iteration++) {
+
+ // Determine the k-mers that have been trained
+ std::vector<bool> trained_kmers(num_kmers_in_alphabet, false);
+
+ size_t num_trained = 0;
+ for(size_t kmer_idx = 0; kmer_idx < num_kmers_in_alphabet; ++kmer_idx) {
+
+ // untrained kmers have a mean of 0.0
+ trained_kmers[kmer_idx] = current_pore_model.states[kmer_idx].level_mean > 1.0;
+ num_trained += trained_kmers[kmer_idx];
+ }
+
+ // Recalibrate the scaling parameters for each read and collect new training data
+ KmerTrainingData kmer_training_data(num_kmers_in_alphabet);
+ for(size_t read_idx = 0; read_idx < reads.size(); ++read_idx) {
+
+ SquiggleRead* read = reads[read_idx];
+
+ // Apply new model to the read
+ read->pore_model[training_strand] = current_pore_model;
+
+ // generate alignment
+ assert(read->read_type != SRT_2D);
+ std::vector<EventAlignment> alignment =
+ read->get_eventalignment_for_1d_basecalls(read->read_sequence,
+ read->base_to_event_map,
+ basecalled_k,
+ training_strand);
+
+ // filter the alignment to only contain k-mers that have a distribution
+ std::vector<EventAlignment> filtered_alignment;
+ for(size_t i = 0; i < alignment.size(); ++i) {
+ size_t kmer_rank = gDNAAlphabet.kmer_rank(alignment[i].model_kmer.c_str(), basecalled_k);
+ if(trained_kmers[kmer_rank]) {
+ filtered_alignment.push_back(alignment[i]);
+ }
+ }
+
+ // recalibrate shift/scale/etc using the filtered alignment
+ recalibrate_model(*read,
+ training_strand,
+ filtered_alignment,
+ &gDNAAlphabet,
+ false, true);
+
+ const PoreModel& read_model = read->pore_model[training_strand];
+ printf("[recalibration] read %zu events: %zu alignment: %zu shift: %.2lf scale: %.2lf drift: %.4lf var: %.2lf\n",
+ read_idx,
+ read->events[training_strand].size(),
+ alignment.size(),
+ read_model.shift,
+ read_model.scale,
+ read_model.drift,
+ read_model.var);
+
+ // skip reads that aren't behaving well TODO: fix
+ if(read_model.scale < 0.9 || read_model.scale > 1.1) {
+ continue;
+ }
+
+ // collect kmer training data from this read
+ alignment_to_training_data(read,
+ alignment,
+ basecalled_k,
+ read_idx,
+ &kmer_training_data,
+ NULL);
+ }
+
+ // Write the training data as a tsv file
+ std::ofstream training_data_tsv("training_data.tsv");
+ StateTrainingData::write_header(training_data_tsv);
+ std::string model_kmer(basecalled_k, 'A');
+ for(size_t kmer_idx = 0; kmer_idx < num_kmers_in_alphabet; kmer_idx++) {
+
+ for(size_t di = 0; di < kmer_training_data[kmer_idx].size(); di++) {
+ kmer_training_data[kmer_idx][di].write_tsv(training_data_tsv, "template.5mer", model_kmer);
+ }
+ gDNAAlphabet.lexicographic_next(model_kmer);
+ }
+
+ // Train new gaussians for each k-mer
+ model_kmer = std::string(basecalled_k, 'A');
+ PoreModel new_pore_model = current_pore_model;
+ for(size_t kmer_idx = 0; kmer_idx < num_kmers_in_alphabet; kmer_idx++) {
+
+ // we use the gaussian mixture machinery but only fit one component in the case
+ ParamMixture input_mixture;
+ fprintf(stderr, "training %s with %zu events\n", model_kmer.c_str(), kmer_training_data[kmer_idx].size());
+ // This is intentially broad and doesn't matter in the one-component case
+ PoreModelStateParams initial_params;
+ initial_params.level_mean = 200;
+ initial_params.level_stdv = 50;
+ initial_params.update_logs();
+
+ input_mixture.log_weights.push_back(log(1.0));
+ input_mixture.params.push_back(initial_params);
+
+ ParamMixture trained_mixture = train_gaussian_mixture(kmer_training_data[kmer_idx], input_mixture);
+ new_pore_model.states[kmer_idx] = trained_mixture.params[0];
+ new_pore_model.states[kmer_idx].level_stdv = 1.5;
+ gDNAAlphabet.lexicographic_next(model_kmer);
+ }
+ new_pore_model.bake_gaussian_parameters();
+ current_pore_model = new_pore_model;
+ }
+ current_pore_model.write("r9.template.5mer.base.model", "r9.template.5mer.base.model");
+
+ // Deallocate input reads
+ for(auto* read : reads) {
+ delete read;
+ }
+
+ return 0;
+}
diff --git a/src/nanopolish_train_poremodel_from_basecalls.h b/src/nanopolish_train_poremodel_from_basecalls.h
new file mode 100644
index 0000000..263daa3
--- /dev/null
+++ b/src/nanopolish_train_poremodel_from_basecalls.h
@@ -0,0 +1,15 @@
+//---------------------------------------------------------
+// Copyright 2016 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_train_poremodel_from_basecalls - train a
+// new pore model from the FAST5 output of a basecaller
+//
+#ifndef NANOPOLISH_TRAIN_POREMODEL_FROM_BASECALLS_H
+#define NANOPOLISH_TRAIN_POREMODEL_FROM_BASECALLS_H
+
+int train_poremodel_from_basecalls_main(int argc, char** argv);
+
+#endif
+
diff --git a/src/test/nanopolish_test.cpp b/src/test/nanopolish_test.cpp
index f414b39..4902d82 100644
--- a/src/test/nanopolish_test.cpp
+++ b/src/test/nanopolish_test.cpp
@@ -8,25 +8,27 @@
#define CATCH_CONFIG_MAIN
#include <stdio.h>
#include <string>
+#include <array>
+#include <vector>
+#include <random>
+
#include "logsum.h"
#include "catch.hpp"
#include "nanopolish_common.h"
#include "nanopolish_alphabet.h"
#include "nanopolish_emissions.h"
#include "nanopolish_profile_hmm.h"
-
-// This code needs to be run before any of the program logic
-// It sets up pre-computed values and caches
-void initialize()
-{
- p7_FLogsumInit();
-}
+#include "training_core.hpp"
+#include "invgauss.hpp"
+#include "logger.hpp"
TEST_CASE( "alphabet", "[alphabet]" ) {
// DNA alphabet
DNAAlphabet dna_alphabet;
MethylCpGAlphabet mc_alphabet;
+ MethylDamAlphabet dam_alphabet;
+ MethylDcmAlphabet dcm_alphabet;
REQUIRE( dna_alphabet.rank('A') == 0 );
REQUIRE( dna_alphabet.rank('C') == 1 );
@@ -69,14 +71,167 @@ TEST_CASE( "alphabet", "[alphabet]" ) {
}
REQUIRE(kmer == "TTT");
- // Test the special reverse complement model
- // for the CpG alphabet
+ // Test the methylate function in the CpG alphabet
+ REQUIRE( mc_alphabet.methylate("C") == "C");
+ REQUIRE( mc_alphabet.methylate("G") == "G");
+ REQUIRE( mc_alphabet.methylate("CG") == "MG");
+ REQUIRE( mc_alphabet.methylate("GC") == "GC");
+ REQUIRE( mc_alphabet.methylate("CGCG") == "MGMG");
+ REQUIRE( mc_alphabet.methylate("AAGCGT") == "AAGMGT");
+ REQUIRE( mc_alphabet.methylate("CGGCGT") == "MGGMGT");
+ REQUIRE( mc_alphabet.methylate("CGCGC") == "MGMGC");
+
+ // unmethylate
+ REQUIRE( mc_alphabet.unmethylate("C") == "C");
+ REQUIRE( mc_alphabet.unmethylate("CG") == "CG");
+ REQUIRE( mc_alphabet.unmethylate("M") == "C");
+ REQUIRE( mc_alphabet.unmethylate("MG") == "CG");
+ REQUIRE( mc_alphabet.unmethylate("MT") == "MT");
+
+ // disambiguate
+ REQUIRE( mc_alphabet.disambiguate("") == "");
+ REQUIRE( mc_alphabet.disambiguate("M") == "M");
+ REQUIRE( mc_alphabet.disambiguate("MT") == "AT");
+ REQUIRE( mc_alphabet.disambiguate("MG") == "MG");
+ REQUIRE( mc_alphabet.disambiguate("AMG") == "AMG");
+ REQUIRE( mc_alphabet.disambiguate("CAM") == "CAM");
+
+ // reverse complement
REQUIRE( mc_alphabet.reverse_complement("M") == "G");
REQUIRE( mc_alphabet.reverse_complement("C") == "G");
+ REQUIRE( mc_alphabet.reverse_complement("G") == "C");
REQUIRE( mc_alphabet.reverse_complement("MG") == "MG");
+ REQUIRE( mc_alphabet.reverse_complement("CG") == "CG");
REQUIRE( mc_alphabet.reverse_complement("AM") == "GT");
REQUIRE( mc_alphabet.reverse_complement("AMG") == "MGT");
+ REQUIRE( mc_alphabet.reverse_complement("AAAMG") == "MGTTT");
+ REQUIRE( mc_alphabet.reverse_complement("MGMG") == "MGMG");
+ REQUIRE( mc_alphabet.reverse_complement("MGAMG") == "MGTMG");
REQUIRE( mc_alphabet.reverse_complement("GTACATG") == dna_alphabet.reverse_complement("GTACATG"));
+
+ // Dam methylation tests
+
+ // methylate
+ REQUIRE( dam_alphabet.methylate("") == "");
+ REQUIRE( dam_alphabet.methylate("G") == "G");
+ REQUIRE( dam_alphabet.methylate("GA") == "GA");
+ REQUIRE( dam_alphabet.methylate("GAT") == "GAT");
+ REQUIRE( dam_alphabet.methylate("GATC") == "GMTC");
+ REQUIRE( dam_alphabet.methylate("GATCG") == "GMTCG");
+ REQUIRE( dam_alphabet.methylate("GATCGA") == "GMTCGA");
+ REQUIRE( dam_alphabet.methylate("GATCGAT") == "GMTCGAT");
+ REQUIRE( dam_alphabet.methylate("GATCGATC") == "GMTCGMTC");
+ REQUIRE( dam_alphabet.methylate("GMTCGATC") == "GMTCGMTC");
+ REQUIRE( dam_alphabet.methylate("GMTCGMTC") == "GMTCGMTC");
+
+ // unmethylate
+ REQUIRE( dam_alphabet.unmethylate("M") == "A");
+ REQUIRE( dam_alphabet.unmethylate("MT") == "AT");
+ REQUIRE( dam_alphabet.unmethylate("MTC") == "ATC");
+ REQUIRE( dam_alphabet.unmethylate("GM") == "GA");
+ REQUIRE( dam_alphabet.unmethylate("GMT") == "GAT");
+ REQUIRE( dam_alphabet.unmethylate("GMTC") == "GATC");
+ REQUIRE( dam_alphabet.unmethylate("GMTCG") == "GATCG");
+ REQUIRE( dam_alphabet.unmethylate("GMTCGM") == "GATCGA");
+ REQUIRE( dam_alphabet.unmethylate("GMTCGMTC") == "GATCGATC");
+ REQUIRE( dam_alphabet.unmethylate("GMTCGMT") == "GATCGAT");
+ REQUIRE( dam_alphabet.unmethylate("GMTCGM") == "GATCGA");
+ REQUIRE( dam_alphabet.unmethylate("MA") == "MA");
+ REQUIRE( dam_alphabet.unmethylate("MT") == "AT");
+ REQUIRE( dam_alphabet.unmethylate("GM") == "GA");
+ REQUIRE( dam_alphabet.unmethylate("CM") == "CM");
+
+ // disambiguate
+ REQUIRE( dam_alphabet.disambiguate("") == "");
+ REQUIRE( dam_alphabet.disambiguate("GMTC") == "GMTC");
+ REQUIRE( dam_alphabet.disambiguate("M") == "M");
+ REQUIRE( dam_alphabet.disambiguate("MT") == "MT");
+ REQUIRE( dam_alphabet.disambiguate("MTC") == "MTC");
+ REQUIRE( dam_alphabet.disambiguate("GM") == "GM");
+ REQUIRE( dam_alphabet.disambiguate("GMT") == "GMT");
+ REQUIRE( dam_alphabet.disambiguate("GMA") == "GAA");
+
+ // reverse complement
+ REQUIRE( dam_alphabet.reverse_complement("") == "");
+ REQUIRE( dam_alphabet.reverse_complement("M") == "T");
+ REQUIRE( dam_alphabet.reverse_complement("G") == "C");
+ REQUIRE( dam_alphabet.reverse_complement("GM") == "TC");
+ REQUIRE( dam_alphabet.reverse_complement("GMT") == "MTC");
+ REQUIRE( dam_alphabet.reverse_complement("GMTC") == "GMTC");
+ REQUIRE( dam_alphabet.reverse_complement("MTC") == "GMT");
+ REQUIRE( dam_alphabet.reverse_complement("TC") == "GA");
+ REQUIRE( dam_alphabet.reverse_complement("C") == "G");
+ REQUIRE( dam_alphabet.reverse_complement("GATC") == "GATC");
+ REQUIRE( dam_alphabet.reverse_complement("ATC") == "GAT");
+ REQUIRE( dam_alphabet.reverse_complement("TC") == "GA");
+ REQUIRE( dam_alphabet.reverse_complement("GAT") == "ATC");
+
+ //
+ // Dcm methylation tests
+ //
+
+ // methylate
+ REQUIRE( dcm_alphabet.methylate("") == "");
+ REQUIRE( dcm_alphabet.methylate("C") == "C");
+ REQUIRE( dcm_alphabet.methylate("CC") == "CC");
+
+ // first recognition site
+ REQUIRE( dcm_alphabet.methylate("CCA") == "CCA");
+ REQUIRE( dcm_alphabet.methylate("CCAG") == "CCAG");
+ REQUIRE( dcm_alphabet.methylate("CCAGG") == "CMAGG");
+ REQUIRE( dcm_alphabet.methylate("CAGG") == "CAGG");
+ REQUIRE( dcm_alphabet.methylate("AGG") == "AGG");
+
+ // second recognition site
+ REQUIRE( dcm_alphabet.methylate("CCT") == "CCT");
+ REQUIRE( dcm_alphabet.methylate("CCTG") == "CCTG");
+ REQUIRE( dcm_alphabet.methylate("CCTGG") == "CMTGG");
+ REQUIRE( dcm_alphabet.methylate("CTGG") == "CTGG");
+ REQUIRE( dcm_alphabet.methylate("TGG") == "TGG");
+
+ // both recognition sites
+ REQUIRE( dcm_alphabet.methylate("CCAGGCCTGG") == "CMAGGCMTGG");
+ REQUIRE( dcm_alphabet.methylate("CCAGGCCTG") == "CMAGGCCTG");
+
+ // unmethylate
+ REQUIRE( dcm_alphabet.unmethylate("M") == "C");
+ REQUIRE( dcm_alphabet.unmethylate("MA") == "CA");
+ REQUIRE( dcm_alphabet.unmethylate("MT") == "CT");
+ REQUIRE( dcm_alphabet.unmethylate("MAG") == "CAG");
+ REQUIRE( dcm_alphabet.unmethylate("MTG") == "CTG");
+ REQUIRE( dcm_alphabet.unmethylate("MAGG") == "CAGG");
+ REQUIRE( dcm_alphabet.unmethylate("MTGG") == "CTGG");
+
+ REQUIRE( dcm_alphabet.unmethylate("CM") == "CC");
+ REQUIRE( dcm_alphabet.unmethylate("GM") == "GM");
+ REQUIRE( dcm_alphabet.unmethylate("MC") == "MC");
+
+ // disambiguate
+ REQUIRE( dcm_alphabet.disambiguate("") == "");
+ REQUIRE( dcm_alphabet.disambiguate("M") == "M");
+ REQUIRE( dcm_alphabet.disambiguate("CM") == "CM");
+ REQUIRE( dcm_alphabet.disambiguate("GM") == "GA");
+ REQUIRE( dcm_alphabet.disambiguate("MA") == "MA");
+ REQUIRE( dcm_alphabet.disambiguate("MT") == "MT");
+ REQUIRE( dcm_alphabet.disambiguate("MC") == "AC");
+
+ // reverse complement
+ REQUIRE( dcm_alphabet.reverse_complement("") == "");
+ REQUIRE( dcm_alphabet.reverse_complement("M") == "G");
+ REQUIRE( dcm_alphabet.reverse_complement("MT") == "AG");
+ REQUIRE( dcm_alphabet.reverse_complement("MTG") == "MAG");
+ REQUIRE( dcm_alphabet.reverse_complement("MTGG") == "CMAG");
+
+ REQUIRE( dcm_alphabet.reverse_complement("MA") == "TG");
+ REQUIRE( dcm_alphabet.reverse_complement("MAG") == "MTG");
+ REQUIRE( dcm_alphabet.reverse_complement("MAGG") == "CMTG");
+
+ REQUIRE( dcm_alphabet.reverse_complement("CM") == "GG");
+ REQUIRE( dcm_alphabet.reverse_complement("CCAGG") == "CCTGG");
+ REQUIRE( dcm_alphabet.reverse_complement("CCTGG") == "CCAGG");
+ REQUIRE( dcm_alphabet.reverse_complement("CMAGG") == "CMTGG");
+ REQUIRE( dcm_alphabet.reverse_complement("CMTGG") == "CMAGG");
+
}
TEST_CASE( "string functions", "[string_functions]" ) {
@@ -98,6 +253,13 @@ TEST_CASE( "string functions", "[string_functions]" ) {
// complement, reverse complement
REQUIRE( dna_alphabet.reverse_complement("GATGA") == "TCATC" );
+
+ // suffix functions
+ REQUIRE( ends_with("abcd", "cd") );
+ REQUIRE( ! ends_with("abcd", "bc") );
+ REQUIRE( ! ends_with("abcd", "e") );
+ REQUIRE( ends_with("abcd", "d") );
+ REQUIRE( ends_with("abcd", "") );
}
TEST_CASE( "math", "[math]") {
@@ -119,61 +281,193 @@ std::string event_alignment_to_string(const std::vector<HMMAlignmentState>& alig
return out;
}
-TEST_CASE( "hmm", "[hmm]") {
+std::vector< StateTrainingData >
+generate_training_data(const ParamMixture& mixture, size_t n_data,
+ const std::array< float, 2 >& read_var_rg = { .5f, 1.5f },
+ const std::array< float, 2 >& read_scale_sd_rg = { .5f, 1.5f },
+ const std::array< float, 2 >& read_var_sd_rg = { .5f, 1.5f })
+{
+ // check parameter sizes
+ size_t n_components = mixture.log_weights.size();
+ assert(mixture.params.size() == n_components);
+ assert(read_var_rg[0] < read_var_rg[1]);
+ assert(read_scale_sd_rg[0] < read_scale_sd_rg[1]);
+ assert(read_var_sd_rg[0] < read_var_sd_rg[1]);
+ // set random seed
+ //seed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+ // catch takes care of managing the random seed
+ std::mt19937 rg(std::rand());
+ typedef std::discrete_distribution< size_t > discrete_dist;
+ typedef std::uniform_real_distribution< float > uniform_dist;
+ typedef std::normal_distribution< float > normal_dist;
+ typedef inverse_gaussian_distribution< float > inverse_gaussian_dist;
+ // generate data
+ std::vector< float > weights(n_components);
+ for (size_t j = 0; j < n_components; ++j)
+ {
+ weights[j] = std::exp(mixture.log_weights[j]);
+ LOG("gen_data", debug) << "weights " << j << " "
+ << std::fixed << std::setprecision(2) << weights[j] << " ("
+ << mixture.log_weights[j] << ")" << std::endl;
+ }
+ std::vector< float > level_mean_sum(n_components, 0.0);
+ std::vector< float > sd_mean_sum(n_components, 0.0);
+ std::vector< size_t > population_size(n_components, 0);
+ std::vector< StateTrainingData > data(n_data);
+ for (size_t i = 0; i < n_data; ++i)
+ {
+ // draw population
+ size_t j = discrete_dist(weights.begin(), weights.end())(rg);
+ ++population_size[j];
+ assert(j < n_components);
+ // draw read_var
+ data[i].read_var = uniform_dist(read_var_rg[0], read_var_rg[1])(rg);
+ data[i].log_read_var = std::log(data[i].read_var);
+ // draw read_scale_sd
+ data[i].read_scale_sd = uniform_dist(read_scale_sd_rg[0], read_scale_sd_rg[1])(rg);
+ data[i].log_read_scale_sd = std::log(data[i].read_scale_sd);
+ // draw read_var_sd
+ data[i].read_var_sd = uniform_dist(read_var_sd_rg[0], read_var_sd_rg[1])(rg);
+ data[i].log_read_var_sd = std::log(data[i].read_var_sd);
+ // scale the state
+ auto scaled_params = mixture.params[j];
+ scaled_params.level_stdv *= data[i].read_var;
+ scaled_params.level_log_stdv += data[i].log_read_var;
+ scaled_params.sd_lambda *= data[i].read_var_sd / data[i].read_scale_sd;
+ scaled_params.sd_log_lambda += data[i].log_read_var_sd - data[i].log_read_scale_sd;
+ // draw level_mean & level_stdv
+ data[i].level_mean = normal_dist(scaled_params.level_mean, scaled_params.level_stdv)(rg);
+ data[i].log_level_mean = std::log(data[i].level_mean);
+ data[i].level_stdv = inverse_gaussian_dist(scaled_params.sd_mean, scaled_params.sd_lambda)(rg);
+ data[i].log_level_stdv = std::log(data[i].level_stdv);
+ level_mean_sum[j] += data[i].level_mean;
+ sd_mean_sum[j] += data[i].level_stdv;
+ LOG("gen_data", debug1)
+ << "data " << i << " " << j << " "
+ << data[i].level_mean << " "
+ << data[i].level_stdv << " "
+ << data[i].read_var << " "
+ << data[i].read_scale_sd << " "
+ << data[i].read_var_sd << std::endl;
+ }
+ for (size_t j = 0; j < n_components; ++j)
+ {
+ LOG("gen_data", debug)
+ << "population " << j << " "
+ << std::fixed << std::setprecision(3) << level_mean_sum[j] / population_size[j] << " "
+ << sd_mean_sum[j] / population_size[j] << std::endl;
+ }
+ return data;
+}
- // read the FAST5
- SquiggleRead sr("test_read", "test/data/LomanLabz_PC_Ecoli_K12_R7.3_2549_1_ch8_file30_strand.fast5");
- sr.transform();
+TEST_CASE("training", "[training]")
+{
+ const unsigned n_data = 1000;
+ const float um_rate = .1;
+ PoreModelStateParams um_params;
+ um_params.level_mean = 65.0;
+ um_params.level_stdv = 1.0;
+ um_params.sd_mean = 0.8;
+ um_params.sd_lambda = 7.0;
+ um_params.update_sd_stdv();
+ um_params.update_logs();
+ //Logger::set_default_level(level_wrapper::debug);
- // The reference sequence to align to:
- std::string ref_subseq = "ATCAGTAAAATAACGTAGAGCGGTAACCTTGCCATAAAGGTCGAGTTTA"
- "TTACCATCCTTGTTATAGACTTCGGCAGCGTGTGCTACGTTCGCAGCT";
+ // first, we test gaussian training only
+ SECTION("gaussian")
+ {
+ float delta_um_rate = .05;
+ float delta_level_mean = 10.0;
+ ParamMixture gen_mixture;
+ gen_mixture.log_weights.push_back(std::log(um_rate + delta_um_rate));
+ gen_mixture.log_weights.push_back(std::log(1 - (um_rate + delta_um_rate)));
+ gen_mixture.params.push_back(um_params);
+ gen_mixture.params.push_back(um_params);
+ gen_mixture.params[1].level_mean += delta_level_mean;
+ auto data = generate_training_data(gen_mixture, n_data);
+ ParamMixture in_mixture;
+ in_mixture.log_weights.push_back(std::log(um_rate));
+ in_mixture.log_weights.push_back(std::log(1 - um_rate));
+ in_mixture.params.push_back(um_params);
+ in_mixture.params.push_back(um_params);
+ // encourage the second component to capture points not well fit by the first
+ in_mixture.params[1].level_stdv += 1.0;
+ in_mixture.params[1].update_logs();
+ auto out_mixture = train_gaussian_mixture(data, in_mixture);
+ CHECK( std::exp(out_mixture.log_weights[0]) == Approx( um_rate + delta_um_rate ).epsilon(.05) );
+ CHECK( out_mixture.params[0].level_mean == Approx( um_params.level_mean ).epsilon(.05) );
+ CHECK( out_mixture.params[1].level_mean == Approx( um_params.level_mean + delta_level_mean ).epsilon(.05) );
+ }
- // Generate a HMMInputData structure to tell the HMM
- // which part of the read to align
- HMMInputData input[2];
-
- // template strand
- input[0].read = &sr;
- input[0].event_start_idx = 3;
- input[0].event_stop_idx = 88;
- input[0].event_stride = 1;
- input[0].rc = false;
- input[0].strand = 0;
-
- // complement strand
- input[1].read = &sr;
- input[1].event_start_idx = 6788;
- input[1].event_stop_idx = 6697;
- input[1].event_stride = -1;
- input[1].rc = true;
- input[1].strand = 1;
-
- // expected output
- std::string expected_alignment[2];
- expected_alignment[0] =
- "MMMMMEMKMKMMMMMMMKMMMKMMMKMMMMMMMMMKKMMEEEMMMMMMKMMMM"
- "MMMKMMMMMKMKMKMEMKKMKMKKMMMMMMEMMMMKMKMEEMMMMKMEEEEEM";
-
- expected_alignment[1] =
- "MMKMMMKMEEMMKMKMKMEMMMKMMMKMEMMMKMMMKMMMMMMMMMKKMEMMMM"
- "EMMMMMMMMKMKKMMMMMMMEMMMMMKMMMMMKMEMMMMMKMMMMMEEEEEEEEM";
-
- double expected_viterbi_last_state[2] = { -237.7690734863, -266.2348022461 };
- double expected_forward[2] = { -221.1331481934, -262.7491455078 };
-
- for(int si = 0; si <= 1; ++si) {
-
- // viterbi align
- std::vector<HMMAlignmentState> event_alignment = profile_hmm_align(ref_subseq, input[si]);
- std::string ea_str = event_alignment_to_string(event_alignment);
-
- // check
- REQUIRE( ea_str == expected_alignment[si]);
- REQUIRE( event_alignment.back().l_fm == Approx(expected_viterbi_last_state[si]));
+ // next, we test inverse gaussian training for the case where the gaussians are distinct
+ SECTION("inverse_gaussian_1")
+ {
+ float delta_um_rate = .05;
+ float delta_level_mean = 10.0;
+ float delta_sd_mean = 0.3;
+ ParamMixture gen_mixture;
+ gen_mixture.log_weights.push_back(std::log(um_rate + delta_um_rate));
+ gen_mixture.log_weights.push_back(std::log(1 - (um_rate + delta_um_rate)));
+ gen_mixture.params.push_back(um_params);
+ gen_mixture.params.push_back(um_params);
+ gen_mixture.params[1].level_mean += delta_level_mean;
+ gen_mixture.params[1].sd_mean += delta_sd_mean;
+ gen_mixture.params[1].update_sd_stdv();
+ gen_mixture.params[1].update_logs();
+ auto data = generate_training_data(gen_mixture, n_data);
+ ParamMixture in_mixture;
+ in_mixture.log_weights.push_back(std::log(um_rate));
+ in_mixture.log_weights.push_back(std::log(1 - um_rate));
+ in_mixture.params.push_back(um_params);
+ in_mixture.params.push_back(um_params);
+ // encourage the second component to capture points not well fit by the first
+ in_mixture.params[1].level_stdv += 1.0;
+ in_mixture.params[1].update_logs();
+ auto mid_mixture = train_gaussian_mixture(data, in_mixture);
+ CHECK( std::exp(mid_mixture.log_weights[0]) == Approx( um_rate + delta_um_rate ).epsilon(.05) );
+ CHECK( mid_mixture.params[0].level_mean == Approx( um_params.level_mean ).epsilon(.05) );
+ CHECK( mid_mixture.params[1].level_mean == Approx( um_params.level_mean + delta_level_mean ).epsilon(.05) );
+ auto out_mixture = train_invgaussian_mixture(data, mid_mixture);
+ CHECK( std::exp(out_mixture.log_weights[0]) == Approx( um_rate + delta_um_rate ).epsilon(.05) );
+ CHECK( out_mixture.params[0].level_mean == Approx( um_params.level_mean ).epsilon(.05) );
+ CHECK( out_mixture.params[1].level_mean == Approx( um_params.level_mean + delta_level_mean ).epsilon(.05) );
+ CHECK( out_mixture.params[0].sd_mean == Approx( um_params.sd_mean ).epsilon(.05) );
+ CHECK( out_mixture.params[1].sd_mean == Approx( um_params.sd_mean + delta_sd_mean ).epsilon(.05) );
+ }
- // forward algorithm
- double lp = profile_hmm_score(ref_subseq, input[si]);
- REQUIRE(lp == Approx(expected_forward[si]));
+ // next, we test inverse gaussian training for the case where the gaussians are very similar
+ SECTION("inverse_gaussian_2")
+ {
+ float delta_um_rate = .05;
+ float delta_level_mean = 2.0;
+ float delta_sd_mean = 0.3;
+ ParamMixture gen_mixture;
+ gen_mixture.log_weights.push_back(std::log(um_rate + delta_um_rate));
+ gen_mixture.log_weights.push_back(std::log(1 - (um_rate + delta_um_rate)));
+ gen_mixture.params.push_back(um_params);
+ gen_mixture.params.push_back(um_params);
+ gen_mixture.params[1].level_mean += delta_level_mean;
+ gen_mixture.params[1].sd_mean += delta_sd_mean;
+ gen_mixture.params[1].update_sd_stdv();
+ gen_mixture.params[1].update_logs();
+ auto data = generate_training_data(gen_mixture, n_data);
+ ParamMixture in_mixture;
+ in_mixture.log_weights.push_back(std::log(um_rate));
+ in_mixture.log_weights.push_back(std::log(1 - um_rate));
+ in_mixture.params.push_back(um_params);
+ in_mixture.params.push_back(um_params);
+ // encourage the second component to capture points not well fit by the first
+ in_mixture.params[1].level_stdv += 1.0;
+ in_mixture.params[1].update_logs();
+ auto mid_mixture = train_gaussian_mixture(data, in_mixture);
+ CHECK( std::exp(mid_mixture.log_weights[0]) == Approx( um_rate + delta_um_rate ).epsilon(.05) );
+ CHECK( mid_mixture.params[0].level_mean == Approx( um_params.level_mean ).epsilon(.05) );
+ CHECK( mid_mixture.params[1].level_mean == Approx( um_params.level_mean + delta_level_mean ).epsilon(.05) );
+ auto out_mixture = train_invgaussian_mixture(data, mid_mixture);
+ CHECK( std::exp(out_mixture.log_weights[0]) == Approx( um_rate + delta_um_rate ).epsilon(.05) );
+ CHECK( out_mixture.params[0].level_mean == Approx( um_params.level_mean ).epsilon(.05) );
+ CHECK( out_mixture.params[1].level_mean == Approx( um_params.level_mean + delta_level_mean ).epsilon(.05) );
+ CHECK( out_mixture.params[0].sd_mean == Approx( um_params.sd_mean ).epsilon(.05) );
+ CHECK( out_mixture.params[1].sd_mean == Approx( um_params.sd_mean + delta_sd_mean ).epsilon(.05) );
}
}
diff --git a/src/training_core.cpp b/src/training_core.cpp
new file mode 100644
index 0000000..426d44d
--- /dev/null
+++ b/src/training_core.cpp
@@ -0,0 +1,265 @@
+#include "training_core.hpp"
+#include "nanopolish_emissions.h"
+#include "logsumset.hpp"
+#include "logger.hpp"
+
+using std::string;
+using std::vector;
+using std::multiset;
+using std::endl;
+
+const bool use_multiset_logsum =
+#ifndef USE_MULTISET_LOGSUM
+ false;
+#else
+ true;
+#endif
+
+ParamMixture train_gaussian_mixture(const vector< StateTrainingData >& data, const ParamMixture& input_mixture)
+{
+ size_t n_components = input_mixture.params.size();
+ size_t n_data = data.size();
+ float log_n_data = std::log(n_data);
+ assert(input_mixture.log_weights.size() == n_components);
+ ParamMixture curr_mixture = input_mixture;
+
+ for(size_t iteration = 0; iteration < 10; ++iteration) {
+ ParamMixture new_mixture = curr_mixture;
+
+ // compute log_pdfs
+ //
+ // pdf[i][j] := gauss(mu_j, sigma_j * read_var_i, level_mean_i)
+ //
+ vector< vector< float > > log_pdf(n_data);
+ for(size_t i = 0; i < n_data; ++i) {
+ log_pdf[i].resize(n_components);
+ for(size_t j = 0; j < n_components; ++j) {
+ // We need to scale the mixture component parameters by the per-read var factor
+ PoreModelStateParams scaled_state = curr_mixture.params[j];
+ scaled_state.level_stdv *= data[i].read_var;
+ scaled_state.level_log_stdv += data[i].log_read_var;
+ log_pdf[i][j] = log_normal_pdf(data[i].level_mean, scaled_state);
+ assert(not std::isnan(log_pdf[i][j]));
+ LOG("training_core", debug1)
+ << "pdf " << i << " " << j << " "
+ << std::scientific << std::exp(log_pdf[i][j]) << " ("
+ << std::fixed << std::setprecision(2) << log_pdf[i][j] << ")" << endl;
+ }
+ }
+
+ // compute responsibilities
+ //
+ // resp[i][j] := ( w_j * pdf[i][j] ) / sum_k ( w_k * pdf[i][k] )
+ //
+ vector< vector< float > > log_resp(n_data);
+ for(size_t i = 0; i < n_data; ++i) {
+ log_resp[i].resize(n_components);
+ logsumset< float > denom_terms(use_multiset_logsum);
+ for(size_t j = 0; j < n_components; ++j) {
+ float v = curr_mixture.log_weights[j] + log_pdf[i][j];
+ log_resp[i][j] = v;
+ denom_terms.add(v);
+ LOG("training_core", debug1)
+ << "resp_numer " << i << " " << j << " "
+ << std::scientific << std::exp(v) << " ("
+ << std::fixed << std::setprecision(2) << v << ")" << endl;
+ }
+ float log_denom = denom_terms.val();
+ LOG("training_core", debug1) << "resp_denom " << i << " "
+ << std::scientific << log_denom << " " << std::exp(log_denom) << std::endl;
+ for(size_t j = 0; j < n_components; ++j) {
+ log_resp[i][j] -= log_denom;
+ LOG("training_core", debug1)
+ << "resp " << i << " " << j << " "
+ << std::fixed << std::setprecision(5) << std::exp(log_resp[i][j]) << " ("
+ << std::fixed << std::setprecision(2) << log_resp[i][j] << ")" << endl;
+ }
+ }
+
+ // update weights
+ //
+ // w'[j] := sum_i resp[i][j] / n_data
+ //
+ for (size_t j = 0; j < n_components; ++j) {
+ logsumset< float > numer_terms(use_multiset_logsum);
+ for (size_t i = 0; i < n_data; ++i) {
+ numer_terms.add(log_resp[i][j]);
+ }
+ float log_numer = numer_terms.val();
+ new_mixture.log_weights[j] = log_numer - log_n_data;
+ }
+
+ // update means
+ //
+ // mu_j := sum_i ( resp[i][j] * level_mean_i ) / sum_i resp[i][j]
+ // = sum_i ( resp[i][j] * level_mean_i ) / ( w'[j] * n_data )
+ //
+ vector< float > new_log_mean(2);
+ for (size_t j = 0; j < n_components; ++j) {
+ logsumset< float > numer_terms(use_multiset_logsum);
+ for (size_t i = 0; i < n_data; ++i) {
+ numer_terms.add(log_resp[i][j] + data[i].log_level_mean);
+ }
+ float log_numer = numer_terms.val();
+ new_log_mean[j] = log_numer - (log_n_data + new_mixture.log_weights[j]);
+ }
+
+ // update stdvs
+ //
+ // var_j := sum_i ( resp[i][j] * ( ( level_mean_i - mu_j ) / read_var_i )^2 ) / sum_i resp[i][j]
+ // = sum_i ( resp[i][j] * ( ( level_mean_i - mu_j ) / read_var_i )^2 ) / ( w'[j] * n_data )
+ //
+ vector< float > new_log_var(2);
+ for (size_t j = 0; j < n_components; ++j) {
+ logsumset< float > numer_terms(use_multiset_logsum);
+ for (size_t i = 0; i < n_data; ++i) {
+ float v = std::abs(data[i].level_mean - std::exp(new_log_mean[j]));
+ numer_terms.add(log_resp[i][j] + (not std::isnan(v) and v > 0? 2.0 * (std::log(v) - data[i].log_read_var) : 0.0));
+ }
+ float log_numer = numer_terms.val();
+ new_log_var[j] = log_numer - (log_n_data + new_mixture.log_weights[j]);
+ }
+
+ for(size_t j = 0; j < n_components; ++j) {
+ new_mixture.params[j].level_mean = std::exp(new_log_mean[j]);
+ new_mixture.params[j].level_log_stdv = .5 * new_log_var[j];
+ new_mixture.params[j].level_stdv = std::exp(new_mixture.params[j].level_log_stdv);
+ LOG("training_core", debug)
+ << "new_mixture " << iteration << " " << j << " "
+ << std::fixed << std::setprecision(5) << std::exp(new_mixture.log_weights[j]) << " "
+ << std::setprecision(3) << new_mixture.params[j].level_mean << " "
+ << new_mixture.params[j].level_stdv << endl;
+ }
+
+ curr_mixture = new_mixture;
+ }
+ return curr_mixture;
+}
+
+ParamMixture train_invgaussian_mixture(const vector< StateTrainingData >& data, const ParamMixture& in_mixture)
+{
+ size_t n_components = in_mixture.params.size();
+ assert(in_mixture.log_weights.size() == n_components);
+ size_t n_data = data.size();
+ auto crt_mixture = in_mixture;
+
+ for (size_t j = 0; j < n_components; ++j) {
+ LOG("training_core", debug)
+ << "in_mixture " << j << " "
+ << std::fixed << std::setprecision(5) << std::exp(in_mixture.log_weights[j]) << " "
+ << std::setprecision(5) << in_mixture.params[j].sd_mean << endl;
+ }
+
+ // compute gaussian pdfs
+ //
+ // pdf[i][j].first = gauss(mu_j, sigma_j * read_var_i, level_mean_i)
+ //
+ vector< vector< std::pair< float, float > > > log_pdf(n_data);
+ for (size_t i = 0; i < n_data; ++i) {
+ log_pdf[i].resize(n_components);
+ for (size_t j = 0; j < n_components; ++j) {
+ PoreModelStateParams scaled_state = in_mixture.params[j];
+ scaled_state.level_stdv *= data[i].read_var;
+ scaled_state.level_log_stdv += data[i].log_read_var;
+ log_pdf[i][j].first = log_normal_pdf(data[i].level_mean, scaled_state);
+ assert(not std::isnan(log_pdf[i][j].first));
+ LOG("training_core", debug1)
+ << "gauss_pdf " << i << " " << j << " "
+ << std::scientific << std::exp(log_pdf[i][j].first) << " ("
+ << std::fixed << std::setprecision(2) << log_pdf[i][j].first << ")" << endl;
+ }
+ }
+
+ // compute gaussian weights
+ //
+ // g_weights[i][j] := ( w_j * pdf[i][j].first ) / sum_k ( w_k * pdf[i][k].first )
+ //
+ vector< vector< float > > log_g_weights(n_data);
+ for (size_t i = 0; i < n_data; ++i) {
+ log_g_weights[i].resize(n_components);
+ logsumset< float > denom_terms(use_multiset_logsum);
+ for (size_t j = 0; j < n_components; ++j) {
+ float v = in_mixture.log_weights[j] + log_pdf[i][j].first;
+ log_g_weights[i][j] = v;
+ denom_terms.add(v);
+ }
+ float log_denom = denom_terms.val();
+ for (size_t j = 0; j < n_components; ++j) {
+ log_g_weights[i][j] -= log_denom;
+ LOG("training_core", debug1)
+ << "g_weights " << i << " " << j << " "
+ << std::fixed << std::setprecision(5) << std::exp(log_g_weights[i][j]) << " ("
+ << std::fixed << std::setprecision(2) << log_g_weights[i][j] << ")" << endl;
+ }
+ }
+
+ for (size_t iteration = 0; iteration < 10; ++iteration) {
+ // compute inverse gaussian pdfs
+ //
+ // pdf[i][j].second = invgauss(eta_j, lambda_j * ( read_var_sd_i / read_var_scale_i ), level_stdv_i)
+ //
+ for (size_t i = 0; i < n_data; ++i) {
+ for (size_t j = 0; j < n_components; ++j) {
+ PoreModelStateParams scaled_state = crt_mixture.params[j];
+ scaled_state.sd_lambda *= data[i].read_var_sd / data[i].read_scale_sd;
+ scaled_state.sd_log_lambda += data[i].log_read_var_sd - data[i].log_read_scale_sd;
+ log_pdf[i][j].second = log_invgauss_pdf(data[i].level_stdv, data[i].log_level_stdv, scaled_state);
+ assert(not std::isnan(log_pdf[i][j].second));
+ LOG("training_core", debug1)
+ << "invgauss_pdf " << i << " " << j << " "
+ << std::scientific << std::exp(log_pdf[i][j].second) << " ("
+ << std::fixed << std::setprecision(2) << log_pdf[i][j].second << ")" << endl;
+ }
+ }
+ // compute inverse gaussian weights (responsibilities)
+ //
+ // ig_weights[i][j] := ( g_weights[i][j] * pdf[i][j].second ) / sum_k ( g_weights[i][k] * pdf[i][k].second )
+ //
+ vector< vector< float > > log_ig_weights(n_data);
+ for (size_t i = 0; i < n_data; ++i) {
+ log_ig_weights[i].resize(n_components);
+ logsumset< float > denom_terms(use_multiset_logsum);
+ for (size_t j = 0; j < n_components; ++j) {
+ float v = log_g_weights[i][j] + log_pdf[i][j].second;
+ log_ig_weights[i][j] = v;
+ denom_terms.add(v);
+ }
+ float log_denom = denom_terms.val();
+ for (size_t j = 0; j < n_components; ++j) {
+ log_ig_weights[i][j] -= log_denom;
+ LOG("training_core", debug1)
+ << "ig_weights " << i << " " << j << " "
+ << std::fixed << std::setprecision(5) << std::exp(log_ig_weights[i][j]) << " ("
+ << std::fixed << std::setprecision(2) << log_ig_weights[i][j] << ")" << endl;
+ }
+ }
+
+ // update eta
+ //
+ // eta_j := sum_i ( ig_weigts[i][j] * lambda'_ij * level_stdv_i ) / sum_i ( ig_weights[i][j] * lambda'_ij )
+ // lambda'_ij := lambda_j * ( read_var_sd_i / read_var_scale_i )
+ //
+ auto new_mixture = crt_mixture;
+ for (size_t j = 0; j < n_components; ++j) {
+ logsumset< float > numer_terms(use_multiset_logsum);
+ logsumset< float > denom_terms(use_multiset_logsum);
+ for (size_t i = 0; i < n_data; ++i) {
+ float v = log_ig_weights[i][j] + crt_mixture.params[j].sd_log_lambda + (data[i].log_read_var_sd - data[i].log_read_scale_sd);
+ numer_terms.add(v + data[i].log_level_stdv);
+ denom_terms.add(v);
+ }
+ float log_numer = numer_terms.val();
+ float log_denom = denom_terms.val();
+ new_mixture.params[j].sd_mean = std::exp(log_numer - log_denom);
+ new_mixture.params[j].update_sd_stdv();
+ new_mixture.params[j].update_logs();
+ LOG("training_core", debug)
+ << "new_mixture " << iteration << " " << j << " "
+ << std::fixed << std::setprecision(5) << std::exp(new_mixture.log_weights[j]) << " "
+ << std::setprecision(5) << new_mixture.params[j].sd_mean << endl;
+ }
+ std::swap(crt_mixture, new_mixture);
+ } // for iteration
+
+ return crt_mixture;
+} // train_ig_mixture
diff --git a/src/training_core.hpp b/src/training_core.hpp
new file mode 100644
index 0000000..9d9b293
--- /dev/null
+++ b/src/training_core.hpp
@@ -0,0 +1,176 @@
+#ifndef __TRAINING_CORE_HPP
+#define __TRAINING_CORE_HPP
+
+#include <cmath>
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <vector>
+
+#include "nanopolish_eventalign.h"
+#include "nanopolish_squiggle_read.h"
+
+// The state training data comes in two different
+// sizes Full and Minimal. The model training functions
+// only actually need the Minimal data but for exploration
+// the Full data is useful so left as an option.
+
+struct MinimalStateTrainingData
+{
+ //
+ // Functions
+ //
+ MinimalStateTrainingData() = default;
+ MinimalStateTrainingData(const SquiggleRead& sr,
+ const EventAlignment& ea,
+ uint32_t,
+ const std::string&,
+ const std::string&)
+ {
+ // scale the observation to the expected pore model
+ this->level_mean = sr.get_fully_scaled_level(ea.event_idx, ea.strand_idx);
+ this->log_level_mean = std::log(this->level_mean);
+ this->level_stdv = sr.get_scaled_stdv(ea.event_idx, ea.strand_idx);
+ this->log_level_stdv = std::log(this->level_stdv);
+ this->read_var = sr.pore_model[ea.strand_idx].var;
+ this->log_read_var = std::log(this->read_var);
+ this->read_scale_sd = sr.pore_model[ea.strand_idx].scale_sd;
+ this->log_read_scale_sd = std::log(this->read_scale_sd);
+ this->read_var_sd = sr.pore_model[ea.strand_idx].var_sd;
+ this->log_read_var_sd = std::log(this->read_var_sd);
+ }
+
+ MinimalStateTrainingData(double level_mean,
+ double level_stdv,
+ double read_var)
+ {
+ // scale the observation to the expected pore model
+ this->level_mean = level_mean;
+ this->log_level_mean = std::log(this->level_mean);
+ this->level_stdv = level_stdv;
+ this->log_level_stdv = std::log(this->level_stdv);
+ this->read_var = read_var;
+ this->log_read_var = std::log(this->read_var);
+ this->read_scale_sd = 1.0; // unused
+ this->log_read_scale_sd = std::log(this->read_scale_sd);
+ this->read_var_sd = 1.0; // unused
+ this->log_read_var_sd = std::log(this->read_var_sd);
+ }
+
+ static void write_header(std::ostream& os)
+ {
+ write_header_nonl(os);
+ os << std::endl;
+ }
+ static void write_header_nonl(std::ostream& os)
+ {
+ os << "model\tmodel_kmer\tlevel_mean\tlevel_stdv\tread_var\tread_scale_sd\tread_var_sd";
+ }
+
+ void write_tsv(std::ostream& os, const std::string& model_name, const std::string& kmer) const
+ {
+ write_tsv_nonl(os, model_name, kmer);
+ os << std::endl;
+ }
+
+ void write_tsv_nonl(std::ostream& os, const std::string& model_name, const std::string& kmer) const
+ {
+ os << model_name << '\t'
+ << kmer << '\t'
+ << std::fixed << std::setprecision(2) << level_mean << '\t'
+ << level_stdv << '\t'
+ << read_var << '\t'
+ << read_scale_sd << '\t'
+ << read_var_sd;
+ }
+
+ //
+ // Data
+ //
+ float level_mean;
+ float log_level_mean;
+ float level_stdv;
+ float log_level_stdv;
+ float read_var;
+ float log_read_var;
+ float read_scale_sd;
+ float log_read_scale_sd;
+ float read_var_sd;
+ float log_read_var_sd;
+}; // struct MinimalStateTrainingData
+
+struct FullStateTrainingData
+ : public MinimalStateTrainingData
+{
+ //
+ // Functions
+ //
+ FullStateTrainingData() = default;
+ FullStateTrainingData(const SquiggleRead& sr,
+ const EventAlignment& ea,
+ uint32_t rank,
+ const std::string& prev_kmer,
+ const std::string& next_kmer)
+ : MinimalStateTrainingData(sr, ea, rank, prev_kmer, next_kmer)
+ {
+ this->duration = sr.events[ea.strand_idx][ea.event_idx].duration;
+ this->ref_position = ea.ref_position;
+ this->ref_strand = ea.rc;
+ GaussianParameters model = sr.pore_model[ea.strand_idx].get_scaled_parameters(rank);
+ this->z = (sr.get_drift_corrected_level(ea.event_idx, ea.strand_idx) - model.mean ) / model.stdv;
+ this->prev_kmer = prev_kmer;
+ this->next_kmer = next_kmer;
+ }
+
+ static void write_header(std::ostream& os)
+ {
+ write_header_nonl(os);
+ os << std::endl;
+ }
+ static void write_header_nonl(std::ostream& os)
+ {
+ MinimalStateTrainingData::write_header_nonl(os);
+ os << "\tduration\tref_pos\tref_strand\tz\tprev_kmer\tnext_kmer";
+ }
+
+ void write_tsv(std::ostream& os, const std::string& model_name, const std::string& kmer) const
+ {
+ write_tsv_nonl(os, model_name, kmer);
+ os << std::endl;
+ }
+ void write_tsv_nonl(std::ostream& os, const std::string& model_name, const std::string& kmer) const
+ {
+ MinimalStateTrainingData::write_tsv_nonl(os, model_name, kmer);
+ os << '\t' << duration << '\t'
+ << ref_position << '\t'
+ << ref_strand << '\t'
+ << z << '\t'
+ << prev_kmer << '\t'
+ << next_kmer;
+ }
+
+ //
+ // Data
+ //
+ float duration;
+ int ref_position;
+ int ref_strand;
+ float z;
+ std::string prev_kmer;
+ std::string next_kmer;
+}; // struct FullStateTrainingData
+
+typedef MinimalStateTrainingData StateTrainingData;
+//typedef FullStateTrainingData StateTrainingData;
+
+struct ParamMixture
+{
+ std::vector< float > log_weights;
+ std::vector< PoreModelStateParams > params;
+}; // struct ParamMixture
+
+// training functions
+ParamMixture train_gaussian_mixture (const std::vector< StateTrainingData >& data, const ParamMixture& input_mixture);
+ParamMixture train_invgaussian_mixture(const std::vector< StateTrainingData >& data, const ParamMixture& input_mixture);
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/nanopolish.git
More information about the debian-med-commit
mailing list