[med-svn] [last-align] 01/06: New upstream version 809
Andreas Tille
tille at debian.org
Wed Nov 30 16:43:45 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository last-align.
commit fe2831f574fda25dad1cda3b6da2a834d8bbd9d5
Author: Andreas Tille <tille at debian.org>
Date: Wed Nov 30 16:02:08 2016 +0100
New upstream version 809
---
ChangeLog.txt | 225 ++++++++++-
doc/last-split.html | 8 +
doc/last-split.txt | 8 +
doc/last-train.html | 18 +-
doc/last-train.txt | 21 +-
doc/maf-convert.html | 5 +
doc/maf-convert.txt | 5 +
scripts/last-postmask | 17 +-
scripts/last-train | 166 ++++++--
scripts/maf-convert | 867 ++++++++++++++++++++++------------------
src/LastalArguments.cc | 2 +
src/split/cbrc_split_aligner.cc | 558 ++++++++++++++------------
src/split/cbrc_split_aligner.hh | 62 +--
src/split/last-split-main.cc | 20 +-
src/split/last-split.cc | 11 +-
src/split/last-split.hh | 2 +
src/stringify.hh | 24 +-
src/version.hh | 2 +-
18 files changed, 1286 insertions(+), 735 deletions(-)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 00416a1..c286315 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,8 +1,231 @@
+2016-11-24 Martin C. Frith <Martin C. Frith>
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Made last-split a bit faster.
+ [604434e30b9b] [tip]
+
+ * src/split/cbrc_split_aligner.cc:
+ Enabled last-split -g to read multi-volume genomes.
+ [39d60e63649c]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Refactoring.
+ [e390c8a6b953]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Refactoring.
+ [a651f673eccd]
+
+ * src/split/cbrc_split_aligner.cc:
+ Refactoring.
+ [3cf4563e1e06]
+
+ * src/split/cbrc_split_aligner.cc:
+ Made last-split -g check for alignment coordinates beyond ends of
+ sequences.
+ [271148f2fd22]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh,
+ test/last-test.sh:
+ Refactoring.
+ [3bcd06103556]
+
+2016-11-22 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-split.txt, src/split/cbrc_split_aligner.cc,
+ src/split/cbrc_split_aligner.hh, src/split/last-split-main.cc,
+ src/split/last-split.cc, src/split/last-split.hh, src/stringify.hh,
+ test/last-split-test.out:
+ Added last-split -b option, to skip troublesome query sequences.
+ [99208596e2df]
+
+2016-11-15 Martin C. Frith <Martin C. Frith>
+
+ * src/split/cbrc_split_aligner.cc:
+ Made last-split's spliced alignment a bit faster.
+ [7a7418e343ec]
+
+ * src/split/cbrc_split_aligner.hh, src/split/last-split.cc:
+ Put memory info in last-split verbose output.
+ [a9b828b51430]
+
+ * test/maf-convert-test.out, test/maf-convert-test.sh:
+ Added tests for maf-convert.
+ [b9fb391b111f]
+
+ * doc/maf-convert.txt, scripts/maf-convert:
+ Added maf-convert -j option!
+ [e80789dbe5a9]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [60a355a2060c]
+
+2016-11-11 Martin C. Frith <Martin C. Frith>
+
+ * scripts/maf-convert:
+ Refactoring.
+ [640ec6e760a4]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [2b817a03775e]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [2e56e3884ce1]
+
+ * scripts/maf-convert:
+ Minor modifications.
+ [3a530469efd4]
+
+ * scripts/maf-convert:
+ Minor modifications.
+ [a6f7e46b53e8]
+
+ * scripts/maf-convert:
+ Minor modifications.
+ [4b680ca58d11]
+
+ * scripts/maf-convert:
+ Minor modifications.
+ [ed5ab3b3b096]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [49af9e116654]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [90cdd16c533d]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [5a56c7267682]
+
+ * scripts/maf-convert:
+ Minor modifications.
+ [5718d170c521]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [5e862e6f4593]
+
+ * scripts/maf-convert:
+ Minor modifications.
+ [1b0f793a2df3]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [fd3de758e253]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [17d5f147e3e8]
+
+ * scripts/maf-convert:
+ Minor refactoring.
+ [9137dbc02893]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [b6a9b0636a43]
+
+ * scripts/maf-convert:
+ Made maf-convert a bit faster.
+ [6c8368485b73]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [21080ba28e06]
+
+2016-11-10 Martin C. Frith <Martin C. Frith>
+
+ * scripts/maf-convert:
+ Refactoring.
+ [e289a5a60161]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [08dff31a448d]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [a6e60fafb4c7]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [3e274b1d6e63]
+
+ * scripts/maf-convert:
+ Refactoring.
+ [3fdccdc68124]
+
+ * src/LastalArguments.cc:
+ Reduced memory use of lastal with multi-threads.
+ [3d55bdf04914]
+
+ * doc/last-train.txt, src/split/cbrc_split_aligner.cc, test/102.maf,
+ test/last-split-test.out, test/last-split-test.sh:
+ Reverted an unintended minor change in last-split output.
+ [41a1f195a956]
+
+2016-11-04 Martin C. Frith <Martin C. Frith>
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Refactoring.
+ [ebfe8b16e512]
+
+ * scripts/last-train:
+ Reduced last-train's default sample size to 1 million.
+ [3fa8121533e6]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Made last-split's spliced alignment a bit faster, maybe.
+ [9e43ca208315]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Refactoring.
+ [15807aae76c1]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Refactoring.
+ [a08e93976d22]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Refactoring.
+ [5a2487a536ff]
+
+ * doc/last-train.txt, scripts/last-train,
+ src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Documented rows/columns of last-train matrices.
+ [ef684382d11c]
+
+2016-11-01 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-train:
+ Made last-train print matrices more prettily.
+ [ea0e36883c48]
+
+ * doc/last-train.txt, scripts/last-train:
+ Improved last-train doc.
+ [567a4a35cd4b]
+
+2016-10-25 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-train.txt, scripts/last-train:
+ Made last-train use a random sample of the query sequences.
+ [ae037299752f]
+
+ * scripts/last-postmask:
+ Made last-postmask a bit faster.
+ [5e99b6e235e0]
+
2016-09-29 Martin C. Frith <Martin C. Frith>
* test/last-test.out, test/last-test.sh:
Added (incomplete) tests for gapless overlap alignment.
- [19b58d988059] [tip]
+ [19b58d988059]
* src/LastalArguments.cc, src/gaplessPssmXdrop.cc,
src/gaplessPssmXdrop.hh, src/gaplessTwoQualityXdrop.cc,
diff --git a/doc/last-split.html b/doc/last-split.html
index c54d670..32bad5c 100644
--- a/doc/last-split.html
+++ b/doc/last-split.html
@@ -623,6 +623,14 @@ above INT will not necessarily get high mismap probabilities.</p>
probabilities. Note that the mismap and score limits still
apply.</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">-b</span>, <span class="option">--bytes=<var>B</var></span></kbd></td>
+<td>Skip any query sequence that would require more than B bytes
+of memory to process. (This only limits the size of some
+core data-structures: the total memory use will be greater.)
+A warning is written for each skipped sequence. You can use
+suffixes such as K (KibiBytes), M (MebiBytes), G (GibiBytes),
+T (TebiBytes), e.g. -b20G.</td></tr>
+<tr><td class="option-group">
<kbd><span class="option">-v</span>, <span class="option">--verbose</span></kbd></td>
<td>Show progress information on the screen.</td></tr>
<tr><td class="option-group">
diff --git a/doc/last-split.txt b/doc/last-split.txt
index 609f3ad..0bda7ff 100644
--- a/doc/last-split.txt
+++ b/doc/last-split.txt
@@ -231,6 +231,14 @@ Options
probabilities. Note that the mismap and score limits still
apply.
+ -b, --bytes=B
+ Skip any query sequence that would require more than B bytes
+ of memory to process. (This only limits the size of some
+ core data-structures: the total memory use will be greater.)
+ A warning is written for each skipped sequence. You can use
+ suffixes such as K (KibiBytes), M (MebiBytes), G (GibiBytes),
+ T (TebiBytes), e.g. -b20G.
+
-v, --verbose
Show progress information on the screen.
diff --git a/doc/last-train.html b/doc/last-train.html
index 9e0202a..a8aa00f 100644
--- a/doc/last-train.html
+++ b/doc/last-train.html
@@ -330,8 +330,8 @@ lastdb mydb reference.fasta
last-train mydb queries.fasta
</pre>
<p>last-train prints a summary of each alignment step, followed by the
-final score parameters in a format that can be read by lastal's -p
-option.</p>
+final score parameters in a format that can be read by <a class="reference external" href="lastal.html#score-options">lastal's -p
+option</a>.</p>
<div class="section" id="options">
<h2>Options</h2>
<blockquote>
@@ -369,9 +369,15 @@ e.g. score(A→G) = score(G→A).</td></tr>
<td>Ignore alignments with > PID% identity. This aims to
optimize the parameters for low-similarity alignments,
similarly to the BLOSUM matrices.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--sample=<var>N</var></span></kbd></td>
+<td>Use only the first N query letters, after randomizing the
+order of sequences. 0 means use all query letters.</td></tr>
</tbody>
</table>
</blockquote>
+<p>All options below this point are passed to lastal to do the
+alignments: they are described in more detail at <a class="reference external" href="lastal.html">lastal.html</a>.</p>
</div>
<div class="section" id="initial-parameter-options">
<h3>Initial parameter options</h3>
@@ -414,10 +420,10 @@ similarly to the BLOSUM matrices.</td></tr>
<tbody valign="top">
<tr><td class="option-group">
<kbd><span class="option">-D <var>LENGTH</var></span></kbd></td>
-<td>Query letters per random alignment.</td></tr>
+<td>Query letters per random alignment. (See <a class="reference external" href="last-evalues.html">here</a>.)</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-E <var>EG2</var></span></kbd></td>
-<td>Maximum expected alignments per square giga.</td></tr>
+<td>Maximum expected alignments per square giga. (See <a class="reference external" href="last-evalues.html">here</a>.)</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-s <var>NUMBER</var></span></kbd></td>
<td>Which query strand to use: 0=reverse, 1=forward, 2=both.</td></tr>
@@ -454,7 +460,9 @@ distribution. If they do not (which is often the case), the results
may be poor.</p>
</li>
<li><p class="first">last-train can fail for various reasons, e.g. if the sequences are
-too dissimilar.</p>
+too dissimilar. If it fails to find any alignments, you could try
+<a class="reference external" href="last-tutorial.html#example-6-find-very-short-dna-alignments">reducing the alignment significance threshold</a> with
+option -D.</p>
</li>
</ul>
</div>
diff --git a/doc/last-train.txt b/doc/last-train.txt
index 97cf0f9..8ee7885 100644
--- a/doc/last-train.txt
+++ b/doc/last-train.txt
@@ -15,8 +15,8 @@ The usage is like this::
last-train mydb queries.fasta
last-train prints a summary of each alignment step, followed by the
-final score parameters in a format that can be read by lastal's -p
-option.
+final score parameters in a format that can be read by `lastal's -p
+option <lastal.html#score-options>`_.
Options
-------
@@ -40,6 +40,12 @@ Training options
Ignore alignments with > PID% identity. This aims to
optimize the parameters for low-similarity alignments,
similarly to the BLOSUM matrices.
+ --sample=N
+ Use only the first N query letters, after randomizing the
+ order of sequences. 0 means use all query letters.
+
+All options below this point are passed to lastal to do the
+alignments: they are described in more detail at `<lastal.html>`_.
Initial parameter options
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -55,8 +61,10 @@ Initial parameter options
Alignment options
~~~~~~~~~~~~~~~~~
- -D LENGTH Query letters per random alignment.
- -E EG2 Maximum expected alignments per square giga.
+ -D LENGTH Query letters per random alignment. (See `here
+ <last-evalues.html>`_.)
+ -E EG2 Maximum expected alignments per square giga. (See `here
+ <last-evalues.html>`_.)
-s NUMBER Which query strand to use: 0=reverse, 1=forward, 2=both.
-S NUMBER Score matrix applies to forward strand of: 0=reference,
1=query. This matters only if the scores lack
@@ -77,4 +85,7 @@ Bugs
may be poor.
* last-train can fail for various reasons, e.g. if the sequences are
- too dissimilar.
+ too dissimilar. If it fails to find any alignments, you could try
+ `reducing the alignment significance threshold
+ <last-tutorial.html#example-6-find-very-short-dna-alignments>`_ with
+ option -D.
diff --git a/doc/maf-convert.html b/doc/maf-convert.html
index 7590a60..a98040e 100644
--- a/doc/maf-convert.html
+++ b/doc/maf-convert.html
@@ -351,6 +351,11 @@ starting with 'p', run lastal with option -j set to 4 or higher.)</p>
nucleotides. This affects psl format only (the first 4
columns).</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">-j</span>, <span class="option">--join=<var>N</var></span></kbd></td>
+<td>Join neighboring alignments if they are co-linear and
+separated by at most N letters. This affects psl format
+only.</td></tr>
+<tr><td class="option-group">
<kbd><span class="option">-n</span>, <span class="option">--noheader</span></kbd></td>
<td>Omit any header lines from the output. This may be useful if
you concatenate outputs, e.g. from parallel jobs.</td></tr>
diff --git a/doc/maf-convert.txt b/doc/maf-convert.txt
index a4ffcd9..f78acee 100644
--- a/doc/maf-convert.txt
+++ b/doc/maf-convert.txt
@@ -32,6 +32,11 @@ Options
nucleotides. This affects psl format only (the first 4
columns).
+ -j, --join=N
+ Join neighboring alignments if they are co-linear and
+ separated by at most N letters. This affects psl format
+ only.
+
-n, --noheader
Omit any header lines from the output. This may be useful if
you concatenate outputs, e.g. from parallel jobs.
diff --git a/scripts/last-postmask b/scripts/last-postmask
index 750ee8e..6e7b9d4 100755
--- a/scripts/last-postmask
+++ b/scripts/last-postmask
@@ -12,7 +12,7 @@
# Limitations: doesn't (yet) handle sequence quality data,
# frameshifts, or generalized affine gaps.
-import fileinput, itertools, optparse, os, signal, sys
+import itertools, optparse, os, signal, sys
def getScoreMatrix(rowHeads, colHeads, matrix, deleteCost, insertCost):
defaultScore = min(map(min, matrix))
@@ -56,12 +56,12 @@ def printIfGood(maf, seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
print line,
print
-def lastPostmask(args):
+def doOneFile(file):
scoreMatrix = []
maf = []
seqs = []
- for line in fileinput.input(args):
+ for line in file:
if line[0] == "#":
print line,
w = line.split()
@@ -90,6 +90,17 @@ def lastPostmask(args):
if line[0] == "s": seqs.append(line.split()[6])
if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
+def lastPostmask(args):
+ if args:
+ for i in args:
+ if i == "-":
+ doOneFile(sys.stdin)
+ else:
+ with open(i) as file:
+ doOneFile(file)
+ else:
+ doOneFile(sys.stdin)
+
if __name__ == "__main__":
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
diff --git a/scripts/last-train b/scripts/last-train
index a96532a..de39072 100755
--- a/scripts/last-train
+++ b/scripts/last-train
@@ -1,10 +1,71 @@
#! /usr/bin/env python
# Copyright 2015 Martin C. Frith
-import fileinput, math, optparse, os, signal, subprocess, sys
+import math, optparse, os, random, signal, subprocess, sys, tempfile
+
+def writeWords(outFile, words):
+ outFile.write(" ".join(words) + "\n")
+
+def seqInput(fileNames):
+ for name in fileNames:
+ with open(name) as file:
+ seqType = 0
+ for line in file:
+ if seqType == 0:
+ if line[0] == ">":
+ seqType = 1
+ seq = []
+ elif line[0] == "@":
+ seqType = 2
+ lineType = 1
+ elif seqType == 1: # fasta
+ if line[0] == ">":
+ yield "".join(seq), ""
+ seq = []
+ else:
+ seq.append(line.rstrip())
+ elif seqType == 2: # fastq
+ if lineType == 1:
+ seq = line.rstrip()
+ elif lineType == 3:
+ yield seq, line.rstrip()
+ lineType = (lineType + 1) % 4
+ if seqType == 1: yield "".join(seq), ""
+
+def randomNumbersAndLengths(lengths, sampleSize):
+ numbersAndLengths = list(enumerate(lengths))
+ random.shuffle(numbersAndLengths)
+ for i, x in numbersAndLengths:
+ if x < sampleSize:
+ yield i, x
+ sampleSize -= x
+ else:
+ yield i, sampleSize
+ break
-def tabSeparatedString(things):
- return "\t".join(map(str, things))
+def writeSeqSample(numbersAndLengths, seqInput, outfile):
+ j = 0
+ for i, x in enumerate(seqInput):
+ if j < len(numbersAndLengths) and numbersAndLengths[j][0] == i:
+ seqlen = numbersAndLengths[j][1]
+ seq, qual = x
+ if qual:
+ outfile.write("@" + str(i) + "\n")
+ outfile.write(seq[:seqlen])
+ outfile.write("\n+\n")
+ outfile.write(qual[:seqlen])
+ else:
+ outfile.write(">" + str(i) + "\n")
+ outfile.write(seq[:seqlen])
+ outfile.write("\n")
+ j += 1
+
+def getSeqSample(opts, queryFiles, outfile):
+ lengths = (len(i[0]) for i in seqInput(queryFiles))
+ sampleSize = int(round(opts.sample))
+ numbersAndLengths = sorted(randomNumbersAndLengths(lengths, sampleSize))
+ writeSeqSample(numbersAndLengths, seqInput(queryFiles), outfile)
+ print "# sampled sequences:", len(numbersAndLengths)
def scaleFromHeader(lines):
for line in lines:
@@ -70,6 +131,45 @@ def scoreFromProb(scale, prob):
def costFromProb(scale, prob):
return -scoreFromProb(scale, prob)
+def guessAlphabet(matrixSize):
+ if matrixSize == 4: return "ACGT"
+ if matrixSize == 20: return "ACDEFGHIKLMNPQRSTVWY"
+ raise Exception("can't handle unusual alphabets")
+
+def matrixWithLetters(matrix):
+ alphabet = guessAlphabet(len(matrix))
+ return [alphabet] + [[a] + i for a, i in zip(alphabet, matrix)]
+
+def writeMatrixHead(outFile, prefix, alphabet, formatString):
+ writeWords(outFile, [prefix + " "] + [formatString % k for k in alphabet])
+
+def writeMatrixBody(outFile, prefix, alphabet, matrix, formatString):
+ for i, j in zip(alphabet, matrix):
+ writeWords(outFile, [prefix + i] + [formatString % k for k in j])
+
+def writeCountMatrix(outFile, matrix, prefix):
+ alphabet = guessAlphabet(len(matrix))
+ writeMatrixHead(outFile, prefix, alphabet, "%-14s")
+ writeMatrixBody(outFile, prefix, alphabet, matrix, "%-14s")
+
+def writeProbMatrix(outFile, matrix, prefix):
+ alphabet = guessAlphabet(len(matrix))
+ writeMatrixHead(outFile, prefix, alphabet, "%-14s")
+ writeMatrixBody(outFile, prefix, alphabet, matrix, "%-14g")
+
+def writeScoreMatrix(outFile, matrix, prefix):
+ alphabet = guessAlphabet(len(matrix))
+ writeMatrixHead(outFile, prefix, alphabet, "%6s")
+ writeMatrixBody(outFile, prefix, alphabet, matrix, "%6s")
+
+def writeMatrixWithLetters(outFile, matrix, prefix):
+ head = matrix[0]
+ tail = matrix[1:]
+ left = [i[0] for i in tail]
+ body = [i[1:] for i in tail]
+ writeMatrixHead(outFile, prefix, head, "%6s")
+ writeMatrixBody(outFile, prefix, left, body, "%6s")
+
def matProbsFromCounts(counts, opts):
r = range(len(counts))
if opts.revsym: # add complement (reverse strand) substitutions
@@ -82,23 +182,15 @@ def matProbsFromCounts(counts, opts):
print "# substitution percent identity: %g" % (100 * identities / total)
print
- print "# count matrix:"
- for i in counts:
- print "#", tabSeparatedString(i)
+ print "# count matrix (query letters = columns, reference letters = rows):"
+ writeCountMatrix(sys.stdout, counts, "# ")
print
- print "# probability matrix:"
- for i in probs:
- print "#", tabSeparatedString("%g" % j for j in i)
+ print "# probability matrix (query letters = columns, reference letters = rows):"
+ writeProbMatrix(sys.stdout, probs, "# ")
print
return probs
-def printMatScores(scores):
- print "# score matrix:"
- for i in scores:
- print "#", tabSeparatedString(i)
- print
-
def gapProbsFromCounts(counts, opts):
matches, deletes, inserts, delOpens, insOpens, alignments = counts
if not alignments: raise Exception("no alignments")
@@ -168,23 +260,9 @@ def gapCostsFromProbs(scale, probs):
if insExtendCost == 0: insExtendCost = 1
return delExistCost, insExistCost, delExtendCost, insExtendCost
-def guessAlphabet(matrixSize):
- if matrixSize == 4: return "ACGT"
- if matrixSize == 20: return "ACDEFGHIKLMNPQRSTVWY"
- raise Exception("can't handle unusual alphabets")
-
-def matrixWithLetters(matrix):
- alphabet = guessAlphabet(len(matrix))
- return [alphabet] + [[a] + i for a, i in zip(alphabet, matrix)]
-
def writeLine(out, *things):
out.write(" ".join(map(str, things)) + "\n")
-def writeMatrixWithLetters(matrix, out):
- writeLine(out, " ", tabSeparatedString(matrix[0]))
- for i in matrix[1:]:
- writeLine(out, i[0], tabSeparatedString(i[1:]))
-
def writeGapCosts(gapCosts, out):
delExistCost, insExistCost, delExtendCost, insExtendCost = gapCosts
writeLine(out, "#last -a", delExistCost)
@@ -218,7 +296,7 @@ def fixedLastalArgs(opts):
if opts.Q: x.append("-Q" + opts.Q)
return x
-def lastTrain(opts, args):
+def doTraining(opts, args):
tryToMakeChildProgramsFindable()
scaleIncrease = 20 # while training, up-scale the scores by this amount
x = fixedLastalArgs(opts)
@@ -248,6 +326,7 @@ def lastTrain(opts, args):
while True:
print "#", " ".join(x)
print
+ sys.stdout.flush()
matCounts, gapCounts = countsFromLastOutput(q.stdout, opts)
gapProbs = gapProbsFromCounts(gapCounts, opts)
gapCosts = gapCostsFromProbs(internalScale, gapProbs)
@@ -258,7 +337,9 @@ def lastTrain(opts, args):
else:
matProbs = matProbsFromCounts(matCounts, opts)
matScores = matScoresFromProbs(internalScale, matProbs)
- printMatScores(matScores)
+ print "# score matrix (query letters = columns, reference letters = rows):"
+ writeScoreMatrix(sys.stdout, matScores, "# ")
+ print
parameters = gapCosts, matScores
if parameters in oldParameters: break
oldParameters.append(parameters)
@@ -268,7 +349,7 @@ def lastTrain(opts, args):
x += args
p = subprocess.Popen(x, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
writeGapCosts(gapCosts, p.stdin)
- writeMatrixWithLetters(internalMatrix, p.stdin)
+ writeMatrixWithLetters(p.stdin, internalMatrix, "")
p.stdin.close()
# in python2.6, the next line must come after p.stdin.close()
q = subprocess.Popen(y, stdin=p.stdout, stdout=subprocess.PIPE)
@@ -280,11 +361,26 @@ def lastTrain(opts, args):
if not opts.Q:
matScores = matScoresFromProbs(externalScale, matProbs)
externalMatrix = matrixWithLetters(matScores)
- writeMatrixWithLetters(externalMatrix, sys.stdout)
+ print "# score matrix (query letters = columns, reference letters = rows):"
+ writeMatrixWithLetters(sys.stdout, externalMatrix, "")
+
+def lastTrain(opts, args):
+ if opts.sample:
+ random.seed(math.pi)
+ refName = args[0]
+ queryFiles = args[1:]
+ try:
+ with tempfile.NamedTemporaryFile(delete=False) as f:
+ getSeqSample(opts, queryFiles, f)
+ doTraining(opts, [refName, f.name])
+ finally:
+ os.remove(f.name)
+ else:
+ doTraining(opts, args)
if __name__ == "__main__":
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
- usage = "%prog lastdb-name sequence-file(s)"
+ usage = "%prog [options] lastdb-name sequence-file(s)"
description = "Try to find suitable score parameters for aligning the given sequences."
op = optparse.OptionParser(usage=usage, description=description)
og = optparse.OptionGroup(op, "Training options")
@@ -296,6 +392,8 @@ if __name__ == "__main__":
help="force insertion/deletion symmetry")
og.add_option("--pid", type="float", default=100, help=
"skip alignments with > PID% identity (default: %default)")
+ og.add_option("--sample", type="float", default=1000000, metavar="N",
+ help="use a random sample of N letters (default: %default)")
op.add_option_group(og)
og = optparse.OptionGroup(op, "Initial parameter options")
og.add_option("-r", metavar="SCORE",
diff --git a/scripts/maf-convert b/scripts/maf-convert
index 8a67408..7b1d92c 100755
--- a/scripts/maf-convert
+++ b/scripts/maf-convert
@@ -1,131 +1,177 @@
#! /usr/bin/env python
# Copyright 2010, 2011, 2013, 2014 Martin C. Frith
# Read MAF-format alignments: write them in other formats.
-# Seems to work with Python 2.x, x>=4
+# Seems to work with Python 2.x, x>=6
# By "MAF" we mean "multiple alignment format" described in the UCSC
# Genome FAQ, not e.g. "MIRA assembly format".
from itertools import *
-import sys, os, fileinput, math, operator, optparse, signal, string
+import math, optparse, os, signal, sys
def maxlen(s):
return max(map(len, s))
-def joined(things, delimiter):
- return delimiter.join(map(str, things))
-
-identityTranslation = string.maketrans("", "")
-def deleted(myString, deleteChars):
- return myString.translate(identityTranslation, deleteChars)
-
-def quantify(iterable, pred=bool):
- """Count how many times the predicate is true."""
- return sum(map(pred, iterable))
-
-class Maf:
- def __init__(self, aLine, sLines, qLines, pLines):
- self.namesAndValues = dict(i.split("=") for i in aLine[1:])
-
- if not sLines: raise Exception("empty alignment")
- cols = zip(*sLines)
- self.seqNames = cols[1]
- self.alnStarts = map(int, cols[2])
- self.alnSizes = map(int, cols[3])
- self.strands = cols[4]
- self.seqSizes = map(int, cols[5])
- self.alnStrings = cols[6]
-
- self.qLines = qLines
- self.pLines = pLines
-
-def dieUnlessPairwise(maf):
- if len(maf.alnStrings) != 2:
- raise Exception("pairwise alignments only, please")
-
-def insertionCounts(alnString):
- gaps = alnString.count("-")
- forwardFrameshifts = alnString.count("\\")
- reverseFrameshifts = alnString.count("/")
- letters = len(alnString) - gaps - forwardFrameshifts - reverseFrameshifts
- return letters, forwardFrameshifts, reverseFrameshifts
-
-def coordinatesPerLetter(alnString, alnSize):
- letters, forwardShifts, reverseShifts = insertionCounts(alnString)
- if forwardShifts or reverseShifts or letters < alnSize: return 3
- else: return 1
+def pairOrDie(sLines, formatName):
+ if len(sLines) != 2:
+ e = "for %s, each alignment must have 2 sequences" % formatName
+ raise Exception(e)
+ return sLines
-def mafLetterSizes(maf):
- return map(coordinatesPerLetter, maf.alnStrings, maf.alnSizes)
+def isMatch(alignmentColumn):
+ # No special treatment of ambiguous bases/residues: same as NCBI BLAST.
+ first = alignmentColumn[0].upper()
+ for i in alignmentColumn[1:]:
+ if i.upper() != first: return False
+ return True
-def alnLetterSizes(sLines):
- return [coordinatesPerLetter(i[6], int(i[3])) for i in sLines]
+def isGapless(alignmentColumn):
+ return "-" not in alignmentColumn
-def isTranslated(sLines):
- for i in sLines:
- alnString = i[6]
- if "\\" in alnString or "/" in alnString: return True
- if len(alnString) - alnString.count("-") < int(i[3]): return True
- return False
+def gapRunCount(row):
+ """Get the number of runs of gap characters."""
+ return sum(k == "-" for k, v in groupby(row))
-def insertionSize(alnString, letterSize):
- """Get the length of sequence included in the alnString."""
- letters, forwardShifts, reverseShifts = insertionCounts(alnString)
- return letters * letterSize + forwardShifts - reverseShifts
+def alignmentRowsFromColumns(columns):
+ return imap(''.join, izip(*columns))
def symbolSize(symbol, letterSize):
- if symbol == "-": return 0
if symbol == "\\": return 1
if symbol == "/": return -1
return letterSize
-def isMatch(alignmentColumn):
- # No special treatment of ambiguous bases/residues: same as NCBI BLAST.
- first = alignmentColumn[0].upper()
- for i in alignmentColumn[1:]:
- if i.upper() != first: return False
- return True
+def insertSize(row, letterSize):
+ """Get the length of sequence included in the row."""
+ return (len(row) - row.count("-")) * letterSize - 4 * row.count("/") - 2 * row.count("\\")
-def isGapless(alignmentColumn):
- return "-" not in alignmentColumn
+def insertSizeText(row, letterSize):
+ return str(insertSize(row, letterSize))
def matchAndInsertSizes(alignmentColumns, letterSizes):
"""Get sizes of gapless blocks, and of the inserts between them."""
for k, v in groupby(alignmentColumns, isGapless):
if k:
- sizeOfGaplessBlock = len(list(v))
- yield str(sizeOfGaplessBlock)
+ yield str(sum(1 for i in v))
else:
- blockRows = izip(*v)
- blockRows = imap(''.join, blockRows)
- insertSizes = imap(insertionSize, blockRows, letterSizes)
- yield joined(insertSizes, ":")
+ yield ":".join(imap(insertSizeText,
+ alignmentRowsFromColumns(v), letterSizes))
+
+##### Routines for reading MAF format: #####
+
+def updateEvalueParameters(opts, line):
+ for field in line.split():
+ try:
+ k, v = field.split("=")
+ x = float(v)
+ if k == "lambda":
+ opts.bitScoreA = x / math.log(2)
+ if k == "K":
+ opts.bitScoreB = math.log(x, 2)
+ except ValueError:
+ pass
+
+def scoreAndEvalue(aLine):
+ score = evalue = None
+ for i in aLine.split():
+ if i.startswith("score="):
+ score = i[6:]
+ elif i.startswith("E="):
+ evalue = i[2:]
+ return score, evalue
+
+def mafInput(opts, lines):
+ aLine = ""
+ sLines = []
+ qLines = []
+ pLines = []
+ for line in lines:
+ if line[0] == "s":
+ junk, seqName, beg, span, strand, seqLen, row = line.split()
+ beg = int(beg)
+ span = int(span)
+ seqLen = int(seqLen)
+ if "\\" in row or "/" in row or len(row) - row.count("-") < span:
+ letterSize = 3
+ else:
+ letterSize = 1
+ fields = seqName, seqLen, strand, letterSize, beg, beg + span, row
+ sLines.append(fields)
+ elif line[0] == "a":
+ aLine = line
+ elif line[0] == "q":
+ qLines.append(line)
+ elif line[0] == "p":
+ pLines.append(line)
+ elif line.isspace():
+ if sLines: yield aLine, sLines, qLines, pLines
+ aLine = ""
+ sLines = []
+ qLines = []
+ pLines = []
+ elif line[0] == "#":
+ updateEvalueParameters(opts, line)
+ if opts.isKeepComments:
+ print line,
+ if sLines: yield aLine, sLines, qLines, pLines
+
+def isJoinable(opts, oldMaf, newMaf):
+ x = oldMaf[1]
+ y = newMaf[1]
+ if x[-1][2] == "-":
+ x, y = y, x
+ return all(i[:4] == j[:4] and i[5] <= j[4] and j[4] - i[5] <= opts.join
+ for i, j in zip(x, y))
+
+def fixOrder(mafs):
+ sLines = mafs[0][1]
+ if sLines[-1][2] == "-":
+ mafs.reverse()
+
+def mafGroupInput(opts, lines):
+ x = []
+ for i in mafInput(opts, lines):
+ if x and not isJoinable(opts, x[-1], i):
+ fixOrder(x)
+ yield x
+ x = []
+ x.append(i)
+ if x:
+ fixOrder(x)
+ yield x
##### Routines for converting to AXT format: #####
axtCounter = count()
def writeAxt(maf):
- if maf.strands[0] != "+":
+ aLine, sLines, qLines, pLines = maf
+
+ if sLines[0][2] != "+":
raise Exception("for AXT, the 1st strand in each alignment must be +")
# Convert to AXT's 1-based coordinates:
- alnStarts = imap(operator.add, maf.alnStarts, repeat(1))
- alnEnds = imap(operator.add, maf.alnStarts, maf.alnSizes)
+ ranges = [(i[0], str(i[4] + 1), str(i[5]), i[2]) for i in sLines]
- rows = zip(maf.seqNames, alnStarts, alnEnds, maf.strands)
- head, tail = rows[0], rows[1:]
+ head, body = ranges[0], ranges[1:]
- outWords = []
- outWords.append(axtCounter.next())
+ outWords = [str(axtCounter.next())]
outWords.extend(head[:3])
- outWords.extend(chain(*tail))
- outWords.append(maf.namesAndValues["score"])
+ for i in body:
+ outWords.extend(i)
+
+ score, evalue = scoreAndEvalue(aLine)
+ if score:
+ outWords.append(score)
- print joined(outWords, " ")
- for i in maf.alnStrings: print i
+ print " ".join(outWords)
+ for i in sLines:
+ print i[6]
print # print a blank line at the end
+def mafConvertToAxt(opts, lines):
+ for maf in mafInput(opts, lines):
+ writeAxt(maf)
+
##### Routines for converting to tabular format: #####
def writeTab(maf):
@@ -133,117 +179,139 @@ def writeTab(maf):
score = "0"
endWords = []
- for i in aLine:
- if i.startswith("score="): score = i[6:]
- elif len(i) > 1: endWords.append(i)
-
- outWords = []
- outWords.append(score)
-
- for i in sLines: outWords.extend(i[1:6])
-
- alnStrings = [i[6] for i in sLines]
- alignmentColumns = zip(*alnStrings)
- letterSizes = alnLetterSizes(sLines)
- gapStrings = matchAndInsertSizes(alignmentColumns, letterSizes)
- gapWord = ",".join(gapStrings)
+ for i in aLine.split():
+ if i.startswith("score="):
+ score = i[6:]
+ elif len(i) > 1:
+ endWords.append(i)
+
+ outWords = [score]
+
+ for seqName, seqLen, strand, letterSize, beg, end, row in sLines:
+ x = seqName, str(beg), str(end - beg), strand, str(seqLen)
+ outWords.extend(x)
+
+ letterSizes = [i[3] for i in sLines]
+ rows = [i[6] for i in sLines]
+ alignmentColumns = izip(*rows)
+ gapWord = ",".join(matchAndInsertSizes(alignmentColumns, letterSizes))
outWords.append(gapWord)
print "\t".join(outWords + endWords)
+def mafConvertToTab(opts, lines):
+ for maf in mafInput(opts, lines):
+ writeTab(maf)
+
##### Routines for converting to PSL format: #####
-def pslBlocks(alignmentColumns, alnStarts, letterSizes):
+def pslBlocks(opts, mafs, outCounts):
"""Get sizes and start coordinates of gapless blocks in an alignment."""
- start1, start2 = alnStarts
- letterSize1, letterSize2 = letterSizes
- size = 0
- for x, y in alignmentColumns:
- if x == "-":
- if size:
- yield size, start1, start2
- start1 += size * letterSize1
- start2 += size * letterSize2
- size = 0
- start2 += symbolSize(y, letterSize2)
- elif y == "-":
- if size:
- yield size, start1, start2
- start1 += size * letterSize1
- start2 += size * letterSize2
- size = 0
- start1 += symbolSize(x, letterSize1)
- else:
- size += 1
- if size: yield size, start1, start2
+ # repMatches is always zero
+ # for proteins, nCount is always zero, because that's what BLATv34 does
+ normalBases = "ACGTU"
+ matches = mismatches = repMatches = nCount = 0
+
+ for maf in mafs:
+ sLines = maf[1]
+ fieldsA, fieldsB = pairOrDie(sLines, "PSL")
+ letterSizeA, begA, endA, rowA = fieldsA[3:7]
+ letterSizeB, begB, endB, rowB = fieldsB[3:7]
+
+ size = 0
+ for x, y in izip(rowA.upper(), rowB.upper()):
+ if x == "-":
+ if size:
+ yield size, begA, begB
+ begA += size * letterSizeA
+ begB += size * letterSizeB
+ size = 0
+ begB += symbolSize(y, letterSizeB)
+ elif y == "-":
+ if size:
+ yield size, begA, begB
+ begA += size * letterSizeA
+ begB += size * letterSizeB
+ size = 0
+ begA += symbolSize(x, letterSizeA)
+ else:
+ size += 1
+ if x in normalBases and y in normalBases or opts.protein:
+ if x == y:
+ matches += 1
+ else:
+ mismatches += 1
+ else:
+ nCount += 1
+ if size:
+ yield size, begA, begB
+
+ outCounts[0:4] = matches, mismatches, repMatches, nCount
+
+def pslNumInserts(blocks, letterSizeA, letterSizeB):
+ numInsertA = numInsertB = 0
+ for i, x in enumerate(blocks):
+ size, begA, begB = x
+ if i:
+ if begA > endA:
+ numInsertA += 1
+ if begB > endB:
+ numInsertB += 1
+ endA = begA + size * letterSizeA
+ endB = begB + size * letterSizeB
+ return numInsertA, numInsertB
def pslCommaString(things):
# UCSC software seems to prefer a trailing comma
- return joined(things, ",") + ","
-
-def gapRunCount(letters):
- """Get the number of runs of gap characters."""
- uniqLetters = map(operator.itemgetter(0), groupby(letters))
- return uniqLetters.count("-")
+ return ",".join(map(str, things)) + ","
-def pslEndpoints(alnStart, alnSize, strand, seqSize):
- alnEnd = alnStart + alnSize
- if strand == "+": return alnStart, alnEnd
- else: return seqSize - alnEnd, seqSize - alnStart
+def pslEnds(headMafFields, tailMafFields):
+ seqName, seqLen, strand, letterSize, beg = headMafFields[0:5]
+ end = tailMafFields[5]
+ if strand == "-":
+ beg, end = seqLen - end, seqLen - beg
+ return seqName, seqLen, strand, letterSize, beg, end
-def caseSensitivePairwiseMatchCounts(columns, isProtein):
- # repMatches is always zero
- # for proteins, nCount is always zero, because that's what BLATv34 does
- standardBases = "ACGTU"
- matches = mismatches = repMatches = nCount = 0
- for i in columns:
- if "-" in i: continue
- x, y = i
- if x in standardBases and y in standardBases or isProtein:
- if x == y: matches += 1
- else: mismatches += 1
- else: nCount += 1
- return matches, mismatches, repMatches, nCount
-
-def writePsl(maf, isProtein):
- dieUnlessPairwise(maf)
-
- alnStrings = map(str.upper, maf.alnStrings)
- alignmentColumns = zip(*alnStrings)
- letterSizes = mafLetterSizes(maf)
-
- matchCounts = caseSensitivePairwiseMatchCounts(alignmentColumns, isProtein)
+def writePsl(opts, mafs):
+ matchCounts = [0] * 4
+ blocks = list(pslBlocks(opts, mafs, matchCounts))
matches, mismatches, repMatches, nCount = matchCounts
numGaplessColumns = sum(matchCounts)
- qNumInsert = gapRunCount(maf.alnStrings[0])
- qBaseInsert = maf.alnSizes[1] - numGaplessColumns * letterSizes[1]
+ headA, headB = mafs[0][1]
+ tailA, tailB = mafs[-1][1]
- tNumInsert = gapRunCount(maf.alnStrings[1])
- tBaseInsert = maf.alnSizes[0] - numGaplessColumns * letterSizes[0]
+ seqNameA, seqLenA, strandA, letterSizeA, begA, endA = pslEnds(headA, tailA)
+ baseInsertA = endA - begA - numGaplessColumns * letterSizeA
- strand = maf.strands[1]
- if max(letterSizes) > 1:
- strand += maf.strands[0]
- elif maf.strands[0] != "+":
- raise Exception("for non-translated PSL, the 1st strand in each alignment must be +")
+ seqNameB, seqLenB, strandB, letterSizeB, begB, endB = pslEnds(headB, tailB)
+ baseInsertB = endB - begB - numGaplessColumns * letterSizeB
- tName, qName = maf.seqNames
- tSeqSize, qSeqSize = maf.seqSizes
+ numInsertA, numInsertB = pslNumInserts(blocks, letterSizeA, letterSizeB)
- rows = zip(maf.alnStarts, maf.alnSizes, maf.strands, maf.seqSizes)
- tStart, tEnd = pslEndpoints(*rows[0])
- qStart, qEnd = pslEndpoints(*rows[1])
+ strand = strandB
+ if letterSizeA > 1 or letterSizeB > 1:
+ strand += strandA
+ elif strandA != "+":
+ raise Exception("for non-translated PSL, the 1st strand in each alignment must be +")
- blocks = list(pslBlocks(alignmentColumns, maf.alnStarts, letterSizes))
blockCount = len(blocks)
- blockSizes, tStarts, qStarts = map(pslCommaString, zip(*blocks))
+ blockSizes, blockStartsA, blockStartsB = map(pslCommaString, zip(*blocks))
outWords = (matches, mismatches, repMatches, nCount,
- qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand,
- qName, qSeqSize, qStart, qEnd, tName, tSeqSize, tStart, tEnd,
- blockCount, blockSizes, qStarts, tStarts)
- print joined(outWords, "\t")
+ numInsertB, baseInsertB, numInsertA, baseInsertA, strand,
+ seqNameB, seqLenB, begB, endB, seqNameA, seqLenA, begA, endA,
+ blockCount, blockSizes, blockStartsB, blockStartsA)
+
+ print "\t".join(map(str, outWords))
+
+def mafConvertToPsl(opts, lines):
+ if opts.join:
+ for i in mafGroupInput(opts, lines):
+ writePsl(opts, i)
+ else:
+ for i in mafInput(opts, lines):
+ writePsl(opts, [i])
##### Routines for converting to SAM format: #####
@@ -253,19 +321,19 @@ def readGroupId(readGroupItems):
return i[3:]
raise Exception("readgroup must include ID")
-def readSequenceLengths(lines):
+def readSequenceLengths(fileNames):
"""Read name & length of topmost sequence in each maf block."""
- sequenceLengths = {} # an OrderedDict might be nice
- isSearching = True
- for line in lines:
- if line.isspace(): isSearching = True
- elif isSearching:
- w = line.split()
- if w[0] == "s":
- seqName, seqSize = w[1], w[5]
- sequenceLengths[seqName] = seqSize
- isSearching = False
- return sequenceLengths
+ for i in fileNames:
+ with open(i) as f:
+ fields = None
+ for line in f:
+ if fields:
+ if line.isspace():
+ fields = None
+ else:
+ if line[0] == "s":
+ fields = line.split()
+ yield fields[1], fields[5]
def naturalSortKey(s):
"""Return a key that sorts strings in "natural" order."""
@@ -277,16 +345,30 @@ def karyotypicSortKey(s):
if s == "MT": return ["~"]
return naturalSortKey(s)
-def writeSamHeader(sequenceLengths, dictFile, readGroupItems):
+def copyDictFile(lines):
+ for line in lines:
+ if line.startswith("@SQ"):
+ sys.stdout.write(line)
+ elif not line[0] == "@":
+ break
+
+def writeSamHeader(opts, fileNames):
print "@HD\tVN:1.3\tSO:unsorted"
- for k in sorted(sequenceLengths, key=karyotypicSortKey):
- print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k])
- if dictFile:
- for i in fileinput.input(dictFile):
- if i.startswith("@SQ"): print i,
- elif not i.startswith("@"): break
- if readGroupItems:
- print "@RG\t" + "\t".join(readGroupItems)
+
+ if opts.dictionary:
+ sequenceLengths = dict(readSequenceLengths(fileNames))
+ for k in sorted(sequenceLengths, key=karyotypicSortKey):
+ print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k])
+
+ if opts.dictfile:
+ if opts.dictfile == "-":
+ copyDictFile(sys.stdin)
+ else:
+ with open(opts.dictfile) as f:
+ copyDictFile(f)
+
+ if opts.readgroup:
+ print "@RG\t" + "\t".join(opts.readgroup.split())
mapqMissing = "255"
mapqMaximum = "254"
@@ -338,16 +420,22 @@ def cigarParts(beg, alignmentColumns, end):
if end: yield str(end) + "H"
-def writeSam(maf, rg):
+def writeSam(readGroup, maf):
aLine, sLines, qLines, pLines = maf
+ fieldsA, fieldsB = pairOrDie(sLines, "SAM")
+ seqNameA, seqLenA, strandA, letterSizeA, begA, endA, rowA = fieldsA
+ seqNameB, seqLenB, strandB, letterSizeB, begB, endB, rowB = fieldsB
- if isTranslated(sLines):
+ if letterSizeA > 1 or letterSizeB > 1:
raise Exception("this looks like translated DNA - can't convert to SAM format")
+ if strandA != "+":
+ raise Exception("for SAM, the 1st strand in each alignment must be +")
+
score = None
evalue = None
mapq = mapqMissing
- for i in aLine:
+ for i in aLine.split():
if i.startswith("score="):
v = i[6:]
if v.isdigit(): score = "AS:i:" + v # it must be an integer
@@ -356,176 +444,195 @@ def writeSam(maf, rg):
elif i.startswith("mismap="):
mapq = mapqFromProb(i[7:])
- if len(sLines) != 2:
- raise Exception("for SAM, each alignment must have 2 sequences")
- rFields, qFields = sLines
- s, rName, rStart, rAlnSize, rStrand, rSeqSize, rAlnString = rFields
- s, qName, qStart, qAlnSize, qStrand, qSeqSize, qAlnString = qFields
- if rStrand != "+":
- raise Exception("for SAM, the 1st strand in each alignment must be +")
- pos = str(int(rStart) + 1) # convert to 1-based coordinate
+ pos = str(begA + 1) # convert to 1-based coordinate
- alignmentColumns = zip(rAlnString.upper(), qAlnString.upper())
+ alignmentColumns = zip(rowA.upper(), rowB.upper())
- qStart = int(qStart)
- qRevStart = int(qSeqSize) - qStart - int(qAlnSize)
- cigar = "".join(cigarParts(qStart, iter(alignmentColumns), qRevStart))
+ revBegB = seqLenB - endB
+ cigar = "".join(cigarParts(begB, iter(alignmentColumns), revBegB))
- seq = deleted(qAlnString, "-")
+ seq = rowB.translate(None, "-")
- if qLines and qLines[-1][1] == qName:
- qual = ''.join(j
- for i, j in izip(qAlnString, qLines[-1][2]) if i != "-")
- else:
- qual = "*"
+ qual = "*"
+ if qLines:
+ qFields = qLines[-1].split()
+ if qFields[1] == seqNameB:
+ qual = ''.join(j for i, j in izip(rowB, qFields[2]) if i != "-")
# It's hard to get all the pair info, so this is very
# incomplete, but hopefully good enough.
# I'm not sure whether to add 2 and/or 8 to flag.
- if qName.endswith("/1"):
- qName = qName[:-2]
- if qStrand == "+": flag = "99" # 1 + 2 + 32 + 64
+ if seqNameB.endswith("/1"):
+ seqNameB = seqNameB[:-2]
+ if strandB == "+": flag = "99" # 1 + 2 + 32 + 64
else: flag = "83" # 1 + 2 + 16 + 64
- elif qName.endswith("/2"):
- qName = qName[:-2]
- if qStrand == "+": flag = "163" # 1 + 2 + 32 + 128
+ elif seqNameB.endswith("/2"):
+ seqNameB = seqNameB[:-2]
+ if strandB == "+": flag = "163" # 1 + 2 + 32 + 128
else: flag = "147" # 1 + 2 + 16 + 128
else:
- if qStrand == "+": flag = "0"
+ if strandB == "+": flag = "0"
else: flag = "16"
- editDistance = sum(1 for x, y in alignmentColumns if x != y)
+ editDistance = sum(x != y for x, y in alignmentColumns)
# no special treatment of ambiguous bases: might be a minor bug
editDistance = "NM:i:" + str(editDistance)
- outWords = [qName, flag, rName, pos, mapq, cigar, "*\t0\t0", seq, qual]
- outWords.append(editDistance)
- if score: outWords.append(score)
- if evalue: outWords.append(evalue)
- if rg: outWords.append(rg)
- print "\t".join(outWords)
+ out = [seqNameB, flag, seqNameA, pos, mapq, cigar, "*\t0\t0", seq, qual]
+ out.append(editDistance)
+ if score: out.append(score)
+ if evalue: out.append(evalue)
+ if readGroup: out.append(readGroup)
+ print "\t".join(out)
+
+def mafConvertToSam(opts, lines):
+ readGroup = ""
+ if opts.readgroup:
+ readGroup = "RG:Z:" + readGroupId(opts.readgroup.split())
+ for maf in mafInput(opts, lines):
+ writeSam(readGroup, maf)
##### Routines for converting to BLAST-like format: #####
def pairwiseMatchSymbol(alignmentColumn):
- if isMatch(alignmentColumn): return "|"
- else: return " "
+ if isMatch(alignmentColumn):
+ return "|"
+ else:
+ return " "
def strandText(strand):
- if strand == "+": return "Plus"
- else: return "Minus"
+ if strand == "+":
+ return "Plus"
+ else:
+ return "Minus"
-def blastCoordinate(oneBasedCoordinate, strand, seqSize):
- if strand == "-":
- oneBasedCoordinate = seqSize - oneBasedCoordinate + 1
- return str(oneBasedCoordinate)
+def blastBegCoordinate(zeroBasedCoordinate, strand, seqLen):
+ if strand == "+":
+ return str(zeroBasedCoordinate + 1)
+ else:
+ return str(seqLen - zeroBasedCoordinate)
+
+def blastEndCoordinate(zeroBasedCoordinate, strand, seqLen):
+ if strand == "+":
+ return str(zeroBasedCoordinate)
+ else:
+ return str(seqLen - zeroBasedCoordinate + 1)
+
+def nextCoordinate(coordinate, row, letterSize):
+ return coordinate + insertSize(row, letterSize)
def chunker(things, chunkSize):
for i in range(0, len(things), chunkSize):
yield things[i:i+chunkSize]
-def blastChunker(maf, lineSize, alignmentColumns):
- letterSizes = mafLetterSizes(maf)
- coords = maf.alnStarts
+def blastChunker(sLines, lineSize, alignmentColumns):
+ seqLens = [i[1] for i in sLines]
+ strands = [i[2] for i in sLines]
+ letterSizes = [i[3] for i in sLines]
+ coords = [i[4] for i in sLines]
for chunkCols in chunker(alignmentColumns, lineSize):
- chunkRows = zip(*chunkCols)
- chunkRows = map(''.join, chunkRows)
- starts = [i + 1 for i in coords] # change to 1-based coordinates
- starts = map(blastCoordinate, starts, maf.strands, maf.seqSizes)
- increments = map(insertionSize, chunkRows, letterSizes)
- coords = map(operator.add, coords, increments)
- ends = map(blastCoordinate, coords, maf.strands, maf.seqSizes)
- yield chunkCols, chunkRows, starts, ends
-
-def writeBlast(maf, opts, oldQueryName):
- dieUnlessPairwise(maf)
-
- if maf.seqNames[1] != oldQueryName:
- print "Query= %s" % maf.seqNames[1]
- print " (%s letters)" % maf.seqSizes[1]
+ chunkRows = list(alignmentRowsFromColumns(chunkCols))
+ begs = map(blastBegCoordinate, coords, strands, seqLens)
+ coords = map(nextCoordinate, coords, chunkRows, letterSizes)
+ ends = map(blastEndCoordinate, coords, strands, seqLens)
+ yield chunkCols, chunkRows, begs, ends
+
+def writeBlast(opts, maf, oldQueryName):
+ aLine, sLines, qLines, pLines = maf
+ fieldsA, fieldsB = pairOrDie(sLines, "Blast")
+ seqNameA, seqLenA, strandA, letterSizeA, begA, endA, rowA = fieldsA
+ seqNameB, seqLenB, strandB, letterSizeB, begB, endB, rowB = fieldsB
+
+ if seqNameB != oldQueryName:
+ print "Query= " + seqNameB
+ print " (%s letters)" % seqLenB
print
- print ">%s" % maf.seqNames[0]
- print " Length = %s" % maf.seqSizes[0]
+ print ">" + seqNameA
+ print " Length = %s" % seqLenA
print
- score = maf.namesAndValues["score"]
- if opts.bitScoreA is not None and opts.bitScoreB is not None:
+ score, evalue = scoreAndEvalue(aLine)
+
+ if score and opts.bitScoreA is not None and opts.bitScoreB is not None:
bitScore = opts.bitScoreA * float(score) - opts.bitScoreB
scoreLine = " Score = %.3g bits (%s)" % (bitScore, score)
else:
scoreLine = " Score = %s" % score
- try: scoreLine += ", Expect = %s" % maf.namesAndValues["E"]
- except KeyError: pass
+
+ if evalue:
+ scoreLine += ", Expect = %s" % evalue
+
print scoreLine
- alignmentColumns = zip(*maf.alnStrings)
+ alignmentColumns = zip(rowA, rowB)
alnSize = len(alignmentColumns)
- matches = quantify(alignmentColumns, isMatch)
+ matches = sum(x.upper() == y.upper() for x, y in alignmentColumns)
matchPercent = 100 * matches // alnSize # round down, like BLAST
identLine = " Identities = %s/%s (%s%%)" % (matches, alnSize, matchPercent)
- gaps = alnSize - quantify(alignmentColumns, isGapless)
- gapPercent = 100 * gaps // alnSize # round down, like BLAST
- if gaps: identLine += ", Gaps = %s/%s (%s%%)" % (gaps, alnSize, gapPercent)
+ gaps = rowA.count("-") + rowB.count("-")
+ if gaps:
+ gapPercent = 100 * gaps // alnSize # round down, like BLAST
+ identLine += ", Gaps = %s/%s (%s%%)" % (gaps, alnSize, gapPercent)
print identLine
- strands = map(strandText, maf.strands)
- print " Strand = %s / %s" % (strands[1], strands[0])
+ print " Strand = %s / %s" % (strandText(strandB), strandText(strandA))
print
- for chunk in blastChunker(maf, opts.linesize, alignmentColumns):
- cols, rows, starts, ends = chunk
- startWidth = maxlen(starts)
- matchSymbols = map(pairwiseMatchSymbol, cols)
- matchSymbols = ''.join(matchSymbols)
- print "Query: %-*s %s %s" % (startWidth, starts[1], rows[1], ends[1])
- print " %-*s %s" % (startWidth, " ", matchSymbols)
- print "Sbjct: %-*s %s %s" % (startWidth, starts[0], rows[0], ends[0])
+ for chunk in blastChunker(sLines, opts.linesize, alignmentColumns):
+ cols, rows, begs, ends = chunk
+ begWidth = maxlen(begs)
+ matchSymbols = ''.join(map(pairwiseMatchSymbol, cols))
+ print "Query: %-*s %s %s" % (begWidth, begs[1], rows[1], ends[1])
+ print " %-*s %s" % (begWidth, " ", matchSymbols)
+ print "Sbjct: %-*s %s %s" % (begWidth, begs[0], rows[0], ends[0])
print
-def blastEndsFromMafLine(fields):
- beg = int(fields[2])
- span = int(fields[3])
- if fields[4] == "+":
- return beg + 1, beg + span
+def mafConvertToBlast(opts, lines):
+ oldQueryName = ""
+ for maf in mafInput(opts, lines):
+ writeBlast(opts, maf, oldQueryName)
+ sLines = maf[1]
+ oldQueryName = sLines[1][0]
+
+def blastDataFromMafFields(fields):
+ seqName, seqLen, strand, letterSize, beg, end, row = fields
+ if strand == "+":
+ beg += 1
else:
- seqSize = int(fields[5])
- beg2 = seqSize - beg
- return beg2, beg2 - span + 1
+ beg = seqLen - beg
+ end = seqLen - end + 1
+ return seqName, str(beg), str(end), row.upper()
-def writeBlastTab(maf, opts):
+def writeBlastTab(opts, maf):
aLine, sLines, qLines, pLines = maf
+ fieldsA, fieldsB = pairOrDie(sLines, "BlastTab")
+ seqNameA, begA, endA, rowA = blastDataFromMafFields(fieldsA)
+ seqNameB, begB, endB, rowB = blastDataFromMafFields(fieldsB)
- score = None
- evalue = None
- for i in aLine:
- if i.startswith("score="): score = i[6:]
- elif i.startswith("E="): evalue = i[2:]
-
- sLine, qLine = sLines
- alnStrings = [i[6] for i in sLines]
- alignmentColumns = zip(*alnStrings)
+ alignmentColumns = zip(rowA, rowB)
alnSize = len(alignmentColumns)
- matches = quantify(alignmentColumns, isMatch)
+ matches = sum(x == y for x, y in alignmentColumns)
matchPercent = "%.2f" % (100.0 * matches / alnSize)
- mismatches = quantify(alignmentColumns, isGapless) - matches
- gapOpens = sum(map(gapRunCount, alnStrings))
-
- qName = qLine[1]
- qBeg, qEnd = blastEndsFromMafLine(qLine)
+ mismatches = alnSize - matches - rowA.count("-") - rowB.count("-")
+ gapOpens = gapRunCount(rowA) + gapRunCount(rowB)
- sName = sLine[1]
- sBeg, sEnd = blastEndsFromMafLine(sLine)
+ out = [seqNameB, seqNameA, matchPercent, str(alnSize), str(mismatches),
+ str(gapOpens), begB, endB, begA, endA]
- out = [qName, sName, matchPercent, alnSize, mismatches, gapOpens,
- qBeg, qEnd, sBeg, sEnd]
+ score, evalue = scoreAndEvalue(aLine)
if evalue:
out.append(evalue)
if score and opts.bitScoreA is not None and opts.bitScoreB is not None:
bitScore = opts.bitScoreA * float(score) - opts.bitScoreB
out.append("%.3g" % bitScore)
- print joined(out, "\t")
+
+ print "\t".join(out)
+
+def mafConvertToBlastTab(opts, lines):
+ for maf in mafInput(opts, lines):
+ writeBlastTab(opts, maf)
##### Routines for converting to HTML format: #####
@@ -586,110 +693,102 @@ def multipleMatchSymbol(alignmentColumn):
if isMatch(alignmentColumn): return "*"
else: return " "
-def writeHtml(maf, lineSize):
+def writeHtml(opts, maf):
+ aLine, sLines, qLines, pLines = maf
+
scoreLine = "Alignment"
- try:
- scoreLine += " score=%s" % maf.namesAndValues["score"]
- scoreLine += ", expect=%s" % maf.namesAndValues["E"]
- except KeyError: pass
+ score, evalue = scoreAndEvalue(aLine)
+ if score:
+ scoreLine += " score=" + score
+ if evalue:
+ scoreLine += ", expect=" + evalue
print "<h3>%s:</h3>" % scoreLine
- if maf.pLines:
- probRows = [i[1] for i in maf.pLines]
+ if pLines:
+ probRows = [i.split()[1] for i in pLines]
probCols = izip(*probRows)
classes = imap(probabilityClass, probCols)
else:
classes = repeat(None)
- nameWidth = maxlen(maf.seqNames)
- alignmentColumns = zip(*maf.alnStrings)
+ seqNames = [i[0] for i in sLines]
+ nameWidth = maxlen(seqNames)
+ rows = [i[6] for i in sLines]
+ alignmentColumns = zip(*rows)
print '<pre>'
- for chunk in blastChunker(maf, lineSize, alignmentColumns):
- cols, rows, starts, ends = chunk
- startWidth = maxlen(starts)
+ for chunk in blastChunker(sLines, opts.linesize, alignmentColumns):
+ cols, rows, begs, ends = chunk
+ begWidth = maxlen(begs)
endWidth = maxlen(ends)
- matchSymbols = map(multipleMatchSymbol, cols)
- matchSymbols = ''.join(matchSymbols)
- classChunk = islice(classes, lineSize)
+ matchSymbols = ''.join(map(multipleMatchSymbol, cols))
+ classChunk = islice(classes, opts.linesize)
classRuns = list(identicalRuns(classChunk))
- for n, s, r, e in zip(maf.seqNames, starts, rows, ends):
+ for n, b, r, e in zip(seqNames, begs, rows, ends):
spans = [htmlSpan(r, i) for i in classRuns]
spans = ''.join(spans)
- formatParams = nameWidth, n, startWidth, s, spans, endWidth, e
+ formatParams = nameWidth, n, begWidth, b, spans, endWidth, e
print '%-*s %*s %s %*s' % formatParams
- print ' ' * nameWidth, ' ' * startWidth, matchSymbols
+ print ' ' * nameWidth, ' ' * begWidth, matchSymbols
print
print '</pre>'
-##### Routines for reading MAF format: #####
+def mafConvertToHtml(opts, lines):
+ for maf in mafInput(opts, lines):
+ writeHtml(opts, maf)
-def mafInput(lines, opts, isKeepComments):
- a = []
- s = []
- q = []
- p = []
- for i in lines:
- w = i.split()
- if not w:
- if a: yield a, s, q, p
- a = []
- s = []
- q = []
- p = []
- elif w[0] == "a":
- a = w
- elif w[0] == "s":
- s.append(w)
- elif w[0] == "q":
- q.append(w)
- elif w[0] == "p":
- p.append(w)
- elif i[0] == "#":
- for j in w:
- try:
- k, v = j.split("=")
- x = float(v)
- if k == "lambda": opts.bitScoreA = x / math.log(2)
- if k == "K": opts.bitScoreB = math.log(x, 2)
- except ValueError: pass
- if isKeepComments: print i,
- if a: yield a, s, q, p
+##### Main program: #####
def isFormat(myString, myFormat):
return myFormat.startswith(myString)
+def mafConvertOneFile(opts, formatName, lines):
+ if isFormat(formatName, "axt"):
+ mafConvertToAxt(opts, lines)
+ elif isFormat(formatName, "blast"):
+ mafConvertToBlast(opts, lines)
+ elif isFormat(formatName, "blasttab"):
+ mafConvertToBlastTab(opts, lines)
+ elif isFormat(formatName, "html"):
+ mafConvertToHtml(opts, lines)
+ elif isFormat(formatName, "psl"):
+ mafConvertToPsl(opts, lines)
+ elif isFormat(formatName, "sam"):
+ mafConvertToSam(opts, lines)
+ elif isFormat(formatName, "tabular"):
+ mafConvertToTab(opts, lines)
+ else:
+ raise Exception("unknown format: " + formatName)
+
def mafConvert(opts, args):
- format = args[0].lower()
- if isFormat(format, "sam"):
- if opts.dictionary: d = readSequenceLengths(fileinput.input(args[1:]))
- else: d = {}
- if opts.readgroup:
- readGroupItems = opts.readgroup.split()
- rg = "RG:Z:" + readGroupId(readGroupItems)
- else:
- readGroupItems = []
- rg = ""
- if not opts.noheader: writeSamHeader(d, opts.dictfile, readGroupItems)
- inputLines = fileinput.input(args[1:])
- if isFormat(format, "html") and not opts.noheader: writeHtmlHeader()
- isKeepCommentLines = isFormat(format, "tabular") and not opts.noheader
+ formatName = args[0].lower()
+ fileNames = args[1:]
+
+ opts.isKeepComments = False
opts.bitScoreA = None
opts.bitScoreB = None
- oldQueryName = ""
- for maf in mafInput(inputLines, opts, isKeepCommentLines):
- if isFormat(format, "axt"): writeAxt(Maf(*maf))
- elif isFormat(format, "blast"):
- maf = Maf(*maf)
- writeBlast(maf, opts, oldQueryName)
- oldQueryName = maf.seqNames[1]
- elif isFormat(format, "blasttab"): writeBlastTab(maf, opts)
- elif isFormat(format, "html"): writeHtml(Maf(*maf), opts.linesize)
- elif isFormat(format, "psl"): writePsl(Maf(*maf), opts.protein)
- elif isFormat(format, "sam"): writeSam(maf, rg)
- elif isFormat(format, "tabular"): writeTab(maf)
- else: raise Exception("unknown format: " + format)
- if isFormat(format, "html") and not opts.noheader: print "</body></html>"
+
+ if not opts.noheader:
+ if isFormat(formatName, "html"):
+ writeHtmlHeader()
+ if isFormat(formatName, "sam"):
+ writeSamHeader(opts, fileNames)
+ if isFormat(formatName, "tabular"):
+ opts.isKeepComments = True
+
+ if fileNames:
+ for i in fileNames:
+ if i == "-":
+ mafConvertOneFile(opts, formatName, sys.stdin)
+ else:
+ with open(i) as f:
+ mafConvertOneFile(opts, formatName, f)
+ else:
+ mafConvertOneFile(opts, formatName, sys.stdin)
+
+ if not opts.noheader:
+ if isFormat(formatName, "html"):
+ print "</body></html>"
if __name__ == "__main__":
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
@@ -709,6 +808,8 @@ if __name__ == "__main__":
op = optparse.OptionParser(usage=usage, description=description)
op.add_option("-p", "--protein", action="store_true",
help="assume protein alignments, for psl match counts")
+ op.add_option("-j", "--join", type="float", metavar="N",
+ help="join co-linear alignments separated by <= N letters")
op.add_option("-n", "--noheader", action="store_true",
help="omit any header lines from the output")
op.add_option("-d", "--dictionary", action="store_true",
@@ -719,7 +820,7 @@ if __name__ == "__main__":
help="read group info for sam format")
op.add_option("-l", "--linesize", type="int", default=60, #metavar="CHARS",
help="line length for blast and html formats (default: %default)")
- (opts, args) = op.parse_args()
+ opts, args = op.parse_args()
if opts.linesize <= 0: op.error("option -l: should be >= 1")
if opts.dictionary and opts.dictfile: op.error("can't use both -d and -f")
if len(args) < 1: op.error("I need a format-name and some MAF alignments")
diff --git a/src/LastalArguments.cc b/src/LastalArguments.cc
index f9b2dbf..0a16f11 100644
--- a/src/LastalArguments.cc
+++ b/src/LastalArguments.cc
@@ -477,6 +477,8 @@ void LastalArguments::setDefaultsFromAlphabet( bool isDna, bool isProtein,
batchSize = 0x100000; // 1 Mbyte
else if( outputType == 0 )
batchSize = 0x1000000; // 16 Mbytes
+ else if( !isVolumes )
+ batchSize = 0x800000; // 8 Mbytes
else if( inputFormat == sequenceFormat::prb )
batchSize = 0x2000000; // 32 Mbytes (?)
else
diff --git a/src/split/cbrc_split_aligner.cc b/src/split/cbrc_split_aligner.cc
index af29ae0..c395a5f 100644
--- a/src/split/cbrc_split_aligner.cc
+++ b/src/split/cbrc_split_aligner.cc
@@ -129,54 +129,51 @@ int SplitAligner::calcSpliceScore(double dist) const {
return std::floor(scale * s + 0.5);
}
-// The dinucleotide immediately downstream of the given coordinate
-unsigned SplitAligner::spliceBegSignal(unsigned coordinate,
- char strand) const {
- if (strand == '+') {
- const uchar *genomeBeg = genome.seqReader();
- const uchar *p = genomeBeg + coordinate;
- unsigned n1 = alphabet.numbersToUppercase[*p];
- if (n1 >= 4) return 16;
- unsigned n2 = alphabet.numbersToUppercase[*(p + 1)];
- if (n2 >= 4) return 16;
- return n1 * 4 + n2;
- } else {
- const uchar *genomeEnd = genome.seqReader() + genome.finishedSize();
- const uchar *p = genomeEnd - coordinate;
- unsigned n1 = alphabet.numbersToUppercase[*(p - 1)];
- if (n1 >= 4) return 16;
- unsigned n2 = alphabet.numbersToUppercase[*(p - 2)];
- if (n2 >= 4) return 16;
- return 15 - (n1 * 4 + n2); // reverse-complement
- }
+// The dinucleotide immediately downstream on the forward strand
+static unsigned spliceBegSignalFwd(const uchar *seqPtr,
+ const uchar *toUnmasked) {
+ unsigned n1 = toUnmasked[*seqPtr];
+ if (n1 >= 4) return 16;
+ unsigned n2 = toUnmasked[*(seqPtr + 1)];
+ if (n2 >= 4) return 16;
+ return n1 * 4 + n2;
}
-// The dinucleotide immediately upstream of the given coordinate
-unsigned SplitAligner::spliceEndSignal(unsigned coordinate,
- char strand) const {
- if (strand == '+') {
- const uchar *genomeBeg = genome.seqReader();
- const uchar *p = genomeBeg + coordinate;
- unsigned n2 = alphabet.numbersToUppercase[*(p - 1)];
- if (n2 >= 4) return 16;
- unsigned n1 = alphabet.numbersToUppercase[*(p - 2)];
- if (n1 >= 4) return 16;
- return n1 * 4 + n2;
- } else {
- const uchar *genomeEnd = genome.seqReader() + genome.finishedSize();
- const uchar *p = genomeEnd - coordinate;
- unsigned n2 = alphabet.numbersToUppercase[*p];
- if (n2 >= 4) return 16;
- unsigned n1 = alphabet.numbersToUppercase[*(p + 1)];
- if (n1 >= 4) return 16;
- return 15 - (n1 * 4 + n2); // reverse-complement
- }
+// The dinucleotide immediately downstream on the reverse strand
+static unsigned spliceBegSignalRev(const uchar *seqPtr,
+ const uchar *toUnmasked) {
+ unsigned n1 = toUnmasked[*(seqPtr - 1)];
+ if (n1 >= 4) return 16;
+ unsigned n2 = toUnmasked[*(seqPtr - 2)];
+ if (n2 >= 4) return 16;
+ return 15 - (n1 * 4 + n2); // reverse-complement
+}
+
+// The dinucleotide immediately upstream on the forward strand
+static unsigned spliceEndSignalFwd(const uchar *seqPtr,
+ const uchar *toUnmasked) {
+ unsigned n2 = toUnmasked[*(seqPtr - 1)];
+ if (n2 >= 4) return 16;
+ unsigned n1 = toUnmasked[*(seqPtr - 2)];
+ if (n1 >= 4) return 16;
+ return n1 * 4 + n2;
+}
+
+// The dinucleotide immediately upstream on the reverse strand
+static unsigned spliceEndSignalRev(const uchar *seqPtr,
+ const uchar *toUnmasked) {
+ unsigned n2 = toUnmasked[*seqPtr];
+ if (n2 >= 4) return 16;
+ unsigned n1 = toUnmasked[*(seqPtr + 1)];
+ if (n1 >= 4) return 16;
+ return 15 - (n1 * 4 + n2); // reverse-complement
}
unsigned SplitAligner::findScore(unsigned j, long score) const {
for (unsigned i = 0; i < numAlns; ++i) {
if (dpBeg(i) >= j || dpEnd(i) < j) continue;
- if (cell(Vmat, i, j) + spliceBegScore(i, j) == score) return i;
+ size_t ij = matrixRowOrigins[i] + j;
+ if (Vmat[ij] + spliceBegScore(ij) == score) return i;
}
return numAlns;
}
@@ -184,16 +181,18 @@ unsigned SplitAligner::findScore(unsigned j, long score) const {
unsigned SplitAligner::findSpliceScore(unsigned i, unsigned j,
long score) const {
assert(splicePrior > 0.0);
+ size_t ij = matrixRowOrigins[i] + j;
unsigned iSeq = rnameAndStrandIds[i];
- unsigned iEnd = cell(spliceEndCoords, i, j);
- int iScore = spliceEndScore(i, j);
+ unsigned iEnd = spliceEndCoords[ij];
+ int iScore = spliceEndScore(ij);
for (unsigned k = 0; k < numAlns; k++) {
if (rnameAndStrandIds[k] != iSeq) continue;
if (dpBeg(k) >= j || dpEnd(k) < j) continue;
- unsigned kBeg = cell(spliceBegCoords, k, j);
+ size_t kj = matrixRowOrigins[k] + j;
+ unsigned kBeg = spliceBegCoords[kj];
if (iEnd <= kBeg) continue;
- int s = iScore + spliceBegScore(k, j) + spliceScore(iEnd - kBeg);
- if (cell(Vmat, k, j) + s == score) return k;
+ int s = iScore + spliceBegScore(kj) + spliceScore(iEnd - kBeg);
+ if (Vmat[kj] + s == score) return k;
}
return numAlns;
}
@@ -201,30 +200,29 @@ unsigned SplitAligner::findSpliceScore(unsigned i, unsigned j,
long SplitAligner::scoreFromSplice(unsigned i, unsigned j,
unsigned oldNumInplay,
unsigned& oldInplayPos) const {
+ size_t ij = matrixRowOrigins[i] + j;
long score = LONG_MIN;
unsigned iSeq = rnameAndStrandIds[i];
- unsigned iEnd = cell(spliceEndCoords, i, j);
+ unsigned iEnd = spliceEndCoords[ij];
for (/* noop */; oldInplayPos < oldNumInplay; ++oldInplayPos) {
unsigned k = oldInplayAlnIndices[oldInplayPos];
if (rnameAndStrandIds[k] < iSeq) continue;
- if (rnameAndStrandIds[k] > iSeq) return score;
- if (rBegs[k] >= iEnd) return score;
- unsigned kBeg = cell(spliceBegCoords, k, j);
+ if (rnameAndStrandIds[k] > iSeq || rBegs[k] >= iEnd) return score;
+ size_t kj = matrixRowOrigins[k] + j;
+ unsigned kBeg = spliceBegCoords[kj];
if (kBeg >= rBegs[i] || rBegs[i] - kBeg <= maxSpliceDist) break;
}
- int iScore = spliceEndScore(i, j);
-
for (unsigned y = oldInplayPos; y < oldNumInplay; ++y) {
unsigned k = oldInplayAlnIndices[y];
- if (rnameAndStrandIds[k] > iSeq) break;
- if (rBegs[k] >= iEnd) break;
- unsigned kBeg = cell(spliceBegCoords, k, j);
+ if (rnameAndStrandIds[k] > iSeq || rBegs[k] >= iEnd) break;
+ size_t kj = matrixRowOrigins[k] + j;
+ unsigned kBeg = spliceBegCoords[kj];
if (iEnd <= kBeg) continue;
if (iEnd - kBeg > maxSpliceDist) continue;
- int s = iScore + spliceBegScore(k, j) + spliceScore(iEnd - kBeg);
- score = std::max(score, cell(Vmat, k, j) + s);
+ score = std::max(score,
+ Vmat[kj] + spliceBegScore(kj) + spliceScore(iEnd - kBeg));
}
return score;
@@ -319,18 +317,19 @@ long SplitAligner::viterbi() {
long sMax = INT_MIN/2;
for (unsigned x = 0; x < newNumInplay; ++x) {
unsigned i = newInplayAlnIndices[x];
- size_t k = matrixRowOrigins[i] + j;
+ size_t ij = matrixRowOrigins[i] + j;
- long s = std::max(Vmat[k] + Dmat[k],
- scoreFromJump + spliceEndScore(i, j));
- if (restartProb <= 0 && alns[i].qstart == j && s < 0) s = 0;
+ long s = scoreFromJump;
if (splicePrior > 0.0)
s = std::max(s,
scoreFromSplice(i, j, oldNumInplay, oldInplayPos));
- s += Amat[k];
+ s += spliceEndScore(ij);
+ s = std::max(s, Vmat[ij] + Dmat[ij]);
+ if (restartProb <= 0 && alns[i].qstart == j && s < 0) s = 0;
+ s += Amat[ij];
- Vmat[k+1] = s;
- sMax = std::max(sMax, s + spliceBegScore(i, j+1));
+ Vmat[ij + 1] = s;
+ sMax = std::max(sMax, s + spliceBegScore(ij + 1));
}
maxScore = std::max(sMax, maxScore);
cell(Vvec, j+1) = maxScore;
@@ -376,9 +375,9 @@ void SplitAligner::traceBack(long viterbiScore,
queryEnds.push_back(j);
for (;;) {
- long score = cell(Vmat, i, j);
--j;
- score -= cell(Amat, i, j);
+ size_t ij = matrixRowOrigins[i] + j;
+ long score = Vmat[ij + 1] - Amat[ij];
if (restartProb <= 0 && alns[i].qstart == j && score == 0) {
queryBegs.push_back(j);
return;
@@ -390,10 +389,10 @@ void SplitAligner::traceBack(long viterbiScore,
// makes some other kinds of boundary less clean. What's the best
// procedure for tied scores?
- bool isStay = (score == scoreIndel(i, j));
+ bool isStay = (score == Vmat[ij] + Dmat[ij]);
if (isStay && alns[i].qstrand == '+') continue;
- long s = score - spliceEndScore(i, j);
+ long s = score - spliceEndScore(ij);
long t = s - restartScore;
if (t == cell(Vvec, j)) {
queryBegs.push_back(j);
@@ -417,8 +416,9 @@ int SplitAligner::segmentScore(unsigned alnNum,
int score = 0;
unsigned i = alnNum;
for (unsigned j = queryBeg; j < queryEnd; ++j) {
- score += cell(Amat, i, j);
- if (j > queryBeg) score += cell(Dmat, i, j);
+ size_t ij = matrixRowOrigins[i] + j;
+ score += Amat[ij];
+ if (j > queryBeg) score += Dmat[ij];
}
return score;
}
@@ -426,30 +426,28 @@ int SplitAligner::segmentScore(unsigned alnNum,
double SplitAligner::probFromSpliceF(unsigned i, unsigned j,
unsigned oldNumInplay,
unsigned& oldInplayPos) const {
+ size_t ij = matrixRowOrigins[i] + j;
double sum = 0.0;
unsigned iSeq = rnameAndStrandIds[i];
- unsigned iEnd = cell(spliceEndCoords, i, j);
+ unsigned iEnd = spliceEndCoords[ij];
for (/* noop */; oldInplayPos < oldNumInplay; ++oldInplayPos) {
unsigned k = oldInplayAlnIndices[oldInplayPos];
if (rnameAndStrandIds[k] < iSeq) continue;
- if (rnameAndStrandIds[k] > iSeq) return sum;
- if (rBegs[k] >= iEnd) return sum;
- unsigned kBeg = cell(spliceBegCoords, k, j);
+ if (rnameAndStrandIds[k] > iSeq || rBegs[k] >= iEnd) return sum;
+ size_t kj = matrixRowOrigins[k] + j;
+ unsigned kBeg = spliceBegCoords[kj];
if (kBeg >= rBegs[i] || rBegs[i] - kBeg <= maxSpliceDist) break;
}
- double iProb = spliceEndProb(i, j);
-
for (unsigned y = oldInplayPos; y < oldNumInplay; ++y) {
unsigned k = oldInplayAlnIndices[y];
- if (rnameAndStrandIds[k] > iSeq) break;
- if (rBegs[k] >= iEnd) break;
- unsigned kBeg = cell(spliceBegCoords, k, j);
+ if (rnameAndStrandIds[k] > iSeq || rBegs[k] >= iEnd) break;
+ size_t kj = matrixRowOrigins[k] + j;
+ unsigned kBeg = spliceBegCoords[kj];
if (iEnd <= kBeg) continue;
if (iEnd - kBeg > maxSpliceDist) continue;
- double p = iProb * spliceBegProb(k, j) * spliceProb(iEnd - kBeg);
- sum += cell(Fmat, k, j) * p;
+ sum += Fmat[kj] * spliceBegProb(kj) * spliceProb(iEnd - kBeg);
}
return sum;
@@ -458,30 +456,28 @@ double SplitAligner::probFromSpliceF(unsigned i, unsigned j,
double SplitAligner::probFromSpliceB(unsigned i, unsigned j,
unsigned oldNumInplay,
unsigned& oldInplayPos) const {
+ size_t ij = matrixRowOrigins[i] + j;
double sum = 0.0;
unsigned iSeq = rnameAndStrandIds[i];
- unsigned iBeg = cell(spliceBegCoords, i, j);
+ unsigned iBeg = spliceBegCoords[ij];
for (/* noop */; oldInplayPos < oldNumInplay; ++oldInplayPos) {
unsigned k = oldInplayAlnIndices[oldInplayPos];
if (rnameAndStrandIds[k] < iSeq) continue;
- if (rnameAndStrandIds[k] > iSeq) return sum;
- if (rEnds[k] <= iBeg) return sum;
- unsigned kEnd = cell(spliceEndCoords, k, j);
+ if (rnameAndStrandIds[k] > iSeq || rEnds[k] <= iBeg) return sum;
+ size_t kj = matrixRowOrigins[k] + j;
+ unsigned kEnd = spliceEndCoords[kj];
if (kEnd <= rEnds[i] || kEnd - rEnds[i] <= maxSpliceDist) break;
}
- double iProb = spliceBegProb(i, j);
-
for (unsigned y = oldInplayPos; y < oldNumInplay; ++y) {
unsigned k = oldInplayAlnIndices[y];
- if (rnameAndStrandIds[k] > iSeq) break;
- if (rEnds[k] <= iBeg) break;
- unsigned kEnd = cell(spliceEndCoords, k, j);
+ if (rnameAndStrandIds[k] > iSeq || rEnds[k] <= iBeg) break;
+ size_t kj = matrixRowOrigins[k] + j;
+ unsigned kEnd = spliceEndCoords[kj];
if (kEnd <= iBeg) continue;
if (kEnd - iBeg > maxSpliceDist) continue;
- double p = iProb * spliceEndProb(k, j) * spliceProb(kEnd - iBeg);
- sum += cell(Bmat, k, j) * p;
+ sum += Bmat[kj] * spliceEndProb(kj) * spliceProb(kEnd - iBeg);
}
return sum;
@@ -513,17 +509,19 @@ void SplitAligner::forward() {
double rNew = 1.0;
for (unsigned x = 0; x < newNumInplay; ++x) {
unsigned i = newInplayAlnIndices[x];
- size_t k = matrixRowOrigins[i] + j;
+ size_t ij = matrixRowOrigins[i] + j;
- double p = Fmat[k] * Dexp[k] + probFromJump * spliceEndProb(i, j);
- if (restartProb <= 0 && alns[i].qstart == j) p += begprob;
+ double p = probFromJump;
if (splicePrior > 0.0)
p += probFromSpliceF(i, j, oldNumInplay, oldInplayPos);
- p = p * Aexp[k] / r;
+ p *= spliceEndProb(ij);
+ p += Fmat[ij] * Dexp[ij];
+ if (restartProb <= 0 && alns[i].qstart == j) p += begprob;
+ p = p * Aexp[ij] / r;
- Fmat[k+1] = p;
+ Fmat[ij + 1] = p;
if (alns[i].qend == j+1) zF += p;
- pSum += p * spliceBegProb(i, j+1);
+ pSum += p * spliceBegProb(ij + 1);
rNew += p;
}
begprob /= r;
@@ -559,17 +557,19 @@ void SplitAligner::backward() {
double pSum = 0.0;
for (unsigned x = 0; x < newNumInplay; ++x) {
unsigned i = newInplayAlnIndices[x];
- size_t k = matrixRowOrigins[i] + j;
+ size_t ij = matrixRowOrigins[i] + j;
- double p = Bmat[k] * Dexp[k] + probFromJump * spliceBegProb(i, j);
- if (restartProb <= 0 && alns[i].qend == j) p += endprob;
+ double p = probFromJump;
if (splicePrior > 0.0)
p += probFromSpliceB(i, j, oldNumInplay, oldInplayPos);
- p = p * Aexp[k-1] / r;
+ p *= spliceBegProb(ij);
+ p += Bmat[ij] * Dexp[ij];
+ if (restartProb <= 0 && alns[i].qend == j) p += endprob;
+ p = p * Aexp[ij - 1] / r;
- Bmat[k-1] = p;
+ Bmat[ij - 1] = p;
//if (alns[i].qstart == j-1) zB += p;
- pSum += p * spliceEndProb(i, j-1);
+ pSum += p * spliceEndProb(ij - 1);
}
endprob /= r;
sumProb = pSum * restartProb + sumProb / r;
@@ -587,11 +587,12 @@ SplitAligner::marginalProbs(unsigned queryBeg, unsigned alnNum,
unsigned i = alnNum;
unsigned j = queryBeg;
for (unsigned pos = alnBeg; pos < alnEnd; ++pos) {
+ size_t ij = matrixRowOrigins[i] + j;
if (alns[i].qalign[pos] == '-') {
- double value = cell(Fmat, i, j) * cell(Bmat, i, j) * cell(Dexp, i, j) / cell(rescales, j);
+ double value = Fmat[ij] * Bmat[ij] * Dexp[ij] / cell(rescales, j);
output.push_back(value);
} else {
- double value = cell(Fmat, i, j+1) * cell(Bmat, i, j) / cell(Aexp, i, j);
+ double value = Fmat[ij + 1] * Bmat[ij] / Aexp[ij];
if (value != value) value = 0.0;
output.push_back(value);
j++;
@@ -600,7 +601,7 @@ SplitAligner::marginalProbs(unsigned queryBeg, unsigned alnNum,
return output;
}
-// The next 2 routines represent affine gap scores in a cunning way.
+// The next routine represents affine gap scores in a cunning way.
// Amat holds scores at query bases, and at every base that is aligned
// to a gap it gets a score of insExistenceScore + insExtensionScore.
// Dmat holds scores between query bases, and between every pair of
@@ -609,149 +610,133 @@ SplitAligner::marginalProbs(unsigned queryBeg, unsigned alnNum,
// if we jump from one alignment to another in the middle of a gap.
void SplitAligner::calcBaseScores(unsigned i) {
- int firstGapScore = insExistenceScore + insExtensionScore;
+ const int firstInsScore = insExistenceScore + insExtensionScore;
+ const int tweenInsScore = -insExistenceScore;
+
const UnsplitAlignment& a = alns[i];
+ const size_t origin = matrixRowOrigins[i];
- int *b = &cell(Amat, i, dpBeg(i));
- int *s = &cell(Amat, i, a.qstart);
- int *f = &cell(Amat, i, a.qend);
- int *e = &cell(Amat, i, dpEnd(i));
+ int *AmatB = &Amat[origin + dpBeg(i)];
+ int *DmatB = &Dmat[origin + dpBeg(i)];
+ int *AmatS = &Amat[origin + a.qstart];
+ int *AmatE = &Amat[origin + dpEnd(i)];
- while (b < s) *b++ = firstGapScore;
+ int delScore = 0;
+ int insCompensationScore = 0;
+
+ // treat any query letters before the alignment as insertions:
+ while (AmatB < AmatS) {
+ *AmatB++ = firstInsScore;
+ *DmatB++ = delScore + insCompensationScore;
+ delScore = 0;
+ insCompensationScore = tweenInsScore;
+ }
const char *rAlign = a.ralign;
const char *qAlign = a.qalign;
const char *qQual = a.qQual;
- while (b < f) {
+ while (*qAlign) {
unsigned char x = *rAlign++;
unsigned char y = *qAlign++;
int q = qQual ? (*qQual++ - qualityOffset) : (numQualCodes - 1);
- if (y == '-') /* noop */;
- else if (x == '-') *b++ = firstGapScore;
- else {
+ if (x == '-') { // gap in reference sequence: insertion
+ *AmatB++ = firstInsScore;
+ *DmatB++ = delScore + insCompensationScore;
+ delScore = 0;
+ insCompensationScore = tweenInsScore;
+ } else if (y == '-') { // gap in query sequence: deletion
+ if (delScore == 0) delScore = gapExistenceScore;
+ delScore += gapExtensionScore;
+ insCompensationScore = 0;
+ } else {
assert(q >= 0);
if (q >= numQualCodes) q = numQualCodes - 1;
- *b++ = score_mat[x % 64][y % 64][q];
+ *AmatB++ = score_mat[x % 64][y % 64][q];
+ *DmatB++ = delScore;
+ delScore = 0;
+ insCompensationScore = 0;
}
// Amazingly, in ASCII, '.' equals 'n' mod 64.
// So '.' will get the same scores as 'n'.
}
- while (b < e) *b++ = firstGapScore;
-}
-
-void SplitAligner::calcInsScores(unsigned i) {
- const UnsplitAlignment& a = alns[i];
- bool isExt = false;
-
- int *b = &cell(Dmat, i, dpBeg(i));
- int *s = &cell(Dmat, i, a.qstart);
- int *e = &cell(Dmat, i, dpEnd(i));
-
- while (b < s) {
- *b++ = (isExt ? -insExistenceScore : 0);
- isExt = true;
+ // treat any query letters after the alignment as insertions:
+ while (AmatB < AmatE) {
+ *AmatB++ = firstInsScore;
+ *DmatB++ = delScore + insCompensationScore;
+ delScore = 0;
+ insCompensationScore = tweenInsScore;
}
- const char *rAlign = a.ralign;
- const char *qAlign = a.qalign;
-
- while (*qAlign) {
- bool isDel = (*qAlign++ == '-');
- bool isIns = (*rAlign++ == '-');
- if (!isDel) *b++ = (isIns && isExt ? -insExistenceScore : 0);
- isExt = isIns;
- }
-
- while (b < e) {
- *b++ = (isExt ? -insExistenceScore : 0);
- isExt = true;
- }
-
- *b++ = 0;
-}
-
-void SplitAligner::calcDelScores(unsigned i) {
- const UnsplitAlignment& a = alns[i];
- int *b = &cell(Dmat, i, a.qstart);
- const char *qAlign = a.qalign;
- int delScore = 0;
- while (*qAlign) {
- if (*qAlign++ == '-') { // deletion in query
- if (delScore == 0) delScore = gapExistenceScore;
- delScore += gapExtensionScore;
- } else {
- *b++ += delScore;
- delScore = 0;
- }
- }
- *b++ += delScore;
+ *DmatB++ = delScore;
}
-void SplitAligner::calcScoreMatrices() {
- resizeMatrix(Amat);
- resizeMatrix(Dmat);
-
- for (unsigned i = 0; i < numAlns; i++) {
- calcBaseScores(i);
- calcInsScores(i);
- calcDelScores(i);
+void SplitAligner::initRbegsAndEnds() {
+ for (unsigned i = 0; i < numAlns; ++i) {
+ const UnsplitAlignment& a = alns[i];
+ rBegs[i] = a.rstart;
+ rEnds[i] = a.rend;
}
}
-void SplitAligner::initSpliceCoords() {
- resizeMatrix(spliceBegCoords);
- resizeMatrix(spliceEndCoords);
-
- for (unsigned i = 0; i < numAlns; ++i) {
- const UnsplitAlignment& a = alns[i];
- unsigned j = dpBeg(i);
- unsigned k = a.rstart;
-
- if (!chromosomeIndex.empty()) {
- StringNumMap::const_iterator f = chromosomeIndex.find(a.rname);
- if (f == chromosomeIndex.end())
- err("can't find " + std::string(a.rname) + " in the genome");
- unsigned c = f->second;
- if (a.qstrand == '+') k += genome.seqBeg(c);
- else k += genome.finishedSize() - genome.seqEnd(c);
- }
+void SplitAligner::initSpliceCoords(unsigned i) {
+ const UnsplitAlignment& a = alns[i];
+ unsigned j = dpBeg(i);
+ unsigned k = a.rstart;
- rBegs[i] = k;
+ cell(spliceBegCoords, i, j) = k;
+ while (j < a.qstart) {
+ cell(spliceEndCoords, i, j) = k;
+ ++j;
cell(spliceBegCoords, i, j) = k;
- while (j < a.qstart) {
- cell(spliceEndCoords, i, j) = k;
- ++j;
- cell(spliceBegCoords, i, j) = k;
- }
- for (unsigned x = 0; a.ralign[x]; ++x) {
- if (a.qalign[x] != '-') cell(spliceEndCoords, i, j) = k;
- if (a.qalign[x] != '-') ++j;
- if (a.ralign[x] != '-') ++k;
- if (a.qalign[x] != '-') cell(spliceBegCoords, i, j) = k;
- }
- while (j < dpEnd(i)) {
- cell(spliceEndCoords, i, j) = k;
- ++j;
- cell(spliceBegCoords, i, j) = k;
- }
+ }
+ for (unsigned x = 0; a.ralign[x]; ++x) {
+ if (a.qalign[x] != '-') cell(spliceEndCoords, i, j) = k;
+ if (a.qalign[x] != '-') ++j;
+ if (a.ralign[x] != '-') ++k;
+ if (a.qalign[x] != '-') cell(spliceBegCoords, i, j) = k;
+ }
+ while (j < dpEnd(i)) {
cell(spliceEndCoords, i, j) = k;
- rEnds[i] = k;
+ ++j;
+ cell(spliceBegCoords, i, j) = k;
}
+ cell(spliceEndCoords, i, j) = k;
+
+ assert(k == a.rend); // xxx
}
-void SplitAligner::initSpliceSignals() {
- resizeMatrix(spliceBegSignals);
- resizeMatrix(spliceEndSignals);
+void SplitAligner::initSpliceSignals(unsigned i) {
+ const uchar *toUnmasked = alphabet.numbersToUppercase;
+ const UnsplitAlignment& a = alns[i];
- for (unsigned i = 0; i < numAlns; ++i) {
- char strand = alns[i].qstrand;
- for (unsigned j = dpBeg(i); j <= dpEnd(i); ++j) {
- cell(spliceBegSignals, i, j) =
- spliceBegSignal(cell(spliceBegCoords, i, j), strand);
- cell(spliceEndSignals, i, j) =
- spliceEndSignal(cell(spliceEndCoords, i, j), strand);
+ StringNumMap::const_iterator f = chromosomeIndex.find(a.rname);
+ if (f == chromosomeIndex.end())
+ err("can't find " + std::string(a.rname) + " in the genome");
+ unsigned v = f->second % maxGenomeVolumes();
+ unsigned c = f->second / maxGenomeVolumes();
+ const uchar *chromBeg = genome[v].seqReader() + genome[v].seqBeg(c);
+ const uchar *chromEnd = genome[v].seqReader() + genome[v].seqEnd(c);
+ if (a.rend > chromEnd - chromBeg)
+ err("alignment beyond the end of " + std::string(a.rname));
+
+ size_t rowBeg = matrixRowOrigins[i] + dpBeg(i);
+ const unsigned *begCoords = &spliceBegCoords[rowBeg];
+ const unsigned *endCoords = &spliceEndCoords[rowBeg];
+ unsigned char *begSignals = &spliceBegSignals[rowBeg];
+ unsigned char *endSignals = &spliceEndSignals[rowBeg];
+ unsigned dpLen = dpEnd(i) - dpBeg(i);
+
+ if (a.qstrand == '+') {
+ for (unsigned j = 0; j <= dpLen; ++j) {
+ begSignals[j] = spliceBegSignalFwd(chromBeg + begCoords[j], toUnmasked);
+ endSignals[j] = spliceEndSignalFwd(chromBeg + endCoords[j], toUnmasked);
+ }
+ } else {
+ for (unsigned j = 0; j <= dpLen; ++j) {
+ begSignals[j] = spliceBegSignalRev(chromEnd - begCoords[j], toUnmasked);
+ endSignals[j] = spliceEndSignalRev(chromEnd - endCoords[j], toUnmasked);
}
}
}
@@ -780,27 +765,13 @@ void SplitAligner::initRnameAndStrandIds() {
}
}
-void SplitAligner::initForwardBackward() {
- resizeMatrix(Aexp);
- transform(Amat.begin(), Amat.end(), Aexp.begin(), scaledExp);
-
- resizeMatrix(Dexp);
- transform(Dmat.begin(), Dmat.end(), Dexp.begin(), scaledExp);
-
- // if x/scale < about -745, then exp(x/scale) will be exactly 0.0
-}
-
-int SplitAligner::maxJumpScore() const {
- int m = jumpScore;
- if (splicePrior > 0.0) {
- double s = spliceTerm1 - meanLogDist + sdevLogDist * sdevLogDist / 2.0;
- int maxSpliceScore = std::floor(scale * s + 0.5);
- m = std::max(m, maxSpliceScore);
- }
- if (!chromosomeIndex.empty()) {
- m += arrayMax(spliceBegScores) + arrayMax(spliceEndScores);
- }
- return m;
+void SplitAligner::dpExtensionMinScores(int maxJumpScore,
+ size_t& minScore1,
+ size_t& minScore2) const {
+ if (!chromosomeIndex.empty()) maxJumpScore += maxSpliceBegEndScore;
+ assert(maxJumpScore + insExistenceScore <= 0);
+ minScore1 = 1 - (maxJumpScore + insExistenceScore);
+ minScore2 = 1 - (maxJumpScore + maxJumpScore + insExistenceScore);
}
static size_t dpExtension(size_t maxScore, size_t minScore, size_t divisor) {
@@ -842,10 +813,8 @@ void SplitAligner::initDpBounds() {
size_t minScore1 = -1;
size_t minScore2 = -1;
if (jumpProb > 0.0 || splicePrior > 0.0) {
- int m = maxJumpScore();
- assert(m + insExistenceScore <= 0);
- minScore1 = 1 - (m + insExistenceScore);
- minScore2 = 1 - (m + m + insExistenceScore);
+ int m = (splicePrior > 0.0) ? maxSpliceScore : jumpScore;
+ dpExtensionMinScores(m, minScore1, minScore2);
}
for (unsigned i = 0; i < numAlns; ++i) {
@@ -875,15 +844,12 @@ void SplitAligner::initDpBounds() {
}
}
-void SplitAligner::initForOneQuery(std::vector<UnsplitAlignment>::const_iterator beg,
- std::vector<UnsplitAlignment>::const_iterator end) {
+void SplitAligner::layout(std::vector<UnsplitAlignment>::const_iterator beg,
+ std::vector<UnsplitAlignment>::const_iterator end) {
assert(end > beg);
numAlns = end - beg;
alns = beg;
- initDpBounds();
- calcScoreMatrices();
-
sortedAlnIndices.resize(numAlns);
for (unsigned i = 0; i < numAlns; ++i) sortedAlnIndices[i] = i;
oldInplayAlnIndices.resize(numAlns);
@@ -893,13 +859,47 @@ void SplitAligner::initForOneQuery(std::vector<UnsplitAlignment>::const_iterator
rEnds.resize(numAlns);
if (splicePrior > 0.0 || !chromosomeIndex.empty()) {
- initSpliceCoords();
- if (!chromosomeIndex.empty()) initSpliceSignals();
+ initRbegsAndEnds();
//initRnameAndStrandIds();
}
initRnameAndStrandIds();
- initForwardBackward();
+ initDpBounds();
+}
+
+size_t SplitAligner::memory(bool isViterbi, bool isBothSpliceStrands) const {
+ size_t numOfStrands = isBothSpliceStrands ? 2 : 1;
+ size_t x = 2 * sizeof(int) + 2 * sizeof(double);
+ if (splicePrior > 0 || !chromosomeIndex.empty()) x += 2 * sizeof(unsigned);
+ if (!chromosomeIndex.empty()) x += 2;
+ if (isViterbi) x += sizeof(long) * numOfStrands;
+ x += 2 * sizeof(double) * numOfStrands;
+ return x * cellsPerDpMatrix();
+}
+
+void SplitAligner::initMatricesForOneQuery() {
+ resizeMatrix(Amat);
+ resizeMatrix(Dmat);
+ resizeMatrix(Aexp);
+ resizeMatrix(Dexp);
+
+ for (unsigned i = 0; i < numAlns; i++) calcBaseScores(i);
+
+ if (splicePrior > 0.0 || !chromosomeIndex.empty()) {
+ resizeMatrix(spliceBegCoords);
+ resizeMatrix(spliceEndCoords);
+ for (unsigned i = 0; i < numAlns; ++i) initSpliceCoords(i);
+ }
+
+ if (!chromosomeIndex.empty()) {
+ resizeMatrix(spliceBegSignals);
+ resizeMatrix(spliceEndSignals);
+ for (unsigned i = 0; i < numAlns; ++i) initSpliceSignals(i);
+ }
+
+ transform(Amat.begin(), Amat.end(), Aexp.begin(), scaledExp);
+ transform(Dmat.begin(), Dmat.end(), Dexp.begin(), scaledExp);
+ // if x/scale < about -745, then exp(x/scale) will be exactly 0.0
}
void SplitAligner::flipSpliceSignals() {
@@ -951,14 +951,18 @@ void SplitAligner::setSpliceParams(double splicePriorIn,
if (splicePrior <= 0.0) return;
const double rootTwoPi = std::sqrt(8.0 * std::atan(1.0));
+ double s2 = sdevLogDist * sdevLogDist;
spliceTerm1 = -std::log(sdevLogDist * rootTwoPi / splicePrior);
- spliceTerm2 = -1.0 / (2.0 * sdevLogDist * sdevLogDist);
+ spliceTerm2 = -0.5 / s2;
+
+ double max1 = spliceTerm1 - meanLogDist + s2 * 0.5;
+ int max2 = std::floor(scale * max1 + 0.5);
+ maxSpliceScore = std::max(max2, jumpScore);
// Set maxSpliceDist so as to ignore splices whose score would be
// less than jumpScore. By solving this quadratic equation:
// spliceTerm1 + spliceTerm2 * (logDist - meanLogDist)^2 - logDist =
// jumpScore / scale
- double s2 = sdevLogDist * sdevLogDist;
double r = s2 + 2 * (spliceTerm1 - meanLogDist - jumpScore / scale);
if (r < 0) {
maxSpliceDist = 0;
@@ -1053,6 +1057,8 @@ void SplitAligner::setSpliceSignals() {
spliceBegProbs[i] = scaledExp(spliceBegScores[i]);
spliceEndProbs[i] = scaledExp(spliceEndScores[i]);
}
+
+ maxSpliceBegEndScore = arrayMax(spliceBegScores) + arrayMax(spliceEndScores);
}
void SplitAligner::printParameters() const {
@@ -1080,10 +1086,11 @@ void SplitAligner::printParameters() const {
}
}
-void SplitAligner::readGenome(const std::string& baseName) {
- std::string alphabetLetters;
- unsigned seqCount = -1;
- unsigned volumes = -1;
+static void readPrjFile(const std::string& baseName,
+ std::string& alphabetLetters,
+ unsigned& seqCount,
+ unsigned& volumes) {
+ seqCount = volumes = -1;
std::string fileName = baseName + ".prj";
std::ifstream f(fileName.c_str());
@@ -1098,19 +1105,40 @@ void SplitAligner::readGenome(const std::string& baseName) {
if( word == "volumes" ) iss >> volumes;
}
- if (alphabetLetters != "ACGT" || seqCount+1 == 0)
- err("can't read file: " + fileName);
+ if (alphabetLetters != "ACGT") err("can't read file: " + fileName);
+}
- if (volumes+1 > 0 && volumes > 1)
- err("can't read multi-volume lastdb files, sorry");
+void SplitAligner::readGenomeVolume(const std::string& baseName,
+ unsigned seqCount,
+ unsigned volumeNumber) {
+ if (seqCount + 1 == 0) err("can't read: " + baseName);
- genome.fromFiles(baseName, seqCount, 0);
+ genome[volumeNumber].fromFiles(baseName, seqCount, 0);
- for (unsigned i = 0; i < seqCount; ++i) {
- std::string n = genome.seqName(i);
- if (!chromosomeIndex.insert(std::make_pair(n, i)).second)
+ for (unsigned long long i = 0; i < seqCount; ++i) {
+ std::string n = genome[volumeNumber].seqName(i);
+ unsigned long long j = i * maxGenomeVolumes() + volumeNumber;
+ if (!chromosomeIndex.insert(std::make_pair(n, j)).second)
err("duplicate sequence name: " + n);
}
+}
+
+void SplitAligner::readGenome(const std::string& baseName) {
+ std::string alphabetLetters;
+ unsigned seqCount, volumes;
+ readPrjFile(baseName, alphabetLetters, seqCount, volumes);
+
+ if (volumes + 1 > 0 && volumes > 1) {
+ if (volumes > maxGenomeVolumes()) err("too many volumes: " + baseName);
+ for (unsigned i = 0; i < volumes; ++i) {
+ std::string b = baseName + stringify(i);
+ unsigned c, v;
+ readPrjFile(b, alphabetLetters, c, v);
+ readGenomeVolume(b, c, i);
+ }
+ } else {
+ readGenomeVolume(baseName, seqCount, 0);
+ }
alphabet.fromString(alphabetLetters);
}
diff --git a/src/split/cbrc_split_aligner.hh b/src/split/cbrc_split_aligner.hh
index 73161dc..53db5b0 100644
--- a/src/split/cbrc_split_aligner.hh
+++ b/src/split/cbrc_split_aligner.hh
@@ -59,9 +59,20 @@ public:
// Outputs some algorithm parameters on lines starting with "#"
void printParameters() const;
- // Prepares to analyze some candidate alignments for one query sequence
- void initForOneQuery(std::vector<UnsplitAlignment>::const_iterator beg,
- std::vector<UnsplitAlignment>::const_iterator end);
+ // Prepares to analyze some candidate alignments for one query
+ // sequence: sets the number of DP matrix cells (and thus memory)
+ void layout(std::vector<UnsplitAlignment>::const_iterator beg,
+ std::vector<UnsplitAlignment>::const_iterator end);
+
+ // The number of cells in each dynamic programming matrix
+ size_t cellsPerDpMatrix() const
+ { return matrixRowOrigins[numAlns-1] + dpEnd(numAlns-1) + 1; }
+
+ // Bytes of memory needed for the current query sequence (roughly)
+ size_t memory(bool isViterbi, bool isBothSpliceStrands) const;
+
+ // Call this before viterbi/forward/backward, and after layout
+ void initMatricesForOneQuery();
long viterbi(); // returns the optimal split-alignment score
@@ -143,6 +154,8 @@ private:
double spliceTerm1;
double spliceTerm2;
unsigned maxSpliceDist;
+ int maxSpliceScore;
+ int maxSpliceBegEndScore;
std::vector<unsigned> spliceBegCoords;
std::vector<unsigned> spliceEndCoords;
std::vector<unsigned char> spliceBegSignals;
@@ -153,31 +166,29 @@ private:
std::vector<int> spliceScoreTable; // lookup table
std::vector<double> spliceProbTable; // lookup table
unsigned spliceTableSize;
- MultiSequence genome;
+ MultiSequence genome[32];
Alphabet alphabet;
- typedef std::map<std::string, unsigned> StringNumMap;
+ typedef std::map<std::string, unsigned long long> StringNumMap;
StringNumMap chromosomeIndex;
int spliceBegScores[4 * 4 + 1]; // donor score for any dinucleotide
int spliceEndScores[4 * 4 + 1]; // acceptor score for any dinucleotide
double spliceBegProbs[4 * 4 + 1];
double spliceEndProbs[4 * 4 + 1];
- unsigned spliceBegSignal(unsigned coordinate, char strand) const;
- unsigned spliceEndSignal(unsigned coordinate, char strand) const;
- int spliceBegScore(unsigned i, unsigned j) const {
+ int spliceBegScore(size_t ij) const {
if (chromosomeIndex.empty()) return 0;
- return spliceBegScores[cell(spliceBegSignals, i, j)];
+ return spliceBegScores[spliceBegSignals[ij]];
}
- int spliceEndScore(unsigned i, unsigned j) const {
+ int spliceEndScore(size_t ij) const {
if (chromosomeIndex.empty()) return 0;
- return spliceEndScores[cell(spliceEndSignals, i, j)];
+ return spliceEndScores[spliceEndSignals[ij]];
}
- double spliceBegProb(unsigned i, unsigned j) const {
+ double spliceBegProb(size_t ij) const {
if (chromosomeIndex.empty()) return 1;
- return spliceBegProbs[cell(spliceBegSignals, i, j)];
+ return spliceBegProbs[spliceBegSignals[ij]];
}
- double spliceEndProb(unsigned i, unsigned j) const {
+ double spliceEndProb(size_t ij) const {
if (chromosomeIndex.empty()) return 1;
- return spliceEndProbs[cell(spliceEndSignals, i, j)];
+ return spliceEndProbs[spliceEndSignals[ij]];
}
int calcSpliceScore(double dist) const;
int spliceScore(unsigned d) const
@@ -186,11 +197,16 @@ private:
{ return scaledExp(calcSpliceScore(dist)); }
double spliceProb(unsigned d) const
{ return d < spliceTableSize ? spliceProbTable[d] : calcSpliceProb(d); }
- void initSpliceCoords();
- void initSpliceSignals();
+ void initSpliceCoords(unsigned i);
+ void initSpliceSignals(unsigned i);
void initRnameAndStrandIds();
+ void initRbegsAndEnds();
+ static size_t maxGenomeVolumes() { return sizeof genome / sizeof *genome; }
+ void readGenomeVolume(const std::string& baseName,
+ unsigned seqCount, unsigned volumeNumber);
- int maxJumpScore() const;
+ void dpExtensionMinScores(int maxJumpScore,
+ size_t& minScore1, size_t& minScore2) const;
void updateInplayAlnIndicesF(unsigned& sortedAlnPos,
unsigned& oldNumInplay,
@@ -239,7 +255,7 @@ private:
// stored in a flat vector. There are numAlns rows, and row i
// has dpEnd(i) - dpBeg(i) + 1 cells. (The final cell per row
// is used in some matrices but not others.)
- m.resize(matrixRowOrigins[numAlns-1] + dpEnd(numAlns-1) + 1);
+ m.resize(cellsPerDpMatrix());
}
double probFromSpliceF(unsigned i, unsigned j, unsigned oldNumInplay,
@@ -249,15 +265,7 @@ private:
unsigned& oldInplayPos) const;
void calcBaseScores(unsigned i);
- void calcInsScores(unsigned i);
- void calcDelScores(unsigned i);
- void calcScoreMatrices();
- void initForwardBackward();
void initDpBounds();
-
- long scoreIndel(unsigned i, unsigned j) const {
- return cell(Vmat, i, j) + cell(Dmat, i, j);
- }
};
}
diff --git a/src/split/last-split-main.cc b/src/split/last-split-main.cc
index 01cea8f..6453048 100644
--- a/src/split/last-split-main.cc
+++ b/src/split/last-split-main.cc
@@ -11,6 +11,16 @@
#include <cstdlib> // EXIT_SUCCESS, EXIT_FAILURE
#include <iostream>
+static size_t defaultBytes(bool isSplicedAlignment) {
+ size_t b = isSplicedAlignment ? 8 : 8 * 1024;
+ for (int i = 0; i < 3; ++i) {
+ size_t n = b * 1024;
+ if (n / 1024 != b) return -1;
+ b = n;
+ }
+ return b;
+}
+
static void run(int argc, char* argv[]) {
LastSplitOptions opts;
@@ -22,6 +32,7 @@ static void run(int argc, char* argv[]) {
opts.mismap = 0.01;
opts.score = -1;
opts.no_split = false;
+ opts.bytes = 0;
opts.verbose = false;
opts.isSplicedAlignment = false;
@@ -58,11 +69,12 @@ Options:\n\
+ cbrc::stringify(opts.mismap) + ")\n\
-s, --score=INT minimum alignment score (default=e OR e+t*ln[100])\n\
-n, --no-split write original, not split, alignments\n\
+ -b, --bytes=B maximum memory (default=8T for split, 8G for spliced)\n\
-v, --verbose be verbose\n\
-V, --version show version information and exit\n\
";
- const char sOpts[] = "hg:d:c:t:M:S:m:s:nvV";
+ const char sOpts[] = "hg:d:c:t:M:S:m:s:nb:vV";
static struct option lOpts[] = {
{ "help", no_argument, 0, 'h' },
@@ -75,6 +87,7 @@ Options:\n\
{ "mismap", required_argument, 0, 'm' },
{ "score", required_argument, 0, 's' },
{ "no-split", no_argument, 0, 'n' },
+ { "bytes", required_argument, 0, 'b' },
{ "verbose", no_argument, 0, 'v' },
{ "version", no_argument, 0, 'V' },
{ 0, 0, 0, 0}
@@ -119,6 +132,9 @@ Options:\n\
case 'n':
opts.no_split = true;
break;
+ case 'b':
+ cbrc::unstringifySize(opts.bytes, optarg);
+ break;
case 'v':
opts.verbose = true;
break;
@@ -130,6 +146,8 @@ Options:\n\
}
}
+ if (!opts.bytes) opts.bytes = defaultBytes(opts.isSplicedAlignment);
+
opts.inputFileNames.assign(argv + optind, argv + argc);
if (opts.inputFileNames.empty()) opts.inputFileNames.push_back("-");
diff --git a/src/split/last-split.cc b/src/split/last-split.cc
index 7db0432..b18a156 100644
--- a/src/split/last-split.cc
+++ b/src/split/last-split.cc
@@ -127,7 +127,16 @@ static void doOneQuery(std::vector<cbrc::UnsplitAlignment>::const_iterator beg,
cbrc::SplitAligner& sa, const LastSplitOptions& opts,
bool isAlreadySplit) {
if (opts.verbose) std::cerr << beg->qname << "\t" << (end - beg);
- sa.initForOneQuery(beg, end);
+ sa.layout(beg, end);
+ if (opts.verbose) std::cerr << "\tcells=" << sa.cellsPerDpMatrix();
+ size_t bytes = sa.memory(!opts.no_split, opts.direction == 2);
+ if (bytes > opts.bytes) {
+ if (opts.verbose) std::cerr << "\n";
+ std::cerr << "last-split: skipping sequence " << beg->qname
+ << " (" << bytes << " bytes)\n";
+ return;
+ }
+ sa.initMatricesForOneQuery();
if (opts.direction != 0) {
sa.forward();
diff --git a/src/split/last-split.hh b/src/split/last-split.hh
index 96a86c3..5c13c7f 100644
--- a/src/split/last-split.hh
+++ b/src/split/last-split.hh
@@ -13,6 +13,7 @@
#ifndef LAST_SPLIT_HH
#define LAST_SPLIT_HH
+#include <stddef.h> // size_t
#include <string>
#include <vector>
@@ -26,6 +27,7 @@ struct LastSplitOptions {
double mismap;
int score;
bool no_split;
+ size_t bytes;
bool verbose;
bool isSplicedAlignment;
std::vector<std::string> inputFileNames;
diff --git a/src/stringify.hh b/src/stringify.hh
index ea846c7..6971726 100644
--- a/src/stringify.hh
+++ b/src/stringify.hh
@@ -38,15 +38,20 @@ void unstringifySize( T& x, const std::string& s ){
std::string suffix;
if( iss >> suffix ){
- int multiplier;
- /**/ if( suffix == "K" ) multiplier = 1024; // "KibiBytes"
- else if( suffix == "M" ) multiplier = 1024*1024; // "MebiBytes"
- else if( suffix == "G" ) multiplier = 1024*1024*1024; // "GibiBytes"
+ int i;
+ /**/ if( suffix == "K" ) i = 1; // "KibiBytes"
+ else if( suffix == "M" ) i = 2; // "MebiBytes"
+ else if( suffix == "G" ) i = 3; // "GibiBytes"
+ else if( suffix == "T" ) i = 4; // "TebiBytes"
+ else if( suffix == "P" ) i = 5; // "PebiBytes"
else throw std::runtime_error( "can't interpret: " + s );
- if( (x * multiplier) / multiplier != x ){ // check for overflow
- throw std::runtime_error( "can't interpret (too big): " + s );
+
+ while( i-- ){
+ if( (x * 1024) / 1024 != x ){
+ throw std::runtime_error( "can't interpret (too big): " + s );
+ }
+ x *= 1024;
}
- x *= multiplier;
}
if( !(iss >> std::ws).eof() ){
@@ -54,5 +59,6 @@ void unstringifySize( T& x, const std::string& s ){
}
}
-} // end namespace cbrc
-#endif // STRINGIFY_HH
+}
+
+#endif
diff --git a/src/version.hh b/src/version.hh
index 406a230..9420baa 100644
--- a/src/version.hh
+++ b/src/version.hh
@@ -1 +1 @@
-"759"
+"809"
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/last-align.git
More information about the debian-med-commit
mailing list