[med-svn] [last-align] 01/08: New upstream version 921
Andreas Tille
tille at debian.org
Mon Feb 5 09:44:33 UTC 2018
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository last-align.
commit 86ec81874c63279f774ca37a17478fe534085b7c
Author: Andreas Tille <tille at debian.org>
Date: Mon Feb 5 10:29:16 2018 +0100
New upstream version 921
---
ChangeLog.txt | 177 +++++++-
doc/last-dotplot.html | 99 +++--
doc/last-dotplot.txt | 82 +++-
doc/last-train.html | 3 +
doc/last-train.txt | 2 +
scripts/last-dotplot | 829 ++++++++++++++++++++++++------------
scripts/last-train | 3 +
src/Alignment.cc | 36 +-
src/Alignment.hh | 13 +-
src/AlignmentWrite.cc | 70 +--
src/Centroid.cc | 290 ++++++-------
src/Centroid.hh | 73 ++--
src/CyclicSubsetSeed.cc | 9 +-
src/LastEvaluer.cc | 10 +-
src/LastEvaluer.hh | 3 +-
src/LastdbArguments.cc | 13 +-
src/LastdbArguments.hh | 1 +
src/MultiSequence.cc | 92 +++-
src/MultiSequence.hh | 90 ++--
src/MultiSequenceQual.cc | 56 +--
src/ScoreMatrix.cc | 7 +-
src/alp/sls_pvalues.cpp | 4 +-
src/io.cc | 23 -
src/io.hh | 61 +--
src/last-pair-probs.cc | 15 +-
src/last.hh | 55 +++
src/lastal.cc | 76 +---
src/lastdb.cc | 69 ++-
src/makefile | 45 +-
src/mcf_zstream.hh | 4 +-
src/split/cbrc_split_aligner.cc | 10 +-
src/split/cbrc_unsplit_alignment.cc | 3 +-
src/split/cbrc_unsplit_alignment.hh | 2 +
src/split/last-split.cc | 2 +-
src/version.hh | 2 +-
src/zio.hh | 37 ++
36 files changed, 1462 insertions(+), 904 deletions(-)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 7c46466..eb4f5b2 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,8 +1,183 @@
+2018-01-05 Martin C. Frith <Martin C. Frith>
+
+ * src/Alignment.cc, src/Alignment.hh, src/Centroid.cc,
+ src/Centroid.hh:
+ Refactor
+ [8430dbf5abe7] [tip]
+
+ * src/Centroid.cc:
+ Try to calculate lastal probs a tiny bit faster
+ [1abf5130f283]
+
+ * src/Alignment.cc, src/Alignment.hh, src/AlignmentWrite.cc,
+ src/Centroid.cc, src/Centroid.hh:
+ Refactor
+ [57012f1307c8]
+
+ * src/Alignment.cc, src/Centroid.cc, src/Centroid.hh:
+ Refactor
+ [c98504d1610e]
+
+ * src/Centroid.cc, src/Centroid.hh:
+ Refactor
+ [c83131b68177]
+
+2017-12-21 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ last-dotplot: add strand/orientation options
+ [ffe68ba3d865]
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [6079fabb49a5]
+
+2017-12-05 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ last-dotplot: add alignment-order options
+ [f4ca19d126e3]
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [e87abc7ae9c6]
+
+2017-11-08 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-dotplot:
+ Tweak last-dotplot range-end pixels
+ [8834139fa8a8]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Add last-dotplot secondary alignments option
+ [4080f3f6f73d]
+
+ * scripts/last-dotplot:
+ Add gap-cutting options to last-dotplot
+ [42653029d900]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Remove last-dotplot trim options
+ [92dfad507286]
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [6840398323e1]
+
+ * scripts/last-dotplot:
+ Tweak last-dotplot's sort options
+ [87991b57ed04]
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [21fbf377641f]
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [8d52cef22afa]
+
+ * scripts/last-dotplot:
+ Tweak dotplot labels
+ [c2f50cd4e29d]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Make last-dotplot try to find a nice font
+ [3af6b02e3f9c]
+
+ * src/CyclicSubsetSeed.cc, src/ScoreMatrix.cc, src/io.cc, src/io.hh,
+ src/last-pair-probs.cc, src/lastal.cc, src/lastdb.cc, src/makefile,
+ src/zio.hh:
+ Try to fix build fails on some computers
+ [db84a7e4ff8a]
+
+2017-11-02 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-dotplot:
+ Tweak last-dotplot's label placement
+ [2003b209d750]
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [5ce3e786e05a]
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [ccf8902bf86c]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Change last-dotplot's label options
+ [f6a9c15287ea]
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [f7bc7d099e2b]
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [2238466f36bd]
+
+ * scripts/last-dotplot:
+ Tweak the dotplot colors
+ [8eb06508b3ed]
+
+2017-10-24 Martin C. Frith <Martin C. Frith>
+
+ * src/CyclicSubsetSeed.cc, src/MultiSequence.cc, src/MultiSequence.hh,
+ src/MultiSequenceQual.cc, src/ScoreMatrix.cc:
+ Make the code a bit more robust
+ [efb59c80bd48]
+
+ * src/split/cbrc_split_aligner.cc,
+ src/split/cbrc_unsplit_alignment.cc,
+ src/split/cbrc_unsplit_alignment.hh, src/split/last-split.cc:
+ Make last-split work with lastdb strand option
+ [e6f6334cedb8]
+
+ * doc/last-train.txt, scripts/last-train, src/LastEvaluer.cc,
+ src/LastEvaluer.hh, src/alp/sls_pvalues.cpp, src/lastal.cc:
+ Fix some E-value & last-train fails
+ [c208904d9c3e]
+
+2017-10-20 Martin C. Frith <Martin C. Frith>
+
+ * src/AlignmentWrite.cc, src/LastdbArguments.cc,
+ src/LastdbArguments.hh, src/MultiSequence.cc, src/MultiSequence.hh,
+ src/lastdb.cc:
+ Add lastdb strand option
+ [bef94bc275d5]
+
+ * src/MultiSequence.hh, src/last.hh, src/lastal.cc, src/lastdb.cc,
+ src/makefile, src/mcf_zstream.hh:
+ Refactor
+ [a4cdd7fbaee2]
+
+2017-10-19 Martin C. Frith <Martin C. Frith>
+
+ * src/Alignment.hh, src/AlignmentWrite.cc, src/MultiSequence.cc,
+ src/MultiSequence.hh, src/lastal.cc:
+ Start implementing reverse strands in lastdb
+ [5187bd412fb6]
+
+ * src/MultiSequence.cc, src/MultiSequence.hh,
+ src/MultiSequenceQual.cc:
+ Refactor
+ [3ba00f2e93e0]
+
+ * src/MultiSequence.cc, src/MultiSequence.hh,
+ src/MultiSequenceQual.cc, src/lastdb.cc:
+ Refactor
+ [69da6e43d607]
+
+ * src/LastdbArguments.cc, src/MultiSequence.cc, src/MultiSequence.hh,
+ src/lastal.cc:
+ Refactor
+ [284c69b1bf3e]
+
2017-10-10 Martin C. Frith <Martin C. Frith>
* scripts/last-train:
Make last-train print the lastal version
- [c2ce08b5a76a] [tip]
+ [c2ce08b5a76a]
* src/Alphabet.cc, src/Alphabet.hh, src/MultiSequence.hh,
src/lastal.cc, src/lastdb.cc:
diff --git a/doc/last-dotplot.html b/doc/last-dotplot.html
index 21cc1d9..8ebcced 100644
--- a/doc/last-dotplot.html
+++ b/doc/last-dotplot.html
@@ -329,16 +329,12 @@ last-dotplot my-alignments my-plot.png
<pre class="literal-block">
last-dotplot alns alns.gif
</pre>
-<p>To get a nicer font, try something like:</p>
-<pre class="literal-block">
-last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png
-</pre>
-<p>or:</p>
-<pre class="literal-block">
-last-dotplot -f /Library/Fonts/Arial.ttf alns alns.png
-</pre>
-<p>If the fonts are located somewhere different on your computer, change
-this as appropriate.</p>
+<div class="section" id="terminology">
+<h2>Terminology</h2>
+<p>last-dotplot shows alignments of one set of sequences against another
+set of sequences. This document calls a "set of sequences" a
+"genome", though it need not actually be a genome.</p>
+</div>
<div class="section" id="choosing-sequences">
<h2>Choosing sequences</h2>
<p>If there are too many sequences, the dotplot will be very cluttered,
@@ -387,6 +383,14 @@ last-dotplot -1 'chr?' -1 'chr??' alns alns.png
last-dotplot -1 chr9:0-1000 alns alns.png
</pre>
</div>
+<div class="section" id="fonts">
+<h2>Fonts</h2>
+<p>last-dotplot tries to find a nice font on your computer, but may fail
+and use an ugly font. You can specify a font like this:</p>
+<pre class="literal-block">
+last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png
+</pre>
+</div>
<div class="section" id="options">
<h2>Options</h2>
<blockquote>
@@ -419,25 +423,64 @@ last-dotplot -1 chr9:0-1000 alns alns.png
<kbd><span class="option">-r <var>COLOR</var></span>, <span class="option">--reversecolor=<var>COLOR</var></span></kbd></td>
<td>Color for reverse alignments.</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">--alignments=<var>FILE</var></span></kbd></td>
+<td>Read secondary alignments. For example: we could use primary
+alignment data with one human DNA read aligned to the human
+genome, and secondary alignment data with the whole chimpanzee
+versus human genomes. last-dotplot will show the parts of the
+secondary alignments that are near the primary alignments.</td></tr>
+<tr><td class="option-group">
<kbd><span class="option">--sort1=<var>N</var></span></kbd></td>
<td>Put the 1st genome's sequences left-to-right in order of: their
-appearance in the input (0), their names (1), their lengths (2).</td></tr>
+appearance in the input (0), their names (1), their lengths (2),
+the top-to-bottom order of (the midpoints of) their alignments
+(3). You can use two colon-separated values, e.g. "2:1" to
+specify 2 for primary and 1 for secondary alignments.</td></tr>
<tr><td class="option-group">
<kbd><span class="option">--sort2=<var>N</var></span></kbd></td>
<td>Put the 2nd genome's sequences top-to-bottom in order of: their
-appearance in the input (0), their names (1), their lengths (2).</td></tr>
-<tr><td class="option-group">
-<kbd><span class="option">--trim1</span></kbd></td>
-<td>Trim unaligned sequence flanks from the 1st (horizontal) genome.</td></tr>
-<tr><td class="option-group">
-<kbd><span class="option">--trim2</span></kbd></td>
-<td>Trim unaligned sequence flanks from the 2nd (vertical) genome.</td></tr>
+appearance in the input (0), their names (1), their lengths (2),
+the left-to-right order of (the midpoints of) their alignments
+(3).</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--strands1=<var>N</var></span></kbd></td>
+<td>Put the 1st genome's sequences: in forwards orientation (0), in
+the orientation of most of their aligned bases (1). In the
+latter case, the labels will be colored (in the same way as the
+alignments) to indicate the sequences' orientations. You can
+use two colon-separated values for primary and secondary
+alignments.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--strands2=<var>N</var></span></kbd></td>
+<td>Put the 2nd genome's sequences: in forwards orientation (0), in
+the orientation of most of their aligned bases (1).</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--max-gap1=<var>FRAC</var></span></kbd></td>
+<td>Maximum unaligned gap in the 1st genome. For example, if two
+parts of one DNA read align to widely-separated parts of one
+chromosome, it's probably best to cut the intervening region
+from the dotplot. FRAC is a fraction of the length of the
+(primary) alignments. You can specify "inf" to keep all
+unaligned gaps. You can use two comma-separated values,
+e.g. "0.5,3" to specify 0.5 for end-gaps (unaligned sequence
+ends) and 3 for mid-gaps (between alignments). You can use two
+colon-separated values (each of which may be comma-separated)
+for primary and secondary alignments.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--max-gap2=<var>FRAC</var></span></kbd></td>
+<td>Maximum unaligned gap in the 2nd genome.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--pad=<var>FRAC</var></span></kbd></td>
+<td>Length of pad to leave when cutting unaligned gaps.</td></tr>
<tr><td class="option-group">
<kbd><span class="option">--border-pixels=<var>INT</var></span></kbd></td>
<td>Number of pixels between sequences.</td></tr>
<tr><td class="option-group">
<kbd><span class="option">--border-color=<var>COLOR</var></span></kbd></td>
<td>Color for pixels between sequences.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--margin-color=<var>COLOR</var></span></kbd></td>
+<td>Color for the margins.</td></tr>
</tbody>
</table>
</blockquote>
@@ -455,17 +498,21 @@ appearance in the input (0), their names (1), their lengths (2).</td></tr>
<kbd><span class="option">-s <var>SIZE</var></span>, <span class="option">--fontsize=<var>SIZE</var></span></kbd></td>
<td>TrueType or OpenType font size.</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">--labels1=<var>N</var></span></kbd></td>
+<td>Label the displayed regions of the 1st genome with their:
+sequence name (0), name:length (1), name:start:length (2),
+name:start-end (3).</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--labels2=<var>N</var></span></kbd></td>
+<td>Label the displayed regions of the 2nd genome with their:
+sequence name (0), name:length (1), name:start:length (2),
+name:start-end (3).</td></tr>
+<tr><td class="option-group">
<kbd><span class="option">--rot1=<var>ROT</var></span></kbd></td>
<td>Text rotation for the 1st genome: h(orizontal) or v(ertical).</td></tr>
<tr><td class="option-group">
<kbd><span class="option">--rot2=<var>ROT</var></span></kbd></td>
<td>Text rotation for the 2nd genome: h(orizontal) or v(ertical).</td></tr>
-<tr><td class="option-group">
-<kbd><span class="option">--lengths1</span></kbd></td>
-<td>Show sequence lengths for the 1st (horizontal) genome.</td></tr>
-<tr><td class="option-group">
-<kbd><span class="option">--lengths2</span></kbd></td>
-<td>Show sequence lengths for the 2nd (vertical) genome.</td></tr>
</tbody>
</table>
</blockquote>
@@ -554,11 +601,11 @@ of unknown sequence.</p>
<p>An unsequenced gap will be shown only if it covers at least one whole
pixel.</p>
</div>
+</div>
<div class="section" id="colors">
-<h3>Colors</h3>
+<h2>Colors</h2>
<p>Colors can be specified in <a class="reference external" href="http://effbot.org/imagingbook/imagecolor.htm">various ways described here</a>.</p>
</div>
</div>
-</div>
</body>
</html>
diff --git a/doc/last-dotplot.txt b/doc/last-dotplot.txt
index 2c1ff46..a908a88 100644
--- a/doc/last-dotplot.txt
+++ b/doc/last-dotplot.txt
@@ -12,16 +12,12 @@ The output can be in any format supported by the Imaging Library::
last-dotplot alns alns.gif
-To get a nicer font, try something like::
+Terminology
+-----------
- last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png
-
-or::
-
- last-dotplot -f /Library/Fonts/Arial.ttf alns alns.png
-
-If the fonts are located somewhere different on your computer, change
-this as appropriate.
+last-dotplot shows alignments of one set of sequences against another
+set of sequences. This document calls a "set of sequences" a
+"genome", though it need not actually be a genome.
Choosing sequences
------------------
@@ -59,6 +55,14 @@ You can also specify a sequence range; for example this gets the first
last-dotplot -1 chr9:0-1000 alns alns.png
+Fonts
+-----
+
+last-dotplot tries to find a nice font on your computer, but may fail
+and use an ugly font. You can specify a font like this::
+
+ last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png
+
Options
-------
@@ -78,20 +82,54 @@ Options
Color for forward alignments.
-r COLOR, --reversecolor=COLOR
Color for reverse alignments.
+ --alignments=FILE
+ Read secondary alignments. For example: we could use primary
+ alignment data with one human DNA read aligned to the human
+ genome, and secondary alignment data with the whole chimpanzee
+ versus human genomes. last-dotplot will show the parts of the
+ secondary alignments that are near the primary alignments.
--sort1=N
Put the 1st genome's sequences left-to-right in order of: their
- appearance in the input (0), their names (1), their lengths (2).
+ appearance in the input (0), their names (1), their lengths (2),
+ the top-to-bottom order of (the midpoints of) their alignments
+ (3). You can use two colon-separated values, e.g. "2:1" to
+ specify 2 for primary and 1 for secondary alignments.
--sort2=N
Put the 2nd genome's sequences top-to-bottom in order of: their
- appearance in the input (0), their names (1), their lengths (2).
- --trim1
- Trim unaligned sequence flanks from the 1st (horizontal) genome.
- --trim2
- Trim unaligned sequence flanks from the 2nd (vertical) genome.
+ appearance in the input (0), their names (1), their lengths (2),
+ the left-to-right order of (the midpoints of) their alignments
+ (3).
+ --strands1=N
+ Put the 1st genome's sequences: in forwards orientation (0), in
+ the orientation of most of their aligned bases (1). In the
+ latter case, the labels will be colored (in the same way as the
+ alignments) to indicate the sequences' orientations. You can
+ use two colon-separated values for primary and secondary
+ alignments.
+ --strands2=N
+ Put the 2nd genome's sequences: in forwards orientation (0), in
+ the orientation of most of their aligned bases (1).
+ --max-gap1=FRAC
+ Maximum unaligned gap in the 1st genome. For example, if two
+ parts of one DNA read align to widely-separated parts of one
+ chromosome, it's probably best to cut the intervening region
+ from the dotplot. FRAC is a fraction of the length of the
+ (primary) alignments. You can specify "inf" to keep all
+ unaligned gaps. You can use two comma-separated values,
+ e.g. "0.5,3" to specify 0.5 for end-gaps (unaligned sequence
+ ends) and 3 for mid-gaps (between alignments). You can use two
+ colon-separated values (each of which may be comma-separated)
+ for primary and secondary alignments.
+ --max-gap2=FRAC
+ Maximum unaligned gap in the 2nd genome.
+ --pad=FRAC
+ Length of pad to leave when cutting unaligned gaps.
--border-pixels=INT
Number of pixels between sequences.
--border-color=COLOR
Color for pixels between sequences.
+ --margin-color=COLOR
+ Color for the margins.
Text options
~~~~~~~~~~~~
@@ -100,14 +138,18 @@ Text options
TrueType or OpenType font file.
-s SIZE, --fontsize=SIZE
TrueType or OpenType font size.
+ --labels1=N
+ Label the displayed regions of the 1st genome with their:
+ sequence name (0), name:length (1), name:start:length (2),
+ name:start-end (3).
+ --labels2=N
+ Label the displayed regions of the 2nd genome with their:
+ sequence name (0), name:length (1), name:start:length (2),
+ name:start-end (3).
--rot1=ROT
Text rotation for the 1st genome: h(orizontal) or v(ertical).
--rot2=ROT
Text rotation for the 2nd genome: h(orizontal) or v(ertical).
- --lengths1
- Show sequence lengths for the 1st (horizontal) genome.
- --lengths2
- Show sequence lengths for the 2nd (vertical) genome.
Annotation options
~~~~~~~~~~~~~~~~~~
@@ -166,7 +208,7 @@ An unsequenced gap will be shown only if it covers at least one whole
pixel.
Colors
-~~~~~~
+------
Colors can be specified in `various ways described here
<http://effbot.org/imagingbook/imagecolor.htm>`_.
diff --git a/doc/last-train.html b/doc/last-train.html
index f510a51..ee0fee2 100644
--- a/doc/last-train.html
+++ b/doc/last-train.html
@@ -346,6 +346,9 @@ zcat queries.fasta.gz | last-train mydb
<tr><td class="option-group">
<kbd><span class="option">-h</span>, <span class="option">--help</span></kbd></td>
<td>Show a help message, with default option values, and exit.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">-v</span>, <span class="option">--verbose</span></kbd></td>
+<td>Show more details of intermediate steps.</td></tr>
</tbody>
</table>
</blockquote>
diff --git a/doc/last-train.txt b/doc/last-train.txt
index c965ff3..89ae977 100644
--- a/doc/last-train.txt
+++ b/doc/last-train.txt
@@ -27,6 +27,8 @@ Options
-h, --help
Show a help message, with default option values, and exit.
+ -v, --verbose
+ Show more details of intermediate steps.
Training options
~~~~~~~~~~~~~~~~
diff --git a/scripts/last-dotplot b/scripts/last-dotplot
index f29fed2..419ba82 100755
--- a/scripts/last-dotplot
+++ b/scripts/last-dotplot
@@ -9,14 +9,21 @@
# according to the number of aligned nt-pairs within it, but the
# result is too faint. How can this be done better?
+import collections
+import functools
import gzip
-import fnmatch, itertools, optparse, os, re, sys
+from fnmatch import fnmatchcase
+from operator import itemgetter
+import subprocess
+import itertools, optparse, os, re, sys
# Try to make PIL/PILLOW work:
try: from PIL import Image, ImageDraw, ImageFont, ImageColor
except ImportError: import Image, ImageDraw, ImageFont, ImageColor
def myOpen(fileName): # faster than fileinput
+ if fileName is None:
+ return []
if fileName == "-":
return sys.stdin
if fileName.endswith(".gz"):
@@ -28,20 +35,29 @@ def warn(message):
prog = os.path.basename(sys.argv[0])
sys.stderr.write(prog + ": " + message + "\n")
-def croppedBlocks(blocks, range1, range2):
- cropBeg1, cropEnd1 = range1
- cropBeg2, cropEnd2 = range2
- if blocks[0][0] < 0: cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
- if blocks[0][1] < 0: cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
- for beg1, beg2, size in blocks:
- b1 = max(cropBeg1, beg1)
- e1 = min(cropEnd1, beg1 + size)
- if b1 >= e1: continue
- offset = beg2 - beg1
- b2 = max(cropBeg2, b1 + offset)
- e2 = min(cropEnd2, e1 + offset)
- if b2 >= e2: continue
- yield b2 - offset, b2, e2 - b2
+def groupByFirstItem(things):
+ for k, v in itertools.groupby(things, itemgetter(0)):
+ yield k, [i[1:] for i in v]
+
+def croppedBlocks(blocks, ranges1, ranges2):
+ headBeg1, headBeg2, headSize = blocks[0]
+ for r1 in ranges1:
+ for r2 in ranges2:
+ cropBeg1, cropEnd1 = r1
+ if headBeg1 < 0:
+ cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
+ cropBeg2, cropEnd2 = r2
+ if headBeg2 < 0:
+ cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
+ for beg1, beg2, size in blocks:
+ b1 = max(cropBeg1, beg1)
+ e1 = min(cropEnd1, beg1 + size)
+ if b1 >= e1: continue
+ offset = beg2 - beg1
+ b2 = max(cropBeg2, b1 + offset)
+ e2 = min(cropEnd2, e1 + offset)
+ if b2 >= e2: continue
+ yield b2 - offset, b2, e2 - b2
def tabBlocks(beg1, beg2, blocks):
'''Get the gapless blocks of an alignment, from LAST tabular format.'''
@@ -102,7 +118,7 @@ def alignmentInput(lines):
yield chr1, seqlen1, chr2, seqlen2, blocks
mafCount = 0
-def seqRangeFromText(text):
+def seqRequestFromText(text):
if ":" in text:
pattern, interval = text.rsplit(":", 1)
if "-" in interval:
@@ -110,52 +126,154 @@ def seqRangeFromText(text):
return pattern, int(beg), int(end) # beg may be negative
return text, 0, sys.maxsize
-def rangeFromSeqName(seqRanges, name, seqLen):
- if not seqRanges: return 0, seqLen
- base = name.split(".")[-1] # allow for names like hg19.chr7
- for pat, beg, end in seqRanges:
- if fnmatch.fnmatchcase(name, pat) or fnmatch.fnmatchcase(base, pat):
- return max(beg, 0), min(end, seqLen)
- return None
-
-def updateSeqs(isTrim, seqNames, seqLimits, seqName, seqRange, blocks, index):
- if seqName not in seqLimits:
- seqNames.append(seqName)
- if isTrim:
- beg = blocks[0][index]
- end = blocks[-1][index] + blocks[-1][2]
- if beg < 0: beg, end = -end, -beg
- if seqName in seqLimits:
- b, e = seqLimits[seqName]
- seqLimits[seqName] = min(b, beg), max(e, end)
- else:
- seqLimits[seqName] = beg, end
+def rangesFromSeqName(seqRequests, name, seqLen):
+ if seqRequests:
+ base = name.split(".")[-1] # allow for names like hg19.chr7
+ for pat, beg, end in seqRequests:
+ if fnmatchcase(name, pat) or fnmatchcase(base, pat):
+ yield max(beg, 0), min(end, seqLen)
+ else:
+ yield 0, seqLen
+
+def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
+ beg, end = coveredRange
+ if beg < 0:
+ coveredRange = -end, -beg
+ if seqName in coverDict:
+ coverDict[seqName].append(coveredRange)
else:
- seqLimits[seqName] = seqRange
+ coverDict[seqName] = [coveredRange]
+ for beg, end in ranges:
+ r = seqName, beg, end
+ seqRanges.append(r)
def readAlignments(fileName, opts):
'''Get alignments and sequence limits, from MAF or tabular format.'''
- seqRanges1 = map(seqRangeFromText, opts.seq1)
- seqRanges2 = map(seqRangeFromText, opts.seq2)
+ seqRequests1 = map(seqRequestFromText, opts.seq1)
+ seqRequests2 = map(seqRequestFromText, opts.seq2)
alignments = []
- seqNames1 = []
- seqNames2 = []
- seqLimits1 = {}
- seqLimits2 = {}
+ seqRanges1 = []
+ seqRanges2 = []
+ coverDict1 = {}
+ coverDict2 = {}
lines = myOpen(fileName)
for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
- range1 = rangeFromSeqName(seqRanges1, seqName1, seqLen1)
- if not range1: continue
- range2 = rangeFromSeqName(seqRanges2, seqName2, seqLen2)
- if not range2: continue
- b = list(croppedBlocks(list(blocks), range1, range2))
+ ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
+ if not ranges1: continue
+ ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
+ if not ranges2: continue
+ b = list(croppedBlocks(list(blocks), ranges1, ranges2))
+ if not b: continue
+ aln = seqName1, seqName2, b
+ alignments.append(aln)
+ coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
+ updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
+ coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
+ updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
+ return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
+
+def nameAndRangesFromDict(cropDict, seqName):
+ if seqName in cropDict:
+ return seqName, cropDict[seqName]
+ n = seqName.split(".")[-1]
+ if n in cropDict:
+ return n, cropDict[n]
+ return seqName, []
+
+def rangesForSecondaryAlignments(primaryRanges, seqLen):
+ if primaryRanges:
+ return primaryRanges
+ return [(0, seqLen)]
+
+def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
+ cropDict1 = dict(groupByFirstItem(cropRanges1))
+ cropDict2 = dict(groupByFirstItem(cropRanges2))
+
+ alignments = []
+ seqRanges1 = []
+ seqRanges2 = []
+ coverDict1 = {}
+ coverDict2 = {}
+ lines = myOpen(opts.alignments)
+ for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
+ seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
+ seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
+ if not ranges1 and not ranges2:
+ continue
+ r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
+ r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
+ b = list(croppedBlocks(list(blocks), r1, r2))
if not b: continue
aln = seqName1, seqName2, b
alignments.append(aln)
- updateSeqs(opts.trim1, seqNames1, seqLimits1, seqName1, range1, b, 0)
- updateSeqs(opts.trim2, seqNames2, seqLimits2, seqName2, range2, b, 1)
- return alignments, seqNames1, seqNames2, seqLimits1, seqLimits2
+ if not ranges1:
+ coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
+ updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
+ if not ranges2:
+ coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
+ updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
+ return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
+
+def twoValuesFromOption(text, separator):
+ if separator in text:
+ return text.split(separator)
+ return text, text
+
+def mergedRanges(ranges):
+ oldBeg, maxEnd = ranges[0]
+ for beg, end in ranges:
+ if beg > maxEnd:
+ yield oldBeg, maxEnd
+ oldBeg = beg
+ maxEnd = end
+ elif end > maxEnd:
+ maxEnd = end
+ yield oldBeg, maxEnd
+
+def mergedRangesPerSeq(coverDict):
+ for k, v in coverDict.iteritems():
+ v.sort()
+ yield k, list(mergedRanges(v))
+
+def coveredLength(mergedCoverDict):
+ return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
+
+def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
+ maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
+ maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
+ maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
+
+ for seqName, rangeBeg, rangeEnd in seqRanges:
+ seqBlocks = coverDict[seqName]
+ blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
+ if blocks[0][0] - rangeBeg > maxEndGap:
+ rangeBeg = blocks[0][0] - endPad
+ for j, y in enumerate(blocks):
+ if j:
+ x = blocks[j - 1]
+ if y[0] - x[1] > maxMidGap:
+ yield seqName, rangeBeg, x[1] + midPad
+ rangeBeg = y[0] - midPad
+ if rangeEnd - blocks[-1][1] > maxEndGap:
+ rangeEnd = blocks[-1][1] + endPad
+ yield seqName, rangeBeg, rangeEnd
+
+def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
+ if strandOpt == "1":
+ forwardMinusReverse = collections.defaultdict(int)
+ for i in alignments:
+ blocks = i[2]
+ beg1, beg2, size = blocks[0]
+ numOfAlignedLetterPairs = sum(i[2] for i in blocks)
+ if (beg1 < 0) != (beg2 < 0): # opposite-strand alignment
+ numOfAlignedLetterPairs *= -1
+ forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
+ strandNum = 0
+ for seqName, beg, end in seqRanges:
+ if strandOpt == "1":
+ strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
+ yield seqName, beg, end, strandNum
def natural_sort_key(my_string):
'''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
@@ -163,17 +281,93 @@ def natural_sort_key(my_string):
parts[1::2] = map(int, parts[1::2])
return parts
-def textDimensions(imageDraw, font, textRot, text):
- x, y = imageDraw.textsize(text, font=font)
- return (y, x) if textRot else (x, y)
-
-def get_text_sizes(my_strings, font, fontsize, image_mode, textRot):
- '''Get widths & heights, in pixels, of some strings.'''
- if fontsize == 0: return [(0, 0) for i in my_strings]
- image_size = 1, 1
- im = Image.new(image_mode, image_size)
- draw = ImageDraw.Draw(im)
- return [textDimensions(draw, font, textRot, i) for i in my_strings]
+def nameKey(oneSeqRanges):
+ return natural_sort_key(oneSeqRanges[0][0])
+
+def sizeKey(oneSeqRanges):
+ return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges)
+
+def alignmentKey(seqNamesToLists, oneSeqRanges):
+ seqName = oneSeqRanges[0][0]
+ alignmentsOfThisSequence = seqNamesToLists[seqName]
+ numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
+ toMiddle = numOfAlignedLetterPairs // 2
+ for i in alignmentsOfThisSequence:
+ toMiddle -= i[3]
+ if toMiddle < 0:
+ return i[1:3] # sequence-rank and "position" of this alignment
+
+def rankAndFlipPerSeq(seqRanges):
+ rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
+ for rank, group in enumerate(rangesGroupedBySeqName):
+ seqName, ranges = group
+ strandNum = next(ranges)[3]
+ flip = 1 if strandNum < 2 else -1
+ yield seqName, (rank, flip)
+
+def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
+ otherIndex = 1 - seqIndex
+ for i in alignments:
+ blocks = i[2]
+ otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
+ otherPos = otherFlip * abs(blocks[0][otherIndex] +
+ blocks[-1][otherIndex] + blocks[-1][2])
+ numOfAlignedLetterPairs = sum(i[2] for i in blocks)
+ yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
+
+def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
+ rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
+ g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
+ for i in g:
+ if i[0][3] > 1:
+ i.reverse()
+ if sortOpt == "1":
+ g.sort(key=nameKey)
+ if sortOpt == "2":
+ g.sort(key=sizeKey)
+ if sortOpt == "3":
+ otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
+ alns = sorted(alignmentSortData(alignments, seqIndex,
+ otherNamesToRanksAndFlips))
+ alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
+ seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
+ g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
+ return [j for i in g for j in i]
+
+def allSortedRanges(opts, alignments, alignmentsB,
+ seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
+ o1, oB1 = twoValuesFromOption(opts.strands1, ":")
+ o2, oB2 = twoValuesFromOption(opts.strands2, ":")
+ if o1 == "1" and o2 == "1":
+ raise Exception("the strand options have circular dependency")
+ seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
+ seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
+ seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
+ seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
+
+ o1, oB1 = twoValuesFromOption(opts.sort1, ":")
+ o2, oB2 = twoValuesFromOption(opts.sort2, ":")
+ if o1 == "3" and o2 == "3":
+ raise Exception("the sort options have circular dependency")
+ if o1 != "3":
+ s1 = mySortedRanges(seqRanges1, o1, None, None, None)
+ if o2 != "3":
+ s2 = mySortedRanges(seqRanges2, o2, None, None, None)
+ if o1 == "3":
+ s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
+ if o2 == "3":
+ s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
+ t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
+ t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
+ return s1 + t1, s2 + t2
+
+def prettyNum(n):
+ t = str(n)
+ groups = []
+ while t:
+ groups.append(t[-3:])
+ t = t[:-3]
+ return ",".join(reversed(groups))
def sizeText(size):
suffixes = "bp", "kb", "Mb", "Gb"
@@ -184,67 +378,79 @@ def sizeText(size):
if size < j * 1000 or i == len(suffixes) - 1:
return "%.0f" % (1.0 * size / j) + x
-def seqNameAndSizeText(seqName, seqSize):
- return seqName + ": " + sizeText(seqSize)
-
-def getSeqInfo(sortOpt, seqNames, seqLimits,
- font, fontsize, image_mode, isShowSize, textRot):
- '''Return miscellaneous information about the sequences.'''
- if sortOpt == 1:
- seqNames.sort(key=natural_sort_key)
- seqSizes = [seqLimits[i][1] - seqLimits[i][0] for i in seqNames]
- for i in seqNames:
- r = seqLimits[i]
- out = i, str(r[0]), str(r[1])
+def labelText(seqRange, labelOpt):
+ seqName, beg, end, strandNum = seqRange
+ if labelOpt == 1:
+ return seqName + ": " + sizeText(end - beg)
+ if labelOpt == 2:
+ return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
+ if labelOpt == 3:
+ return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
+ return seqName
+
+def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
+ if fontsize:
+ image_size = 1, 1
+ im = Image.new(image_mode, image_size)
+ draw = ImageDraw.Draw(im)
+ x = y = 0
+ for r in seqRanges:
+ text = labelText(r, labelOpt)
+ if fontsize:
+ x, y = draw.textsize(text, font=font)
+ if textRot:
+ x, y = y, x
+ yield text, x, y, r[3]
+
+def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
+ for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
+ out = [seqName, str(rangeBeg), str(rangeEnd)]
+ if strandNum > 0:
+ out.append(".+-"[strandNum])
warn("\t".join(out))
warn("")
- if sortOpt == 2:
- seqRecords = sorted(zip(seqSizes, seqNames), reverse=True)
- seqSizes = [i[0] for i in seqRecords]
- seqNames = [i[1] for i in seqRecords]
- if isShowSize:
- seqLabels = map(seqNameAndSizeText, seqNames, seqSizes)
- else:
- seqLabels = seqNames
- labelSizes = get_text_sizes(seqLabels, font, fontsize, image_mode, textRot)
- margin = max(i[1] for i in labelSizes)
+ rangeSizes = [e - b for n, b, e, s in sortedRanges]
+ labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
+ imageMode, textRot))
+ margin = max(i[2] for i in labs)
# xxx the margin may be too big, because some labels may get omitted
- return seqNames, seqSizes, seqLabels, labelSizes, margin
+ return rangeSizes, labs, margin
def div_ceil(x, y):
'''Return x / y rounded up.'''
q, r = divmod(x, y)
return q + (r != 0)
-def get_bp_per_pix(seq_sizes, pix_tween_seqs, pix_limit):
+def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
'''Get the minimum bp-per-pixel that fits in the size limit.'''
- seq_num = len(seq_sizes)
- seq_pix_limit = pix_limit - pix_tween_seqs * (seq_num - 1)
- if seq_pix_limit < seq_num:
+ warn("choosing bp per pixel...")
+ numOfRanges = len(rangeSizes)
+ maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
+ if maxPixelsInRanges < numOfRanges:
raise Exception("can't fit the image: too many sequences?")
- negLimit = -seq_pix_limit
- negBpPerPix = sum(seq_sizes) // negLimit
+ negLimit = -maxPixelsInRanges
+ negBpPerPix = sum(rangeSizes) // negLimit
while True:
- if sum(i // negBpPerPix for i in seq_sizes) >= negLimit:
+ if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
return -negBpPerPix
negBpPerPix -= 1
-def get_seq_starts(seq_pix, pix_tween_seqs, margin):
- '''Get the start pixel for each sequence.'''
- seq_starts = []
- pix_tot = margin - pix_tween_seqs
- for i in seq_pix:
- pix_tot += pix_tween_seqs
- seq_starts.append(pix_tot)
+def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
+ '''Get the start pixel for each range.'''
+ rangePixBegs = []
+ pix_tot = margin - pixTweenRanges
+ for i in rangePixLens:
+ pix_tot += pixTweenRanges
+ rangePixBegs.append(pix_tot)
pix_tot += i
- return seq_starts
+ return rangePixBegs
-def get_pix_info(seq_sizes, bp_per_pix, pix_tween_seqs, margin):
- '''Return pixel information about the sequences.'''
- seq_pix = [div_ceil(i, bp_per_pix) for i in seq_sizes]
- seq_starts = get_seq_starts(seq_pix, pix_tween_seqs, margin)
- tot_pix = seq_starts[-1] + seq_pix[-1]
- return seq_pix, seq_starts, tot_pix
+def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
+ '''Return pixel information about the ranges.'''
+ rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
+ rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
+ tot_pix = rangePixBegs[-1] + rangePixLens[-1]
+ return rangePixBegs, rangePixLens, tot_pix
def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
while True:
@@ -258,7 +464,6 @@ def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
size -= next_pix
def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
- beg2 = -1 - beg2
while True:
q1, r1 = divmod(beg1, bp_per_pix)
q2, r2 = divmod(beg2, bp_per_pix)
@@ -269,21 +474,31 @@ def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
beg2 -= next_pix
size -= next_pix
-def alignmentPixels(width, height, alignments, bp_per_pix, origins1, origins2):
+def strandAndOrigin(ranges, beg, size):
+ isReverseStrand = (beg < 0)
+ if isReverseStrand:
+ beg = -(beg + size)
+ for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
+ if rangeEnd > beg:
+ return (isReverseStrand != isReverseRange), origin
+
+def alignmentPixels(width, height, alignments, bp_per_pix,
+ rangeDict1, rangeDict2):
hits = [0] * (width * height) # the image data
for seq1, seq2, blocks in alignments:
- ori1 = origins1[seq1]
- ori2 = origins2[seq2]
+ beg1, beg2, size = blocks[0]
+ isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
+ isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
for beg1, beg2, size in blocks:
- if beg1 < 0:
+ if isReverse1:
beg1 = -(beg1 + size)
beg2 = -(beg2 + size)
- if beg2 >= 0:
+ if isReverse1 == isReverse2:
drawLineForward(hits, width, bp_per_pix,
- beg1 + ori1, beg2 + ori2, size)
+ ori1 + beg1, ori2 + beg2, size)
else:
drawLineReverse(hits, width, bp_per_pix,
- beg1 + ori1, beg2 - ori2, size)
+ ori1 + beg1, ori2 - beg2 - 1, size)
return hits
def expandedSeqDict(seqDict):
@@ -297,40 +512,41 @@ def expandedSeqDict(seqDict):
newDict[base] = x
return newDict
-def readBed(fileName, seqLimits):
- if not fileName: return
+def readBed(fileName, rangeDict):
for line in myOpen(fileName):
w = line.split()
if not w: continue
seqName = w[0]
- if seqName not in seqLimits: continue
+ if seqName not in rangeDict: continue
beg = int(w[1])
end = int(w[2])
layer = 900
- color = "#ffe4ff"
+ color = "#fbf"
if len(w) > 4:
if w[4] != ".":
layer = float(w[4])
if len(w) > 5:
if len(w) > 8 and w[8].count(",") == 2:
color = "rgb(" + w[8] + ")"
- elif w[5] == "+":
- color = "#fff4f4"
- elif w[5] == "-":
- color = "#f4f4ff"
+ else:
+ strand = w[5]
+ isRev = rangeDict[seqName][0][2]
+ if strand == "+" and not isRev or strand == "-" and isRev:
+ color = "#ffe8e8"
+ if strand == "-" and not isRev or strand == "+" and isRev:
+ color = "#e8e8ff"
yield layer, color, seqName, beg, end
def commaSeparatedInts(text):
return map(int, text.rstrip(",").split(","))
-def readGenePred(opts, fileName, seqLimits):
- if not fileName: return
+def readGenePred(opts, fileName, rangeDict):
for line in myOpen(fileName):
fields = line.split()
if not fields: continue
if fields[2] not in "+-": fields = fields[1:]
seqName = fields[1]
- if seqName not in seqLimits: continue
+ if seqName not in rangeDict: continue
#strand = fields[2]
cdsBeg = int(fields[5])
cdsEnd = int(fields[6])
@@ -342,20 +558,19 @@ def readGenePred(opts, fileName, seqLimits):
e = min(end, cdsEnd)
if b < e: yield 400, opts.cds_color, seqName, b, e
-def readRmsk(fileName, seqLimits):
- if not fileName: return
+def readRmsk(fileName, rangeDict):
for line in myOpen(fileName):
fields = line.split()
if len(fields) == 17: # rmsk.txt
seqName = fields[5]
- if seqName not in seqLimits: continue # do this ASAP for speed
+ if seqName not in rangeDict: continue # do this ASAP for speed
beg = int(fields[6])
end = int(fields[7])
strand = fields[9]
repeatClass = fields[11]
elif len(fields) == 15: # .out
seqName = fields[4]
- if seqName not in seqLimits: continue
+ if seqName not in rangeDict: continue
beg = int(fields[5]) - 1
end = int(fields[6])
strand = fields[8]
@@ -363,25 +578,24 @@ def readRmsk(fileName, seqLimits):
else:
continue
if repeatClass in ("Low_complexity", "Simple_repeat"):
- yield 200, "#ffe4ff", seqName, beg, end
- elif strand == "+":
- yield 100, "#fff4f4", seqName, beg, end
+ yield 200, "#fbf", seqName, beg, end
+ elif (strand == "+") != rangeDict[seqName][0][2]:
+ yield 100, "#ffe8e8", seqName, beg, end
else:
- yield 100, "#f4f4ff", seqName, beg, end
+ yield 100, "#e8e8ff", seqName, beg, end
def isExtraFirstGapField(fields):
return fields[4].isdigit()
-def readGaps(opts, fileName, seqLimits):
+def readGaps(opts, fileName, rangeDict):
'''Read locations of unsequenced gaps, from an agp or gap file.'''
- if not fileName: return
for line in myOpen(fileName):
w = line.split()
if not w or w[0][0] == "#": continue
if isExtraFirstGapField(w): w = w[1:]
if w[4] not in "NU": continue
seqName = w[0]
- if seqName not in seqLimits: continue
+ if seqName not in rangeDict: continue
end = int(w[2])
beg = end - int(w[5]) # zero-based coordinate
if w[7] == "yes":
@@ -389,144 +603,222 @@ def readGaps(opts, fileName, seqLimits):
else:
yield 2000, opts.unbridged_color, seqName, beg, end
-def bedBoxes(beds, seqLimits, origins, margin, edge, isTop, bpPerPix):
- for layer, color, seqName, beg, end in beds:
- cropBeg, cropEnd = seqLimits[seqName]
- beg = max(beg, cropBeg)
- end = min(end, cropEnd)
- if beg >= end: continue
- ori = origins[seqName]
- if layer <= 1000:
- # include partly-covered pixels
- b = (ori + beg) // bpPerPix
- e = div_ceil(ori + end, bpPerPix)
- else:
- # exclude partly-covered pixels
- b = div_ceil(ori + beg, bpPerPix)
- e = (ori + end) // bpPerPix
- if e <= b: continue
- if isTop:
- box = b, margin, e, edge
- else:
- box = margin, b, edge, e
- yield layer, color, box
+def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
+ for layer, color, seqName, bedBeg, bedEnd in beds:
+ for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
+ beg = max(bedBeg, rangeBeg)
+ end = min(bedEnd, rangeEnd)
+ if beg >= end: continue
+ if isReverseRange:
+ beg, end = -end, -beg
+ if layer <= 1000:
+ # include partly-covered pixels
+ b = (origin + beg) // bpPerPix
+ e = div_ceil(origin + end, bpPerPix)
+ else:
+ # exclude partly-covered pixels
+ b = div_ceil(origin + beg, bpPerPix)
+ e = (origin + end) // bpPerPix
+ if e <= b: continue
+ if bedEnd >= rangeEnd: # include partly-covered end pixels
+ if isReverseRange:
+ b = (origin + beg) // bpPerPix
+ else:
+ e = div_ceil(origin + end, bpPerPix)
+ if isTop:
+ box = b, margin, e, edge
+ else:
+ box = margin, b, edge, e
+ yield layer, color, box
def drawAnnotations(im, boxes):
# xxx use partial transparency for different-color overlaps?
for layer, color, box in boxes:
im.paste(color, box)
-def make_label(text, text_size, range_start, range_size):
- '''Return an axis label with endpoint & sort-order information.'''
- text_width, text_height = text_size
- label_start = range_start + (range_size - text_width) // 2
- label_end = label_start + text_width
- sort_key = text_width - range_size
- return sort_key, label_start, label_end, text, text_height
-
-def get_nonoverlapping_labels(labels, label_space):
+def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
+ '''Return axis labels with endpoint & sort-order information.'''
+ maxWidth = end - beg
+ for i, j, k in zip(labels, rangePixBegs, rangePixLens):
+ text, textWidth, textHeight, strandNum = i
+ if textWidth > maxWidth:
+ continue
+ labelBeg = j + (k - textWidth) // 2
+ labelEnd = labelBeg + textWidth
+ sortKey = textWidth - k
+ if labelBeg < beg:
+ sortKey += maxWidth * (beg - labelBeg)
+ labelBeg = beg
+ labelEnd = beg + textWidth
+ if labelEnd > end:
+ sortKey += maxWidth * (labelEnd - end)
+ labelEnd = end
+ labelBeg = end - textWidth
+ yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
+
+def nonoverlappingLabels(labels, minPixTweenLabels):
'''Get a subset of non-overlapping axis labels, greedily.'''
- nonoverlapping_labels = []
+ out = []
for i in labels:
- if True not in [i[1] < j[2] + label_space and j[1] < i[2] + label_space
- for j in nonoverlapping_labels]:
- nonoverlapping_labels.append(i)
- return nonoverlapping_labels
-
-def get_axis_image(seqNames, name_sizes, seq_starts, seq_pix, textRot,
- textAln, font, image_mode, opts):
+ beg = i[1] - minPixTweenLabels
+ end = i[2] + minPixTweenLabels
+ if all(j[2] <= beg or j[1] >= end for j in out):
+ out.append(i)
+ return out
+
+def axisImage(labels, rangePixBegs, rangePixLens, textRot,
+ textAln, font, image_mode, opts):
'''Make an image of axis labels.'''
- min_pos = seq_starts[0]
- max_pos = seq_starts[-1] + seq_pix[-1]
- margin = max(i[1] for i in name_sizes)
- labels = map(make_label, seqNames, name_sizes, seq_starts, seq_pix)
- labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos]
- labels.sort()
+ beg = rangePixBegs[0]
+ end = rangePixBegs[-1] + rangePixLens[-1]
+ margin = max(i[2] for i in labels)
+ labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
minPixTweenLabels = 0 if textRot else opts.label_space
- labels = get_nonoverlapping_labels(labels, minPixTweenLabels)
- image_size = (margin, max_pos) if textRot else (max_pos, margin)
- im = Image.new(image_mode, image_size, opts.border_color)
+ labels = nonoverlappingLabels(labels, minPixTweenLabels)
+ image_size = (margin, end) if textRot else (end, margin)
+ im = Image.new(image_mode, image_size, opts.margin_color)
draw = ImageDraw.Draw(im)
- for i in labels:
- base = margin - i[4] if textAln else 0
- position = (base, i[1]) if textRot else (i[1], base)
- draw.text(position, i[3], font=font, fill=opts.text_color)
+ for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
+ base = margin - textHeight if textAln else 0
+ position = (base, labelBeg) if textRot else (labelBeg, base)
+ fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
+ draw.text(position, text, font=font, fill=fill)
return im
-def seqOrigins(seqNames, seq_starts, seqLimits, bp_per_pix):
- for i, j in zip(seqNames, seq_starts):
- yield i, bp_per_pix * j - seqLimits[i][0]
+def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
+ for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
+ seqName, rangeBeg, rangeEnd, strandNum = i
+ isReverseRange = (strandNum > 1)
+ if isReverseRange:
+ origin = bpPerPix * (j + k) + rangeBeg
+ else:
+ origin = bpPerPix * j - rangeBeg
+ yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
+
+def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
+ a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
+ for k, v in itertools.groupby(a, itemgetter(0)):
+ yield k, [i[1] for i in v]
+
+def getFont(opts):
+ if opts.fontfile:
+ return ImageFont.truetype(opts.fontfile, opts.fontsize)
+ fileNames = []
+ try:
+ x = ["fc-match", "-f%{file}", "arial"]
+ p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ out, err = p.communicate()
+ fileNames.append(out)
+ except OSError as e:
+ warn("fc-match error: " + str(e))
+ fileNames.append("/Library/Fonts/Arial.ttf") # for Mac
+ for i in fileNames:
+ try:
+ font = ImageFont.truetype(i, opts.fontsize)
+ warn("font: " + i)
+ return font
+ except IOError as e:
+ warn("font load error: " + str(e))
+ return ImageFont.load_default()
def lastDotplot(opts, args):
- if opts.fontfile: font = ImageFont.truetype(opts.fontfile, opts.fontsize)
- else: font = ImageFont.load_default()
-
+ font = getFont(opts)
image_mode = 'RGB'
forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
zipped_colors = zip(forward_color, reverse_color)
overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
+ maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
+ maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
+
warn("reading alignments...")
- alignmentInfo = readAlignments(args[0], opts)
- alignments, seqNames1, seqNames2, seqLimits1, seqLimits2 = alignmentInfo
- warn("done")
+ alnData = readAlignments(args[0], opts)
+ alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
if not alignments: raise Exception("there are no alignments")
+ warn("cutting...")
+ coverDict1 = dict(mergedRangesPerSeq(coverDict1))
+ coverDict2 = dict(mergedRangesPerSeq(coverDict2))
+ minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
+ pad = int(opts.pad * minAlignedBases)
+ cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
+ maxGap1, pad, pad))
+ cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
+ maxGap2, pad, pad))
+
+ warn("reading secondary alignments...")
+ alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
+ alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
+ warn("cutting...")
+ coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
+ coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
+ cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
+ maxGapB1, 0, 0)
+ cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
+ maxGapB2, 0, 0)
+
+ warn("sorting...")
+ sortOut = allSortedRanges(opts, alignments, alignmentsB,
+ cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
+ sortedRanges1, sortedRanges2 = sortOut
textRot1 = "vertical".startswith(opts.rot1)
- i1 = getSeqInfo(opts.sort1, seqNames1, seqLimits1,
- font, opts.fontsize, image_mode, opts.lengths1, textRot1)
- seqNames1, seqSizes1, seqLabels1, labelSizes1, tMargin = i1
+ i1 = dataFromRanges(sortedRanges1, font,
+ opts.fontsize, image_mode, opts.labels1, textRot1)
+ rangeSizes1, labelData1, tMargin = i1
textRot2 = "horizontal".startswith(opts.rot2)
- i2 = getSeqInfo(opts.sort2, seqNames2, seqLimits2,
- font, opts.fontsize, image_mode, opts.lengths2, textRot2)
- seqNames2, seqSizes2, seqLabels2, labelSizes2, lMargin = i2
-
- warn("choosing bp per pixel...")
- pix_limit1 = opts.width - lMargin
- pix_limit2 = opts.height - tMargin
- bpPerPix1 = get_bp_per_pix(seqSizes1, opts.border_pixels, pix_limit1)
- bpPerPix2 = get_bp_per_pix(seqSizes2, opts.border_pixels, pix_limit2)
+ i2 = dataFromRanges(sortedRanges2, font,
+ opts.fontsize, image_mode, opts.labels2, textRot2)
+ rangeSizes2, labelData2, lMargin = i2
+
+ maxPixels1 = opts.width - lMargin
+ maxPixels2 = opts.height - tMargin
+ bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
+ bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
bpPerPix = max(bpPerPix1, bpPerPix2)
warn("bp per pixel = " + str(bpPerPix))
- seq_pix1, seq_starts1, width = get_pix_info(seqSizes1, bpPerPix,
- opts.border_pixels, lMargin)
- seq_pix2, seq_starts2, height = get_pix_info(seqSizes2, bpPerPix,
- opts.border_pixels, tMargin)
+ p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
+ rangePixBegs1, rangePixLens1, width = p1
+ rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
+ rangePixLens1, bpPerPix))
+
+ p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
+ rangePixBegs2, rangePixLens2, height = p2
+ rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
+ rangePixLens2, bpPerPix))
+
warn("width: " + str(width))
warn("height: " + str(height))
- origins1 = dict(seqOrigins(seqNames1, seq_starts1, seqLimits1, bpPerPix))
- origins2 = dict(seqOrigins(seqNames2, seq_starts2, seqLimits2, bpPerPix))
-
warn("processing alignments...")
- hits = alignmentPixels(width, height, alignments, bpPerPix,
- origins1, origins2)
- warn("done")
+ hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
+ rangeDict1, rangeDict2)
- image_size = width, height
- im = Image.new(image_mode, image_size, opts.background_color)
+ warn("reading annotations...")
- seqLimits1 = expandedSeqDict(seqLimits1)
- seqLimits2 = expandedSeqDict(seqLimits2)
- origins1 = expandedSeqDict(origins1)
- origins2 = expandedSeqDict(origins2)
+ rangeDict1 = expandedSeqDict(rangeDict1)
+ beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
+ readRmsk(opts.rmsk1, rangeDict1),
+ readGenePred(opts, opts.genePred1, rangeDict1),
+ readGaps(opts, opts.gap1, rangeDict1))
+ b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
- beds1 = itertools.chain(readBed(opts.bed1, seqLimits1),
- readRmsk(opts.rmsk1, seqLimits1),
- readGenePred(opts, opts.genePred1, seqLimits1),
- readGaps(opts, opts.gap1, seqLimits1))
- b1 = bedBoxes(beds1, seqLimits1, origins1, tMargin, height, True, bpPerPix)
-
- beds2 = itertools.chain(readBed(opts.bed2, seqLimits2),
- readRmsk(opts.rmsk2, seqLimits2),
- readGenePred(opts, opts.genePred2, seqLimits2),
- readGaps(opts, opts.gap2, seqLimits2))
- b2 = bedBoxes(beds2, seqLimits2, origins2, lMargin, width, False, bpPerPix)
+ rangeDict2 = expandedSeqDict(rangeDict2)
+ beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
+ readRmsk(opts.rmsk2, rangeDict2),
+ readGenePred(opts, opts.genePred2, rangeDict2),
+ readGaps(opts, opts.gap2, rangeDict2))
+ b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
boxes = sorted(itertools.chain(b1, b2))
+
+ warn("drawing...")
+
+ image_size = width, height
+ im = Image.new(image_mode, image_size, opts.background_color)
+
drawAnnotations(im, boxes)
for i in range(height):
@@ -538,22 +830,22 @@ def lastDotplot(opts, args):
elif store_value == 3: im.putpixel(xy, overlap_color)
if opts.fontsize != 0:
- axis1 = get_axis_image(seqLabels1, labelSizes1, seq_starts1, seq_pix1,
- textRot1, False, font, image_mode, opts)
+ axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
+ textRot1, False, font, image_mode, opts)
if textRot1:
axis1 = axis1.transpose(Image.ROTATE_90)
- axis2 = get_axis_image(seqLabels2, labelSizes2, seq_starts2, seq_pix2,
- textRot2, textRot2, font, image_mode, opts)
+ axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
+ textRot2, textRot2, font, image_mode, opts)
if not textRot2:
axis2 = axis2.transpose(Image.ROTATE_270)
im.paste(axis1, (0, 0))
im.paste(axis2, (0, 0))
- for i in seq_starts1[1:]:
+ for i in rangePixBegs1[1:]:
box = i - opts.border_pixels, tMargin, i, height
im.paste(opts.border_color, box)
- for i in seq_starts2[1:]:
+ for i in rangePixBegs2[1:]:
box = lMargin, i - opts.border_pixels, width, i
im.paste(opts.border_color, box)
@@ -583,35 +875,51 @@ if __name__ == "__main__":
help="color for forward alignments (default: %default)")
op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
help="color for reverse alignments (default: %default)")
- op.add_option("--sort1", type="int", default=1, metavar="N",
+ op.add_option("--alignments", metavar="FILE", help="secondary alignments")
+ op.add_option("--sort1", default="1", metavar="N",
help="genome1 sequence order: 0=input order, 1=name order, "
- "2=length order (default=%default)")
- op.add_option("--sort2", type="int", default=1, metavar="N",
+ "2=length order, 3=alignment order (default=%default)")
+ op.add_option("--sort2", default="1", metavar="N",
help="genome2 sequence order: 0=input order, 1=name order, "
- "2=length order (default=%default)")
- op.add_option("--trim1", action="store_true",
- help="trim unaligned sequence flanks from the 1st genome")
- op.add_option("--trim2", action="store_true",
- help="trim unaligned sequence flanks from the 2nd genome")
+ "2=length order, 3=alignment order (default=%default)")
+ op.add_option("--strands1", default="0", metavar="N", help=
+ "genome1 sequence orientation: 0=forward orientation, "
+ "1=alignment orientation (default=%default)")
+ op.add_option("--strands2", default="0", metavar="N", help=
+ "genome2 sequence orientation: 0=forward orientation, "
+ "1=alignment orientation (default=%default)")
+ op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
+ "maximum unaligned (end,mid) gap in genome1: "
+ "fraction of aligned length (default=%default)")
+ op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
+ "maximum unaligned (end,mid) gap in genome2: "
+ "fraction of aligned length (default=%default)")
+ op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
+ "pad length when cutting unaligned gaps: "
+ "fraction of aligned length (default=%default)")
op.add_option("--border-pixels", metavar="INT", type="int", default=1,
help="number of pixels between sequences (default=%default)")
- op.add_option("--border-color", metavar="COLOR", default="#dcdcdc",
+ op.add_option("--border-color", metavar="COLOR", default="black",
help="color for pixels between sequences (default=%default)")
- # xxx --margin-color?
+ # --break-color and/or --break-pixels for intra-sequence breaks?
+ op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
+ help="margin color")
og = optparse.OptionGroup(op, "Text options")
og.add_option("-f", "--fontfile", metavar="FILE",
help="TrueType or OpenType font file")
- og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=11,
+ og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
help="TrueType or OpenType font size (default: %default)")
+ og.add_option("--labels1", type="int", default=0, metavar="N", help=
+ "genome1 labels: 0=name, 1=name:length, "
+ "2=name:start:length, 3=name:start-end (default=%default)")
+ og.add_option("--labels2", type="int", default=0, metavar="N", help=
+ "genome2 labels: 0=name, 1=name:length, "
+ "2=name:start:length, 3=name:start-end (default=%default)")
og.add_option("--rot1", metavar="ROT", default="h",
help="text rotation for the 1st genome (default=%default)")
og.add_option("--rot2", metavar="ROT", default="v",
help="text rotation for the 2nd genome (default=%default)")
- og.add_option("--lengths1", action="store_true",
- help="show sequence lengths for the 1st (horizontal) genome")
- og.add_option("--lengths2", action="store_true",
- help="show sequence lengths for the 2nd (vertical) genome")
op.add_option_group(og)
og = optparse.OptionGroup(op, "Annotation options")
@@ -630,9 +938,9 @@ if __name__ == "__main__":
help="read genome1 genes from genePred file")
og.add_option("--genePred2", metavar="FILE",
help="read genome2 genes from genePred file")
- og.add_option("--exon-color", metavar="COLOR", default="#dfd",
+ og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
help="color for exons (default=%default)")
- og.add_option("--cds-color", metavar="COLOR", default="#bdb",
+ og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
help="color for protein-coding regions (default=%default)")
op.add_option_group(og)
@@ -643,18 +951,17 @@ if __name__ == "__main__":
help="read genome2 unsequenced gaps from agp or gap file")
og.add_option("--bridged-color", metavar="COLOR", default="yellow",
help="color for bridged gaps (default: %default)")
- og.add_option("--unbridged-color", metavar="COLOR", default="pink",
+ og.add_option("--unbridged-color", metavar="COLOR", default="orange",
help="color for unbridged gaps (default: %default)")
op.add_option_group(og)
(opts, args) = op.parse_args()
if len(args) != 2: op.error("2 arguments needed")
- opts.text_color = "black"
opts.background_color = "white"
opts.label_space = 5 # minimum number of pixels between axis labels
try: lastDotplot(opts, args)
except KeyboardInterrupt: pass # avoid silly error message
- except Exception, e:
+ except Exception as e:
prog = os.path.basename(sys.argv[0])
sys.exit(prog + ": error: " + str(e))
diff --git a/scripts/last-train b/scripts/last-train
index 4b360c7..a324194 100755
--- a/scripts/last-train
+++ b/scripts/last-train
@@ -360,6 +360,7 @@ def fixedLastalArgs(opts, lastalProgName):
if opts.k: x.append("-k" + opts.k)
if opts.P: x.append("-P" + opts.P)
if opts.Q: x.append("-Q" + opts.Q)
+ if opts.verbose: x.append("-" + "v" * opts.verbose)
return x
def doTraining(opts, args):
@@ -452,6 +453,8 @@ if __name__ == "__main__":
usage = "%prog [options] lastdb-name sequence-file(s)"
description = "Try to find suitable score parameters for aligning the given sequences."
op = optparse.OptionParser(usage=usage, description=description)
+ op.add_option("-v", "--verbose", action="count",
+ help="show more details of intermediate steps")
og = optparse.OptionGroup(op, "Training options")
og.add_option("--revsym", action="store_true",
help="force reverse-complement symmetry")
diff --git a/src/Alignment.cc b/src/Alignment.cc
index 0ef6c6c..551a642 100644
--- a/src/Alignment.cc
+++ b/src/Alignment.cc
@@ -96,7 +96,7 @@ void Alignment::makeXdrop( Centroid& centroid,
}
// extend a gapped alignment in the left/reverse direction from the seed:
- std::vector<uchar>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
+ std::vector<char>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
extend( blocks, columnAmbiguityCodes, centroid, greedyAligner, isGreedy,
seq1, seq2, seed.beg1(), seed.beg2(), false, globality,
scoreMatrix, smMax, maxDrop, gap, frameshiftCost,
@@ -116,7 +116,7 @@ void Alignment::makeXdrop( Centroid& centroid,
// extend a gapped alignment in the right/forward direction from the seed:
std::vector<SegmentPair> forwardBlocks;
- std::vector<uchar> forwardAmbiguities;
+ std::vector<char> forwardAmbiguities;
extend( forwardBlocks, forwardAmbiguities, centroid, greedyAligner, isGreedy,
seq1, seq2, seed.end1(), seed.end2(), true, globality,
scoreMatrix, smMax, maxDrop, gap, frameshiftCost,
@@ -262,13 +262,12 @@ bool Alignment::hasGoodSegment(const uchar *seq1, const uchar *seq2,
return false;
}
-static void getColumnAmbiguities(const Centroid& centroid,
- std::vector<uchar>& ambiguityCodes,
- const std::vector<SegmentPair>& chunks,
- bool isForward) {
+static void getColumnCodes(const Centroid& centroid, std::vector<char>& codes,
+ const std::vector<SegmentPair>& chunks,
+ bool isForward) {
for (size_t i = 0; i < chunks.size(); ++i) {
const SegmentPair& x = chunks[i];
- centroid.getMatchAmbiguities(ambiguityCodes, x.end1(), x.end2(), x.size);
+ centroid.getMatchAmbiguities(codes, x.end1(), x.end2(), x.size);
size_t j = i + 1;
bool isNext = (j < chunks.size());
size_t end1 = isNext ? chunks[j].end1() : 0;
@@ -276,17 +275,17 @@ static void getColumnAmbiguities(const Centroid& centroid,
// ASSUMPTION: if there is an insertion adjacent to a deletion,
// the deletion will get printed first.
if (isForward) {
- centroid.getInsertAmbiguities(ambiguityCodes, x.beg2(), end2);
- centroid.getDeleteAmbiguities(ambiguityCodes, x.beg1(), end1);
+ centroid.getInsertAmbiguities(codes, x.beg2(), end2);
+ centroid.getDeleteAmbiguities(codes, x.beg1(), end1);
} else {
- centroid.getDeleteAmbiguities(ambiguityCodes, x.beg1(), end1);
- centroid.getInsertAmbiguities(ambiguityCodes, x.beg2(), end2);
+ centroid.getDeleteAmbiguities(codes, x.beg1(), end1);
+ centroid.getInsertAmbiguities(codes, x.beg2(), end2);
}
}
}
void Alignment::extend( std::vector< SegmentPair >& chunks,
- std::vector< uchar >& ambiguityCodes,
+ std::vector< char >& columnCodes,
Centroid& centroid,
GreedyXdropAligner& greedyAligner, bool isGreedy,
const uchar* seq1, const uchar* seq2,
@@ -358,7 +357,7 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
score += extensionScore;
- if( outputType < 5 || outputType == 7 ){ // ordinary max-score alignment
+ if( outputType < 5 || outputType > 6 ){ // ordinary max-score alignment
size_t end1, end2, size;
if( isGreedy ){
while( greedyAligner.getNextChunk( end1, end2, size ) )
@@ -375,16 +374,19 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
if( outputType > 3 ){ // calculate match probabilities
assert( !isGreedy );
assert( !sm2qual );
- centroid.reset();
- centroid.forward( seq1, seq2, start1, start2, isForward, globality, gap );
- centroid.backward( seq1, seq2, start1, start2, isForward, globality, gap );
+ if (!isForward) {
+ --start1;
+ --start2;
+ }
+ centroid.doForwardBackwardAlgorithm(seq1, seq2, start1, start2, isForward,
+ gap, globality);
if( outputType > 4 && outputType < 7 ){ // gamma-centroid / LAMA alignment
centroid.dp( gamma );
centroid.traceback( chunks, gamma );
}
- getColumnAmbiguities( centroid, ambiguityCodes, chunks, isForward );
+ getColumnCodes(centroid, columnCodes, chunks, isForward);
extras.fullScore += centroid.logPartitionFunction();
if( outputType == 7 ){
diff --git a/src/Alignment.hh b/src/Alignment.hh
index 4d1e9fc..de4e580 100644
--- a/src/Alignment.hh
+++ b/src/Alignment.hh
@@ -62,7 +62,7 @@ struct AlignmentText {
struct AlignmentExtras {
// Optional (probabilistic) attributes of an alignment.
// To save memory, these are outside the main Alignment struct.
- std::vector<uchar> columnAmbiguityCodes; // char or uchar?
+ std::vector<char> columnAmbiguityCodes;
std::vector<double> expectedCounts; // expected emission & transition counts
double fullScore; // a.k.a. forward score, sum-of-paths score
AlignmentExtras() : fullScore(0) {}
@@ -111,7 +111,7 @@ struct Alignment{
const uchar *qual1, const uchar *qual2) const;
AlignmentText write(const MultiSequence& seq1, const MultiSequence& seq2,
- size_t seqNum2, char strand, const uchar* seqData2,
+ size_t seqNum2, const uchar* seqData2,
bool isTranslated, const Alphabet& alph,
const LastEvaluer& evaluer, int format,
const AlignmentExtras& extras) const;
@@ -127,7 +127,7 @@ struct Alignment{
size_t end2() const{ return blocks.back().end2(); }
void extend( std::vector< SegmentPair >& chunks,
- std::vector< uchar >& ambiguityCodes,
+ std::vector< char >& columnCodes,
Centroid& centroid,
GreedyXdropAligner& greedyAligner, bool isGreedy,
const uchar* seq1, const uchar* seq2,
@@ -143,20 +143,19 @@ struct Alignment{
double gamma, int outputType );
AlignmentText writeTab(const MultiSequence& seq1, const MultiSequence& seq2,
- size_t seqNum2, char strand, bool isTranslated,
+ size_t seqNum2, bool isTranslated,
const LastEvaluer& evaluer,
const AlignmentExtras& extras) const;
AlignmentText writeMaf(const MultiSequence& seq1, const MultiSequence& seq2,
- size_t seqNum2, char strand, const uchar* seqData2,
+ size_t seqNum2, const uchar* seqData2,
bool isTranslated, const Alphabet& alph,
const LastEvaluer& evaluer,
const AlignmentExtras& extras) const;
AlignmentText writeBlastTab(const MultiSequence& seq1,
const MultiSequence& seq2,
- size_t seqNum2, char strand,
- const uchar* seqData2,
+ size_t seqNum2, const uchar* seqData2,
bool isTranslated, const Alphabet& alph,
const LastEvaluer& evaluer,
bool isExtraColumns) const;
diff --git a/src/AlignmentWrite.cc b/src/AlignmentWrite.cc
index 51b7a32..010db57 100644
--- a/src/AlignmentWrite.cc
+++ b/src/AlignmentWrite.cc
@@ -90,21 +90,19 @@ private:
AlignmentText Alignment::write(const MultiSequence& seq1,
const MultiSequence& seq2,
- size_t seqNum2, char strand,
- const uchar* seqData2,
+ size_t seqNum2, const uchar* seqData2,
bool isTranslated, const Alphabet& alph,
const LastEvaluer& evaluer, int format,
const AlignmentExtras& extras) const {
assert( !blocks.empty() );
if( format == 'm' )
- return writeMaf( seq1, seq2, seqNum2, strand, seqData2,
+ return writeMaf( seq1, seq2, seqNum2, seqData2,
isTranslated, alph, evaluer, extras );
if( format == 't' )
- return writeTab( seq1, seq2, seqNum2, strand,
- isTranslated, evaluer, extras );
+ return writeTab( seq1, seq2, seqNum2, isTranslated, evaluer, extras );
else
- return writeBlastTab( seq1, seq2, seqNum2, strand, seqData2,
+ return writeBlastTab( seq1, seq2, seqNum2, seqData2,
isTranslated, alph, evaluer, format == 'B' );
}
@@ -178,8 +176,7 @@ static char* writeTags( const LastEvaluer& evaluer, double queryLength,
AlignmentText Alignment::writeTab(const MultiSequence& seq1,
const MultiSequence& seq2,
- size_t seqNum2, char strand,
- bool isTranslated,
+ size_t seqNum2, bool isTranslated,
const LastEvaluer& evaluer,
const AlignmentExtras& extras) const {
size_t alnBeg1 = beg1();
@@ -197,7 +194,9 @@ AlignmentText Alignment::writeTab(const MultiSequence& seq1,
IntText sc(score);
std::string n1 = seq1.seqName(seqNum1);
+ char strand1 = seq1.strand(seqNum1);
std::string n2 = seq2.seqName(seqNum2);
+ char strand2 = seq2.strand(seqNum2);
IntText b1(alnBeg1 - seqStart1);
IntText b2(alnBeg2 - seqStart2);
IntText r1(alnEnd1 - alnBeg1);
@@ -220,13 +219,13 @@ AlignmentText Alignment::writeTab(const MultiSequence& seq1,
char *text = new char[textLen + 1];
Writer w(text);
w << sc << t;
- w << n1 << t << b1 << t << r1 << t << '+' << t << s1 << t;
- w << n2 << t << b2 << t << r2 << t << strand << t << s2 << t;
+ w << n1 << t << b1 << t << r1 << t << strand1 << t << s1 << t;
+ w << n2 << t << b2 << t << r2 << t << strand2 << t << s2 << t;
w.copy(&blockText[0] + blockText.size() - blockLen, blockLen);
w.copy(tags, tagLen);
w << '\0';
- return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand, score, 0, 0, text);
+ return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand2, score, 0, 0, text);
}
static void putLeft(Writer &w, const std::string &t, size_t width) {
@@ -274,13 +273,6 @@ static void writeMafHeadQ(char *out,
w.fill(qLineBlankLen, ' ');
}
-// Write the first part of a "p" line:
-static void writeMafHeadP(char *out, size_t pLineBlankLen) {
- Writer w(out);
- w << 'p' << ' ';
- w.fill(pLineBlankLen, ' ');
-}
-
// Write a "c" line
static void writeMafLineC(std::vector<char> &cLine,
const std::vector<double> &counts) {
@@ -297,13 +289,12 @@ static void writeMafLineC(std::vector<char> &cLine,
AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
const MultiSequence& seq2,
- size_t seqNum2, char strand,
- const uchar* seqData2,
+ size_t seqNum2, const uchar* seqData2,
bool isTranslated, const Alphabet& alph,
const LastEvaluer& evaluer,
const AlignmentExtras& extras) const {
double fullScore = extras.fullScore;
- const std::vector<uchar>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
+ const std::vector<char>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
size_t alnBeg1 = beg1();
size_t alnEnd1 = end1();
@@ -323,7 +314,9 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
size_t aLineLen = aLineEnd - aLine;
const std::string n1 = seq1.seqName(seqNum1);
+ char strand1 = seq1.strand(seqNum1);
const std::string n2 = seq2.seqName(seqNum2);
+ char strand2 = seq2.strand(seqNum2);
IntText b1(alnBeg1 - seqStart1);
IntText b2(alnBeg2 - seqStart2);
IntText r1(alnEnd1 - alnBeg1);
@@ -356,7 +349,7 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
char *dest = std::copy(aLine, aLineEnd, text);
- writeMafHeadS(dest, n1, nw, b1, bw, r1, rw, '+', s1, sw);
+ writeMafHeadS(dest, n1, nw, b1, bw, r1, rw, strand1, s1, sw);
dest = writeTopSeq(seq1.seqReader(), alph, 0, frameSize2, dest + headLen);
*dest++ = '\n';
@@ -367,7 +360,7 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
*dest++ = '\n';
}
- writeMafHeadS(dest, n2, nw, b2, bw, r2, rw, strand, s2, sw);
+ writeMafHeadS(dest, n2, nw, b2, bw, r2, rw, strand2, s2, sw);
dest = writeBotSeq(seqData2, alph, 0, frameSize2, dest + headLen);
*dest++ = '\n';
@@ -379,25 +372,25 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
*dest++ = '\n';
}
+ Writer w(dest);
+
if (!columnAmbiguityCodes.empty()) {
- writeMafHeadP(dest, pLineBlankLen);
- dest = copy(columnAmbiguityCodes.begin(),
- columnAmbiguityCodes.end(), dest + headLen);
- *dest++ = '\n';
+ w << 'p' << ' ';
+ w.fill(pLineBlankLen, ' ');
+ w.copy(&columnAmbiguityCodes[0], columnAmbiguityCodes.size());
+ w << '\n';
}
- dest = copy(cLine.begin(), cLine.end(), dest);
+ if (!cLine.empty()) w.copy(&cLine[0], cLine.size());
- *dest++ = '\n'; // blank line afterwards
- *dest++ = '\0';
+ w << '\n' << '\0'; // blank line afterwards
- return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand, score, 0, 0, text);
+ return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand2, score, 0, 0, text);
}
AlignmentText Alignment::writeBlastTab(const MultiSequence& seq1,
const MultiSequence& seq2,
- size_t seqNum2, char strand,
- const uchar* seqData2,
+ size_t seqNum2, const uchar* seqData2,
bool isTranslated, const Alphabet& alph,
const LastEvaluer& evaluer,
bool isExtraColumns) const {
@@ -406,6 +399,7 @@ AlignmentText Alignment::writeBlastTab(const MultiSequence& seq1,
size_t seqNum1 = seq1.whichSequence(alnBeg1);
size_t seqStart1 = seq1.seqBeg(seqNum1);
size_t seqLen1 = seq1.seqLen(seqNum1);
+ char strand1 = seq1.strand(seqNum1);
size_t size2 = seq2.padLen(seqNum2);
size_t frameSize2 = isTranslated ? (size2 / 3) : 0;
@@ -413,6 +407,7 @@ AlignmentText Alignment::writeBlastTab(const MultiSequence& seq1,
size_t alnEnd2 = aaToDna( end2(), frameSize2 );
size_t seqStart2 = seq2.seqBeg(seqNum2) - seq2.padBeg(seqNum2);
size_t seqLen2 = seq2.seqLen(seqNum2);
+ char strand2 = seq2.strand(seqNum2);
size_t alnSize = numColumns( frameSize2 );
size_t matches = matchCount( blocks, seq1.seqReader(), seqData2,
@@ -423,9 +418,14 @@ AlignmentText Alignment::writeBlastTab(const MultiSequence& seq1,
size_t blastAlnBeg1 = alnBeg1 + 1; // 1-based coordinate
size_t blastAlnEnd1 = alnEnd1;
+ if (strand1 == '-') {
+ blastAlnBeg1 = seqStart1 + seqLen1 - alnBeg1;
+ blastAlnEnd1 = seqStart1 + seqLen1 - alnEnd1 + 1; // 1-based coordinate
+ seqStart1 = 0;
+ }
size_t blastAlnBeg2 = alnBeg2 + 1; // 1-based coordinate
size_t blastAlnEnd2 = alnEnd2;
- if (strand == '-') {
+ if (strand2 == '-') {
blastAlnBeg2 = size2 - alnBeg2;
blastAlnEnd2 = size2 - alnEnd2 + 1; // 1-based coordinate
/*
@@ -474,7 +474,7 @@ AlignmentText Alignment::writeBlastTab(const MultiSequence& seq1,
if (isExtraColumns) w << t << s2 << t << s1 << t << sc;
w << '\n' << '\0';
- return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand, score,
+ return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand2, score,
alnSize, matches, text);
}
diff --git a/src/Centroid.cc b/src/Centroid.cc
index 36edd35..98303be 100644
--- a/src/Centroid.cc
+++ b/src/Centroid.cc
@@ -96,7 +96,6 @@ namespace cbrc{
}
void Centroid::initForwardMatrix(){
- scale.assign ( numAntidiagonals + 2, 1.0 ); // scaling
size_t n = xa.scoreEndIndex( numAntidiagonals );
if ( fM.size() < n ) {
@@ -110,9 +109,6 @@ namespace cbrc{
}
void Centroid::initBackwardMatrix(){
- mD.assign( numAntidiagonals + 2, 0.0 );
- mI.assign( numAntidiagonals + 2, 0.0 );
-
size_t n = xa.scoreEndIndex( numAntidiagonals );
bM.assign( n, 0.0 );
bD.assign( n, 0.0 );
@@ -120,51 +116,20 @@ namespace cbrc{
bP.assign( n, 0.0 );
}
- void Centroid::initDecodingMatrix(){
- X.resize( fM.size() );
- }
-
- void Centroid::updateScore( double score, size_t antiDiagonal, size_t cur ){
- if( bestScore < score ){
- bestScore = score;
- bestAntiDiagonal = antiDiagonal;
- bestPos1 = cur;
- }
- }
-
- // xxx this will go wrong for non-delimiters with severe mismatch scores
- static bool isDelimiter(uchar c, const double *expScores) {
- return expScores[c] <= 0.0;
- }
-
- void Centroid::forward( const uchar* seq1, const uchar* seq2,
- size_t start1, size_t start2,
- bool isForward, int globality,
- const GeneralizedAffineGapCosts& gap ){
-
- //std::cout << "[forward] start1=" << start1 << "," << "start2=" << start2 << "," << "isForward=" << isForward << std::endl;
- seq1 += start1;
- seq2 += start2;
- const ExpMatrixRow* pssm = isPssm ? pssmExp2 + start2 : 0;
- const int seqIncrement = isForward ? 1 : -1;
-
+ void Centroid::forward(const uchar* seq1, const uchar* seq2,
+ const ExpMatrixRow* pssm, bool isExtendFwd,
+ const GeneralizedAffineGapCosts& gap,
+ int globality) {
initForwardMatrix();
-
- double Z = 0.0; // partion function of forward values
-
+ const int seqIncrement = isExtendFwd ? 1 : -1;
const bool isAffine = gap.isAffine();
- const int E = gap.delExtend;
- const int F = gap.delExist;
- const int EI = gap.insExtend;
- const int FI = gap.insExist;
- const int P = gap.pairExtend;
- const double eE = EXP ( - E / T );
- const double eF = EXP ( - F / T );
- const double eEI = EXP ( - EI / T );
- const double eFI = EXP ( - FI / T );
- const double eP = EXP ( - P / T );
-
+ const double eE = EXP ( - gap.delExtend / T );
+ const double eF = EXP ( - gap.delExist / T );
+ const double eEI = EXP ( - gap.insExtend / T );
+ const double eFI = EXP ( - gap.insExist / T );
+ const double eP = EXP ( - gap.pairExtend / T );
assert( gap.insExist == gap.delExist || eP <= 0.0 );
+ double Z = 0.0; // partion function of forward values
for( size_t k = 0; k < numAntidiagonals; ++k ){ // loop over antidiagonals
const size_t seq1beg = seq1start( k );
@@ -193,26 +158,28 @@ namespace cbrc{
const double* fM0last = fM0 + xa.numCellsAndPads( k ) - 1;
- const uchar* s1 = seqPtr( seq1, isForward, seq1beg );
+ const uchar* s1 = isExtendFwd ? seq1 + seq1beg : seq1 - seq1beg;
*fM0++ = *fD0++ = *fI0++ = *fP0++ = 0.0; // add one pad cell
if (! isPssm) {
- const uchar* s2 = seqPtr( seq2, isForward, seq2pos );
+ const uchar* s2 = isExtendFwd ? seq2 + seq2pos : seq2 - seq2pos;
if (isAffine) {
while (1) {
const double xM = *fM2 * scale12;
const double xD = *fD1 * seE;
const double xI = *fI1 * seEI;
+ const unsigned letter1 = *s1;
+ const unsigned letter2 = *s2;
+ const double matchProb = match_score[letter1][letter2];
*fD0 = xM * eF + xD;
*fI0 = (xM + xD) * eFI + xI;
- *fM0 = (xM + xD + xI) * match_score[ *s1 ][ *s2 ];
+ const double total = xM + xD + xI;
+ *fM0 = total * matchProb;
sum_f += xM;
- if( globality && (isDelimiter(*s2, *match_score) ||
- isDelimiter(*s1, *match_score)) ){
- Z += xM + xD + xI;
- }
+ if (globality && matchProb <= 0) Z += total; // xxx
+
if (fM0 == fM0last) break;
fM0++; fD0++; fI0++;
fM2++; fD1++; fI1++;
@@ -225,15 +192,17 @@ namespace cbrc{
const double xD = *fD1 * seE;
const double xI = *fI1 * seEI;
const double xP = *fP2 * seP;
- *fD0 = xM * eF + xD + xP;
- *fI0 = (xM + xD) * eFI + xI + xP;
- *fM0 = (xM + xD + xI + xP) * match_score[ *s1 ][ *s2 ];
+ const unsigned letter1 = *s1;
+ const unsigned letter2 = *s2;
+ const double matchProb = match_score[letter1][letter2];
*fP0 = xM * eF + xP;
+ *fD0 = xM * eF + xP + xD;
+ *fI0 = (xM + xD) * eFI + (xI + xP);
+ const double total = (xM + xD) + (xI + xP);
+ *fM0 = total * matchProb;
sum_f += xM;
- if( globality && (isDelimiter(*s2, *match_score) ||
- isDelimiter(*s1, *match_score)) ){
- Z += xM + xD + xI + xP;
- }
+ if (globality && matchProb <= 0) Z += total; // xxx
+
if (fM0 == fM0last) break;
fM0++; fD0++; fI0++; fP0++;
fM2++; fD1++; fI1++; fP2++;
@@ -242,21 +211,23 @@ namespace cbrc{
}
}
} else {
- const ExpMatrixRow* p2 = seqPtr( pssm, isForward, seq2pos );
+ const ExpMatrixRow* p2 = isExtendFwd ? pssm + seq2pos : pssm - seq2pos;
if (isAffine) {
while (1) {
const double xM = *fM2 * scale12;
const double xD = *fD1 * seE;
const double xI = *fI1 * seEI;
+ const unsigned letter1 = *s1;
+ const double *matchProbs = *p2;
+ const double matchProb = matchProbs[letter1];
*fD0 = xM * eF + xD;
*fI0 = (xM + xD) * eFI + xI;
- *fM0 = (xM + xD + xI) * (*p2)[ *s1 ];
+ const double total = xM + xD + xI;
+ *fM0 = total * matchProb;
sum_f += xM;
- if ( globality && (isDelimiter(0, *p2) ||
- isDelimiter(*s1, *pssm)) ){
- Z += xM + xD + xI;
- }
+ if (globality && matchProb <= 0) Z += total; // xxx
+
if (fM0 == fM0last) break;
fM0++; fD0++; fI0++;
fM2++; fD1++; fI1++;
@@ -269,15 +240,17 @@ namespace cbrc{
const double xD = *fD1 * seE;
const double xI = *fI1 * seEI;
const double xP = *fP2 * seP;
- *fD0 = xM * eF + xD + xP;
- *fI0 = (xM + xD) * eFI + xI + xP;
- *fM0 = (xM + xD + xI + xP) * (*p2)[ *s1 ];
+ const unsigned letter1 = *s1;
+ const double *matchProbs = *p2;
+ const double matchProb = matchProbs[letter1];
*fP0 = xM * eF + xP;
+ *fD0 = xM * eF + xP + xD;
+ *fI0 = (xM + xD) * eFI + (xI + xP);
+ const double total = (xM + xD) + (xI + xP);
+ *fM0 = total * matchProb;
sum_f += xM;
- if ( globality && (isDelimiter(0, *p2) ||
- isDelimiter(*s1, *pssm)) ){
- Z += xM + xD + xI + xP;
- }
+ if (globality && matchProb <= 0) Z += total; // xxx
+
if (fM0 == fM0last) break;
fM0++; fD0++; fI0++; fP0++;
fM2++; fD1++; fI1++; fP2++;
@@ -299,33 +272,20 @@ namespace cbrc{
// added by M. Hamada
// compute posterior probabilities while executing backward algorithm
- void Centroid::backward( const uchar* seq1, const uchar* seq2,
- size_t start1, size_t start2,
- bool isForward, int globality,
- const GeneralizedAffineGapCosts& gap ){
-
- //std::cout << "[backward] start1=" << start1 << "," << "start2=" << start2 << "," << "isForward=" << isForward << std::endl;
- seq1 += start1;
- seq2 += start2;
- const ExpMatrixRow* pssm = isPssm ? pssmExp2 + start2 : 0;
- const int seqIncrement = isForward ? 1 : -1;
-
+ void Centroid::backward(const uchar* seq1, const uchar* seq2,
+ const ExpMatrixRow* pssm, bool isExtendFwd,
+ const GeneralizedAffineGapCosts& gap,
+ int globality) {
initBackwardMatrix();
-
+ const int seqIncrement = isExtendFwd ? 1 : -1;
const bool isAffine = gap.isAffine();
- const int E = gap.delExtend;
- const int F = gap.delExist;
- const int EI = gap.insExtend;
- const int FI = gap.insExist;
- const int P = gap.pairExtend;
- const double eE = EXP ( - E / T );
- const double eF = EXP ( - F / T );
- const double eEI = EXP ( - EI / T );
- const double eFI = EXP ( - FI / T );
- const double eP = EXP ( - P / T );
- double scaledUnit = 1.0;
-
+ const double eE = EXP ( - gap.delExtend / T );
+ const double eF = EXP ( - gap.delExist / T );
+ const double eEI = EXP ( - gap.insExtend / T );
+ const double eFI = EXP ( - gap.insExist / T );
+ const double eP = EXP ( - gap.pairExtend / T );
assert( gap.insExist == gap.delExist || eP <= 0.0 );
+ double scaledUnit = 1.0;
for( size_t k = numAntidiagonals; k-- > 0; ){
const size_t seq1beg = seq1start( k );
@@ -361,22 +321,24 @@ namespace cbrc{
double* mDout = &mD[ seq1beg ];
double* mIout = &mI[ seq2pos ];
- const uchar* s1 = seqPtr( seq1, isForward, seq1beg );
+ const uchar *s1 = isExtendFwd ? seq1 + seq1beg : seq1 - seq1beg;
if (! isPssm ) {
- const uchar* s2 = seqPtr( seq2, isForward, seq2pos );
+ const uchar *s2 = isExtendFwd ? seq2 + seq2pos : seq2 - seq2pos;
if (isAffine) {
while (1) {
- double yM = *bM0 * match_score[ *s1 ][ *s2 ];
- double yD = *bD0;
- double yI = *bI0;
+ const double matchProb = match_score[*s1][*s2];
+ const double yM = (*bM0) * matchProb;
+ const double yD = *bD0;
+ const double yI = *bI0;
double zM = yM + yD * eF + yI * eFI;
double zD = yM + yD + yI * eFI;
double zI = yM + yI;
if( globality ){
- if( isDelimiter(*s2, *match_score) ||
- isDelimiter(*s1, *match_score) ){
+ if( matchProb <= 0 ){
+ // xxx should get here only at delimiters, but will get
+ // here for non-delimiters with severe mismatch scores
zM += scaledUnit; zD += scaledUnit; zI += scaledUnit;
}
}else{
@@ -399,17 +361,17 @@ namespace cbrc{
}
} else {
while (1) {
- double yM = *bM0 * match_score[ *s1 ][ *s2 ];
- double yD = *bD0;
- double yI = *bI0;
- double yP = *bP0;
+ const double matchProb = match_score[*s1][*s2];
+ const double yM = (*bM0) * matchProb;
+ const double yD = *bD0;
+ const double yI = *bI0;
+ const double yP = *bP0;
double zM = yM + yD * eF + yI * eFI + yP * eF;
double zD = yM + yD + yI * eFI;
double zI = yM + yI;
- double zP = yM + yP + yD + yI;
+ double zP = yM + yD + yI + yP;
if( globality ){
- if( isDelimiter(*s2, *match_score) ||
- isDelimiter(*s1, *match_score) ){
+ if( matchProb <= 0 ){ // xxx
zM += scaledUnit; zD += scaledUnit; zI += scaledUnit;
zP += scaledUnit;
}
@@ -435,19 +397,19 @@ namespace cbrc{
}
}
} else {
- const ExpMatrixRow* p2 = seqPtr( pssm, isForward, seq2pos );
+ const ExpMatrixRow *p2 = isExtendFwd ? pssm + seq2pos : pssm - seq2pos;
if (isAffine) {
while (1) {
- double yM = *bM0 * ( *p2 )[ *s1 ];
- double yD = *bD0;
- double yI = *bI0;
+ const double matchProb = (*p2)[*s1];
+ const double yM = (*bM0) * matchProb;
+ const double yD = *bD0;
+ const double yI = *bI0;
double zM = yM + yD * eF + yI * eFI;
double zD = yM + yD + yI * eFI;
double zI = yM + yI;
if( globality ){
- if( isDelimiter(0, *p2) ||
- isDelimiter(*s1, *pssm) ){
+ if( matchProb <= 0 ){ // xxx
zM += scaledUnit; zD += scaledUnit; zI += scaledUnit;
}
}else{
@@ -470,17 +432,17 @@ namespace cbrc{
}
} else {
while (1) {
- double yM = *bM0 * ( *p2 )[ *s1 ];
- double yD = *bD0;
- double yI = *bI0;
- double yP = *bP0;
+ const double matchProb = (*p2)[*s1];
+ const double yM = (*bM0) * matchProb;
+ const double yD = *bD0;
+ const double yI = *bI0;
+ const double yP = *bP0;
double zM = yM + yD * eF + yI * eFI + yP * eF;
double zD = yM + yD + yI * eFI;
double zI = yM + yI;
- double zP = yM + yP + yD + yI;
+ double zP = yM + yD + yI + yP;
if( globality ){
- if( isDelimiter(0, *p2) ||
- isDelimiter(*s1, *pssm) ){
+ if( matchProb <= 0 ){ // xxx
zM += scaledUnit; zD += scaledUnit; zI += scaledUnit;
zP += scaledUnit;
}
@@ -509,27 +471,19 @@ namespace cbrc{
}
//std::cout << "# bM[0]=" << bM[0] << std::endl;
- //ExpectedCount ec;
- //computeExpectedCounts ( seq1, seq2, start1, start2, isForward, gap, ec );
- //ec.write (std::cerr);
}
double Centroid::dp( double gamma ){
- if (outputType == 5 ) return dp_centroid( gamma );
- else if (outputType == 6 ) return dp_ama (gamma);
+ bestScore = 0;
+ bestAntiDiagonal = 0;
+ bestPos1 = 0;
+ X.resize(fM.size());
+ if (outputType == 5) return dp_centroid(gamma);
+ if (outputType == 6) return dp_ama(gamma);
return 0;
}
- void Centroid::traceback( std::vector< SegmentPair >& chunks,
- double gamma ) const{
- if (outputType==5) traceback_centroid( chunks, gamma );
- else if (outputType==6) traceback_ama( chunks, gamma);
- }
-
double Centroid::dp_centroid( double gamma ){
-
- initDecodingMatrix();
-
for( size_t k = 1; k < numAntidiagonals; ++k ){ // loop over antidiagonals
const size_t scoreEnd = xa.scoreEndIndex( k );
double* X0 = &X[ scoreEnd ];
@@ -589,9 +543,6 @@ namespace cbrc{
}
double Centroid::dp_ama( double gamma ){
-
- initDecodingMatrix();
-
mX1.assign ( numAntidiagonals + 2, 1.0 );
mX2.assign ( numAntidiagonals + 2, 1.0 );
@@ -684,7 +635,7 @@ namespace cbrc{
// Return an ASCII code representing an error probability. The
// printable codes are 33--126, but here we use a maximum of 125, so
// that 126 is reserved for special cases.
- static uchar asciiProbability( double probCorrect ){
+ static char asciiProbability( double probCorrect ){
assert( probCorrect >= 0 );
//assert( probCorrect <= 1 ); // can fail: floating point is imperfect
double e = 1 - probCorrect;
@@ -693,10 +644,10 @@ namespace cbrc{
int i = static_cast<int>(g); // round fractions down
int j = i + 33;
int k = std::min( j, 125 );
- return static_cast<uchar>(k);
+ return static_cast<char>(k);
}
- void Centroid::getMatchAmbiguities(std::vector<uchar>& ambiguityCodes,
+ void Centroid::getMatchAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq1end, size_t seq2end,
size_t length) const {
while (length) {
@@ -707,13 +658,13 @@ namespace cbrc{
}
}
- void Centroid::getDeleteAmbiguities(std::vector<uchar>& ambiguityCodes,
+ void Centroid::getDeleteAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq1end, size_t seq1beg) const {
for (size_t i = seq1end; i > seq1beg; --i)
ambiguityCodes.push_back(asciiProbability(mD[i]));
}
- void Centroid::getInsertAmbiguities(std::vector<uchar>& ambiguityCodes,
+ void Centroid::getInsertAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq2end, size_t seq2beg) const {
for (size_t i = seq2end; i > seq2beg; --i)
ambiguityCodes.push_back(asciiProbability(mI[i]));
@@ -729,25 +680,20 @@ namespace cbrc{
void Centroid::computeExpectedCounts ( const uchar* seq1, const uchar* seq2,
size_t start1, size_t start2,
- bool isForward,
+ bool isExtendFwd,
const GeneralizedAffineGapCosts& gap,
ExpectedCount& c ) const{
seq1 += start1;
seq2 += start2;
const ExpMatrixRow* pssm = isPssm ? pssmExp2 + start2 : 0;
- const int seqIncrement = isForward ? 1 : -1;
+ const int seqIncrement = isExtendFwd ? 1 : -1;
const bool isAffine = gap.isAffine();
- const int E = gap.delExtend;
- const int F = gap.delExist;
- const int EI = gap.insExtend;
- const int FI = gap.insExist;
- const int P = gap.pairExtend;
- const double eE = EXP ( - E / T );
- const double eF = EXP ( - F / T );
- const double eEI = EXP ( - EI / T );
- const double eFI = EXP ( - FI / T );
- const double eP = EXP ( - P / T );
+ const double eE = EXP ( - gap.delExtend / T );
+ const double eF = EXP ( - gap.delExist / T );
+ const double eEI = EXP ( - gap.insExtend / T );
+ const double eFI = EXP ( - gap.insExist / T );
+ const double eP = EXP ( - gap.pairExtend / T );
assert( gap.insExist == gap.delExist || eP <= 0.0 );
@@ -761,8 +707,8 @@ namespace cbrc{
const double seEI = eEI * scale1;
const double seP = eP * scale12;
- const uchar* s1 = seqPtr( seq1, isForward, seq1beg );
- const uchar* s2 = seqPtr( seq2, isForward, seq2pos );
+ const uchar* s1 = isExtendFwd ? seq1 + seq1beg : seq1 - seq1beg;
+ const uchar* s2 = isExtendFwd ? seq2 + seq2pos : seq2 - seq2pos;
const size_t scoreEnd = xa.scoreEndIndex( k );
const double* bM0 = &bM[ scoreEnd + 1 ];
@@ -786,10 +732,12 @@ namespace cbrc{
const double xM = *fM2 * scale12;
const double xD = *fD1 * seE;
const double xI = *fI1 * seEI;
- const double yM = *bM0 * match_score[ *s1 ][ *s2 ];
+ const unsigned letter1 = *s1;
+ const unsigned letter2 = *s2;
+ const double yM = *bM0 * match_score[letter1][letter2];
const double yD = *bD0;
const double yI = *bI0;
- c.emit[*s1][*s2] += (xM + xD + xI) * yM;
+ c.emit[letter1][letter2] += (xM + xD + xI) * yM;
c.MM += xM * yM;
c.DM += xD * yM;
c.IM += xI * yM;
@@ -810,11 +758,13 @@ namespace cbrc{
const double xD = *fD1 * seE;
const double xI = *fI1 * seEI;
const double xP = *fP2 * seP;
- const double yM = *bM0 * match_score[ *s1 ][ *s2 ];
+ const unsigned letter1 = *s1;
+ const unsigned letter2 = *s2;
+ const double yM = *bM0 * match_score[letter1][letter2];
const double yD = *bD0;
const double yI = *bI0;
const double yP = *bP0;
- c.emit[*s1][*s2] += (xM + xD + xI + xP) * yM;
+ c.emit[letter1][letter2] += (xM + xD + xI + xP) * yM;
c.MM += xM * yM;
c.DM += xD * yM;
c.IM += xI * yM;
@@ -837,17 +787,18 @@ namespace cbrc{
}
}
else {
- const ExpMatrixRow* p2 = seqPtr( pssm, isForward, seq2pos );
+ const ExpMatrixRow* p2 = isExtendFwd ? pssm + seq2pos : pssm - seq2pos;
if (isAffine) {
while (1) { // inner most loop
const double xM = *fM2 * scale12;
const double xD = *fD1 * seE;
const double xI = *fI1 * seEI;
- const double yM = *bM0 * ( *p2 )[ *s1 ];
+ const unsigned letter1 = *s1;
+ const double yM = *bM0 * (*p2)[letter1];
const double yD = *bD0;
const double yI = *bI0;
- c.emit[*s1][*s2] += (xM + xD + xI) * yM; // xxx
+ c.emit[letter1][*s2] += (xM + xD + xI) * yM; // xxx
c.MM += xM * yM;
c.DM += xD * yM;
c.IM += xI * yM;
@@ -869,11 +820,12 @@ namespace cbrc{
const double xD = *fD1 * seE;
const double xI = *fI1 * seEI;
const double xP = *fP2 * seP;
- const double yM = *bM0 * ( *p2 )[ *s1 ];
+ const unsigned letter1 = *s1;
+ const double yM = *bM0 * (*p2)[letter1];
const double yD = *bD0;
const double yI = *bI0;
const double yP = *bP0;
- c.emit[*s1][*s2] += (xM + xD + xI + xP) * yM; // xxx
+ c.emit[letter1][*s2] += (xM + xD + xI + xP) * yM; // xxx
c.MM += xM * yM;
c.DM += xD * yM;
c.IM += xI * yM;
diff --git a/src/Centroid.hh b/src/Centroid.hh
index 9c3b8f5..afb7603 100644
--- a/src/Centroid.hh
+++ b/src/Centroid.hh
@@ -25,6 +25,7 @@ namespace cbrc{
ExpectedCount ();
std::ostream& write (std::ostream& os) const;
};
+
/**
* (1) Forward and backward algorithm on the DP region given by Xdrop algorithm
* (2) \gamma-centroid decoding
@@ -40,23 +41,31 @@ namespace cbrc{
const uchar* sequenceBeg, const uchar* qualityBeg );
void setOutputType( int m ) { outputType = m; }
- void reset( ) {
+ // start1 is the index of the first letter to look at in seq1
+ // start2 is the index of the first letter to look at in seq2
+
+ void doForwardBackwardAlgorithm(const uchar* seq1, const uchar* seq2,
+ size_t start1, size_t start2,
+ bool isExtendFwd,
+ const GeneralizedAffineGapCosts& gap,
+ int globality) {
+ seq1 += start1;
+ seq2 += start2;
+ const ExpMatrixRow *pssm = isPssm ? pssmExp2 + start2 : 0;
numAntidiagonals = xa.numAntidiagonals();
- bestScore = 0;
- bestAntiDiagonal = 0;
- bestPos1 =0;
+ scale.assign(numAntidiagonals + 2, 1.0);
+ forward(seq1, seq2, pssm, isExtendFwd, gap, globality);
+ mD.assign(numAntidiagonals + 2, 0.0);
+ mI.assign(numAntidiagonals + 2, 0.0);
+ backward(seq1, seq2, pssm, isExtendFwd, gap, globality);
}
- void forward( const uchar* seq1, const uchar* seq2,
- size_t start1, size_t start2, bool isForward,
- int globality, const GeneralizedAffineGapCosts& gap );
-
- void backward( const uchar* seq1, const uchar* seq2,
- size_t start1, size_t start2, bool isForward,
- int globality, const GeneralizedAffineGapCosts& gap );
-
double dp( double gamma );
- void traceback( std::vector< SegmentPair >& chunks, double gamma ) const;
+
+ void traceback(std::vector<SegmentPair> &chunks, double gamma) const {
+ if (outputType==5) traceback_centroid(chunks, gamma);
+ if (outputType==6) traceback_ama(chunks, gamma);
+ }
double dp_centroid( double gamma );
void traceback_centroid( std::vector< SegmentPair >& chunks, double gamma ) const;
@@ -64,23 +73,23 @@ namespace cbrc{
double dp_ama( double gamma );
void traceback_ama( std::vector< SegmentPair >& chunks, double gamma ) const;
- void getMatchAmbiguities(std::vector<uchar>& ambiguityCodes,
+ void getMatchAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq1end, size_t seq2end,
size_t length) const;
- void getDeleteAmbiguities(std::vector<uchar>& ambiguityCodes,
+ void getDeleteAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq1end, size_t seq1beg) const;
- void getInsertAmbiguities(std::vector<uchar>& ambiguityCodes,
+ void getInsertAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq2end, size_t seq2beg) const;
double logPartitionFunction() const; // a.k.a. full score, forward score
// Added by MH (2008/10/10) : compute expected counts for transitions and emissions
- void computeExpectedCounts ( const uchar* seq1, const uchar* seq2,
- size_t start1, size_t start2, bool isForward,
- const GeneralizedAffineGapCosts& gap,
- ExpectedCount& count ) const;
+ void computeExpectedCounts(const uchar* seq1, const uchar* seq2,
+ size_t start1, size_t start2, bool isExtendFwd,
+ const GeneralizedAffineGapCosts& gap,
+ ExpectedCount& count) const;
private:
typedef double ExpMatrixRow[scoreMatrixRowSize];
@@ -119,24 +128,30 @@ namespace cbrc{
size_t bestAntiDiagonal;
size_t bestPos1;
+ void forward(const uchar* seq1, const uchar* seq2,
+ const ExpMatrixRow* pssm, bool isExtendFwd,
+ const GeneralizedAffineGapCosts& gap, int globality);
+
+ void backward(const uchar* seq1, const uchar* seq2,
+ const ExpMatrixRow* pssm, bool isExtendFwd,
+ const GeneralizedAffineGapCosts& gap, int globality);
+
void initForwardMatrix();
void initBackwardMatrix();
- void initDecodingMatrix();
- void updateScore( double score, size_t antiDiagonal, size_t cur );
+ void updateScore(double score, size_t antiDiagonal, size_t cur) {
+ if (bestScore < score) {
+ bestScore = score;
+ bestAntiDiagonal = antiDiagonal;
+ bestPos1 = cur;
+ }
+ }
// start of the x-drop region (i.e. number of skipped seq1 letters
// before the x-drop region) for this antidiagonal
size_t seq1start( size_t antidiagonal ) const {
return xa.scoreEndIndex( antidiagonal ) - xa.scoreOrigin( antidiagonal );
}
-
- // get a pointer into a sequence, taking start and direction into account
- template < typename T >
- static const T* seqPtr( const T* seq, bool isForward, size_t pos ){
- if( isForward ) return seq + pos;
- else return seq - pos - 1;
- }
};
}
diff --git a/src/CyclicSubsetSeed.cc b/src/CyclicSubsetSeed.cc
index 8017fde..bc2f525 100644
--- a/src/CyclicSubsetSeed.cc
+++ b/src/CyclicSubsetSeed.cc
@@ -2,7 +2,7 @@
#include "CyclicSubsetSeed.hh"
#include "CyclicSubsetSeedData.hh"
-#include "io.hh"
+#include "zio.hh"
#include "stringify.hh"
#include <algorithm> // sort
#include <sstream>
@@ -23,7 +23,7 @@ std::string CyclicSubsetSeed::stringFromName( const std::string& name ){
if( name == subsetSeeds[i].name )
return subsetSeeds[i].text;
- return slurp( name );
+ return slurp( name.c_str() );
}
std::string
@@ -122,8 +122,9 @@ void CyclicSubsetSeed::appendPosition( std::istream& inputLine,
std::string subset;
for( size_t i = 0; i < inputWord.size(); ++i ){
- uchar upper = std::toupper( inputWord[i] );
- uchar lower = std::tolower( inputWord[i] );
+ uchar c = inputWord[i];
+ uchar upper = std::toupper(c);
+ uchar lower = std::tolower(c);
addLetter( &numbersToSubsets[0], upper, subsetNum, letterCode );
subset += upper;
if( !isMaskLowercase && lower != upper ){
diff --git a/src/LastEvaluer.cc b/src/LastEvaluer.cc
index 72b1dae..372123a 100644
--- a/src/LastEvaluer.cc
+++ b/src/LastEvaluer.cc
@@ -6,6 +6,7 @@
#include <algorithm>
#include <cstring>
+#include <iostream>
#define COUNTOF(a) (sizeof (a) / sizeof *(a))
@@ -263,7 +264,8 @@ void LastEvaluer::init(const char *matrixName,
int insEpen,
int frameshiftCost,
const GeneticCode &geneticCode,
- bool isStandardGeneticCode) {
+ bool isStandardGeneticCode,
+ int verbosity) {
const double lambdaTolerance = 0.01;
const double kTolerance = 0.05;
const double maxMegabytes = 500;
@@ -359,14 +361,20 @@ void LastEvaluer::init(const char *matrixName,
evaluer.set_gapped_computation_parameters_simplified(maxSeconds);
for (int i = 0; ; ++i) {
double t = Sls::default_importance_sampling_temperature + 0.01 * i;
+ if (verbosity > 0) std::cerr << "try temperature=" << t << " ";
try {
evaluer.initGapped(alphabetSize, &matrix[0],
letterFreqs2, letterFreqs1,
delOpen, delEpen, insOpen, insEpen,
true, lambdaTolerance, kTolerance,
0, maxMegabytes, randomSeed, t);
+ if (verbosity > 0) std::cerr << "OK\n";
break;
} catch (const Sls::error& e) {
+ if (verbosity > 0) std::cerr << "NG\n";
+ if (verbosity > 1) {
+ std::cerr << "ALP: " << e.error_code << ": " << e.st;
+ }
if (i == 20) throw;
}
}
diff --git a/src/LastEvaluer.hh b/src/LastEvaluer.hh
index eee084e..4b05d1f 100644
--- a/src/LastEvaluer.hh
+++ b/src/LastEvaluer.hh
@@ -50,7 +50,8 @@ public:
int insEpen,
int frameshiftCost,
const GeneticCode &geneticCode,
- bool isStandardGeneticCode);
+ bool isStandardGeneticCode,
+ int verbosity);
void setSearchSpace(double databaseLength, // number of database letters
double numOfStrands) { // 1 or 2
diff --git a/src/LastdbArguments.cc b/src/LastdbArguments.cc
index 95060ec..66535f7 100644
--- a/src/LastdbArguments.cc
+++ b/src/LastdbArguments.cc
@@ -31,6 +31,7 @@ LastdbArguments::LastdbArguments() :
tantanSetting(0),
isCaseSensitive(false),
seedPatterns(0),
+ strand(1),
volumeSize(-1),
indexStep(1),
minimizerWindow(1),
@@ -66,9 +67,11 @@ Advanced Options (default settings):\n\
+ stringify(indexStep) + ")\n\
-W: use \"minimum\" positions in sliding windows of W consecutive positions ("
+ stringify(minimizerWindow) + ")\n\
+-S: strand: 0=reverse, 1=forward, 2=both ("
+ + stringify(strand) + ")\n\
-s: volume size (unlimited)\n\
-Q: input format: 0=fasta, 1=fastq-sanger, 2=fastq-solexa, 3=fastq-illumina ("
- + stringify(inputFormat) + ")\n\
+ + stringify(inputFormat) + ")\n\
-P: number of parallel threads ("
+ stringify(numOfThreads) + ")\n\
-m: seed pattern\n\
@@ -86,8 +89,10 @@ Report bugs to: last-align (ATmark) googlegroups (dot) com\n\
LAST home page: http://last.cbrc.jp/\n\
";
+ static const char sOpts[] = "hVpR:cm:S:s:w:W:P:u:a:i:b:C:xvQ:";
+
int c;
- while( (c = myGetopt(argc, argv, "hVpR:cm:s:w:W:P:u:a:i:b:C:xvQ:")) != -1 ) {
+ while ((c = myGetopt(argc, argv, sOpts)) != -1) {
switch(c){
case 'h':
std::cout << help;
@@ -113,6 +118,10 @@ LAST home page: http://last.cbrc.jp/\n\
case 'm':
seedPatterns.push_back(optarg);
break;
+ case 'S':
+ unstringify( strand, optarg );
+ if( strand < 0 || strand > 2 ) badopt( c, optarg );
+ break;
case 's':
unstringifySize( volumeSize, optarg );
break;
diff --git a/src/LastdbArguments.hh b/src/LastdbArguments.hh
index 4eb25db..95d7a72 100644
--- a/src/LastdbArguments.hh
+++ b/src/LastdbArguments.hh
@@ -34,6 +34,7 @@ struct LastdbArguments{
int tantanSetting;
bool isCaseSensitive;
std::vector< std::string > seedPatterns;
+ int strand;
size_t volumeSize;
size_t indexStep;
size_t minimizerWindow;
diff --git a/src/MultiSequence.cc b/src/MultiSequence.cc
index 1f1d05a..463ca8a 100644
--- a/src/MultiSequence.cc
+++ b/src/MultiSequence.cc
@@ -19,12 +19,21 @@ void MultiSequence::initForAppending( indexT padSizeIn ){
}
void MultiSequence::reinitForAppending(){
- seq.v.erase( seq.v.begin(), seq.v.begin() + ends.v.back() - padSize );
- names.v.erase( names.v.begin(),
- names.v.begin() + nameEnds.v[ finishedSequences() ] );
+ size_t n = finishedSequences();
+ size_t s = padBeg(n);
+
+ seq.v.erase(seq.v.begin(), seq.v.begin() + s);
+ names.v.erase(names.v.begin(), names.v.begin() + nameEnds.v[n]);
ends.v.resize(1);
nameEnds.v.resize(1);
if( !names.v.empty() ) nameEnds.v.push_back( names.v.size() );
+
+ qualityScores.v.erase(qualityScores.v.begin(),
+ qualityScores.v.begin() + s * qualsPerLetter());
+
+ if (!pssm.empty()) {
+ pssm.erase(pssm.begin(), pssm.begin() + s * scoreMatrixRowSize);
+ }
}
void MultiSequence::fromFiles( const std::string& baseName, indexT seqCount,
@@ -91,34 +100,71 @@ MultiSequence::appendFromFasta( std::istream& stream, indexT maxSeqLen ){
++inpos;
}
- if( isFinishable(maxSeqLen) ) finish();
+ if (isRoomToAppendPad(maxSeqLen)) finish();
return stream;
}
-void MultiSequence::finish(){
- assert( !isFinished() );
- seq.v.insert( seq.v.end(), padSize, ' ' );
- ends.v.push_back( seq.v.size() );
- assert( ends.v.back() == seq.v.size() );
+MultiSequence::indexT MultiSequence::whichSequence( indexT coordinate ) const{
+ const indexT* u = std::upper_bound( ends.begin(), ends.end(), coordinate );
+ assert( u != ends.begin() && u != ends.end() );
+ return u - ends.begin() - 1;
}
-void MultiSequence::unfinish(){
- assert( isFinished() );
- ends.v.pop_back();
- seq.v.erase( seq.v.end() - padSize, seq.v.end() );
+static void reverseComplementPssm(int *beg, int *end,
+ const uchar *complement) {
+ while (beg < end) {
+ end -= scoreMatrixRowSize;
+ for (unsigned i = 0; i < scoreMatrixRowSize; ++i) {
+ unsigned j = complement[i];
+ if (beg < end || i < j) std::swap(beg[i], end[j]);
+ }
+ beg += scoreMatrixRowSize;
+ }
}
-bool MultiSequence::isFinishable( indexT maxSeqLen ) const{
- return seq.v.size() + padSize <= maxSeqLen;
-}
+void MultiSequence::reverseComplementOneSequence(indexT seqNum,
+ const uchar *complement) {
+ size_t b = seqBeg(seqNum);
+ size_t e = seqEnd(seqNum);
-MultiSequence::indexT MultiSequence::whichSequence( indexT coordinate ) const{
- const indexT* u = std::upper_bound( ends.begin(), ends.end(), coordinate );
- assert( u != ends.begin() && u != ends.end() );
- return u - ends.begin() - 1;
+ uchar *s = seqWriter();
+ std::reverse(s + b, s + e);
+ for (size_t i = b; i < e; ++i) {
+ s[i] = complement[s[i]];
+ }
+
+ reverse(qualityScores.v.begin() + b * qualsPerLetter(),
+ qualityScores.v.begin() + e * qualsPerLetter());
+
+ if (!pssm.empty()) {
+ int *p = &pssm[0];
+ reverseComplementPssm(p + b * scoreMatrixRowSize,
+ p + e * scoreMatrixRowSize, complement);
+ }
+
+ char &strandChar = names.v[nameEnds.v[seqNum + 1] - 1];
+ strandChar = "\n\t"[strandChar == '\n'];
}
-std::string MultiSequence::seqName( indexT seqNum ) const{
- return std::string( names.begin() + nameEnds[ seqNum ],
- names.begin() + nameEnds[ seqNum + 1 ] );
+void MultiSequence::duplicateOneSequence(indexT seqNum) {
+ indexT nameBeg = nameEnds[seqNum];
+ indexT nameEnd = nameEnds[seqNum + 1];
+ for (indexT i = nameBeg; i < nameEnd; ++i) {
+ names.v.push_back(names.v[i]);
+ }
+ finishName();
+
+ size_t b = seqBeg(seqNum);
+ size_t e = padEnd(seqNum);
+
+ for (size_t i = b; i < e; ++i) {
+ seq.v.push_back(seq.v[i]);
+ }
+ ends.v.push_back(seq.v.size());
+
+ for (size_t i = b * qualsPerLetter(); i < e * qualsPerLetter(); ++i) {
+ qualityScores.v.push_back(qualityScores.v[i]);
+ }
+
+ assert(pssm.empty()); // implement this if & when needed
}
diff --git a/src/MultiSequence.hh b/src/MultiSequence.hh
index 3876058..465634a 100644
--- a/src/MultiSequence.hh
+++ b/src/MultiSequence.hh
@@ -29,6 +29,12 @@ class MultiSequence{
// re-initialize, but keep the last sequence if it is unfinished
void reinitForAppending();
+ void eraseAllButTheLastSequence() {
+ ends.v.pop_back();
+ reinitForAppending();
+ ends.v.push_back(seq.v.size());
+ }
+
// read seqCount finished sequences, and their names, from binary files
void fromFiles( const std::string& baseName, indexT seqCount,
size_t qualitiesPerLetter );
@@ -53,21 +59,12 @@ class MultiSequence{
const uchar* lettersToNumbers,
bool isMaskLowercase );
- // finish the last sequence: add final pad and end coordinate
- void finish();
-
- // unfinish the last sequence: remove final pad and end coordinate
- void unfinish();
-
// did we finish reading the last sequence?
bool isFinished() const{ return ends.size() == nameEnds.size(); }
// how many finished sequences are there?
indexT finishedSequences() const{ return ends.size() - 1; }
- // total length of finished sequences plus delimiters
- indexT finishedSize() const{ return ends.back(); }
-
// total length of finished and unfinished sequences plus delimiters
size_t unfinishedSize() const{ return seq.size(); }
@@ -80,7 +77,21 @@ class MultiSequence{
indexT padEnd( indexT seqNum ) const{ return ends[seqNum+1]; }
indexT seqLen( indexT seqNum ) const{ return seqEnd(seqNum)-seqBeg(seqNum); }
indexT padLen( indexT seqNum ) const{ return padEnd(seqNum)-padBeg(seqNum); }
- std::string seqName( indexT seqNum ) const;
+
+ std::string seqName(indexT seqNum) const {
+ const char *n = names.begin();
+ indexT b = nameEnds[seqNum];
+ indexT e = nameEnds[seqNum + 1];
+ if (e > b && n[e-1] <= ' ') --e;
+ return std::string(n + b, n + e);
+ }
+
+ char strand(indexT seqNum) const {
+ const char *n = names.begin();
+ indexT b = nameEnds[seqNum];
+ indexT e = nameEnds[seqNum + 1];
+ return (e > b && n[e-1] == '\t') ? '-' : '+';
+ }
// get a pointer to the start of the sequence data
const uchar* seqReader() const{ return seq.begin(); }
@@ -94,19 +105,17 @@ class MultiSequence{
: reinterpret_cast< const ScoreMatrixRow* >( &pssm[0] );
}
- /* */ ScoreMatrixRow* pssmWriter() {
- return pssm.empty() ? 0
- : reinterpret_cast< ScoreMatrixRow* >( &pssm[0] );
- }
-
// get a pointer to the start of the quality data
const uchar* qualityReader() const{ return qualityScores.begin(); }
- /***/ uchar* qualityWriter() { return &qualityScores.v[0]; }
// How many quality scores are there per letter? There might be
// none at all, one per letter, or several (e.g. 4) per letter.
size_t qualsPerLetter() const { return qualityScoresPerLetter; }
+ void reverseComplementOneSequence(indexT seqNum, const uchar *complement);
+
+ void duplicateOneSequence(indexT seqNum);
+
private:
indexT padSize; // number of delimiter chars between sequences
VectorOrMmap<uchar> seq; // concatenated sequences
@@ -140,57 +149,30 @@ class MultiSequence{
void addName(const std::string &name) { // add a new sequence name
names.v.insert(names.v.end(), name.begin(), name.end());
+ names.v.push_back('\n');
finishName();
}
- void finishQual() { // add delimiter to the end of the quality scores
+ void appendQualPad() { // add delimiter to the end of the quality scores
uchar padQualityScore = 64; // should never be used, but a valid value
size_t s = padSize * qualityScoresPerLetter;
qualityScores.v.insert(qualityScores.v.end(), s, padQualityScore);
}
- void finishPssm() { // add delimiter to the end of the PSSM
+ void appendPssmPad() { // add delimiter to the end of the PSSM
pssm.insert(pssm.end(), padSize * scoreMatrixRowSize, -INF);
}
- // can we finish the last sequence and stay within the memory limit?
- bool isFinishable( indexT maxSeqLen ) const;
-};
-
-inline void reverseComplementPssm(ScoreMatrixRow *beg, ScoreMatrixRow *end,
- const uchar *complement) {
- while (beg < end) {
- --end;
- for (unsigned i = 0; i < scoreMatrixRowSize; ++i) {
- unsigned j = complement[i];
- if (beg < end || i < j) std::swap((*beg)[i], (*end)[j]);
- }
- ++beg;
+ bool isRoomToAppendPad(indexT maxSeqLen) const {
+ return seq.v.size() <= maxSeqLen && padSize <= maxSeqLen - seq.v.size();
}
-}
-
-inline void reverseComplementOneSequence(MultiSequence &m,
- const uchar *complement,
- size_t seqNum) {
- size_t b = m.seqBeg(seqNum);
- size_t e = m.seqEnd(seqNum);
- uchar *s = m.seqWriter();
- std::reverse(s + b, s + e);
- for (size_t i = b; i < e; ++i) {
- s[i] = complement[s[i]];
- }
-
- size_t qpl = m.qualsPerLetter();
- if (qpl) {
- std::reverse(m.qualityWriter() + b * qpl, m.qualityWriter() + e * qpl);
- }
-
- ScoreMatrixRow *p = m.pssmWriter();
- if (p) {
- reverseComplementPssm(p + b, p + e, complement);
+ // finish the last sequence: add final pad and end coordinate
+ void finish() {
+ seq.v.insert(seq.v.end(), padSize, ' ');
+ ends.v.push_back(seq.v.size());
}
-}
+};
// Divide the sequences into a given number of roughly-equally-sized
// chunks, and return the first sequence in the Nth chunk.
@@ -198,7 +180,7 @@ inline size_t firstSequenceInChunk(const MultiSequence &m,
size_t numOfChunks, size_t chunkNum) {
size_t numOfSeqs = m.finishedSequences();
size_t beg = m.seqBeg(0);
- size_t end = m.padEnd(numOfSeqs - 1) - 1;
+ size_t end = m.seqBeg(numOfSeqs) - 1;
unsigned long long len = end - beg; // try to avoid overflow
size_t pos = beg + len * chunkNum / numOfChunks;
size_t seqNum = m.whichSequence(pos);
diff --git a/src/MultiSequenceQual.cc b/src/MultiSequenceQual.cc
index 415fa14..d83f7da 100644
--- a/src/MultiSequenceQual.cc
+++ b/src/MultiSequenceQual.cc
@@ -6,9 +6,6 @@
#include <cctype> // toupper
#include <limits> // numeric_limits
-// make C++ tolerable:
-#define CI(type) std::vector<type>::const_iterator
-
#define ERR(x) throw std::runtime_error(x)
using namespace cbrc;
@@ -17,12 +14,7 @@ std::istream&
MultiSequence::appendFromFastq( std::istream& stream, indexT maxSeqLen ){
// initForAppending:
qualityScoresPerLetter = 1;
- if( qualityScores.v.empty() ) finishQual();
-
- // reinitForAppending:
- if( qualityScores.v.size() > seq.v.size() )
- qualityScores.v.erase( qualityScores.v.begin(),
- qualityScores.v.end() - seq.v.size() );
+ if( qualityScores.v.empty() ) appendQualPad();
if( isFinished() ){
uchar c = '@';
@@ -46,9 +38,9 @@ MultiSequence::appendFromFastq( std::istream& stream, indexT maxSeqLen ){
if( seq.v.size() != qualityScores.v.size() ) ERR( "bad FASTQ data" );
}
- if( isFinishable(maxSeqLen) ){
+ if (isRoomToAppendPad(maxSeqLen)) {
finish();
- finishQual();
+ appendQualPad();
}
return stream;
@@ -57,16 +49,9 @@ MultiSequence::appendFromFastq( std::istream& stream, indexT maxSeqLen ){
std::istream&
MultiSequence::appendFromPrb( std::istream& stream, indexT maxSeqLen,
unsigned alphSize, const uchar decode[] ){
- size_t qualSize = seq.v.size() * alphSize;
-
// initForAppending:
qualityScoresPerLetter = alphSize;
- if( qualityScores.v.empty() ) finishQual();
-
- // reinitForAppending:
- if( qualityScores.v.size() > qualSize )
- qualityScores.v.erase( qualityScores.v.begin(),
- qualityScores.v.end() - qualSize );
+ if( qualityScores.v.empty() ) appendQualPad();
if( isFinished() ){
std::string line;
@@ -78,6 +63,8 @@ MultiSequence::appendFromPrb( std::istream& stream, indexT maxSeqLen,
std::string name = stringify( ++lineCount );
addName(name);
+ size_t oldSize = qualityScores.v.size();
+
std::istringstream iss(line);
int q;
while( iss >> q ){
@@ -86,18 +73,19 @@ MultiSequence::appendFromPrb( std::istream& stream, indexT maxSeqLen,
qualityScores.v.push_back( q + 64 ); // ASCII-encode the quality score
}
- if( qualityScores.v.size() % alphSize != 0 ) ERR( "bad PRB data" );
+ size_t newSize = qualityScores.v.size();
+ if (newSize % qualityScoresPerLetter != 0) ERR("bad PRB data");
- for( CI(uchar) i = qualityScores.v.begin() + qualSize;
- i < qualityScores.v.end(); i += alphSize ){
- unsigned maxIndex = std::max_element( i, i + alphSize ) - i;
+ for (size_t i = oldSize; i < newSize; i += qualityScoresPerLetter) {
+ const uchar *q = &qualityScores.v[i];
+ unsigned maxIndex = std::max_element(q, q + qualityScoresPerLetter) - q;
seq.v.push_back( decode[ maxIndex ] );
}
}
- if( isFinishable(maxSeqLen) ){
+ if (isRoomToAppendPad(maxSeqLen)) {
finish();
- finishQual();
+ appendQualPad();
}
return stream;
@@ -118,7 +106,8 @@ std::istream& MultiSequence::readPssmHeader( std::istream& stream ){
while( iss >> word ){
if( word.size() == 1 ){
- uchar letter = std::toupper( word[0] );
+ uchar c = word[0];
+ uchar letter = std::toupper(c);
// allow for PSI-BLAST format, with repeated letters:
if( pssmColumnLetters.size() && pssmColumnLetters[0] == letter ) break;
pssmColumnLetters.push_back(letter);
@@ -140,14 +129,8 @@ std::istream&
MultiSequence::appendFromPssm( std::istream& stream, indexT maxSeqLen,
const uchar* lettersToNumbers,
bool isMaskLowercase ){
- size_t pssmSize = seq.v.size() * scoreMatrixRowSize;
-
// initForAppending:
- if( pssm.empty() ) finishPssm();
-
- // reinitForAppending:
- if( pssm.size() > pssmSize )
- pssm.erase( pssm.begin(), pssm.end() - pssmSize );
+ if( pssm.empty() ) appendPssmPad();
if( isFinished() ){
readPssmHeader(stream);
@@ -181,16 +164,17 @@ MultiSequence::appendFromPssm( std::istream& stream, indexT maxSeqLen,
if( isMaskLowercase ) scores[i] = std::min(scores[i], 0);
if( maskColumn != column ) row[maskColumn] = scores[i];
}
- unsigned delimiterColumn = lettersToNumbers[' '];
+ uchar delimiter = ' ';
+ unsigned delimiterColumn = lettersToNumbers[delimiter];
assert( delimiterColumn < scoreMatrixRowSize );
row[delimiterColumn] = -INF;
stream.ignore( std::numeric_limits<std::streamsize>::max(), '\n' );
}
- if( isFinishable(maxSeqLen) ){
+ if (isRoomToAppendPad(maxSeqLen)) {
finish();
- finishPssm();
+ appendPssmPad();
}
if( !stream.bad() ) stream.clear();
diff --git a/src/ScoreMatrix.cc b/src/ScoreMatrix.cc
index 8c68032..6f984ec 100644
--- a/src/ScoreMatrix.cc
+++ b/src/ScoreMatrix.cc
@@ -2,7 +2,7 @@
#include "ScoreMatrix.hh"
#include "ScoreMatrixData.hh"
-#include "io.hh"
+#include "zio.hh"
#include <sstream>
#include <iomanip>
#include <algorithm> // min, max
@@ -32,7 +32,7 @@ std::string ScoreMatrix::stringFromName( const std::string& name ){
if( n == scoreMatrices[i].name )
return scoreMatrices[i].text;
- return slurp( n );
+ return slurp( n.c_str() );
}
void ScoreMatrix::matchMismatch( int match, int mismatch,
@@ -104,7 +104,8 @@ void ScoreMatrix::init( const uchar encode[] ){
}
// set a hugely negative score for the delimiter symbol:
- uchar z = encode[' '];
+ uchar delimiter = ' ';
+ uchar z = encode[delimiter];
assert( z < MAT );
for( unsigned i = 0; i < MAT; ++i ){
caseSensitive[z][i] = -INF;
diff --git a/src/alp/sls_pvalues.cpp b/src/alp/sls_pvalues.cpp
index 515a41c..2220477 100644
--- a/src/alp/sls_pvalues.cpp
+++ b/src/alp/sls_pvalues.cpp
@@ -1213,14 +1213,14 @@ ALP_set_of_parameters &gumbel_params_)
bool pvalues::assert_Gumbel_parameters(
const ALP_set_of_parameters &par_)//a set of Gumbel parameters
{
- if(par_.lambda<=0||
+ if(!(par_.lambda>0)||
par_.lambda_error<0||
//the parameters C and K_C are not necessary for the P-value calculation
//par_.C<0||
//par_.C_error<0||
- par_.K<=0||
+ !(par_.K>0)||
par_.K_error<0||
par_.a_I<0||
diff --git a/src/io.cc b/src/io.cc
deleted file mode 100644
index f1abcce..0000000
--- a/src/io.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright 2008, 2009, 2010 Martin C. Frith
-
-#include "io.hh"
-#include <iostream>
-#include <iterator> // istreambuf_iterator
-
-namespace cbrc{
-
-std::istream& openIn( const std::string& fileName, mcf::izstream& ifs ){
- if( fileName == "-" ) return std::cin;
- ifs.open( fileName.c_str() );
- if( !ifs ) throw std::runtime_error("can't open file: " + fileName);
- return ifs;
-}
-
-std::string slurp( const std::string& fileName ){
- mcf::izstream inFileStream;
- std::istream& in = openIn( fileName, inFileStream );
- std::istreambuf_iterator<char> beg(in), end;
- return std::string( beg, end );
-}
-
-} // end namespace cbrc
diff --git a/src/io.hh b/src/io.hh
index c08cd8c..24eadf9 100644
--- a/src/io.hh
+++ b/src/io.hh
@@ -1,14 +1,8 @@
// Copyright 2008, 2009, 2010 Martin C. Frith
-// Generally useful input/output functions, mostly for binary reading
-// and writing of vectors.
+#ifndef IO_HH
+#define IO_HH
-#ifndef IO_H
-#define IO_H
-
-#include "mcf_zstream.hh"
-
-#include <vector>
#include <string>
#include <stdexcept>
#include <fstream>
@@ -16,30 +10,6 @@
namespace cbrc{
-// open an input file, but if the name is "-", just return cin
-std::istream& openIn( const std::string& fileName, mcf::izstream& ifs );
-
-// read a file into a string, but if the name is "-", read cin
-std::string slurp( const std::string& fileName );
-
-template <typename T> // T should be a vector-iterator or a pointer
-void memoryFromStream( T beg, T end, std::istream& s ){
- assert( beg < end );
- enum { CHUNK_SIZE = 1073741824 }; // need to do big reads in chunks: why?
- char * b = reinterpret_cast<char*>(&(*beg));
- char * e = reinterpret_cast<char*>(&(*end));
- while( e - b > CHUNK_SIZE ){
- s.read( b, CHUNK_SIZE );
- b += CHUNK_SIZE;
- }
- s.read( b, e - b );
-}
-
-template<typename T>
-void vectorFromStream( std::vector<T>& v, std::istream& s ){
- memoryFromStream( v.begin(), v.end(), s );
-}
-
template<typename T> // T should be a vector-iterator or a pointer
void memoryToStream( T beg, T end, std::ostream& s ){
assert( beg < end );
@@ -53,26 +23,6 @@ void memoryToStream( T beg, T end, std::ostream& s ){
s.write( b, e - b );
}
-template<typename T>
-void vectorToStream( const std::vector<T>& v, std::ostream& s ){
- memoryToStream( v.begin(), v.end(), s );
-}
-
-template<typename T> // T should be a vector-iterator or a pointer
-void memoryFromBinaryFile( T beg, T end, const std::string& fileName ){
- if( beg == end ) return;
- std::ifstream file( fileName.c_str(), std::ios::binary );
- if( !file ) throw std::runtime_error( "can't open file: " + fileName );
- memoryFromStream( beg, end, file );
- if( !file ) throw std::runtime_error( "can't read file: " + fileName );
-}
-
-template<typename T>
-void vectorFromBinaryFile( std::vector<T>& v,
- const std::string& fileName ){
- memoryFromBinaryFile( v.begin(), v.end(), fileName );
-}
-
template<typename T> // T should be a vector-iterator or a pointer
void memoryToBinaryFile( T beg, T end, const std::string& fileName ){
if( beg == end ) return;
@@ -83,11 +33,6 @@ void memoryToBinaryFile( T beg, T end, const std::string& fileName ){
if( !file ) throw std::runtime_error( "can't write file: " + fileName );
}
-template<typename T>
-void vectorToBinaryFile( const std::vector<T>& v,
- const std::string& fileName ){
- memoryToBinaryFile( v.begin(), v.end(), fileName );
}
-} // end namespace cbrc
-#endif // IO_H
+#endif
diff --git a/src/last-pair-probs.cc b/src/last-pair-probs.cc
index 21adc33..220d3d0 100644
--- a/src/last-pair-probs.cc
+++ b/src/last-pair-probs.cc
@@ -3,7 +3,7 @@
#include "last-pair-probs.hh"
-#include "io.hh"
+#include "zio.hh"
#include "stringify.hh"
#include <algorithm>
@@ -12,7 +12,6 @@
#include <cmath>
#include <cstdlib> // atof
#include <cstring> // strncmp
-#include <fstream>
#include <iostream>
#include <sstream>
#include <stdexcept>
@@ -617,8 +616,10 @@ void lastPairProbs(LastPairProbsOptions& opts) {
if (!opts.isFraglen || !opts.isSdev) {
mcf::izstream inFile1, inFile2;
- std::istream& in1 = (n > 0) ? cbrc::openIn(inputs[0], inFile1) : std::cin;
- std::istream& in2 = (n > 1) ? cbrc::openIn(inputs[1], inFile2) : in1;
+ std::istream& in1 =
+ (n > 0) ? cbrc::openIn(inputs[0].c_str(), inFile1) : std::cin;
+ std::istream& in2 =
+ (n > 1) ? cbrc::openIn(inputs[1].c_str(), inFile2) : in1;
if (n < 2) skipOneBatchMarker(in1);
std::vector<long> lengths = readQueryPairs1pass(in1, in2, 1.0, 1.0,
opts.circular);
@@ -627,8 +628,10 @@ void lastPairProbs(LastPairProbsOptions& opts) {
if (!opts.estdist) {
mcf::izstream inFile1, inFile2;
- std::istream& in1 = (n > 0) ? cbrc::openIn(inputs[0], inFile1) : std::cin;
- std::istream& in2 = (n > 1) ? cbrc::openIn(inputs[1], inFile2) : in1;
+ std::istream& in1 =
+ (n > 0) ? cbrc::openIn(inputs[0].c_str(), inFile1) : std::cin;
+ std::istream& in2 =
+ (n > 1) ? cbrc::openIn(inputs[1].c_str(), inFile2) : in1;
AlignmentParameters params1 = readHeaderOrDie(in1);
AlignmentParameters params2 = (n > 1) ? readHeaderOrDie(in2) : params1;
calculateScorePieces(opts, params1, params2);
diff --git a/src/last.hh b/src/last.hh
new file mode 100644
index 0000000..6fd3f69
--- /dev/null
+++ b/src/last.hh
@@ -0,0 +1,55 @@
+// Copyright 2017 Martin C. Frith
+
+#ifndef LAST_HH
+#define LAST_HH
+
+#include "Alphabet.hh"
+#include "MultiSequence.hh"
+#include "SequenceFormat.hh"
+#include "qualityScoreUtil.hh"
+
+namespace cbrc {
+
+typedef MultiSequence::indexT indexT;
+
+inline void err(const char *s) { throw std::runtime_error(s); }
+
+// Read the next sequence, adding it to the MultiSequence
+inline std::istream &appendSequence(MultiSequence &m, std::istream &in,
+ indexT maxSeqLen, sequenceFormat::Enum f,
+ const Alphabet &a, bool isKeepLowercase,
+ bool isMaskLowercase) {
+ if (m.finishedSequences() == 0) maxSeqLen = -1;
+
+ size_t oldSize = m.unfinishedSize();
+
+ if (f == sequenceFormat::fasta) {
+ m.appendFromFasta(in, maxSeqLen);
+ } else if (f == sequenceFormat::prb) {
+ m.appendFromPrb(in, maxSeqLen, a.size, a.decode);
+ } else if (f == sequenceFormat::pssm) {
+ m.appendFromPssm(in, maxSeqLen, a.encode, isMaskLowercase);
+ } else {
+ m.appendFromFastq(in, maxSeqLen);
+ }
+
+ if (!m.isFinished() && m.finishedSequences() == 0) {
+ err("encountered a sequence that's too long");
+ }
+
+ size_t newSize = m.unfinishedSize();
+
+ // encode the newly-read sequence
+ a.tr(m.seqWriter() + oldSize, m.seqWriter() + newSize, isKeepLowercase);
+
+ if (isPhred(f)) {
+ checkQualityCodes(m.qualityReader() + oldSize,
+ m.qualityReader() + newSize, qualityOffset(f));
+ } // assumes one quality code per letter
+
+ return in;
+}
+
+}
+
+#endif
diff --git a/src/lastal.cc b/src/lastal.cc
index 47bff6e..cda3415 100644
--- a/src/lastal.cc
+++ b/src/lastal.cc
@@ -2,11 +2,12 @@
// BLAST-like pair-wise sequence alignment, using suffix arrays.
+#include "last.hh"
+
#include "LastalArguments.hh"
#include "QualityPssmMaker.hh"
#include "OneQualityScoreMatrix.hh"
#include "TwoQualityScoreMatrix.hh"
-#include "qualityScoreUtil.hh"
#include "LambdaCalculator.hh"
#include "LastEvaluer.hh"
#include "GeneticCode.hh"
@@ -18,8 +19,6 @@
#include "SegmentPairPot.hh"
#include "SegmentPair.hh"
#include "ScoreMatrix.hh"
-#include "Alphabet.hh"
-#include "MultiSequence.hh"
#include "TantanMasker.hh"
#include "DiagonalTable.hh"
#include "GeneralizedAffineGapCosts.hh"
@@ -27,7 +26,7 @@
#include "gaplessXdrop.hh"
#include "gaplessPssmXdrop.hh"
#include "gaplessTwoQualityXdrop.hh"
-#include "io.hh"
+#include "zio.hh"
#include "stringify.hh"
#include "threadUtil.hh"
#include <iomanip> // setw
@@ -54,7 +53,6 @@ struct LastAligner { // data that changes between queries
};
namespace {
- typedef MultiSequence::indexT indexT;
typedef unsigned long long countT;
LastalArguments args;
@@ -280,13 +278,12 @@ void calculateScoreStatistics( const std::string& matrixName,
p1, p2, isGapped,
gapCosts.delExist, gapCosts.delExtend,
gapCosts.insExist, gapCosts.insExtend,
- args.frameshiftCost, geneticCode, isStandardGeneticCode );
+ args.frameshiftCost, geneticCode, isStandardGeneticCode,
+ args.verbosity );
evaluer.setSearchSpace( refLetters, args.numOfStrands() );
if( args.verbosity > 0 ) evaluer.writeParameters( std::cerr );
}catch( const Sls::error& e ){
LOG( "can't get E-value parameters for this scoring scheme" );
- if( args.verbosity > 1 )
- std::cerr << "ALP: " << e.error_code << ": " << e.st;
}
}
@@ -526,9 +523,9 @@ static void printAndDelete(char *text) {
}
static void writeAlignment(LastAligner &aligner, const Alignment &aln,
- size_t queryNum, char strand, const uchar* querySeq,
+ size_t queryNum, const uchar* querySeq,
const AlignmentExtras &extras = AlignmentExtras()) {
- AlignmentText a = aln.write(text, query, queryNum, strand, querySeq,
+ AlignmentText a = aln.write(text, query, queryNum, querySeq,
args.isTranslated(), alph, evaluer,
args.outputFormat, extras);
if (isCollatedAlignments() || aligners.size() > 1)
@@ -538,11 +535,10 @@ static void writeAlignment(LastAligner &aligner, const Alignment &aln,
}
static void writeSegmentPair(LastAligner &aligner, const SegmentPair &s,
- size_t queryNum, char strand,
- const uchar* querySeq) {
+ size_t queryNum, const uchar* querySeq) {
Alignment a;
a.fromSegmentPair(s);
- writeAlignment(aligner, a, queryNum, strand, querySeq);
+ writeAlignment(aligner, a, queryNum, querySeq);
}
// Find query matches to the suffix array, and do gapless extensions
@@ -597,7 +593,7 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns,
if( score < minScoreGapless ) continue;
SegmentPair sp( j - revLen, i - revLen, revLen + fwdLen, score );
dt.addEndpoint( sp.end2(), sp.end1() );
- writeSegmentPair( aligner, sp, queryNum, strand, querySeq );
+ writeSegmentPair( aligner, sp, queryNum, querySeq );
}else{
int fs = dis.forwardGaplessScore( j, i );
int rs = dis.reverseGaplessScore( j, i );
@@ -616,7 +612,7 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns,
dt.addEndpoint( sp.end2(), sp.end1() );
if( args.outputType == 1 ){ // we just want gapless alignments
- writeSegmentPair( aligner, sp, queryNum, strand, querySeq );
+ writeSegmentPair( aligner, sp, queryNum, querySeq );
}
else{
gaplessAlns.add(sp); // add the gapless alignment to the pot
@@ -747,7 +743,7 @@ void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns,
for( size_t i = 0; i < gappedAlns.size(); ++i ){
const Alignment& aln = gappedAlns.items[i];
if( args.outputType < 4 ){
- writeAlignment( aligner, aln, queryNum, strand, querySeq );
+ writeAlignment( aligner, aln, queryNum, querySeq );
}
else{ // calculate match probabilities:
Alignment probAln;
@@ -760,7 +756,7 @@ void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns,
dis.p, dis.t, dis.i, dis.j, alph, extras,
args.gamma, args.outputType );
assert( aln.score != -INF );
- writeAlignment( aligner, probAln, queryNum, strand, querySeq, extras );
+ writeAlignment( aligner, probAln, queryNum, querySeq, extras );
}
}
}
@@ -953,13 +949,13 @@ void translateAndScan( LastAligner& aligner, size_t queryNum, char strand ){
static void alignOneQuery(LastAligner &aligner,
size_t queryNum, bool isFirstVolume) {
if (args.strand == 2 && !isFirstVolume)
- reverseComplementOneSequence(query, queryAlph.complement, queryNum);
+ query.reverseComplementOneSequence(queryNum, queryAlph.complement);
if (args.strand != 0)
translateAndScan(aligner, queryNum, '+');
if (args.strand == 2 || (args.strand == 0 && isFirstVolume))
- reverseComplementOneSequence(query, queryAlph.complement, queryNum);
+ query.reverseComplementOneSequence(queryNum, queryAlph.complement);
if (args.strand != 1)
translateAndScan(aligner, queryNum, '-');
@@ -1098,40 +1094,6 @@ void writeHeader( countT refSequences, countT refLetters, std::ostream& out ){
}
}
-// Read the next sequence, adding it to the MultiSequence
-std::istream& appendFromFasta( std::istream& in ){
- indexT maxSeqLen = args.batchSize;
- if( maxSeqLen < args.batchSize ) maxSeqLen = indexT(-1);
- if( query.finishedSequences() == 0 ) maxSeqLen = indexT(-1);
-
- size_t oldSize = query.unfinishedSize();
-
- /**/ if( args.inputFormat == sequenceFormat::fasta )
- query.appendFromFasta( in, maxSeqLen );
- else if( args.inputFormat == sequenceFormat::prb )
- query.appendFromPrb( in, maxSeqLen, queryAlph.size, queryAlph.decode );
- else if( args.inputFormat == sequenceFormat::pssm )
- query.appendFromPssm( in, maxSeqLen, queryAlph.encode,
- args.maskLowercase > 1 );
- else
- query.appendFromFastq( in, maxSeqLen );
-
- if( !query.isFinished() && query.finishedSequences() == 0 )
- ERR( "encountered a sequence that's too long" );
-
- // encode the newly-read sequence
- uchar* seq = query.seqWriter();
- size_t newSize = query.unfinishedSize();
- queryAlph.tr( seq + oldSize, seq + newSize, args.isKeepLowercase );
-
- if( isPhred( args.inputFormat ) ) // assumes one quality code per letter:
- checkQualityCodes( query.qualityReader() + oldSize,
- query.qualityReader() + newSize,
- qualityOffset( args.inputFormat ) );
-
- return in;
-}
-
void lastal( int argc, char** argv ){
args.fromArgs( argc, argv );
args.resetCumulativeOptions(); // because we will do fromArgs again
@@ -1209,8 +1171,7 @@ void lastal( int argc, char** argv ){
if( !isMultiVolume ) args.minScoreGapless = minScoreGapless;
if( args.outputType > 0 ) makeQualityScorers();
- queryAlph.tr( query.seqWriter(),
- query.seqWriter() + query.unfinishedSize() );
+ queryAlph.tr(query.seqWriter(), query.seqWriter() + query.seqBeg(0));
if( volumes+1 == 0 ) readIndex( args.lastdbName, refSequences );
@@ -1219,6 +1180,8 @@ void lastal( int argc, char** argv ){
out.precision(3); // print non-integers more compactly
countT queryBatchCount = 0;
countT sequenceCount = 0;
+ indexT maxSeqLen = args.batchSize;
+ if (maxSeqLen < args.batchSize) maxSeqLen = -1;
char defaultInputName[] = "-";
char* defaultInput[] = { defaultInputName, 0 };
@@ -1229,7 +1192,8 @@ void lastal( int argc, char** argv ){
std::istream& in = openIn( *i, inFileStream );
LOG( "reading " << *i << "..." );
- while( appendFromFasta( in ) ){
+ while (appendSequence(query, in, maxSeqLen, args.inputFormat, queryAlph,
+ args.isKeepLowercase, args.maskLowercase > 1)) {
if( query.isFinished() ){
++sequenceCount;
}else{
diff --git a/src/lastdb.cc b/src/lastdb.cc
index daea418..d20572b 100644
--- a/src/lastdb.cc
+++ b/src/lastdb.cc
@@ -3,13 +3,12 @@
// Read fasta-format sequences; construct a suffix array of them; and
// write the results to files.
+#include "last.hh"
+
#include "LastdbArguments.hh"
#include "SubsetSuffixArray.hh"
-#include "Alphabet.hh"
-#include "MultiSequence.hh"
#include "TantanMasker.hh"
-#include "io.hh"
-#include "qualityScoreUtil.hh"
+#include "zio.hh"
#include "stringify.hh"
#include "threadUtil.hh"
#include <stdexcept>
@@ -23,7 +22,6 @@
using namespace cbrc;
-typedef MultiSequence::indexT indexT;
typedef unsigned long long countT;
// Set up an alphabet (e.g. DNA or protein), based on the user options
@@ -185,7 +183,7 @@ void makeVolume( std::vector< CyclicSubsetSeed >& seeds,
const std::string& seedText, const std::string& baseName ){
size_t numOfIndexes = seeds.size();
size_t numOfSequences = multi.finishedSequences();
- size_t textLength = multi.finishedSize();
+ size_t textLength = multi.seqBeg(numOfSequences);
const uchar* seq = multi.seqReader();
std::vector<countT> letterCountsInThisVolume(alph.size);
@@ -252,34 +250,11 @@ static indexT maxLettersPerVolume( const LastdbArguments& args,
return z;
}
-// Read the next sequence, adding it to the MultiSequence
-std::istream&
-appendFromFasta( MultiSequence& multi, indexT maxSeqLen,
- const LastdbArguments& args, const Alphabet& alph,
- std::istream& in ){
- if( multi.finishedSequences() == 0 ) maxSeqLen = indexT(-1);
-
- size_t oldSize = multi.unfinishedSize();
-
- if ( args.inputFormat == sequenceFormat::fasta )
- multi.appendFromFasta( in, maxSeqLen );
- else
- multi.appendFromFastq( in, maxSeqLen );
-
- if( !multi.isFinished() && multi.finishedSequences() == 0 )
- ERR( "encountered a sequence that's too long" );
-
- // encode the newly-read sequence
- uchar* seq = multi.seqWriter();
- size_t newSize = multi.unfinishedSize();
- alph.tr( seq + oldSize, seq + newSize, args.isKeepLowercase );
-
- if( isPhred( args.inputFormat ) ) // assumes one quality code per letter:
- checkQualityCodes( multi.qualityReader() + oldSize,
- multi.qualityReader() + newSize,
- qualityOffset( args.inputFormat ) );
-
- return in;
+static bool isRoomToDuplicateTheLastSequence(const MultiSequence &multi,
+ size_t maxSeqLen) {
+ size_t n = multi.finishedSequences();
+ size_t s = multi.seqBeg(n);
+ return s <= maxSeqLen && s - multi.seqBeg(n - 1) <= maxSeqLen - s;
}
void lastdb( int argc, char** argv ){
@@ -297,16 +272,16 @@ void lastdb( int argc, char** argv ){
unsigned numOfThreads =
decideNumberOfThreads(args.numOfThreads, args.programName, args.verbosity);
Alphabet alph;
- MultiSequence multi;
- std::vector< CyclicSubsetSeed > seeds;
makeAlphabet( alph, args );
TantanMasker tantanMasker;
if( args.tantanSetting )
tantanMasker.init( alph.isProtein(), args.tantanSetting > 1,
alph.letters, alph.encode );
+ std::vector< CyclicSubsetSeed > seeds;
makeSubsetSeeds( seeds, seedText, args, alph );
+ MultiSequence multi;
multi.initForAppending(1);
- alph.tr( multi.seqWriter(), multi.seqWriter() + multi.unfinishedSize() );
+ alph.tr(multi.seqWriter(), multi.seqWriter() + multi.seqBeg(0));
unsigned volumeNumber = 0;
countT sequenceCount = 0;
std::vector<countT> letterCounts( alph.size );
@@ -321,13 +296,31 @@ void lastdb( int argc, char** argv ){
std::istream& in = openIn( *i, inFileStream );
LOG( "reading " << *i << "..." );
- while( appendFromFasta( multi, maxSeqLen, args, alph, in ) ){
+ while (appendSequence(multi, in, maxSeqLen, args.inputFormat, alph,
+ args.isKeepLowercase, 0)) {
if( !args.isProtein && args.userAlphabet.empty() &&
sequenceCount == 0 && isDubiousDna( alph, multi ) ){
std::cerr << args.programName << ": that's some funny-lookin DNA\n";
}
if( multi.isFinished() ){
+ if (args.strand != 1) {
+ if (args.strand == 2) {
+ ++sequenceCount;
+ if (isRoomToDuplicateTheLastSequence(multi, maxSeqLen)) {
+ size_t lastSeq = multi.finishedSequences() - 1;
+ multi.duplicateOneSequence(lastSeq);
+ } else {
+ std::string baseName =
+ args.lastdbName + stringify(volumeNumber++);
+ makeVolume(seeds, multi, args, alph, letterCounts,
+ tantanMasker, numOfThreads, seedText, baseName);
+ multi.eraseAllButTheLastSequence();
+ }
+ }
+ size_t lastSeq = multi.finishedSequences() - 1;
+ multi.reverseComplementOneSequence(lastSeq, alph.complement);
+ }
++sequenceCount;
}
else{
diff --git a/src/makefile b/src/makefile
index de49694..98d6831 100644
--- a/src/makefile
+++ b/src/makefile
@@ -14,15 +14,15 @@ alp/njn_dynprogproblim.o alp/njn_ioutil.o alp/njn_random.o \
alp/sls_falp_alignment_evaluer.o alp/sls_fsa1_pvalues.o \
alp/sls_fsa1_utils.o alp/sls_fsa1.o alp/sls_fsa1_parameters.o
-indexObj0 = Alphabet.o CyclicSubsetSeed.o LambdaCalculator.o \
-ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o io.o \
+indexObj0 = Alphabet.o CyclicSubsetSeed.o LambdaCalculator.o \
+ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o \
tantan.o LastdbArguments.o
indexObj4 = MultiSequence.o MultiSequenceQual.o SubsetSuffixArray.o \
SubsetSuffixArraySort.o lastdb.o
alignObj0 = Alphabet.o CyclicSubsetSeed.o LambdaCalculator.o \
-ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o io.o \
+ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o \
tantan.o LastalArguments.o GappedXdropAligner.o \
GappedXdropAlignerPssm.o GappedXdropAligner2qual.o \
GappedXdropAligner3frame.o GeneralizedAffineGapCosts.o GeneticCode.o \
@@ -41,7 +41,7 @@ split/cbrc_unsplit_alignment.o split/last-split-main.o
splitObj4 = MultiSequence.o split/cbrc_split_aligner.o \
split/last-split.o
-PPOBJ = last-pair-probs.o last-pair-probs-main.o io.o
+PPOBJ = last-pair-probs.o last-pair-probs-main.o
MBOBJ = last-merge-batches.o
@@ -148,7 +148,7 @@ Centroid.o Centroid.o8: Centroid.cc Centroid.hh GappedXdropAligner.hh \
ScoreMatrixRow.hh GeneralizedAffineGapCosts.hh SegmentPair.hh \
OneQualityScoreMatrix.hh GappedXdropAlignerInl.hh
CyclicSubsetSeed.o CyclicSubsetSeed.o8: CyclicSubsetSeed.cc CyclicSubsetSeed.hh \
- CyclicSubsetSeedData.hh io.hh mcf_zstream.hh stringify.hh
+ CyclicSubsetSeedData.hh zio.hh mcf_zstream.hh stringify.hh
DiagonalTable.o DiagonalTable.o8: DiagonalTable.cc DiagonalTable.hh
GappedXdropAligner.o GappedXdropAligner.o8: GappedXdropAligner.cc GappedXdropAligner.hh \
ScoreMatrixRow.hh GappedXdropAlignerInl.hh
@@ -176,7 +176,7 @@ LastalArguments.o LastalArguments.o8: LastalArguments.cc LastalArguments.hh \
LastdbArguments.o LastdbArguments.o8: LastdbArguments.cc LastdbArguments.hh \
SequenceFormat.hh stringify.hh getoptUtil.hh version.hh
MultiSequence.o MultiSequence.o8: MultiSequence.cc MultiSequence.hh ScoreMatrixRow.hh \
- VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh io.hh mcf_zstream.hh
+ VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh io.hh
MultiSequenceQual.o MultiSequenceQual.o8: MultiSequenceQual.cc MultiSequence.hh \
ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh
OneQualityScoreMatrix.o OneQualityScoreMatrix.o8: OneQualityScoreMatrix.cc \
@@ -184,7 +184,7 @@ OneQualityScoreMatrix.o OneQualityScoreMatrix.o8: OneQualityScoreMatrix.cc \
stringify.hh
QualityPssmMaker.o QualityPssmMaker.o8: QualityPssmMaker.cc QualityPssmMaker.hh \
ScoreMatrixRow.hh qualityScoreUtil.hh stringify.hh
-ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh io.hh \
+ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh zio.hh \
mcf_zstream.hh
SegmentPair.o SegmentPair.o8: SegmentPair.cc SegmentPair.hh
SegmentPairPot.o SegmentPairPot.o8: SegmentPairPot.cc SegmentPairPot.hh SegmentPair.hh
@@ -192,7 +192,7 @@ SubsetMinimizerFinder.o SubsetMinimizerFinder.o8: SubsetMinimizerFinder.cc \
SubsetMinimizerFinder.hh CyclicSubsetSeed.hh
SubsetSuffixArray.o SubsetSuffixArray.o8: SubsetSuffixArray.cc SubsetSuffixArray.hh \
CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
- SubsetMinimizerFinder.hh io.hh mcf_zstream.hh
+ SubsetMinimizerFinder.hh io.hh
SubsetSuffixArraySearch.o SubsetSuffixArraySearch.o8: SubsetSuffixArraySearch.cc \
SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \
fileMap.hh stringify.hh
@@ -209,29 +209,28 @@ gaplessPssmXdrop.o gaplessPssmXdrop.o8: gaplessPssmXdrop.cc gaplessPssmXdrop.hh
gaplessTwoQualityXdrop.o gaplessTwoQualityXdrop.o8: gaplessTwoQualityXdrop.cc \
gaplessTwoQualityXdrop.hh TwoQualityScoreMatrix.hh ScoreMatrixRow.hh
gaplessXdrop.o gaplessXdrop.o8: gaplessXdrop.cc gaplessXdrop.hh ScoreMatrixRow.hh
-io.o io.o8: io.cc io.hh mcf_zstream.hh
last-pair-probs-main.o last-pair-probs-main.o8: last-pair-probs-main.cc last-pair-probs.hh \
stringify.hh version.hh
-last-pair-probs.o last-pair-probs.o8: last-pair-probs.cc last-pair-probs.hh io.hh \
+last-pair-probs.o last-pair-probs.o8: last-pair-probs.cc last-pair-probs.hh zio.hh \
mcf_zstream.hh stringify.hh
-lastal.o lastal.o8: lastal.cc LastalArguments.hh SequenceFormat.hh \
- QualityPssmMaker.hh ScoreMatrixRow.hh OneQualityScoreMatrix.hh \
- TwoQualityScoreMatrix.hh qualityScoreUtil.hh stringify.hh \
+lastal.o lastal.o8: lastal.cc last.hh Alphabet.hh MultiSequence.hh \
+ ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
+ SequenceFormat.hh qualityScoreUtil.hh LastalArguments.hh \
+ QualityPssmMaker.hh OneQualityScoreMatrix.hh TwoQualityScoreMatrix.hh \
LambdaCalculator.hh LastEvaluer.hh alp/sls_alignment_evaluer.hpp \
alp/sls_pvalues.hpp alp/sls_basic.hpp alp/sls_falp_alignment_evaluer.hpp \
alp/sls_fsa1_pvalues.hpp GeneticCode.hh SubsetMinimizerFinder.hh \
- SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \
- fileMap.hh Centroid.hh GappedXdropAligner.hh \
- GeneralizedAffineGapCosts.hh SegmentPair.hh AlignmentPot.hh Alignment.hh \
- SegmentPairPot.hh ScoreMatrix.hh Alphabet.hh MultiSequence.hh \
+ SubsetSuffixArray.hh CyclicSubsetSeed.hh Centroid.hh \
+ GappedXdropAligner.hh GeneralizedAffineGapCosts.hh SegmentPair.hh \
+ AlignmentPot.hh Alignment.hh SegmentPairPot.hh ScoreMatrix.hh \
TantanMasker.hh tantan.hh DiagonalTable.hh GreedyXdropAligner.hh \
- gaplessXdrop.hh gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh io.hh \
+ gaplessXdrop.hh gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh zio.hh \
mcf_zstream.hh threadUtil.hh version.hh
-lastdb.o lastdb.o8: lastdb.cc LastdbArguments.hh SequenceFormat.hh \
- SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \
- fileMap.hh stringify.hh Alphabet.hh MultiSequence.hh ScoreMatrixRow.hh \
- TantanMasker.hh tantan.hh io.hh mcf_zstream.hh qualityScoreUtil.hh \
- threadUtil.hh version.hh
+lastdb.o lastdb.o8: lastdb.cc last.hh Alphabet.hh MultiSequence.hh \
+ ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
+ SequenceFormat.hh qualityScoreUtil.hh LastdbArguments.hh \
+ SubsetSuffixArray.hh CyclicSubsetSeed.hh TantanMasker.hh tantan.hh \
+ zio.hh mcf_zstream.hh threadUtil.hh version.hh
tantan.o tantan.o8: tantan.cc tantan.hh
last-merge-batches.o: last-merge-batches.c version.hh
alp/njn_dynprogprob.o: alp/njn_dynprogprob.cpp alp/njn_dynprogprob.hpp \
diff --git a/src/mcf_zstream.hh b/src/mcf_zstream.hh
index fd527c5..24c7ec3 100644
--- a/src/mcf_zstream.hh
+++ b/src/mcf_zstream.hh
@@ -4,8 +4,8 @@
// you give it a gzip-compressed file, it will decompress what it
// reads.
-#ifndef MCF_ZSTREAM
-#define MCF_ZSTREAM
+#ifndef MCF_ZSTREAM_HH
+#define MCF_ZSTREAM_HH
#include <zlib.h>
diff --git a/src/split/cbrc_split_aligner.cc b/src/split/cbrc_split_aligner.cc
index 7cfe2c7..fa42d95 100644
--- a/src/split/cbrc_split_aligner.cc
+++ b/src/split/cbrc_split_aligner.cc
@@ -390,7 +390,7 @@ void SplitAligner::traceBack(long viterbiScore,
// procedure for tied scores?
bool isStay = (score == Vmat[ij] + Dmat[ij]);
- if (isStay && alns[i].qstrand == '+') continue;
+ if (isStay && alns[i].isForwardStrand()) continue;
long s = score - spliceEndScore(ij);
long t = s - restartScore;
@@ -736,7 +736,7 @@ void SplitAligner::initSpliceSignals(unsigned i) {
unsigned char *endSignals = &spliceEndSignals[rowBeg];
unsigned dpLen = dpEnd(i) - dpBeg(i);
- if (a.qstrand == '+') {
+ if (a.isForwardStrand()) {
for (unsigned j = 0; j <= dpLen; ++j) {
begSignals[j] = spliceBegSignalFwd(chromBeg + begCoords[j], toUnmasked);
endSignals[j] = spliceEndSignalFwd(chromBeg + endCoords[j], toUnmasked);
@@ -783,7 +783,7 @@ void SplitAligner::spliceBegSignal(char *out,
unsigned alnNum, unsigned queryPos,
bool isSenseStrand) const {
const UnsplitAlignment& a = alns[alnNum];
- bool isForwardStrand = (a.qstrand == '+');
+ bool isForwardStrand = a.isForwardStrand();
StringNumMap::const_iterator f = chromosomeIndex.find(a.rname);
size_t v = f->second % maxGenomeVolumes();
size_t c = f->second / maxGenomeVolumes();
@@ -799,7 +799,7 @@ void SplitAligner::spliceEndSignal(char *out,
unsigned alnNum, unsigned queryPos,
bool isSenseStrand) const {
const UnsplitAlignment& a = alns[alnNum];
- bool isForwardStrand = (a.qstrand == '+');
+ bool isForwardStrand = a.isForwardStrand();
StringNumMap::const_iterator f = chromosomeIndex.find(a.rname);
size_t v = f->second % maxGenomeVolumes();
size_t c = f->second / maxGenomeVolumes();
@@ -1194,6 +1194,8 @@ void SplitAligner::readGenomeVolume(const std::string& baseName,
genome[volumeNumber].fromFiles(baseName, seqCount, 0);
for (unsigned long long i = 0; i < seqCount; ++i) {
+ char s = genome[volumeNumber].strand(i);
+ if (s == '-') continue;
std::string n = genome[volumeNumber].seqName(i);
unsigned long long j = i * maxGenomeVolumes() + volumeNumber;
if (!chromosomeIndex.insert(std::make_pair(n, j)).second)
diff --git a/src/split/cbrc_unsplit_alignment.cc b/src/split/cbrc_unsplit_alignment.cc
index f85d2bd..090b51b 100644
--- a/src/split/cbrc_unsplit_alignment.cc
+++ b/src/split/cbrc_unsplit_alignment.cc
@@ -192,12 +192,13 @@ void UnsplitAlignment::init() {
if (s == 1) {
rstart = start;
rend = start + len;
+ qstrand = (strand == '-') * 2;
rname = i->c_str() + (d - c);
ralign = i->c_str() + (f - c);
} else if (s == 2) {
qstart = start;
qend = start + len;
- qstrand = strand;
+ if (strand == '-') qstrand = 3 - qstrand;
qname = i->c_str() + (d - c);
qalign = i->c_str() + (f - c);
}
diff --git a/src/split/cbrc_unsplit_alignment.hh b/src/split/cbrc_unsplit_alignment.hh
index 76f64b7..aebc114 100644
--- a/src/split/cbrc_unsplit_alignment.hh
+++ b/src/split/cbrc_unsplit_alignment.hh
@@ -34,6 +34,8 @@ public:
UnsplitAlignment(StringIt linesBegIn, StringIt linesEndIn)
: linesBeg(linesBegIn), linesEnd(linesEndIn) { init(); }
void init();
+ bool isForwardStrand() const { return qstrand < 2; }
+ bool isFlipped() const { return qstrand % 2; }
};
void flipMafStrands(StringIt linesBeg, StringIt linesEnd);
diff --git a/src/split/last-split.cc b/src/split/last-split.cc
index 619f54c..fcea62d 100644
--- a/src/split/last-split.cc
+++ b/src/split/last-split.cc
@@ -147,7 +147,7 @@ static void doOneAlignmentPart(cbrc::SplitAligner& sa,
}
std::cout << "\n" << std::setprecision(6);
- if (a.qstrand == '-') cbrc::flipMafStrands(s.begin(), s.end());
+ if (a.isFlipped()) cbrc::flipMafStrands(s.begin(), s.end());
if (opts.no_split && a.linesEnd[-1][0] == 'c') s.push_back(a.linesEnd[-1]);
cbrc::printMaf(s);
}
diff --git a/src/version.hh b/src/version.hh
index 0fe2c52..96659c2 100644
--- a/src/version.hh
+++ b/src/version.hh
@@ -1 +1 @@
-"885"
+"921"
diff --git a/src/zio.hh b/src/zio.hh
new file mode 100644
index 0000000..0602350
--- /dev/null
+++ b/src/zio.hh
@@ -0,0 +1,37 @@
+// Copyright 2017 Martin C. Frith
+
+#ifndef ZIO_HH
+#define ZIO_HH
+
+#include "mcf_zstream.hh"
+
+#include <iostream>
+#include <iterator> // istreambuf_iterator
+#include <stdexcept>
+#include <string>
+
+namespace cbrc {
+
+// open an input file, but if the name is "-", just return cin
+inline std::istream &openIn(const char *fileName, mcf::izstream &z) {
+ if (fileName[0] == '-' && fileName[1] == 0) {
+ return std::cin;
+ }
+ z.open(fileName);
+ if (!z) {
+ throw std::runtime_error(std::string("can't open file: ") + fileName);
+ }
+ return z;
+}
+
+// read a file into a string, but if the name is "-", read cin
+inline std::string slurp(const char *fileName) {
+ mcf::izstream z;
+ std::istream &s = openIn(fileName, z);
+ std::istreambuf_iterator<char> beg(s), end;
+ return std::string(beg, end);
+}
+
+}
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/last-align.git
More information about the debian-med-commit
mailing list