[med-svn] [last-align] 01/04: New upstream version 869
Sascha Steinbiss
satta at debian.org
Fri Aug 11 19:32:40 UTC 2017
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository last-align.
commit 00a898dda28bae53f453bbe0aaefd78c48ac13c2
Author: Sascha Steinbiss <satta at debian.org>
Date: Fri Aug 11 15:16:56 2017 -0400
New upstream version 869
---
ChangeLog.txt | 198 +++++++++++++-
doc/last-dotplot.html | 126 ++++++++-
doc/last-dotplot.txt | 99 +++++--
doc/last-papers.html | 5 +
doc/last-papers.txt | 8 +
doc/last-train.html | 24 +-
doc/last-train.txt | 21 +-
doc/last-tutorial.html | 28 +-
doc/last-tutorial.txt | 27 +-
doc/lastal.html | 12 +-
doc/lastal.txt | 12 +-
doc/lastdb.html | 2 +-
doc/lastdb.txt | 2 +-
scripts/last-dotplot | 463 ++++++++++++++++++++++++---------
scripts/last-postmask | 8 +-
scripts/last-train | 129 ++++++---
scripts/maf-convert | 52 ++--
src/Alignment.cc | 25 +-
src/AlignmentWrite.cc | 5 +-
src/Centroid.cc | 52 ++--
src/Centroid.hh | 13 +-
src/LastEvaluer.cc | 17 +-
src/LastalArguments.cc | 25 +-
src/LastalArguments.hh | 1 +
src/LastdbArguments.cc | 18 +-
src/alp/sls_alignment_evaluer.cpp | 14 +-
src/alp/sls_alignment_evaluer.hpp | 4 +-
src/alp/sls_alp_data.cpp | 9 +-
src/alp/sls_alp_data.hpp | 4 +
src/alp/sls_basic.hpp | 8 +-
src/alp/sls_falp_alignment_evaluer.cpp | 10 -
src/alp/sls_fsa1_pvalues.cpp | 54 +---
src/alp/sls_fsa1_pvalues.hpp | 21 --
src/alp/sls_pvalues.cpp | 54 +---
src/alp/sls_pvalues.hpp | 20 --
src/getoptUtil.hh | 17 ++
src/lastal.cc | 13 +-
src/makefile | 4 +-
src/split/cbrc_split_aligner.hh | 2 +-
src/split/last-split.cc | 15 +-
src/version.hh | 2 +-
41 files changed, 1113 insertions(+), 510 deletions(-)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 4b51ca3..2203b24 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,9 +1,205 @@
+2017-06-20 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-train.txt, scripts/last-train, test/last-train-test.out,
+ test/last-train-test.sh:
+ Make it easier to feed stdin/pipe to last-train
+ [b73f1146e688] [tip]
+
+ * doc/lastal.txt, src/AlignmentWrite.cc, test/last-test.out:
+ Add raw score to BlastTab+ format
+ [56f960a740a3]
+
+2017-06-15 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-postmask, scripts/last-train:
+ Make last-train read sequence file "-" from stdin
+ [f9ec9c71d72e]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Change last-dotplot's verbosity
+ [5182d8528ce9]
+
+ * src/LastEvaluer.cc, src/alp/sls_alignment_evaluer.cpp,
+ src/alp/sls_alignment_evaluer.hpp, src/alp/sls_alp_data.cpp,
+ src/alp/sls_alp_data.hpp:
+ Enable E-values for some unusual scoring schemes
+ [230d1abe4c5d]
+
+2017-06-02 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-papers.txt, doc/last-tutorial.txt:
+ Update docs
+ [c15cd2ae062d]
+
+ * scripts/last-dotplot:
+ last-dotplot: get bp-per-pixel faster
+ [6a4915d5b5cb]
+
+ * src/LastalArguments.cc, src/LastdbArguments.cc, src/getoptUtil.hh,
+ src/makefile:
+ Fix parsing of command-line options
+ [9afba231262d]
+
+2017-05-18 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-dotplot:
+ last-dotplot: do e.g. chr7 annotations -> hg19.chr7 only if no
+ ambiguity
+ [a75b439f0cf2]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ last-dotplot: add genePred and rmsk options
+ [6e8ff424ce2b]
+
+ * scripts/last-dotplot:
+ last-dotplot: refactor
+ [1ecdbf2ddb78]
+
+ * scripts/last-dotplot:
+ last-dotplot: get layer from BED score
+ [93d47bb8527f]
+
+ * scripts/last-dotplot:
+ last-dotplot: start implementing "layers"
+ [5e4e3f2cf68b]
+
+ * scripts/last-dotplot:
+ last-dotplot: parse BED more liberally
+ [d086361eabec]
+
+2017-05-17 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-dotplot:
+ Refactor last-dotplot
+ [09a9d7ef13ae]
+
+2017-05-15 Martin C. Frith <Martin C. Frith>
+
+ * scripts/maf-convert:
+ Make maf-convert to tab faster
+ [0d11825c43b2]
+
+ * doc/last-train.txt, scripts/last-train:
+ Add lastal's -C option to last-train
+ [791ec91ac03c]
+
+2017-05-02 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Add border options to last-dotplot
+ [f3b6c666afad]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Add sequence order options to last-dotplot
+ [5b2acb7fdb3e]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Tweak last-dotplot's documentation
+ [db0ddc92f2ec]
+
+2017-04-18 Martin C. Frith <Martin C. Frith>
+
+ * src/Alignment.cc, src/Centroid.cc, src/Centroid.hh:
+ Refactor
+ [cb934e010567]
+
+ * doc/last-train.txt, scripts/last-train, test/last-train-test.out,
+ test/last-train-test.sh:
+ Improve last-train sampling for long queries, e.g. chromosomes
+ [7c09e0d848f2]
+
+2017-04-04 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-dotplot:
+ Made last-dotplot print width & height.
+ [3ca7aa4b27a8]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Added last-dotplot options to show sequence lengths.
+ [1f46ab956351]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Added last-dotplot options to show BED features.
+ [16060c00b129]
+
+ * scripts/last-dotplot:
+ Made last-dotplot faster, sometimes.
+ [23de4eb3be1d]
+
+2017-03-09 Martin C. Frith <Martin C. Frith>
+
+ * src/alp/sls_basic.hpp, src/split/cbrc_split_aligner.hh:
+ Fixed compile errors with old compilers.
+ [7872fdade23a]
+
+2017-03-07 Martin C. Frith <Martin C. Frith>
+
+ * doc/lastal.txt, src/LastalArguments.cc, src/LastalArguments.hh,
+ src/lastal.cc, test/last-test.out, test/last-test.sh:
+ Added lastal -N option to quickly get the first hits per query.
+ [b69999b5cd1b]
+
+ * src/alp/sls_alignment_evaluer.cpp, src/alp/sls_basic.hpp,
+ src/alp/sls_falp_alignment_evaluer.cpp,
+ src/alp/sls_fsa1_pvalues.cpp, src/alp/sls_fsa1_pvalues.hpp,
+ src/alp/sls_pvalues.cpp, src/alp/sls_pvalues.hpp:
+ Bugfix: E-values were occasionally negative.
+ [681c5acdf5ca]
+
+2017-03-02 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Enabled sequence ranges for last-dotplot.
+ [85a72978fb7d]
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Added trim options to last-dotplot.
+ [bbc6f00e683b]
+
+ * scripts/last-dotplot:
+ Refactoring.
+ [7b972338fe04]
+
+2017-02-28 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-dotplot:
+ Refactoring.
+ [2c3511fa1734]
+
+ * scripts/last-dotplot:
+ Refactoring.
+ [1b4db1600728]
+
+ * scripts/last-dotplot:
+ Refactoring.
+ [cdb1e40c28e4]
+
+ * doc/last-papers.txt, scripts/last-dotplot:
+ Bugfix: dotplot left border misplaced (for some versions of
+ python/PIL).
+ [1514199bb31b]
+
+2017-02-03 Martin C. Frith <Martin C. Frith>
+
+ * doc/lastdb.txt, src/LastdbArguments.cc:
+ Tiny doc/help edits.
+ [ec636cb2fe64]
+
+ * src/split/last-split.cc:
+ Bugfix: rarely, last-split output end-gaps with wrong score/don/acc.
+ [ff7752714cef]
+
+ * scripts/maf-convert, test/90089.maf, test/maf-convert-test.out, test
+ /maf-convert-test.sh:
+ Bugfix(?): made maf-convert to psl trim end-gaps.
+ [c5591feba7a2]
+
2017-01-19 Martin C. Frith <Martin C. Frith>
* src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh,
src/split/last-split.cc:
Added don and acc to spliced alignment output.
- [138b43db3812] [tip]
+ [138b43db3812]
* src/split/cbrc_split_aligner.cc, src/split/last-split.cc:
Refactoring.
diff --git a/doc/last-dotplot.html b/doc/last-dotplot.html
index 7f03a03..e1717a0 100644
--- a/doc/last-dotplot.html
+++ b/doc/last-dotplot.html
@@ -342,10 +342,10 @@ sequences with names like "chrUn_random522" like this:</p>
<pre class="literal-block">
last-dotplot -1 'chr[!U]*' -2 'chr[!U]*' alns alns.png
</pre>
-<p>Option "-1" selects sequences from the 1st genome, and "-2" selects
-sequences from the 2nd genome. 'chr[!U]*' is a <em>pattern</em> that
-specifies names starting with "chr", followed by any character except
-U, followed by anything.</p>
+<p>Option "-1" selects sequences from the 1st (horizontal) genome, and
+"-2" selects sequences from the 2nd (vertical) genome. 'chr[!U]*' is
+a <em>pattern</em> that specifies names starting with "chr", followed by any
+character except U, followed by anything.</p>
<table border="1" class="docutils">
<colgroup>
<col width="26%" />
@@ -376,6 +376,11 @@ names starting in "chr" followed by one or two characters:</p>
<pre class="literal-block">
last-dotplot -1 'chr?' -1 'chr??' alns alns.png
</pre>
+<p>You can also specify a sequence range; for example this gets the first
+1000 bases of chr9:</p>
+<pre class="literal-block">
+last-dotplot -1 chr9:0-1000 alns alns.png
+</pre>
</div>
<div class="section" id="options">
<h2>Options</h2>
@@ -388,11 +393,14 @@ last-dotplot -1 'chr?' -1 'chr??' alns alns.png
<kbd><span class="option">-h</span>, <span class="option">--help</span></kbd></td>
<td>Show a help message, with default option values, and exit.</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">-v</span>, <span class="option">--verbose</span></kbd></td>
+<td>Show progress messages & data about the plot.</td></tr>
+<tr><td class="option-group">
<kbd><span class="option">-1 <var>PATTERN</var></span>, <span class="option">--seq1=<var>PATTERN</var></span></kbd></td>
-<td>Which sequences to show from the 1st genome.</td></tr>
+<td>Which sequences to show from the 1st (horizontal) genome.</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-2 <var>PATTERN</var></span>, <span class="option">--seq2=<var>PATTERN</var></span></kbd></td>
-<td>Which sequences to show from the 2nd genome.</td></tr>
+<td>Which sequences to show from the 2nd (vertical) genome.</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-x <var>WIDTH</var></span>, <span class="option">--width=<var>WIDTH</var></span></kbd></td>
<td>Maximum width in pixels.</td></tr>
@@ -400,20 +408,114 @@ last-dotplot -1 'chr?' -1 'chr??' alns alns.png
<kbd><span class="option">-y <var>HEIGHT</var></span>, <span class="option">--height=<var>HEIGHT</var></span></kbd></td>
<td>Maximum height in pixels.</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">-c <var>COLOR</var></span>, <span class="option">--forwardcolor=<var>COLOR</var></span></kbd></td>
+<td>Color for forward alignments.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">-r <var>COLOR</var></span>, <span class="option">--reversecolor=<var>COLOR</var></span></kbd></td>
+<td>Color for reverse alignments.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--sort1=<var>N</var></span></kbd></td>
+<td>Put the 1st genome's sequences left-to-right in order of: their
+appearance in the input (0), their names (1), their lengths (2).</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--sort2=<var>N</var></span></kbd></td>
+<td>Put the 2nd genome's sequences top-to-bottom in order of: their
+appearance in the input (0), their names (1), their lengths (2).</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--trim1</span></kbd></td>
+<td>Trim unaligned sequence flanks from the 1st (horizontal) genome.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--trim2</span></kbd></td>
+<td>Trim unaligned sequence flanks from the 2nd (vertical) genome.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--border-pixels=<var>INT</var></span></kbd></td>
+<td>Number of pixels between sequences.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--border-color=<var>COLOR</var></span></kbd></td>
+<td>Color for pixels between sequences.</td></tr>
+</tbody>
+</table>
+</blockquote>
+<div class="section" id="text-options">
+<h3>Text options</h3>
+<blockquote>
+<table class="docutils option-list" frame="void" rules="none">
+<col class="option" />
+<col class="description" />
+<tbody valign="top">
+<tr><td class="option-group">
<kbd><span class="option">-f <var>FILE</var></span>, <span class="option">--fontfile=<var>FILE</var></span></kbd></td>
<td>TrueType or OpenType font file.</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-s <var>SIZE</var></span>, <span class="option">--fontsize=<var>SIZE</var></span></kbd></td>
<td>TrueType or OpenType font size.</td></tr>
<tr><td class="option-group">
-<kbd><span class="option">-c <var>COLOR</var></span>, <span class="option">--forwardcolor=<var>COLOR</var></span></kbd></td>
-<td>Color for forward alignments.</td></tr>
+<kbd><span class="option">--lengths1</span></kbd></td>
+<td>Show sequence lengths for the 1st (horizontal) genome.</td></tr>
<tr><td class="option-group">
-<kbd><span class="option">-r <var>COLOR</var></span>, <span class="option">--reversecolor=<var>COLOR</var></span></kbd></td>
-<td>Color for reverse alignments.</td></tr>
+<kbd><span class="option">--lengths2</span></kbd></td>
+<td>Show sequence lengths for the 2nd (vertical) genome.</td></tr>
</tbody>
</table>
</blockquote>
+</div>
+<div class="section" id="annotation-options">
+<h3>Annotation options</h3>
+<p>These options read annotations of sequence segments, and draw them as
+colored horizontal or vertical stripes. This looks good only if the
+annotations are reasonably sparse: e.g. you can't sensibly view 20000
+gene annotations in one small dotplot.</p>
+<blockquote>
+<table class="docutils option-list" frame="void" rules="none">
+<col class="option" />
+<col class="description" />
+<tbody valign="top">
+<tr><td class="option-group">
+<kbd><span class="option">--bed1=<var>FILE</var></span></kbd></td>
+<td>Read <a class="reference external" href="https://genome.ucsc.edu/FAQ/FAQformat.html#format1">BED-format</a>
+annotations for the 1st genome. They are drawn as stripes, with
+coordinates given by the first three BED fields. The color is
+specified by the RGB field if present, else pale red if the
+strand is "+", pale blue if "-", or pale purple.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--bed2=<var>FILE</var></span></kbd></td>
+<td>Read BED-format annotations for the 2nd genome.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--rmsk1=<var>FILE</var></span></kbd></td>
+<td>Read repeat annotations for the 1st genome, in RepeatMasker .out
+or rmsk.txt format. The color is pale purple for "low
+complexity" and "simple repeats", else pale red for "+" strand
+and pale blue for "-" strand.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--rmsk2=<var>FILE</var></span></kbd></td>
+<td>Read repeat annotations for the 2nd genome.</td></tr>
+</tbody>
+</table>
+</blockquote>
+</div>
+<div class="section" id="gene-options">
+<h3>Gene options</h3>
+<blockquote>
+<table class="docutils option-list" frame="void" rules="none">
+<col class="option" />
+<col class="description" />
+<tbody valign="top">
+<tr><td class="option-group">
+<kbd><span class="option">--genePred1=<var>FILE</var></span></kbd></td>
+<td>Read gene annotations for the 1st genome in <a class="reference external" href="https://genome.ucsc.edu/FAQ/FAQformat.html#format9">genePred format</a>.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--genePred2=<var>FILE</var></span></kbd></td>
+<td>Read gene annotations for the 2nd genome.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--exon-color=<var>COLOR</var></span></kbd></td>
+<td>Color for exons.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--cds-color=<var>COLOR</var></span></kbd></td>
+<td>Color for protein-coding regions.</td></tr>
+</tbody>
+</table>
+</blockquote>
+</div>
<div class="section" id="unsequenced-gap-options">
<h3>Unsequenced gap options</h3>
<p>Note: these "gaps" are <em>not</em> alignment gaps (indels): they are regions
@@ -441,6 +543,10 @@ of unknown sequence.</p>
<p>An unsequenced gap will be shown only if it covers at least one whole
pixel.</p>
</div>
+<div class="section" id="colors">
+<h3>Colors</h3>
+<p>Colors can be specified in <a class="reference external" href="http://effbot.org/imagingbook/imagecolor.htm">various ways described here</a>.</p>
+</div>
</div>
</div>
</body>
diff --git a/doc/last-dotplot.txt b/doc/last-dotplot.txt
index 6b093a2..8642e59 100644
--- a/doc/last-dotplot.txt
+++ b/doc/last-dotplot.txt
@@ -27,10 +27,10 @@ sequences with names like "chrUn_random522" like this::
last-dotplot -1 'chr[!U]*' -2 'chr[!U]*' alns alns.png
-Option "-1" selects sequences from the 1st genome, and "-2" selects
-sequences from the 2nd genome. 'chr[!U]*' is a *pattern* that
-specifies names starting with "chr", followed by any character except
-U, followed by anything.
+Option "-1" selects sequences from the 1st (horizontal) genome, and
+"-2" selects sequences from the 2nd (vertical) genome. 'chr[!U]*' is
+a *pattern* that specifies names starting with "chr", followed by any
+character except U, followed by anything.
========== =============================
Pattern Meaning
@@ -49,35 +49,94 @@ names starting in "chr" followed by one or two characters::
last-dotplot -1 'chr?' -1 'chr??' alns alns.png
+You can also specify a sequence range; for example this gets the first
+1000 bases of chr9::
+
+ last-dotplot -1 chr9:0-1000 alns alns.png
+
Options
-------
-h, --help
Show a help message, with default option values, and exit.
-
+ -v, --verbose
+ Show progress messages & data about the plot.
-1 PATTERN, --seq1=PATTERN
- Which sequences to show from the 1st genome.
-
+ Which sequences to show from the 1st (horizontal) genome.
-2 PATTERN, --seq2=PATTERN
- Which sequences to show from the 2nd genome.
-
+ Which sequences to show from the 2nd (vertical) genome.
-x WIDTH, --width=WIDTH
Maximum width in pixels.
-
-y HEIGHT, --height=HEIGHT
Maximum height in pixels.
+ -c COLOR, --forwardcolor=COLOR
+ Color for forward alignments.
+ -r COLOR, --reversecolor=COLOR
+ Color for reverse alignments.
+ --sort1=N
+ Put the 1st genome's sequences left-to-right in order of: their
+ appearance in the input (0), their names (1), their lengths (2).
+ --sort2=N
+ Put the 2nd genome's sequences top-to-bottom in order of: their
+ appearance in the input (0), their names (1), their lengths (2).
+ --trim1
+ Trim unaligned sequence flanks from the 1st (horizontal) genome.
+ --trim2
+ Trim unaligned sequence flanks from the 2nd (vertical) genome.
+ --border-pixels=INT
+ Number of pixels between sequences.
+ --border-color=COLOR
+ Color for pixels between sequences.
+
+Text options
+~~~~~~~~~~~~
-f FILE, --fontfile=FILE
TrueType or OpenType font file.
-
-s SIZE, --fontsize=SIZE
TrueType or OpenType font size.
-
- -c COLOR, --forwardcolor=COLOR
- Color for forward alignments.
-
- -r COLOR, --reversecolor=COLOR
- Color for reverse alignments.
+ --lengths1
+ Show sequence lengths for the 1st (horizontal) genome.
+ --lengths2
+ Show sequence lengths for the 2nd (vertical) genome.
+
+Annotation options
+~~~~~~~~~~~~~~~~~~
+
+These options read annotations of sequence segments, and draw them as
+colored horizontal or vertical stripes. This looks good only if the
+annotations are reasonably sparse: e.g. you can't sensibly view 20000
+gene annotations in one small dotplot.
+
+ --bed1=FILE
+ Read `BED-format
+ <https://genome.ucsc.edu/FAQ/FAQformat.html#format1>`_
+ annotations for the 1st genome. They are drawn as stripes, with
+ coordinates given by the first three BED fields. The color is
+ specified by the RGB field if present, else pale red if the
+ strand is "+", pale blue if "-", or pale purple.
+ --bed2=FILE
+ Read BED-format annotations for the 2nd genome.
+ --rmsk1=FILE
+ Read repeat annotations for the 1st genome, in RepeatMasker .out
+ or rmsk.txt format. The color is pale purple for "low
+ complexity" and "simple repeats", else pale red for "+" strand
+ and pale blue for "-" strand.
+ --rmsk2=FILE
+ Read repeat annotations for the 2nd genome.
+
+Gene options
+~~~~~~~~~~~~
+
+ --genePred1=FILE
+ Read gene annotations for the 1st genome in `genePred format
+ <https://genome.ucsc.edu/FAQ/FAQformat.html#format9>`_.
+ --genePred2=FILE
+ Read gene annotations for the 2nd genome.
+ --exon-color=COLOR
+ Color for exons.
+ --cds-color=COLOR
+ Color for protein-coding regions.
Unsequenced gap options
~~~~~~~~~~~~~~~~~~~~~~~
@@ -96,3 +155,9 @@ of unknown sequence.
An unsequenced gap will be shown only if it covers at least one whole
pixel.
+
+Colors
+~~~~~~
+
+Colors can be specified in `various ways described here
+<http://effbot.org/imagingbook/imagecolor.htm>`_.
diff --git a/doc/last-papers.html b/doc/last-papers.html
index c33a5b4..995bed4 100644
--- a/doc/last-papers.html
+++ b/doc/last-papers.html
@@ -384,6 +384,11 @@ Frith MC, Kawaguchi R. Genome Biology. 2015 16:106.</p>
<p>Describes the split alignment algorithm, and its application to
whole genome alignment.</p>
</li>
+<li><p class="first"><a class="reference external" href="https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btw742">Training alignment parameters for arbitrary sequencers with
+LAST-TRAIN</a>. Hamada M, Ono Y, Asai K Frith MC.
+Bioinformatics. 2017 33(6):926-928.</p>
+<p>Describes last-train.</p>
+</li>
</ol>
<div class="section" id="external-methods">
<h2>External methods</h2>
diff --git a/doc/last-papers.txt b/doc/last-papers.txt
index f89b20a..4a9bf08 100644
--- a/doc/last-papers.txt
+++ b/doc/last-papers.txt
@@ -100,6 +100,14 @@ research to society.
Describes the split alignment algorithm, and its application to
whole genome alignment.
+12. `Training alignment parameters for arbitrary sequencers with
+ LAST-TRAIN`__. Hamada M, Ono Y, Asai K Frith MC.
+ Bioinformatics. 2017 33(6):926-928.
+
+ __ https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btw742
+
+ Describes last-train.
+
External methods
----------------
diff --git a/doc/last-train.html b/doc/last-train.html
index a8aa00f..a4889bd 100644
--- a/doc/last-train.html
+++ b/doc/last-train.html
@@ -330,8 +330,12 @@ lastdb mydb reference.fasta
last-train mydb queries.fasta
</pre>
<p>last-train prints a summary of each alignment step, followed by the
-final score parameters in a format that can be read by <a class="reference external" href="lastal.html#score-options">lastal's -p
+final score parameters, in a format that can be read by <a class="reference external" href="lastal.html#score-options">lastal's -p
option</a>.</p>
+<p>You can also pipe sequences into last-train, for example:</p>
+<pre class="literal-block">
+zcat queries.fasta.gz | last-train mydb
+</pre>
<div class="section" id="options">
<h2>Options</h2>
<blockquote>
@@ -370,9 +374,15 @@ e.g. score(A→G) = score(G→A).</td></tr>
optimize the parameters for low-similarity alignments,
similarly to the BLOSUM matrices.</td></tr>
<tr><td class="option-group">
-<kbd><span class="option">--sample=<var>N</var></span></kbd></td>
-<td>Use only the first N query letters, after randomizing the
-order of sequences. 0 means use all query letters.</td></tr>
+<kbd><span class="option">--sample-number=<var>N</var></span></kbd></td>
+<td>Use N randomly-chosen chunks of the query sequences. The
+queries are chopped into fixed-length chunks (as if they were
+first concatenated into one long sequence). If there are ≤ N
+chunks, all are picked. Otherwise, if the final chunk is
+shorter, it is never picked. 0 means use everything.</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--sample-length=<var>L</var></span></kbd></td>
+<td>Use randomly-chosen chunks of length L.</td></tr>
</tbody>
</table>
</blockquote>
@@ -433,6 +443,12 @@ alignments: they are described in more detail at <a class="reference external" h
1=query. This matters only if the scores lack
reverse-complement symmetry.</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">-C <var>COUNT</var></span></kbd></td>
+<td>Before extending gapped alignments, discard any gapless
+alignment whose query range lies in COUNT other gapless
+alignments with higher score-per-length. This aims to
+reduce run time.</td></tr>
+<tr><td class="option-group">
<kbd><span class="option">-T <var>NUMBER</var></span></kbd></td>
<td>Type of alignment: 0=local, 1=overlap.</td></tr>
<tr><td class="option-group">
diff --git a/doc/last-train.txt b/doc/last-train.txt
index 8ee7885..c9fcd93 100644
--- a/doc/last-train.txt
+++ b/doc/last-train.txt
@@ -15,9 +15,13 @@ The usage is like this::
last-train mydb queries.fasta
last-train prints a summary of each alignment step, followed by the
-final score parameters in a format that can be read by `lastal's -p
+final score parameters, in a format that can be read by `lastal's -p
option <lastal.html#score-options>`_.
+You can also pipe sequences into last-train, for example::
+
+ zcat queries.fasta.gz | last-train mydb
+
Options
-------
@@ -40,9 +44,14 @@ Training options
Ignore alignments with > PID% identity. This aims to
optimize the parameters for low-similarity alignments,
similarly to the BLOSUM matrices.
- --sample=N
- Use only the first N query letters, after randomizing the
- order of sequences. 0 means use all query letters.
+ --sample-number=N
+ Use N randomly-chosen chunks of the query sequences. The
+ queries are chopped into fixed-length chunks (as if they were
+ first concatenated into one long sequence). If there are ≤ N
+ chunks, all are picked. Otherwise, if the final chunk is
+ shorter, it is never picked. 0 means use everything.
+ --sample-length=L
+ Use randomly-chosen chunks of length L.
All options below this point are passed to lastal to do the
alignments: they are described in more detail at `<lastal.html>`_.
@@ -69,6 +78,10 @@ Alignment options
-S NUMBER Score matrix applies to forward strand of: 0=reference,
1=query. This matters only if the scores lack
reverse-complement symmetry.
+ -C COUNT Before extending gapped alignments, discard any gapless
+ alignment whose query range lies in COUNT other gapless
+ alignments with higher score-per-length. This aims to
+ reduce run time.
-T NUMBER Type of alignment: 0=local, 1=overlap.
-m COUNT Maximum number of initial matches per query position.
-P COUNT Number of parallel threads.
diff --git a/doc/last-tutorial.html b/doc/last-tutorial.html
index 62f419a..19c9ed9 100644
--- a/doc/last-tutorial.html
+++ b/doc/last-tutorial.html
@@ -484,33 +484,13 @@ the two reads in a pair to match (e.g.) different chromosomes.</p>
</li>
</ul>
</div>
-<div class="section" id="example-8-compare-the-cat-and-rat-genomes">
-<h2>Example 8: Compare the cat and rat genomes</h2>
-<p>If you have ~50 GB of memory and don't mind waiting a few days, this
-is a good way to compare such genomes:</p>
-<pre class="literal-block">
-lastdb -uMAM8 -cR11 catdb cat.fa
-lastal -m100 -E0.05 catdb rat.fa | last-split -m1 > out.maf
-</pre>
-<p>This looks for a unique best alignment for each part of each rat
-chromosome. Omitting -m100 makes it faster but somewhat less
-sensitive. Omitting -uMAM8 reduces the memory use to ~10 GB and makes
-it faster but considerably less sensitive.</p>
-<p>This recipe aligns each rat base-pair to at most one cat base-pair,
-but not necessarily vice-versa. You can get strictly 1-to-1
-alignments by swapping the sequences and running last-split again:</p>
-<pre class="literal-block">
-maf-swap out.maf | last-split -m1 > out2.maf
-</pre>
+<div class="section" id="example-8-compare-the-human-and-mouse-genomes">
+<h2>Example 8: Compare the human and mouse genomes</h2>
+<p>See <a class="reference external" href="https://github.com/mcfrith/last-genome-alignments">here</a>.</p>
</div>
<div class="section" id="example-9-compare-the-human-and-chimp-genomes">
<h2>Example 9: Compare the human and chimp genomes</h2>
-<p>For strongly similar genomes (e.g. 99% identity), something like this
-is more appropriate:</p>
-<pre class="literal-block">
-lastdb -uNEAR -cR11 human human.fa
-lastal -m50 -E0.05 human chimp.fa | last-split -m1 > out.maf
-</pre>
+<p>See <a class="reference external" href="https://github.com/mcfrith/last-genome-alignments">here</a>.</p>
</div>
<div class="section" id="example-10-ambiguity-of-alignment-columns">
<h2>Example 10: Ambiguity of alignment columns</h2>
diff --git a/doc/last-tutorial.txt b/doc/last-tutorial.txt
index cbff52a..32ede6e 100644
--- a/doc/last-tutorial.txt
+++ b/doc/last-tutorial.txt
@@ -175,34 +175,15 @@ Tuning speed, sensitivity, memory and disk usage
* You can `trade off speed, sensitivity, memory and disk usage
<last-tuning.html>`_.
-Example 8: Compare the cat and rat genomes
-------------------------------------------
-
-If you have ~50 GB of memory and don't mind waiting a few days, this
-is a good way to compare such genomes::
-
- lastdb -uMAM8 -cR11 catdb cat.fa
- lastal -m100 -E0.05 catdb rat.fa | last-split -m1 > out.maf
-
-This looks for a unique best alignment for each part of each rat
-chromosome. Omitting -m100 makes it faster but somewhat less
-sensitive. Omitting -uMAM8 reduces the memory use to ~10 GB and makes
-it faster but considerably less sensitive.
-
-This recipe aligns each rat base-pair to at most one cat base-pair,
-but not necessarily vice-versa. You can get strictly 1-to-1
-alignments by swapping the sequences and running last-split again::
+Example 8: Compare the human and mouse genomes
+----------------------------------------------
- maf-swap out.maf | last-split -m1 > out2.maf
+See `here <https://github.com/mcfrith/last-genome-alignments>`_.
Example 9: Compare the human and chimp genomes
----------------------------------------------
-For strongly similar genomes (e.g. 99% identity), something like this
-is more appropriate::
-
- lastdb -uNEAR -cR11 human human.fa
- lastal -m50 -E0.05 human chimp.fa | last-split -m1 > out.maf
+See `here <https://github.com/mcfrith/last-genome-alignments>`_.
Example 10: Ambiguity of alignment columns
------------------------------------------
diff --git a/doc/lastal.html b/doc/lastal.html
index 94f9541..191dde2 100644
--- a/doc/lastal.html
+++ b/doc/lastal.html
@@ -412,9 +412,10 @@ coordinates are one-based. <em>Warning:</em> this is a lossy format,
because it does not show gap positions. <em>Warning:</em> the other
LAST programs cannot read this format. <em>Warning:</em> <a class="reference external" href="last-evalues.html">"bit score"
is not the same as "score"</a>.</p>
-<p><strong>BlastTab+</strong> format is the same as BlastTab, with 2 extra
-columns at the end: length of query sequence and length of
-reference sequence. More columns might be added in future.</p>
+<p><strong>BlastTab+</strong> format is the same as BlastTab, with 3 extra
+columns at the end: length of query sequence, length of
+reference sequence, and (raw) score. More columns might be
+added in future.</p>
<p class="last">For backwards compatibility, a NAME of 0 means TAB and 1 means
MAF.</p>
</td></tr>
@@ -669,6 +670,11 @@ start at one query position, if it gets COUNT successful
extensions, it skips any remaining initial matches starting at
that position.</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">-N <var>COUNT</var></span></kbd></td>
+<td>Stop after finding COUNT alignments per query strand. This
+makes lastal faster: it quits gapless and gapped extensions as
+soon as it finds COUNT alignments with score ≥ e.</td></tr>
+<tr><td class="option-group">
<kbd><span class="option">-R <var>DIGITS</var></span></kbd></td>
<td><p class="first">Specify lowercase-marking of repeats, by two digits (e.g. "-R 01"),
with the following meanings.</p>
diff --git a/doc/lastal.txt b/doc/lastal.txt
index d8b2690..db7eeb1 100644
--- a/doc/lastal.txt
+++ b/doc/lastal.txt
@@ -93,9 +93,10 @@ Cosmetic options
LAST programs cannot read this format. *Warning:* `"bit score"
is not the same as "score" <last-evalues.html>`_.
- **BlastTab+** format is the same as BlastTab, with 2 extra
- columns at the end: length of query sequence and length of
- reference sequence. More columns might be added in future.
+ **BlastTab+** format is the same as BlastTab, with 3 extra
+ columns at the end: length of query sequence, length of
+ reference sequence, and (raw) score. More columns might be
+ added in future.
For backwards compatibility, a NAME of 0 means TAB and 1 means
MAF.
@@ -311,6 +312,11 @@ Miscellaneous options
extensions, it skips any remaining initial matches starting at
that position.
+ -N COUNT
+ Stop after finding COUNT alignments per query strand. This
+ makes lastal faster: it quits gapless and gapped extensions as
+ soon as it finds COUNT alignments with score ≥ e.
+
-R DIGITS
Specify lowercase-marking of repeats, by two digits (e.g. "-R 01"),
with the following meanings.
diff --git a/doc/lastdb.html b/doc/lastdb.html
index 1e6fafb..974c588 100644
--- a/doc/lastdb.html
+++ b/doc/lastdb.html
@@ -387,7 +387,7 @@ for ~80% AT-rich genomes.</p>
<td>Soft-mask lowercase letters. This means that, when we compare
these sequences to some other sequences using lastal, lowercase
letters will be excluded from initial matches. This will apply
-to lowercase letters in both sets of sequences.</td></tr>
+to lowercase letters in <em>both</em> sets of sequences.</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-u <var>NAME</var></span></kbd></td>
<td><p class="first">Specify a seeding scheme. The -m option will then be ignored.
diff --git a/doc/lastdb.txt b/doc/lastdb.txt
index 0d2d017..c0b5fdb 100644
--- a/doc/lastdb.txt
+++ b/doc/lastdb.txt
@@ -59,7 +59,7 @@ Main Options
-c Soft-mask lowercase letters. This means that, when we compare
these sequences to some other sequences using lastal, lowercase
letters will be excluded from initial matches. This will apply
- to lowercase letters in both sets of sequences.
+ to lowercase letters in *both* sets of sequences.
-u NAME
Specify a seeding scheme. The -m option will then be ignored.
diff --git a/scripts/last-dotplot b/scripts/last-dotplot
index 52becbf..6d4f1d2 100755
--- a/scripts/last-dotplot
+++ b/scripts/last-dotplot
@@ -9,15 +9,36 @@
# according to the number of aligned nt-pairs within it, but the
# result is too faint. How can this be done better?
-import fileinput, fnmatch, itertools, optparse, os, re, sys
+import fnmatch, itertools, optparse, os, re, sys
# Try to make PIL/PILLOW work:
try: from PIL import Image, ImageDraw, ImageFont, ImageColor
except ImportError: import Image, ImageDraw, ImageFont, ImageColor
+def myOpen(fileName): # faster than fileinput
+ if fileName == "-":
+ return sys.stdin
+ return open(fileName)
+
def warn(message):
- prog = os.path.basename(sys.argv[0])
- sys.stderr.write(prog + ": " + message + "\n")
+ if opts.verbose:
+ prog = os.path.basename(sys.argv[0])
+ sys.stderr.write(prog + ": " + message + "\n")
+
+def croppedBlocks(blocks, range1, range2):
+ cropBeg1, cropEnd1 = range1
+ cropBeg2, cropEnd2 = range2
+ if blocks[0][0] < 0: cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
+ if blocks[0][1] < 0: cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
+ for beg1, beg2, size in blocks:
+ b1 = max(cropBeg1, beg1)
+ e1 = min(cropEnd1, beg1 + size)
+ if b1 >= e1: continue
+ offset = beg2 - beg1
+ b2 = max(cropBeg2, b1 + offset)
+ e2 = min(cropEnd2, e1 + offset)
+ if b2 >= e2: continue
+ yield b2 - offset, b2, e2 - b2
def tabBlocks(beg1, beg2, blocks):
'''Get the gapless blocks of an alignment, from LAST tabular format.'''
@@ -78,28 +99,60 @@ def alignmentInput(lines):
yield chr1, seqlen1, chr2, seqlen2, blocks
mafCount = 0
-def isWantedSequenceName(name, patterns):
- if not patterns: return True
+def seqRangeFromText(text):
+ if ":" in text:
+ pattern, interval = text.rsplit(":", 1)
+ if "-" in interval:
+ beg, end = interval.rsplit("-", 1)
+ return pattern, int(beg), int(end) # beg may be negative
+ return text, 0, sys.maxsize
+
+def rangeFromSeqName(seqRanges, name, seqLen):
+ if not seqRanges: return 0, seqLen
base = name.split(".")[-1] # allow for names like hg19.chr7
- for i in patterns:
- if fnmatch.fnmatchcase(name, i): return True
- if fnmatch.fnmatchcase(base, i): return True
- return False
+ for pat, beg, end in seqRanges:
+ if fnmatch.fnmatchcase(name, pat) or fnmatch.fnmatchcase(base, pat):
+ return max(beg, 0), min(end, seqLen)
+ return None
+
+def updateSeqs(isTrim, seqNames, seqLimits, seqName, seqRange, blocks, index):
+ if seqName not in seqLimits:
+ seqNames.append(seqName)
+ if isTrim:
+ beg = blocks[0][index]
+ end = blocks[-1][index] + blocks[-1][2]
+ if beg < 0: beg, end = -end, -beg
+ if seqName in seqLimits:
+ b, e = seqLimits[seqName]
+ seqLimits[seqName] = min(b, beg), max(e, end)
+ else:
+ seqLimits[seqName] = beg, end
+ else:
+ seqLimits[seqName] = seqRange
def readAlignments(fileName, opts):
- '''Get alignments and sequence lengths, from MAF or tabular format.'''
+ '''Get alignments and sequence limits, from MAF or tabular format.'''
+ seqRanges1 = map(seqRangeFromText, opts.seq1)
+ seqRanges2 = map(seqRangeFromText, opts.seq2)
+
alignments = []
- seqLengths1 = {}
- seqLengths2 = {}
- lines = fileinput.input(fileName)
- for chr1, seqlen1, chr2, seqlen2, blocks in alignmentInput(lines):
- if not isWantedSequenceName(chr1, opts.seq1): continue
- if not isWantedSequenceName(chr2, opts.seq2): continue
- aln = chr1, chr2, blocks
+ seqNames1 = []
+ seqNames2 = []
+ seqLimits1 = {}
+ seqLimits2 = {}
+ lines = myOpen(fileName)
+ for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
+ range1 = rangeFromSeqName(seqRanges1, seqName1, seqLen1)
+ if not range1: continue
+ range2 = rangeFromSeqName(seqRanges2, seqName2, seqLen2)
+ if not range2: continue
+ b = list(croppedBlocks(list(blocks), range1, range2))
+ if not b: continue
+ aln = seqName1, seqName2, b
alignments.append(aln)
- seqLengths1[chr1] = seqlen1
- seqLengths2[chr2] = seqlen2
- return alignments, seqLengths1, seqLengths2
+ updateSeqs(opts.trim1, seqNames1, seqLimits1, seqName1, range1, b, 0)
+ updateSeqs(opts.trim2, seqNames2, seqLimits2, seqName2, range2, b, 1)
+ return alignments, seqNames1, seqNames2, seqLimits1, seqLimits2
def natural_sort_key(my_string):
'''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
@@ -115,34 +168,58 @@ def get_text_sizes(my_strings, font, fontsize, image_mode):
draw = ImageDraw.Draw(im)
return [draw.textsize(i, font=font) for i in my_strings]
-def get_seq_info(seq_size_dic, font, fontsize, image_mode):
+def sizeText(size):
+ suffixes = "bp", "kb", "Mb", "Gb"
+ for i, x in enumerate(suffixes):
+ j = 10 ** (i * 3)
+ if size < j * 10:
+ return "%.2g" % (1.0 * size / j) + x
+ if size < j * 1000 or i == len(suffixes) - 1:
+ return "%.0f" % (1.0 * size / j) + x
+
+def seqNameAndSizeText(seqName, seqSize):
+ return seqName + ": " + sizeText(seqSize)
+
+def getSeqInfo(sortOpt, seqNames, seqLimits,
+ font, fontsize, image_mode, isShowSize):
'''Return miscellaneous information about the sequences.'''
- seq_names = seq_size_dic.keys()
- seq_names.sort(key=natural_sort_key)
- seq_sizes = [seq_size_dic[i] for i in seq_names]
- name_sizes = get_text_sizes(seq_names, font, fontsize, image_mode)
- margin = max(zip(*name_sizes)[1]) # maximum text height
- return seq_names, seq_sizes, name_sizes, margin
+ if sortOpt == 1:
+ seqNames.sort(key=natural_sort_key)
+ seqSizes = [seqLimits[i][1] - seqLimits[i][0] for i in seqNames]
+ for i in seqNames:
+ r = seqLimits[i]
+ out = i, str(r[0]), str(r[1])
+ warn("\t".join(out))
+ warn("")
+ if sortOpt == 2:
+ seqRecords = sorted(zip(seqSizes, seqNames), reverse=True)
+ seqSizes = [i[0] for i in seqRecords]
+ seqNames = [i[1] for i in seqRecords]
+ if isShowSize:
+ seqLabels = map(seqNameAndSizeText, seqNames, seqSizes)
+ else:
+ seqLabels = seqNames
+ labelSizes = get_text_sizes(seqLabels, font, fontsize, image_mode)
+ margin = max(zip(*labelSizes)[1]) # maximum text height
+ return seqNames, seqSizes, seqLabels, labelSizes, margin
def div_ceil(x, y):
'''Return x / y rounded up.'''
q, r = divmod(x, y)
return q + (r != 0)
-def tot_seq_pix(seq_sizes, bp_per_pix):
- '''Return the total pixels needed for sequences of the given sizes.'''
- return sum([div_ceil(i, bp_per_pix) for i in seq_sizes])
-
def get_bp_per_pix(seq_sizes, pix_tween_seqs, pix_limit):
'''Get the minimum bp-per-pixel that fits in the size limit.'''
seq_num = len(seq_sizes)
seq_pix_limit = pix_limit - pix_tween_seqs * (seq_num - 1)
if seq_pix_limit < seq_num:
raise Exception("can't fit the image: too many sequences?")
- lower_bound = div_ceil(sum(seq_sizes), seq_pix_limit)
- for bp_per_pix in itertools.count(lower_bound): # slow linear search
- if tot_seq_pix(seq_sizes, bp_per_pix) <= seq_pix_limit: break
- return bp_per_pix
+ negLimit = -seq_pix_limit
+ negBpPerPix = sum(seq_sizes) // negLimit
+ while True:
+ if sum(i // negBpPerPix for i in seq_sizes) >= negLimit:
+ return -negBpPerPix
+ negBpPerPix -= 1
def get_seq_starts(seq_pix, pix_tween_seqs, margin):
'''Get the start pixel for each sequence.'''
@@ -161,86 +238,174 @@ def get_pix_info(seq_sizes, bp_per_pix, pix_tween_seqs, margin):
tot_pix = seq_starts[-1] + seq_pix[-1]
return seq_pix, seq_starts, tot_pix
-def drawLineForward(hits, width, bp_per_pix, origin, beg1, beg2, size):
+def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
while True:
q1, r1 = divmod(beg1, bp_per_pix)
q2, r2 = divmod(beg2, bp_per_pix)
- hits[origin + q2 * width + q1] |= 1
+ hits[q2 * width + q1] |= 1
next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
if next_pix >= size: break
beg1 += next_pix
beg2 += next_pix
size -= next_pix
-def drawLineReverse(hits, width, bp_per_pix, origin, beg1, beg2, size):
+def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
beg2 = -1 - beg2
while True:
q1, r1 = divmod(beg1, bp_per_pix)
q2, r2 = divmod(beg2, bp_per_pix)
- hits[origin + q2 * width + q1] |= 2
+ hits[q2 * width + q1] |= 2
next_pix = min(bp_per_pix - r1, r2 + 1)
if next_pix >= size: break
beg1 += next_pix
beg2 -= next_pix
size -= next_pix
-def alignmentPixels(width, height, alignments, bp_per_pix,
- seq_start_dic1, seq_start_dic2):
+def alignmentPixels(width, height, alignments, bp_per_pix, origins1, origins2):
hits = [0] * (width * height) # the image data
for seq1, seq2, blocks in alignments:
- seq_start1 = seq_start_dic1[seq1]
- seq_start2 = seq_start_dic2[seq2]
- origin = seq_start2 * width + seq_start1
+ ori1 = origins1[seq1]
+ ori2 = origins2[seq2]
for beg1, beg2, size in blocks:
if beg1 < 0:
beg1 = -(beg1 + size)
beg2 = -(beg2 + size)
if beg2 >= 0:
- drawLineForward(hits, width, bp_per_pix, origin,
- beg1, beg2, size)
+ drawLineForward(hits, width, bp_per_pix,
+ beg1 + ori1, beg2 + ori2, size)
else:
- drawLineReverse(hits, width, bp_per_pix, origin,
- beg1, beg2, size)
+ drawLineReverse(hits, width, bp_per_pix,
+ beg1 + ori1, beg2 - ori2, size)
return hits
def expandedSeqDict(seqDict):
'''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
- newDict = {}
+ newDict = seqDict.copy()
for name, x in seqDict.items():
- base = name.split(".")[-1]
- newDict[name] = x
- newDict[base] = x
+ if "." in name:
+ base = name.split(".")[-1]
+ if base in newDict: # an ambiguous case was found:
+ return seqDict # so give up completely
+ newDict[base] = x
return newDict
+def readBed(fileName, seqLimits):
+ if not fileName: return
+ for line in myOpen(fileName):
+ w = line.split()
+ if not w: continue
+ seqName = w[0]
+ if seqName not in seqLimits: continue
+ beg = int(w[1])
+ end = int(w[2])
+ layer = 900
+ color = "#ffe4ff"
+ if len(w) > 4:
+ if w[4] != ".":
+ layer = float(w[4])
+ if len(w) > 5:
+ if len(w) > 8 and w[8].count(",") == 2:
+ color = "rgb(" + w[8] + ")"
+ elif w[5] == "+":
+ color = "#fff4f4"
+ elif w[5] == "-":
+ color = "#f4f4ff"
+ yield layer, color, seqName, beg, end
+
+def commaSeparatedInts(text):
+ return map(int, text.rstrip(",").split(","))
+
+def readGenePred(opts, fileName, seqLimits):
+ if not fileName: return
+ for line in myOpen(fileName):
+ fields = line.split()
+ if not fields: continue
+ if fields[2] not in "+-": fields = fields[1:]
+ seqName = fields[1]
+ if seqName not in seqLimits: continue
+ #strand = fields[2]
+ cdsBeg = int(fields[5])
+ cdsEnd = int(fields[6])
+ exonBegs = commaSeparatedInts(fields[8])
+ exonEnds = commaSeparatedInts(fields[9])
+ for beg, end in zip(exonBegs, exonEnds):
+ yield 300, opts.exon_color, seqName, beg, end
+ b = max(beg, cdsBeg)
+ e = min(end, cdsEnd)
+ if b < e: yield 400, opts.cds_color, seqName, b, e
+
+def readRmsk(fileName, seqLimits):
+ if not fileName: return
+ for line in myOpen(fileName):
+ fields = line.split()
+ if len(fields) == 17: # rmsk.txt
+ seqName = fields[5]
+ if seqName not in seqLimits: continue # do this ASAP for speed
+ beg = int(fields[6])
+ end = int(fields[7])
+ strand = fields[9]
+ repeatClass = fields[11]
+ elif len(fields) == 15: # .out
+ seqName = fields[4]
+ if seqName not in seqLimits: continue
+ beg = int(fields[5]) - 1
+ end = int(fields[6])
+ strand = fields[8]
+ repeatClass = fields[10]
+ else:
+ continue
+ if repeatClass in ("Low_complexity", "Simple_repeat"):
+ yield 200, "#ffe4ff", seqName, beg, end
+ elif strand == "+":
+ yield 100, "#fff4f4", seqName, beg, end
+ else:
+ yield 100, "#f4f4ff", seqName, beg, end
+
def isExtraFirstGapField(fields):
return fields[4].isdigit()
-def readGaps(fileName):
+def readGaps(opts, fileName, seqLimits):
'''Read locations of unsequenced gaps, from an agp or gap file.'''
if not fileName: return
- for line in fileinput.input(fileName):
+ for line in myOpen(fileName):
w = line.split()
if not w or w[0][0] == "#": continue
if isExtraFirstGapField(w): w = w[1:]
if w[4] not in "NU": continue
seqName = w[0]
+ if seqName not in seqLimits: continue
end = int(w[2])
beg = end - int(w[5]) # zero-based coordinate
- bridgedText = w[7]
- yield seqName, beg, end, bridgedText
-
-def drawUnsequencedGaps(im, gaps, start_dic, margin, limit, isTop, bridgedText,
- bp_per_pix, color):
- '''Draw rectangles representing unsequenced gaps.'''
- for seqName, beg, end, b in gaps:
- if b != bridgedText: continue
- if seqName not in start_dic: continue
- origin = start_dic[seqName]
- b = div_ceil(beg, bp_per_pix) # use fully-covered pixels only
- e = end // bp_per_pix
- if e <= b: continue
- if isTop: box = origin + b, margin, origin + e, limit
- else: box = margin, origin + b, limit, origin + e
+ if w[7] == "yes":
+ yield 3000, opts.bridged_color, seqName, beg, end
+ else:
+ yield 2000, opts.unbridged_color, seqName, beg, end
+
+def bedBoxes(beds, seqLimits, origins, margin, edge, isTop, bpPerPix):
+ for layer, color, seqName, beg, end in beds:
+ cropBeg, cropEnd = seqLimits[seqName]
+ beg = max(beg, cropBeg)
+ end = min(end, cropEnd)
+ if beg >= end: continue
+ ori = origins[seqName]
+ if layer <= 1000:
+ # include partly-covered pixels
+ b = (ori + beg) // bpPerPix
+ e = div_ceil(ori + end, bpPerPix)
+ else:
+ # exclude partly-covered pixels
+ b = div_ceil(ori + beg, bpPerPix)
+ e = (ori + end) // bpPerPix
+ if e <= b: continue
+ if isTop:
+ box = b, margin, e, edge
+ else:
+ box = margin, b, edge, e
+ yield layer, color, box
+
+def drawAnnotations(im, boxes):
+ # xxx use partial transparency for different-color overlaps?
+ for layer, color, box in boxes:
im.paste(color, box)
def make_label(text, text_size, range_start, range_size):
@@ -260,25 +425,28 @@ def get_nonoverlapping_labels(labels, label_space):
nonoverlapping_labels.append(i)
return nonoverlapping_labels
-def get_axis_image(seq_names, name_sizes, seq_starts, seq_pix,
+def get_axis_image(seqNames, name_sizes, seq_starts, seq_pix,
font, image_mode, opts):
'''Make an image of axis labels.'''
min_pos = seq_starts[0]
max_pos = seq_starts[-1] + seq_pix[-1]
height = max(zip(*name_sizes)[1])
- labels = [make_label(i, j, k, l) for i, j, k, l in
- zip(seq_names, name_sizes, seq_starts, seq_pix)]
+ labels = map(make_label, seqNames, name_sizes, seq_starts, seq_pix)
labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos]
labels.sort()
labels = get_nonoverlapping_labels(labels, opts.label_space)
image_size = max_pos, height
- im = Image.new(image_mode, image_size, opts.border_shade)
+ im = Image.new(image_mode, image_size, opts.border_color)
draw = ImageDraw.Draw(im)
for i in labels:
position = i[1], 0
draw.text(position, i[3], font=font, fill=opts.text_color)
return im
+def seqOrigins(seqNames, seq_starts, seqLimits, bp_per_pix):
+ for i, j in zip(seqNames, seq_starts):
+ yield i, bp_per_pix * j - seqLimits[i][0]
+
def lastDotplot(opts, args):
if opts.fontfile: font = ImageFont.truetype(opts.fontfile, opts.fontsize)
else: font = ImageFont.load_default()
@@ -290,52 +458,64 @@ def lastDotplot(opts, args):
overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
warn("reading alignments...")
- alignments, seq_size_dic1, seq_size_dic2 = readAlignments(args[0], opts)
+ alignmentInfo = readAlignments(args[0], opts)
+ alignments, seqNames1, seqNames2, seqLimits1, seqLimits2 = alignmentInfo
warn("done")
-
if not alignments: raise Exception("there are no alignments")
- seq_info1 = get_seq_info(seq_size_dic1, font, opts.fontsize, image_mode)
- seq_info2 = get_seq_info(seq_size_dic2, font, opts.fontsize, image_mode)
- seq_names1, seq_sizes1, name_sizes1, margin1 = seq_info1
- seq_names2, seq_sizes2, name_sizes2, margin2 = seq_info2
+ i1 = getSeqInfo(opts.sort1, seqNames1, seqLimits1,
+ font, opts.fontsize, image_mode, opts.lengths1)
+ seqNames1, seqSizes1, seqLabels1, labelSizes1, margin1 = i1
+
+ i2 = getSeqInfo(opts.sort2, seqNames2, seqLimits2,
+ font, opts.fontsize, image_mode, opts.lengths2)
+ seqNames2, seqSizes2, seqLabels2, labelSizes2, margin2 = i2
warn("choosing bp per pixel...")
pix_limit1 = opts.width - margin1
pix_limit2 = opts.height - margin2
- bp_per_pix1 = get_bp_per_pix(seq_sizes1, opts.pix_tween_seqs, pix_limit1)
- bp_per_pix2 = get_bp_per_pix(seq_sizes2, opts.pix_tween_seqs, pix_limit2)
- bp_per_pix = max(bp_per_pix1, bp_per_pix2)
- warn("bp per pixel = " + str(bp_per_pix))
-
- seq_pix1, seq_starts1, width = get_pix_info(seq_sizes1, bp_per_pix,
- opts.pix_tween_seqs, margin1)
- seq_pix2, seq_starts2, height = get_pix_info(seq_sizes2, bp_per_pix,
- opts.pix_tween_seqs, margin2)
- seq_start_dic1 = dict(zip(seq_names1, seq_starts1))
- seq_start_dic2 = dict(zip(seq_names2, seq_starts2))
+ bpPerPix1 = get_bp_per_pix(seqSizes1, opts.border_pixels, pix_limit1)
+ bpPerPix2 = get_bp_per_pix(seqSizes2, opts.border_pixels, pix_limit2)
+ bpPerPix = max(bpPerPix1, bpPerPix2)
+ warn("bp per pixel = " + str(bpPerPix))
+
+ seq_pix1, seq_starts1, width = get_pix_info(seqSizes1, bpPerPix,
+ opts.border_pixels, margin1)
+ seq_pix2, seq_starts2, height = get_pix_info(seqSizes2, bpPerPix,
+ opts.border_pixels, margin2)
+ warn("width: " + str(width))
+ warn("height: " + str(height))
+
+ origins1 = dict(seqOrigins(seqNames1, seq_starts1, seqLimits1, bpPerPix))
+ origins2 = dict(seqOrigins(seqNames2, seq_starts2, seqLimits2, bpPerPix))
warn("processing alignments...")
- hits = alignmentPixels(width, height, alignments, bp_per_pix,
- seq_start_dic1, seq_start_dic2)
+ hits = alignmentPixels(width, height, alignments, bpPerPix,
+ origins1, origins2)
warn("done")
image_size = width, height
im = Image.new(image_mode, image_size, opts.background_color)
- start_dic1 = expandedSeqDict(seq_start_dic1)
- start_dic2 = expandedSeqDict(seq_start_dic2)
- gaps1 = list(readGaps(opts.gap1))
- gaps2 = list(readGaps(opts.gap2))
- # draw bridged gaps first, then unbridged gaps on top:
- drawUnsequencedGaps(im, gaps1, start_dic1, margin2, height, True, "yes",
- bp_per_pix, opts.bridged_color)
- drawUnsequencedGaps(im, gaps2, start_dic2, margin1, width, False, "yes",
- bp_per_pix, opts.bridged_color)
- drawUnsequencedGaps(im, gaps1, start_dic1, margin2, height, True, "no",
- bp_per_pix, opts.unbridged_color)
- drawUnsequencedGaps(im, gaps2, start_dic2, margin1, width, False, "no",
- bp_per_pix, opts.unbridged_color)
+ seqLimits1 = expandedSeqDict(seqLimits1)
+ seqLimits2 = expandedSeqDict(seqLimits2)
+ origins1 = expandedSeqDict(origins1)
+ origins2 = expandedSeqDict(origins2)
+
+ beds1 = itertools.chain(readBed(opts.bed1, seqLimits1),
+ readRmsk(opts.rmsk1, seqLimits1),
+ readGenePred(opts, opts.genePred1, seqLimits1),
+ readGaps(opts, opts.gap1, seqLimits1))
+ b1 = bedBoxes(beds1, seqLimits1, origins1, margin2, height, True, bpPerPix)
+
+ beds2 = itertools.chain(readBed(opts.bed2, seqLimits2),
+ readRmsk(opts.rmsk2, seqLimits2),
+ readGenePred(opts, opts.genePred2, seqLimits2),
+ readGaps(opts, opts.gap2, seqLimits2))
+ b2 = bedBoxes(beds2, seqLimits2, origins2, margin1, width, False, bpPerPix)
+
+ boxes = sorted(itertools.chain(b1, b2))
+ drawAnnotations(im, boxes)
for i in range(height):
for j in range(width):
@@ -346,21 +526,21 @@ def lastDotplot(opts, args):
elif store_value == 3: im.putpixel(xy, overlap_color)
if opts.fontsize != 0:
- axis1 = get_axis_image(seq_names1, name_sizes1, seq_starts1, seq_pix1,
+ axis1 = get_axis_image(seqLabels1, labelSizes1, seq_starts1, seq_pix1,
font, image_mode, opts)
- axis2 = get_axis_image(seq_names2, name_sizes2, seq_starts2, seq_pix2,
+ axis2 = get_axis_image(seqLabels2, labelSizes2, seq_starts2, seq_pix2,
font, image_mode, opts)
- axis2 = axis2.rotate(270, expand=1)
+ axis2 = axis2.transpose(Image.ROTATE_270) # !!! bug hotspot
im.paste(axis1, (0, 0))
im.paste(axis2, (0, 0))
for i in seq_starts1[1:]:
- box = i - opts.pix_tween_seqs, margin2, i, height
- im.paste(opts.border_shade, box)
+ box = i - opts.border_pixels, margin2, i, height
+ im.paste(opts.border_color, box)
for i in seq_starts2[1:]:
- box = margin1, i - opts.pix_tween_seqs, width, i
- im.paste(opts.border_shade, box)
+ box = margin1, i - opts.border_pixels, width, i
+ im.paste(opts.border_color, box)
im.save(args[1])
@@ -371,23 +551,72 @@ if __name__ == "__main__":
or: ..."""
description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
op = optparse.OptionParser(usage=usage, description=description)
+ op.add_option("-v", "--verbose", action="count",
+ help="show progress messages & data about the plot")
op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
+ default=[],
help="which sequences to show from the 1st genome")
op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
+ default=[],
help="which sequences to show from the 2nd genome")
# Replace "width" & "height" with a single "length" option?
op.add_option("-x", "--width", type="int", default=1000,
help="maximum width in pixels (default: %default)")
op.add_option("-y", "--height", type="int", default=1000,
help="maximum height in pixels (default: %default)")
- op.add_option("-f", "--fontfile", metavar="FILE",
- help="TrueType or OpenType font file")
- op.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=11,
- help="TrueType or OpenType font size (default: %default)")
op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
help="color for forward alignments (default: %default)")
op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
help="color for reverse alignments (default: %default)")
+ op.add_option("--sort1", type="int", default=1, metavar="N",
+ help="genome1 sequence order: 0=input order, 1=name order, "
+ "2=length order (default=%default)")
+ op.add_option("--sort2", type="int", default=1, metavar="N",
+ help="genome2 sequence order: 0=input order, 1=name order, "
+ "2=length order (default=%default)")
+ op.add_option("--trim1", action="store_true",
+ help="trim unaligned sequence flanks from the 1st genome")
+ op.add_option("--trim2", action="store_true",
+ help="trim unaligned sequence flanks from the 2nd genome")
+ op.add_option("--border-pixels", metavar="INT", type="int", default=1,
+ help="number of pixels between sequences (default=%default)")
+ op.add_option("--border-color", metavar="COLOR", default="#dcdcdc",
+ help="color for pixels between sequences (default=%default)")
+ # xxx --margin-color?
+
+ og = optparse.OptionGroup(op, "Text options")
+ og.add_option("-f", "--fontfile", metavar="FILE",
+ help="TrueType or OpenType font file")
+ og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=11,
+ help="TrueType or OpenType font size (default: %default)")
+ og.add_option("--lengths1", action="store_true",
+ help="show sequence lengths for the 1st (horizontal) genome")
+ og.add_option("--lengths2", action="store_true",
+ help="show sequence lengths for the 2nd (vertical) genome")
+ op.add_option_group(og)
+
+ og = optparse.OptionGroup(op, "Annotation options")
+ og.add_option("--bed1", metavar="FILE",
+ help="read genome1 annotations from BED file")
+ og.add_option("--bed2", metavar="FILE",
+ help="read genome2 annotations from BED file")
+ og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
+ "RepeatMasker .out or rmsk.txt file")
+ og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
+ "RepeatMasker .out or rmsk.txt file")
+ op.add_option_group(og)
+
+ og = optparse.OptionGroup(op, "Gene options")
+ og.add_option("--genePred1", metavar="FILE",
+ help="read genome1 genes from genePred file")
+ og.add_option("--genePred2", metavar="FILE",
+ help="read genome2 genes from genePred file")
+ og.add_option("--exon-color", metavar="COLOR", default="#dfd",
+ help="color for exons (default=%default)")
+ og.add_option("--cds-color", metavar="COLOR", default="#bdb",
+ help="color for protein-coding regions (default=%default)")
+ op.add_option_group(og)
+
og = optparse.OptionGroup(op, "Unsequenced gap options")
og.add_option("--gap1", metavar="FILE",
help="read genome1 unsequenced gaps from agp or gap file")
@@ -403,8 +632,6 @@ if __name__ == "__main__":
opts.text_color = "black"
opts.background_color = "white"
- opts.pix_tween_seqs = 2 # number of border pixels between sequences
- opts.border_shade = 239, 239, 239 # the shade of grey for border pixels
opts.label_space = 5 # minimum number of pixels between axis labels
try: lastDotplot(opts, args)
diff --git a/scripts/last-postmask b/scripts/last-postmask
index 6e7b9d4..669be10 100755
--- a/scripts/last-postmask
+++ b/scripts/last-postmask
@@ -56,12 +56,12 @@ def printIfGood(maf, seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
print line,
print
-def doOneFile(file):
+def doOneFile(lines):
scoreMatrix = []
maf = []
seqs = []
- for line in file:
+ for line in lines:
if line[0] == "#":
print line,
w = line.split()
@@ -96,8 +96,8 @@ def lastPostmask(args):
if i == "-":
doOneFile(sys.stdin)
else:
- with open(i) as file:
- doOneFile(file)
+ with open(i) as f:
+ doOneFile(f)
else:
doOneFile(sys.stdin)
diff --git a/scripts/last-train b/scripts/last-train
index de39072..7fe5947 100755
--- a/scripts/last-train
+++ b/scripts/last-train
@@ -3,14 +3,33 @@
import math, optparse, os, random, signal, subprocess, sys, tempfile
+def myOpen(fileName): # faster than fileinput
+ if fileName == "-":
+ return sys.stdin
+ return open(fileName)
+
+def randomSample(things, sampleSize):
+ """Randomly get sampleSize things (or all if fewer)."""
+ reservoir = [] # "reservoir sampling" algorithm
+ for i, x in enumerate(things):
+ if i < sampleSize:
+ reservoir.append(x)
+ else:
+ r = random.randrange(i + 1)
+ if r < sampleSize:
+ reservoir[r] = x
+ return reservoir
+
def writeWords(outFile, words):
outFile.write(" ".join(words) + "\n")
def seqInput(fileNames):
+ if not fileNames:
+ fileNames = ["-"]
for name in fileNames:
- with open(name) as file:
+ with myOpen(name) as f:
seqType = 0
- for line in file:
+ for line in f:
if seqType == 0:
if line[0] == ">":
seqType = 1
@@ -32,40 +51,66 @@ def seqInput(fileNames):
lineType = (lineType + 1) % 4
if seqType == 1: yield "".join(seq), ""
-def randomNumbersAndLengths(lengths, sampleSize):
- numbersAndLengths = list(enumerate(lengths))
- random.shuffle(numbersAndLengths)
- for i, x in numbersAndLengths:
- if x < sampleSize:
- yield i, x
- sampleSize -= x
- else:
- yield i, sampleSize
- break
-
-def writeSeqSample(numbersAndLengths, seqInput, outfile):
- j = 0
- for i, x in enumerate(seqInput):
- if j < len(numbersAndLengths) and numbersAndLengths[j][0] == i:
- seqlen = numbersAndLengths[j][1]
- seq, qual = x
- if qual:
- outfile.write("@" + str(i) + "\n")
- outfile.write(seq[:seqlen])
- outfile.write("\n+\n")
- outfile.write(qual[:seqlen])
- else:
- outfile.write(">" + str(i) + "\n")
- outfile.write(seq[:seqlen])
- outfile.write("\n")
- j += 1
+def isGoodChunk(chunk):
+ for i in chunk:
+ for j in i[3]:
+ if j not in "Nn":
+ return True
+ return False
+
+def chunkInput(opts, sequences):
+ chunkCount = 0
+ chunk = []
+ wantedLength = opts.sample_length
+ for i, x in enumerate(sequences):
+ seq, qual = x
+ if all(i in "Nn" for i in seq): continue
+ seqLength = len(seq)
+ beg = 0
+ while beg < seqLength:
+ length = min(wantedLength, seqLength - beg)
+ end = beg + length
+ segment = i, beg, end, seq[beg:end], qual[beg:end]
+ chunk.append(segment)
+ wantedLength -= length
+ if not wantedLength:
+ if isGoodChunk(chunk):
+ yield chunk
+ chunkCount += 1
+ chunk = []
+ wantedLength = opts.sample_length
+ beg = end
+ if chunk and chunkCount < opts.sample_number:
+ yield chunk
+
+def writeSegment(outfile, segment):
+ if not segment: return
+ i, beg, end, seq, qual = segment
+ name = str(i) + ":" + str(beg)
+ if qual:
+ outfile.write("@" + name + "\n")
+ outfile.write(seq)
+ outfile.write("\n+\n")
+ outfile.write(qual)
+ else:
+ outfile.write(">" + name + "\n")
+ outfile.write(seq)
+ outfile.write("\n")
def getSeqSample(opts, queryFiles, outfile):
- lengths = (len(i[0]) for i in seqInput(queryFiles))
- sampleSize = int(round(opts.sample))
- numbersAndLengths = sorted(randomNumbersAndLengths(lengths, sampleSize))
- writeSeqSample(numbersAndLengths, seqInput(queryFiles), outfile)
- print "# sampled sequences:", len(numbersAndLengths)
+ sequences = seqInput(queryFiles)
+ chunks = chunkInput(opts, sequences)
+ sample = randomSample(chunks, opts.sample_number)
+ sample.sort()
+ x = None
+ for chunk in sample:
+ for y in chunk:
+ if x and y[0] == x[0] and y[1] == x[2]:
+ x = x[0], x[1], y[2], x[3] + y[3], x[4] + y[4]
+ else:
+ writeSegment(outfile, x)
+ x = y
+ writeSegment(outfile, x)
def scaleFromHeader(lines):
for line in lines:
@@ -290,6 +335,7 @@ def fixedLastalArgs(opts):
if opts.E: x.append("-E" + opts.E)
if opts.s: x.append("-s" + opts.s)
if opts.S: x.append("-S" + opts.S)
+ if opts.C: x.append("-C" + opts.C)
if opts.T: x.append("-T" + opts.T)
if opts.m: x.append("-m" + opts.m)
if opts.P: x.append("-P" + opts.P)
@@ -365,7 +411,7 @@ def doTraining(opts, args):
writeMatrixWithLetters(sys.stdout, externalMatrix, "")
def lastTrain(opts, args):
- if opts.sample:
+ if opts.sample_number:
random.seed(math.pi)
refName = args[0]
queryFiles = args[1:]
@@ -392,8 +438,10 @@ if __name__ == "__main__":
help="force insertion/deletion symmetry")
og.add_option("--pid", type="float", default=100, help=
"skip alignments with > PID% identity (default: %default)")
- og.add_option("--sample", type="float", default=1000000, metavar="N",
- help="use a random sample of N letters (default: %default)")
+ og.add_option("--sample-number", type="int", default=500, metavar="N",
+ help="number of random sequence samples (default: %default)")
+ og.add_option("--sample-length", type="int", default=2000, metavar="L",
+ help="length of each sample (default: %default)")
op.add_option_group(og)
og = optparse.OptionGroup(op, "Initial parameter options")
og.add_option("-r", metavar="SCORE",
@@ -418,6 +466,8 @@ if __name__ == "__main__":
og.add_option("-S", metavar="NUMBER", default="1", help=
"score matrix applies to forward strand of: " +
"0=reference, 1=query (default: %default)")
+ og.add_option("-C", metavar="COUNT", help=
+ "omit gapless alignments in COUNT others with > score-per-length")
og.add_option("-T", metavar="NUMBER",
help="type of alignment: 0=local, 1=overlap (default: 0)")
og.add_option("-m", metavar="COUNT", help=
@@ -428,7 +478,10 @@ if __name__ == "__main__":
help="input format: 0=fasta, 1=fastq-sanger")
op.add_option_group(og)
(opts, args) = op.parse_args()
- if len(args) < 2: op.error("I need a lastdb index and query sequences")
+ if len(args) < 1:
+ op.error("I need a lastdb index and query sequences")
+ if not opts.sample_number and (len(args) < 2 or "-" in args):
+ op.error("sorry, can't use stdin when --sample-number=0")
if not opts.p and (not opts.Q or opts.Q == "0"):
if not opts.r: opts.r = "5"
if not opts.q: opts.q = "5"
diff --git a/scripts/maf-convert b/scripts/maf-convert
index 7b1d92c..850f3b1 100755
--- a/scripts/maf-convert
+++ b/scripts/maf-convert
@@ -25,9 +25,6 @@ def isMatch(alignmentColumn):
if i.upper() != first: return False
return True
-def isGapless(alignmentColumn):
- return "-" not in alignmentColumn
-
def gapRunCount(row):
"""Get the number of runs of gap characters."""
return sum(k == "-" for k, v in groupby(row))
@@ -44,17 +41,27 @@ def insertSize(row, letterSize):
"""Get the length of sequence included in the row."""
return (len(row) - row.count("-")) * letterSize - 4 * row.count("/") - 2 * row.count("\\")
-def insertSizeText(row, letterSize):
- return str(insertSize(row, letterSize))
-
def matchAndInsertSizes(alignmentColumns, letterSizes):
"""Get sizes of gapless blocks, and of the inserts between them."""
- for k, v in groupby(alignmentColumns, isGapless):
- if k:
- yield str(sum(1 for i in v))
+ letterSizeA, letterSizeB = letterSizes
+ delSize = insSize = subSize = 0
+ for x, y in alignmentColumns:
+ if x == "-":
+ if subSize:
+ if delSize or insSize: yield str(delSize) + ":" + str(insSize)
+ yield str(subSize)
+ delSize = insSize = subSize = 0
+ insSize += symbolSize(y, letterSizeB)
+ elif y == "-":
+ if subSize:
+ if delSize or insSize: yield str(delSize) + ":" + str(insSize)
+ yield str(subSize)
+ delSize = insSize = subSize = 0
+ delSize += symbolSize(x, letterSizeA)
else:
- yield ":".join(imap(insertSizeText,
- alignmentRowsFromColumns(v), letterSizes))
+ subSize += 1
+ if delSize or insSize: yield str(delSize) + ":" + str(insSize)
+ if subSize: yield str(subSize)
##### Routines for reading MAF format: #####
@@ -265,12 +272,10 @@ def pslCommaString(things):
# UCSC software seems to prefer a trailing comma
return ",".join(map(str, things)) + ","
-def pslEnds(headMafFields, tailMafFields):
- seqName, seqLen, strand, letterSize, beg = headMafFields[0:5]
- end = tailMafFields[5]
+def pslEnds(seqLen, strand, beg, end):
if strand == "-":
- beg, end = seqLen - end, seqLen - beg
- return seqName, seqLen, strand, letterSize, beg, end
+ return seqLen - end, seqLen - beg
+ return beg, end
def writePsl(opts, mafs):
matchCounts = [0] * 4
@@ -278,13 +283,19 @@ def writePsl(opts, mafs):
matches, mismatches, repMatches, nCount = matchCounts
numGaplessColumns = sum(matchCounts)
- headA, headB = mafs[0][1]
- tailA, tailB = mafs[-1][1]
+ if not blocks:
+ return
+
+ fieldsA, fieldsB = mafs[0][1]
+ headSize, headBegA, headBegB = blocks[0]
+ tailSize, tailBegA, tailBegB = blocks[-1]
- seqNameA, seqLenA, strandA, letterSizeA, begA, endA = pslEnds(headA, tailA)
+ seqNameA, seqLenA, strandA, letterSizeA = fieldsA[0:4]
+ begA, endA = pslEnds(seqLenA, strandA, headBegA, tailBegA + tailSize)
baseInsertA = endA - begA - numGaplessColumns * letterSizeA
- seqNameB, seqLenB, strandB, letterSizeB, begB, endB = pslEnds(headB, tailB)
+ seqNameB, seqLenB, strandB, letterSizeB = fieldsB[0:4]
+ begB, endB = pslEnds(seqLenB, strandB, headBegB, tailBegB + tailSize)
baseInsertB = endB - begB - numGaplessColumns * letterSizeB
numInsertA, numInsertB = pslNumInserts(blocks, letterSizeA, letterSizeB)
@@ -828,7 +839,6 @@ if __name__ == "__main__":
op.error("need file (not pipe) with option -d")
try: mafConvert(opts, args)
- except KeyboardInterrupt: pass # avoid silly error message
except Exception, e:
prog = os.path.basename(sys.argv[0])
sys.exit(prog + ": error: " + str(e))
diff --git a/src/Alignment.cc b/src/Alignment.cc
index ec2db83..0ef6c6c 100644
--- a/src/Alignment.cc
+++ b/src/Alignment.cc
@@ -262,6 +262,29 @@ bool Alignment::hasGoodSegment(const uchar *seq1, const uchar *seq2,
return false;
}
+static void getColumnAmbiguities(const Centroid& centroid,
+ std::vector<uchar>& ambiguityCodes,
+ const std::vector<SegmentPair>& chunks,
+ bool isForward) {
+ for (size_t i = 0; i < chunks.size(); ++i) {
+ const SegmentPair& x = chunks[i];
+ centroid.getMatchAmbiguities(ambiguityCodes, x.end1(), x.end2(), x.size);
+ size_t j = i + 1;
+ bool isNext = (j < chunks.size());
+ size_t end1 = isNext ? chunks[j].end1() : 0;
+ size_t end2 = isNext ? chunks[j].end2() : 0;
+ // ASSUMPTION: if there is an insertion adjacent to a deletion,
+ // the deletion will get printed first.
+ if (isForward) {
+ centroid.getInsertAmbiguities(ambiguityCodes, x.beg2(), end2);
+ centroid.getDeleteAmbiguities(ambiguityCodes, x.beg1(), end1);
+ } else {
+ centroid.getDeleteAmbiguities(ambiguityCodes, x.beg1(), end1);
+ centroid.getInsertAmbiguities(ambiguityCodes, x.beg2(), end2);
+ }
+ }
+}
+
void Alignment::extend( std::vector< SegmentPair >& chunks,
std::vector< uchar >& ambiguityCodes,
Centroid& centroid,
@@ -361,7 +384,7 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
centroid.traceback( chunks, gamma );
}
- centroid.getColumnAmbiguities( ambiguityCodes, chunks, isForward );
+ getColumnAmbiguities( centroid, ambiguityCodes, chunks, isForward );
extras.fullScore += centroid.logPartitionFunction();
if( outputType == 7 ){
diff --git a/src/AlignmentWrite.cc b/src/AlignmentWrite.cc
index bfbd53e..51b7a32 100644
--- a/src/AlignmentWrite.cc
+++ b/src/AlignmentWrite.cc
@@ -457,12 +457,13 @@ AlignmentText Alignment::writeBlastTab(const MultiSequence& seq1,
}
IntText s1(seqLen1);
IntText s2(seqLen2);
+ IntText sc(score);
size_t s =
n2.size() + n1.size() + mp.size() + as.size() + mm.size() + go.size() +
b2.size() + e2.size() + b1.size() + e1.size() + 10;
if (evaluer.isGood()) s += ev.size() + bs.size() + 2;
- if (isExtraColumns) s += s1.size() + s2.size() + 2;
+ if (isExtraColumns) s += s1.size() + s2.size() + sc.size() + 3;
char *text = new char[s + 1];
Writer w(text);
@@ -470,7 +471,7 @@ AlignmentText Alignment::writeBlastTab(const MultiSequence& seq1,
w << n2 << t << n1 << t << mp << t << as << t << mm << t << go << t
<< b2 << t << e2 << t << b1 << t << e1;
if (evaluer.isGood()) w << t << ev << t << bs;
- if (isExtraColumns) w << t << s2 << t << s1;
+ if (isExtraColumns) w << t << s2 << t << s1 << t << sc;
w << '\n' << '\0';
return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand, score,
diff --git a/src/Centroid.cc b/src/Centroid.cc
index 42c5286..36edd35 100644
--- a/src/Centroid.cc
+++ b/src/Centroid.cc
@@ -696,45 +696,27 @@ namespace cbrc{
return static_cast<uchar>(k);
}
- static void getGapAmbiguities( std::vector<uchar>& ambiguityCodes,
- const std::vector<double>& probs,
- size_t rbeg, size_t rend ){
- for( size_t i = rbeg; i > rend; --i ){
- ambiguityCodes.push_back( asciiProbability( probs[ i ] ) );
+ void Centroid::getMatchAmbiguities(std::vector<uchar>& ambiguityCodes,
+ size_t seq1end, size_t seq2end,
+ size_t length) const {
+ while (length) {
+ size_t d = xa.diag(seq1end + seq2end, seq1end);
+ double p = fM[d] * bM[d];
+ ambiguityCodes.push_back(asciiProbability(p));
+ --seq1end; --seq2end; --length;
}
}
- // Added by MCF:
- void Centroid::getColumnAmbiguities( std::vector<uchar>& ambiguityCodes,
- const std::vector<SegmentPair>& chunks,
- bool isForward ){
- for( CI(SegmentPair) i = chunks.begin(); i < chunks.end(); ++i ){
- size_t seq1pos = i->end1();
- size_t seq2pos = i->end2();
-
- for( size_t j = 0; j < i->size; ++j ){
- size_t d = xa.diag( seq1pos + seq2pos, seq1pos );
- double p = fM[d] * bM[d];
- ambiguityCodes.push_back( asciiProbability(p) );
- --seq1pos;
- --seq2pos;
- }
-
- CI(SegmentPair) j = i + 1;
- size_t end1 = (j < chunks.end()) ? j->end1() : 0;
- size_t end2 = (j < chunks.end()) ? j->end2() : 0;
+ void Centroid::getDeleteAmbiguities(std::vector<uchar>& ambiguityCodes,
+ size_t seq1end, size_t seq1beg) const {
+ for (size_t i = seq1end; i > seq1beg; --i)
+ ambiguityCodes.push_back(asciiProbability(mD[i]));
+ }
- // ASSUMPTION: if there is an insertion adjacent to a deletion,
- // the deletion will get printed first.
- if( isForward ){
- getGapAmbiguities( ambiguityCodes, mI, seq2pos, end2 );
- getGapAmbiguities( ambiguityCodes, mD, seq1pos, end1 );
- }
- else{
- getGapAmbiguities( ambiguityCodes, mD, seq1pos, end1 );
- getGapAmbiguities( ambiguityCodes, mI, seq2pos, end2 );
- }
- }
+ void Centroid::getInsertAmbiguities(std::vector<uchar>& ambiguityCodes,
+ size_t seq2end, size_t seq2beg) const {
+ for (size_t i = seq2end; i > seq2beg; --i)
+ ambiguityCodes.push_back(asciiProbability(mI[i]));
}
double Centroid::logPartitionFunction() const{
diff --git a/src/Centroid.hh b/src/Centroid.hh
index 516bbb3..9c3b8f5 100644
--- a/src/Centroid.hh
+++ b/src/Centroid.hh
@@ -64,10 +64,15 @@ namespace cbrc{
double dp_ama( double gamma );
void traceback_ama( std::vector< SegmentPair >& chunks, double gamma ) const;
- // Added by MCF: get the probability of each column in the alignment:
- void getColumnAmbiguities( std::vector< uchar >& ambiguityCodes,
- const std::vector< SegmentPair >& chunks,
- bool isForward );
+ void getMatchAmbiguities(std::vector<uchar>& ambiguityCodes,
+ size_t seq1end, size_t seq2end,
+ size_t length) const;
+
+ void getDeleteAmbiguities(std::vector<uchar>& ambiguityCodes,
+ size_t seq1end, size_t seq1beg) const;
+
+ void getInsertAmbiguities(std::vector<uchar>& ambiguityCodes,
+ size_t seq2end, size_t seq2beg) const;
double logPartitionFunction() const; // a.k.a. full score, forward score
diff --git a/src/LastEvaluer.cc b/src/LastEvaluer.cc
index 2c7ac4c..b1951ca 100644
--- a/src/LastEvaluer.cc
+++ b/src/LastEvaluer.cc
@@ -357,10 +357,19 @@ void LastEvaluer::init(const char *matrixName,
if (isGapped) {
evaluer.set_gapped_computation_parameters_simplified(maxSeconds);
- evaluer.initGapped(alphabetSize, &matrix[0], letterFreqs2, letterFreqs1,
- delOpen, delEpen, insOpen, insEpen,
- true, lambdaTolerance, kTolerance,
- 0, maxMegabytes, randomSeed);
+ for (int i = 0; ; ++i) {
+ double t = Sls::default_importance_sampling_temperature + 0.01 * i;
+ try {
+ evaluer.initGapped(alphabetSize, &matrix[0],
+ letterFreqs2, letterFreqs1,
+ delOpen, delEpen, insOpen, insEpen,
+ true, lambdaTolerance, kTolerance,
+ 0, maxMegabytes, randomSeed, t);
+ break;
+ } catch (const Sls::error& e) {
+ if (i == 10) throw;
+ }
+ }
} else {
evaluer.initGapless(alphabetSize, &matrix[0],
letterFreqs2, letterFreqs1);
diff --git a/src/LastalArguments.cc b/src/LastalArguments.cc
index 709ce9d..8d389ff 100644
--- a/src/LastalArguments.cc
+++ b/src/LastalArguments.cc
@@ -2,7 +2,7 @@
#include "LastalArguments.hh"
#include "stringify.hh"
-#include <unistd.h> // getopt
+#include "getoptUtil.hh"
#include <algorithm> // max
#include <iostream>
#include <sstream>
@@ -74,6 +74,7 @@ LastalArguments::LastalArguments() :
maxHitDepth(-1),
oneHitMultiplicity(10),
maxGaplessAlignmentsPerQueryPosition(0), // depends on oneHitMultiplicity
+ maxAlignmentsPerQueryStrand(-1),
cullingLimitForGaplessAlignments(0),
cullingLimitForFinalAlignments(0),
queryStep(1),
@@ -144,6 +145,7 @@ Miscellaneous options (default settings):\n\
-T: type of alignment: 0=local, 1=overlap ("
+ stringify(globality) + ")\n\
-n: maximum gapless alignments per query position (infinity if m=0, else m)\n\
+-N: stop after the first N alignments per query strand\n\
-R: repeat-marking options (the same as was used for lastdb)\n\
-u: mask lowercase during extensions: 0=never, 1=gapless,\n\
2=gapless+postmask, 3=always (2 if lastdb -c and Q<5, else 0)\n\
@@ -165,10 +167,9 @@ Report bugs to: last-align (ATmark) googlegroups (dot) com\n\
LAST home page: http://last.cbrc.jp/\n\
";
- optind = 1; // allows us to scan arguments more than once(???)
int c;
const char optionString[] = "hVvf:" "r:q:p:a:b:A:B:c:F:x:y:z:d:e:" "D:E:"
- "s:S:MT:m:l:L:n:C:K:k:W:i:P:R:u:w:t:g:G:j:Q:";
+ "s:S:MT:m:l:L:n:N:C:K:k:W:i:P:R:u:w:t:g:G:j:Q:";
while( (c = myGetopt(argc, argv, optionString)) != -1 ){
switch(c){
case 'h':
@@ -285,6 +286,9 @@ LAST home page: http://last.cbrc.jp/\n\
unstringify( maxGaplessAlignmentsPerQueryPosition, optarg );
if( maxGaplessAlignmentsPerQueryPosition <= 0 ) badopt( c, optarg );
break;
+ case 'N':
+ unstringify( maxAlignmentsPerQueryStrand, optarg );
+ break;
case 'C':
unstringify( cullingLimitForGaplessAlignments, optarg );
break;
@@ -363,11 +367,14 @@ LAST home page: http://last.cbrc.jp/\n\
if( isGreedy && maskLowercase == 3 )
ERR( "can't combine option -M with option -u 3" );
- if( optionsOnly ) return;
- if( optind >= argc )
- ERR( "please give me a database name and sequence file(s)\n\n" + usage );
- lastdbName = argv[optind++];
- inputStart = optind;
+ if( !optionsOnly ){
+ if( optind >= argc )
+ ERR( "please give me a database name and sequence file(s)\n\n" + usage );
+ lastdbName = argv[optind++];
+ inputStart = optind;
+ }
+
+ resetGetopt();
}
void LastalArguments::fromLine( const std::string& line ){
@@ -595,6 +602,8 @@ void LastalArguments::writeCommented( std::ostream& stream ) const{
if( maxHitDepth + 1 > 0 )
stream << " L=" << maxHitDepth;
stream << " n=" << maxGaplessAlignmentsPerQueryPosition;
+ if( maxAlignmentsPerQueryStrand + 1 > 0 )
+ stream << " N=" << maxAlignmentsPerQueryStrand;
if( cullingLimitForGaplessAlignments )
stream << " C=" << cullingLimitForGaplessAlignments;
if( cullingLimitForFinalAlignments )
diff --git a/src/LastalArguments.hh b/src/LastalArguments.hh
index 708f999..f4cc87c 100644
--- a/src/LastalArguments.hh
+++ b/src/LastalArguments.hh
@@ -87,6 +87,7 @@ struct LastalArguments{
size_t maxHitDepth;
size_t oneHitMultiplicity;
size_t maxGaplessAlignmentsPerQueryPosition;
+ size_t maxAlignmentsPerQueryStrand;
size_t cullingLimitForGaplessAlignments;
size_t cullingLimitForFinalAlignments;
size_t queryStep;
diff --git a/src/LastdbArguments.cc b/src/LastdbArguments.cc
index 7393685..95060ec 100644
--- a/src/LastdbArguments.cc
+++ b/src/LastdbArguments.cc
@@ -2,7 +2,7 @@
#include "LastdbArguments.hh"
#include "stringify.hh"
-#include <unistd.h> // getopt
+#include "getoptUtil.hh"
#include <iostream>
#include <stdexcept>
#include <cstdlib> // EXIT_SUCCESS
@@ -56,7 +56,7 @@ Main Options:\n\
-p: interpret the sequences as proteins\n\
-R: repeat-marking options (default="
+ stringify(isKeepLowercase) + stringify(tantanSetting) + ")\n\
--c: soft-mask lowercase letters\n\
+-c: soft-mask lowercase letters (in reference *and* query sequences)\n\
-u: seeding scheme (default: YASS for DNA, else exact-match seeds)";
std::string help = usage + "\n\
@@ -86,7 +86,6 @@ Report bugs to: last-align (ATmark) googlegroups (dot) com\n\
LAST home page: http://last.cbrc.jp/\n\
";
- optind = 1; // allows us to scan arguments more than once(???)
int c;
while( (c = myGetopt(argc, argv, "hVpR:cm:s:w:W:P:u:a:i:b:C:xvQ:")) != -1 ) {
switch(c){
@@ -159,11 +158,14 @@ LAST home page: http://last.cbrc.jp/\n\
}
}
- if( isOptionsOnly ) return;
- if( optind >= argc )
- ERR( "please give me an output name and sequence file(s)\n\n" + usage );
- lastdbName = argv[optind++];
- inputStart = optind;
+ if( !isOptionsOnly ){
+ if( optind >= argc )
+ ERR( "please give me an output name and sequence file(s)\n\n" + usage );
+ lastdbName = argv[optind++];
+ inputStart = optind;
+ }
+
+ resetGetopt();
}
void LastdbArguments::fromLine( const std::string& line ){
diff --git a/src/alp/sls_alignment_evaluer.cpp b/src/alp/sls_alignment_evaluer.cpp
index 8fd6f1a..aa7adc1 100644
--- a/src/alp/sls_alignment_evaluer.cpp
+++ b/src/alp/sls_alignment_evaluer.cpp
@@ -365,7 +365,8 @@ double eps_lambda_,//relative error for the parameter lambda
double eps_K_,//relative error for the parameter K
double max_time_,//maximum allowed calculation time in seconds
double max_mem_,//maximum allowed memory usage in Mb
-long randomSeed_)//randomizaton seed
+long randomSeed_,//randomizaton seed
+double temperature_)
{
struct_for_randomization *randomization_parameters=NULL;
@@ -512,6 +513,7 @@ long randomSeed_)//randomizaton seed
letterFreqs1_normalized,
letterFreqs2_normalized,
+ temperature_,
max_time_,//maximum allowed calculation time in seconds
max_mem_,//maximum allowed memory usage in MB
eps_lambda_,//relative error for lambda calculation
@@ -1014,11 +1016,6 @@ double seqlen2_) const//length of sequence #2
E,
area_res,
- pvalues_obj.a_normal,
- pvalues_obj.b_normal,
- pvalues_obj.h_normal,
- pvalues_obj.N_normal,
- pvalues_obj.p_normal,
area_is_1_flag,
compute_only_area);
@@ -1093,11 +1090,6 @@ double &evalue_) const//resulted E-value
evalue_,
area,
- pvalues_obj.a_normal,
- pvalues_obj.b_normal,
- pvalues_obj.h_normal,
- pvalues_obj.N_normal,
- pvalues_obj.p_normal,
area_is_1_flag);
diff --git a/src/alp/sls_alignment_evaluer.hpp b/src/alp/sls_alignment_evaluer.hpp
index 4a398ae..a113f4e 100644
--- a/src/alp/sls_alignment_evaluer.hpp
+++ b/src/alp/sls_alignment_evaluer.hpp
@@ -40,6 +40,7 @@ Contents: library functions of main routines
#include <math.h>
namespace Sls {
+ const double default_importance_sampling_temperature = 1.07;
class AlignmentEvaluer {
@@ -100,7 +101,8 @@ namespace Sls {
double eps_K_,//relative error for the parameter K
double max_time_,//maximum allowed calculation time in seconds;
double max_mem_,//maximum allowed memory usage in Mb
- long randomSeed_);//randomizaton seed
+ long randomSeed_,//randomizaton seed
+ double temperature_=default_importance_sampling_temperature);
//Initializes Gumbel parameters using precalculated values:
diff --git a/src/alp/sls_alp_data.cpp b/src/alp/sls_alp_data.cpp
index 6d32520..73f5676 100644
--- a/src/alp/sls_alp_data.cpp
+++ b/src/alp/sls_alp_data.cpp
@@ -216,6 +216,7 @@ long int epen_,//gap extension penalty
long int epen1_,//gap extension penalty for a gap in the sequence #1
long int epen2_,//gap extension penalty for a gap in the sequence #2
+double temperature_,
double max_time_,//maximum allowed calculation time in seconds
double max_mem_,//maximum allowed memory usage in MB
double eps_lambda_,//relative error for lambda calculation
@@ -294,6 +295,7 @@ bool insertions_after_deletions_)//if true, then insertions after deletions are
d_epen,
+ temperature_,
d_number_of_AA,
d_smatr,
d_RR1,
@@ -382,6 +384,7 @@ long int epen2_,//gap extension penalty for a gap in the sequence #2
string smatr_file_name_,//scoring matrix file name
string RR1_file_name_,//probabilities1 file name
string RR2_file_name_,//probabilities2 file name
+double temperature_,
double max_time_,//maximum allowed calculation time in seconds
double max_mem_,//maximum allowed memory usage in MB
double eps_lambda_,//relative error for lambda calculation
@@ -443,6 +446,7 @@ bool insertions_after_deletions_)//if true, then insertions after deletions are
epen1_,//gap extension penalty for a gap in the sequence #1
epen2_,//gap extension penalty for a gap in the sequence #2
+ temperature_,
max_time_,//maximum allowed calculation time in seconds
max_mem_,//maximum allowed memory usage in MB
eps_lambda_,//relative error for lambda calculation
@@ -488,6 +492,7 @@ const long *const *substitutionScoreMatrix_,
const double *letterFreqs1_,
const double *letterFreqs2_,
+double temperature_,
double max_time_,//maximum allowed calculation time in seconds
double max_mem_,//maximum allowed memory usage in MB
double eps_lambda_,//relative error for lambda calculation
@@ -581,6 +586,7 @@ double max_time_with_computation_parameters_)//maximum allowed time in seconds f
epen1_,//gap extension penalty for a gap in the sequence #1
epen2_,//gap extension penalty for a gap in the sequence #2
+ temperature_,
max_time_,//maximum allowed calculation time in seconds
max_mem_,//maximum allowed memory usage in MB
eps_lambda_,//relative error for lambda calculation
@@ -1128,6 +1134,7 @@ long int open_,
long int epen_,
+double temperature_,
long int number_of_AA_,
long int **smatr_,
double *RR1_,
@@ -1272,7 +1279,7 @@ double *RR2_)
//cout<<"\nUngapped lambda is "<<d_ungap_lambda<<endl;
- d_lambda*=1.07;
+ d_lambda*=temperature_;
};
diff --git a/src/alp/sls_alp_data.hpp b/src/alp/sls_alp_data.hpp
index 39a81de..2fcd777 100644
--- a/src/alp/sls_alp_data.hpp
+++ b/src/alp/sls_alp_data.hpp
@@ -245,6 +245,7 @@ namespace Sls {
long int epen_,
+ double temperature_,
long int number_of_AA_,
long int **smatr_,
double *RR1_,
@@ -309,6 +310,7 @@ namespace Sls {
std::string smatr_file_name_,//scoring matrix file name
std::string RR1_file_name_,//probabilities1 file name
std::string RR2_file_name_,//probabilities2 file name
+ double temperature_,
double max_time_,//maximum allowed calculation time in seconds
double max_mem_,//maximum allowed memory usage in MB
double eps_lambda_,//relative error for lambda calculation
@@ -332,6 +334,7 @@ namespace Sls {
const double *letterFreqs1_,
const double *letterFreqs2_,
+ double temperature_,
double max_time_,//maximum allowed calculation time in seconds
double max_mem_,//maximum allowed memory usage in MB
double eps_lambda_,//relative error for lambda calculation
@@ -368,6 +371,7 @@ namespace Sls {
long int epen1_,//gap extension penalty for a gap in the sequence #1
long int epen2_,//gap extension penalty for a gap in the sequence #2
+ double temperature_,
double max_time_,//maximum allowed calculation time in seconds
double max_mem_,//maximum allowed memory usage in MB
double eps_lambda_,//relative error for lambda calculation
diff --git a/src/alp/sls_basic.hpp b/src/alp/sls_basic.hpp
index d289ffd..f6e4353 100644
--- a/src/alp/sls_basic.hpp
+++ b/src/alp/sls_basic.hpp
@@ -48,7 +48,8 @@ Contents: Some basic functions and types
#endif
#include <iomanip>
-#include <cmath>
+#include <cmath> // ?
+#include <math.h>
#include <string>
namespace Sls {
@@ -189,6 +190,11 @@ namespace Sls {
static double one_minus_exp_function(
double y_);
+ static double normal_probability(double x_)
+ {
+ return 0.5*erfc(-sqrt(0.5)*x_);
+ }
+
static double normal_probability(
double x_,
double eps_);
diff --git a/src/alp/sls_falp_alignment_evaluer.cpp b/src/alp/sls_falp_alignment_evaluer.cpp
index a51f84d..113f485 100644
--- a/src/alp/sls_falp_alignment_evaluer.cpp
+++ b/src/alp/sls_falp_alignment_evaluer.cpp
@@ -977,11 +977,6 @@ double seqlen2_) const//length of sequence #2
E,
area_res,
- pvalues_obj.a_normal,
- pvalues_obj.b_normal,
- pvalues_obj.h_normal,
- pvalues_obj.N_normal,
- pvalues_obj.p_normal,
area_is_1_flag,
compute_only_area);
@@ -1057,11 +1052,6 @@ double &evalue_) const//resulted E-value
evalue_,
area,
- pvalues_obj.a_normal,
- pvalues_obj.b_normal,
- pvalues_obj.h_normal,
- pvalues_obj.N_normal,
- pvalues_obj.p_normal,
area_is_1_flag);
}
diff --git a/src/alp/sls_fsa1_pvalues.cpp b/src/alp/sls_fsa1_pvalues.cpp
index 8b1dcc5..0ba0770 100644
--- a/src/alp/sls_fsa1_pvalues.cpp
+++ b/src/alp/sls_fsa1_pvalues.cpp
@@ -106,13 +106,6 @@ double &E_,
double &E_error_,
double &area_,
-
-double a_normal_,
-double b_normal_,
-double h_normal_,
-long int N_normal_,
-double *p_normal_,
-
bool &area_is_1_flag_)
{
@@ -181,8 +174,6 @@ bool &area_is_1_flag_)
tau_hat_error_=0;
};
- double eps=0.000001;
-
double m_li_y_error=0;
double m_li_y=0;
@@ -218,7 +209,7 @@ bool &area_is_1_flag_)
};
- double P_m_F=sls_basic::normal_probability(a_normal_,b_normal_,h_normal_,N_normal_,p_normal_,m_F,eps);
+ double P_m_F=sls_basic::normal_probability(m_F);
double P_m_F_error=const_val*exp(-0.5*m_F*m_F)*m_F_error;
double E_m_F=-const_val*exp(-0.5*m_F*m_F);
@@ -270,7 +261,7 @@ bool &area_is_1_flag_)
n_F=n_lj_y/sqrt_vj_y;
};
- double P_n_F=sls_basic::normal_probability(a_normal_,b_normal_,h_normal_,N_normal_,p_normal_,n_F,eps);
+ double P_n_F=sls_basic::normal_probability(n_F);
double P_n_F_error=const_val*exp(-0.5*n_F*n_F)*n_F_error;
double E_n_F=-const_val*exp(-0.5*n_F*n_F);
@@ -385,13 +376,6 @@ double &P_,
double &E_,
double &area_,
-
-double a_normal_,
-double b_normal_,
-double h_normal_,
-long int N_normal_,
-double *p_normal_,
-
bool &area_is_1_flag_,
bool compute_only_area_)
{
@@ -438,8 +422,6 @@ bool compute_only_area_)
tau_hat_=0;
};
- double eps=0.000001;
-
double m_li_y=0;
double tmp=ai_hat_*y_+bi_hat_;
@@ -463,7 +445,7 @@ bool compute_only_area_)
};
- double P_m_F=sls_basic::normal_probability(a_normal_,b_normal_,h_normal_,N_normal_,p_normal_,m_F,eps);
+ double P_m_F=sls_basic::normal_probability(m_F);
double E_m_F=-const_val*exp(-0.5*m_F*m_F);
@@ -498,7 +480,7 @@ bool compute_only_area_)
n_F=n_lj_y/sqrt_vj_y;
};
- double P_n_F=sls_basic::normal_probability(a_normal_,b_normal_,h_normal_,N_normal_,p_normal_,n_F,eps);
+ double P_n_F=sls_basic::normal_probability(n_F);
double E_n_F=-const_val*exp(-0.5*n_F*n_F);
@@ -576,12 +558,6 @@ double &P_error_,
double &E_,
double &E_error_,
-double a_normal_,
-double b_normal_,
-double h_normal_,
-long int N_normal_,
-double *p_normal_,
-
bool &area_is_1_flag_)
{
long int dim=par_.m_LambdaSbs.size();
@@ -696,13 +672,6 @@ bool &area_is_1_flag_)
E_tmp,
area_tmp,
-
- a_normal_,
- b_normal_,
- h_normal_,
- N_normal_,
- p_normal_,
-
area_is_1_flag_);
P_values[i]=P_tmp;
@@ -880,11 +849,6 @@ bool read_Sbs_par_flag)
E,
area,
- a_normal,
- b_normal,
- h_normal,
- N_normal,
- p_normal,
area_is_1_flag);
@@ -906,11 +870,6 @@ bool read_Sbs_par_flag)
E_tmp,
E_error,
- a_normal,
- b_normal,
- h_normal,
- N_normal,
- p_normal,
area_is_1_flag);
@@ -954,11 +913,6 @@ bool read_Sbs_par_flag)
E_error,
area,
- a_normal,
- b_normal,
- h_normal,
- N_normal,
- p_normal,
area_is_1_flag);
P_value_error=P_error;
diff --git a/src/alp/sls_fsa1_pvalues.hpp b/src/alp/sls_fsa1_pvalues.hpp
index c32cac0..77fe8ff 100644
--- a/src/alp/sls_fsa1_pvalues.hpp
+++ b/src/alp/sls_fsa1_pvalues.hpp
@@ -206,13 +206,6 @@ namespace Sls {
double &E_error_,
double &area_,
-
- double a_normal_,
- double b_normal_,
- double h_normal_,
- long int N_normal_,
- double *p_normal_,
-
bool &area_is_1_flag_);
static void compute_tmp_values(FALP_set_of_parameters &par_);
@@ -229,13 +222,6 @@ namespace Sls {
double &E_,
double &area_,
-
- double a_normal_,
- double b_normal_,
- double h_normal_,
- long int N_normal_,
- double *p_normal_,
-
bool &area_is_1_flag_,
bool compute_only_area_=false);
@@ -253,13 +239,6 @@ namespace Sls {
double &E_,
double &E_error_,
-
- double a_normal_,
- double b_normal_,
- double h_normal_,
- long int N_normal_,
- double *p_normal_,
-
bool &area_is_1_flag_);
diff --git a/src/alp/sls_pvalues.cpp b/src/alp/sls_pvalues.cpp
index 194d696..515a41c 100644
--- a/src/alp/sls_pvalues.cpp
+++ b/src/alp/sls_pvalues.cpp
@@ -60,13 +60,6 @@ double &E_,
double &E_error_,
double &area_,
-
-double a_normal_,
-double b_normal_,
-double h_normal_,
-long int N_normal_,
-double *p_normal_,
-
bool &area_is_1_flag_)
{
@@ -135,8 +128,6 @@ bool &area_is_1_flag_)
tau_hat_error_=0;
};
- double eps=0.000001;
-
double m_li_y_error=0;
double m_li_y=0;
@@ -171,7 +162,7 @@ bool &area_is_1_flag_)
};
- double P_m_F=sls_basic::normal_probability(a_normal_,b_normal_,h_normal_,N_normal_,p_normal_,m_F,eps);
+ double P_m_F=sls_basic::normal_probability(m_F);
double P_m_F_error=const_val*exp(-0.5*m_F*m_F)*m_F_error;
double E_m_F=-const_val*exp(-0.5*m_F*m_F);
@@ -222,7 +213,7 @@ bool &area_is_1_flag_)
n_F=n_lj_y/sqrt_vj_y;
};
- double P_n_F=sls_basic::normal_probability(a_normal_,b_normal_,h_normal_,N_normal_,p_normal_,n_F,eps);
+ double P_n_F=sls_basic::normal_probability(n_F);
double P_n_F_error=const_val*exp(-0.5*n_F*n_F)*n_F_error;
double E_n_F=-const_val*exp(-0.5*n_F*n_F);
@@ -384,13 +375,6 @@ double &P_,
double &E_,
double &area_,
-
-double a_normal_,
-double b_normal_,
-double h_normal_,
-long int N_normal_,
-double *p_normal_,
-
bool &area_is_1_flag_,
bool compute_only_area_)
{
@@ -436,8 +420,6 @@ bool compute_only_area_)
tau_hat_=0;
};
- double eps=0.000001;
-
double m_li_y=0;
double tmp=ai_hat_*y_+bi_hat_;
@@ -463,7 +445,7 @@ bool compute_only_area_)
};
- double P_m_F=sls_basic::normal_probability(a_normal_,b_normal_,h_normal_,N_normal_,p_normal_,m_F,eps);
+ double P_m_F=sls_basic::normal_probability(m_F);
double E_m_F=-const_val*exp(-0.5*m_F*m_F);
@@ -497,7 +479,7 @@ bool compute_only_area_)
n_F=n_lj_y/sqrt_vj_y;
};
- double P_n_F=sls_basic::normal_probability(a_normal_,b_normal_,h_normal_,N_normal_,p_normal_,n_F,eps);
+ double P_n_F=sls_basic::normal_probability(n_F);
double E_n_F=-const_val*exp(-0.5*n_F*n_F);
@@ -575,12 +557,6 @@ double &P_error_,
double &E_,
double &E_error_,
-double a_normal_,
-double b_normal_,
-double h_normal_,
-long int N_normal_,
-double *p_normal_,
-
bool &area_is_1_flag_)
{
long int dim=par_.m_LambdaSbs.size();
@@ -693,13 +669,6 @@ bool &area_is_1_flag_)
E_tmp,
area_tmp,
-
- a_normal_,
- b_normal_,
- h_normal_,
- N_normal_,
- p_normal_,
-
area_is_1_flag_);
P_values[i]=P_tmp;
@@ -888,11 +857,6 @@ bool read_Sbs_par_flag)
E,
area,
- a_normal,
- b_normal,
- h_normal,
- N_normal,
- p_normal,
area_is_1_flag);
@@ -914,11 +878,6 @@ bool read_Sbs_par_flag)
E_tmp,
E_error,
- a_normal,
- b_normal,
- h_normal,
- N_normal,
- p_normal,
area_is_1_flag);
@@ -962,11 +921,6 @@ bool read_Sbs_par_flag)
E_error,
area,
- a_normal,
- b_normal,
- h_normal,
- N_normal,
- p_normal,
area_is_1_flag);
P_value_error=P_error;
diff --git a/src/alp/sls_pvalues.hpp b/src/alp/sls_pvalues.hpp
index c47dd37..023cd47 100644
--- a/src/alp/sls_pvalues.hpp
+++ b/src/alp/sls_pvalues.hpp
@@ -190,13 +190,6 @@ namespace Sls {
double &E_error_,
double &area_,
-
- double a_normal_,
- double b_normal_,
- double h_normal_,
- long int N_normal_,
- double *p_normal_,
-
bool &area_is_1_flag_);
static void compute_tmp_values(ALP_set_of_parameters &par_);
@@ -213,13 +206,6 @@ namespace Sls {
double &E_,
double &area_,
-
- double a_normal_,
- double b_normal_,
- double h_normal_,
- long int N_normal_,
- double *p_normal_,
-
bool &area_is_1_flag_,
bool compute_only_area_=false);
@@ -236,12 +222,6 @@ namespace Sls {
double &E_,
double &E_error_,
- double a_normal_,
- double b_normal_,
- double h_normal_,
- long int N_normal_,
- double *p_normal_,
-
bool &area_is_1_flag_);
diff --git a/src/getoptUtil.hh b/src/getoptUtil.hh
new file mode 100644
index 0000000..25bd328
--- /dev/null
+++ b/src/getoptUtil.hh
@@ -0,0 +1,17 @@
+// Copyright 2017 Martin C. Frith
+
+#ifndef GETOPT_UTIL_HH
+#define GETOPT_UTIL_HH
+
+#include <unistd.h>
+
+inline void resetGetopt() { // XXX fragile voodoo
+#ifdef __GLIBC__
+ optind = 0;
+#else
+ optind = 1;
+ //optreset = 1; // XXX ???
+#endif
+}
+
+#endif
diff --git a/src/lastal.cc b/src/lastal.cc
index 467fad7..0e90c61 100644
--- a/src/lastal.cc
+++ b/src/lastal.cc
@@ -552,6 +552,7 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns,
Dispatcher dis( Phase::gapless, aligner, queryNum, strand, querySeq );
DiagonalTable dt; // record already-covered positions on each diagonal
countT matchCount = 0, gaplessExtensionCount = 0, gaplessAlignmentCount = 0;
+ size_t significantAlignmentCount = 0;
size_t loopBeg = query.seqBeg(queryNum) - query.padBeg(queryNum);
size_t loopEnd = query.seqEnd(queryNum) - query.padBeg(queryNum);
@@ -588,10 +589,11 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns,
indexT j = *beg; // coordinate in the reference sequence
if( dt.isCovered( i, j ) ) continue;
++gaplessExtensionCount;
+ int score;
if( isOverlap ){
size_t revLen, fwdLen;
- int score = dis.gaplessOverlap( j, i, revLen, fwdLen );
+ score = dis.gaplessOverlap( j, i, revLen, fwdLen );
if( score < minScoreGapless ) continue;
SegmentPair sp( j - revLen, i - revLen, revLen + fwdLen, score );
dt.addEndpoint( sp.end2(), sp.end1() );
@@ -599,7 +601,7 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns,
}else{
int fs = dis.forwardGaplessScore( j, i );
int rs = dis.reverseGaplessScore( j, i );
- int score = fs + rs;
+ score = fs + rs;
// Tried checking the score after isOptimal & addEndpoint,
// but the number of extensions decreased by < 10%, and it
@@ -623,6 +625,12 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns,
++gaplessAlignmentsPerQueryPosition;
++gaplessAlignmentCount;
+
+ if( score >= args.minScoreGapped &&
+ ++significantAlignmentCount >= args.maxAlignmentsPerQueryStrand ) {
+ i = loopEnd;
+ break;
+ }
}
}
}
@@ -710,6 +718,7 @@ void alignGapped( LastAligner& aligner,
else SegmentPairPot::markAsGood(sp);
++gappedAlignmentCount;
+ if( gappedAlignmentCount >= args.maxAlignmentsPerQueryStrand ) break;
}
LOG2( "gapped extensions=" << gappedExtensionCount );
diff --git a/src/makefile b/src/makefile
index cddf3b7..df3581a 100644
--- a/src/makefile
+++ b/src/makefile
@@ -175,9 +175,9 @@ LastEvaluer.o LastEvaluer.o8: LastEvaluer.cc LastEvaluer.hh ScoreMatrixRow.hh \
alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \
GeneticCode.hh
LastalArguments.o LastalArguments.o8: LastalArguments.cc LastalArguments.hh \
- SequenceFormat.hh stringify.hh version.hh
+ SequenceFormat.hh stringify.hh getoptUtil.hh version.hh
LastdbArguments.o LastdbArguments.o8: LastdbArguments.cc LastdbArguments.hh \
- SequenceFormat.hh stringify.hh version.hh
+ SequenceFormat.hh stringify.hh getoptUtil.hh version.hh
MultiSequence.o MultiSequence.o8: MultiSequence.cc MultiSequence.hh ScoreMatrixRow.hh \
VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh io.hh
MultiSequenceQual.o MultiSequenceQual.o8: MultiSequenceQual.cc MultiSequence.hh \
diff --git a/src/split/cbrc_split_aligner.hh b/src/split/cbrc_split_aligner.hh
index 6e6885b..ff4bdb7 100644
--- a/src/split/cbrc_split_aligner.hh
+++ b/src/split/cbrc_split_aligner.hh
@@ -212,7 +212,7 @@ private:
void initSpliceSignals(unsigned i);
void initRnameAndStrandIds();
void initRbegsAndEnds();
- static size_t maxGenomeVolumes() { return sizeof genome / sizeof *genome; }
+ size_t maxGenomeVolumes() const { return sizeof genome / sizeof *genome; }
void readGenomeVolume(const std::string& baseName,
size_t seqCount, size_t volumeNumber);
diff --git a/src/split/last-split.cc b/src/split/last-split.cc
index 1dd714e..619f54c 100644
--- a/src/split/last-split.cc
+++ b/src/split/last-split.cc
@@ -83,21 +83,26 @@ static void doOneAlignmentPart(cbrc::SplitAligner& sa,
bool isSenseStrand, double senseStrandLogOdds,
const LastSplitOptions& opts,
bool isAlreadySplit) {
+ unsigned qSliceBegTrimmed = qSliceBeg;
+ unsigned qSliceEndTrimmed = qSliceEnd;
unsigned alnBeg, alnEnd;
- cbrc::mafSliceBeg(a.ralign, a.qalign, a.qstart, qSliceBeg, alnBeg);
- cbrc::mafSliceEnd(a.ralign, a.qalign, a.qend, qSliceEnd, alnEnd);
+ cbrc::mafSliceBeg(a.ralign, a.qalign, a.qstart, qSliceBegTrimmed, alnBeg);
+ cbrc::mafSliceEnd(a.ralign, a.qalign, a.qend, qSliceEndTrimmed, alnEnd);
- int score = sa.segmentScore(alnNum, qSliceBeg, qSliceEnd);
+ int score =
+ sa.segmentScore(alnNum, qSliceBeg, qSliceEnd) -
+ sa.segmentScore(alnNum, qSliceBeg, qSliceBegTrimmed) -
+ sa.segmentScore(alnNum, qSliceEndTrimmed, qSliceEnd);
if (score < opts.score) return;
std::vector<double> p;
if (opts.direction != 0) {
- p = sa.marginalProbs(qSliceBeg, alnNum, alnBeg, alnEnd);
+ p = sa.marginalProbs(qSliceBegTrimmed, alnNum, alnBeg, alnEnd);
}
std::vector<double> pRev;
if (opts.direction != 1) {
sa.flipSpliceSignals();
- pRev = sa.marginalProbs(qSliceBeg, alnNum, alnBeg, alnEnd);
+ pRev = sa.marginalProbs(qSliceBegTrimmed, alnNum, alnBeg, alnEnd);
sa.flipSpliceSignals();
}
if (opts.direction == 0) p.swap(pRev);
diff --git a/src/version.hh b/src/version.hh
index d87fb52..7104b42 100644
--- a/src/version.hh
+++ b/src/version.hh
@@ -1 +1 @@
-"830"
+"869"
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/last-align.git
More information about the debian-med-commit
mailing list