[med-svn] [last-align] 01/06: New upstream version 885
Charles Plessy
plessy at moszumanska.debian.org
Sun Oct 15 13:22:58 UTC 2017
This is an automated email from the git hooks/post-receive script.
plessy pushed a commit to branch master
in repository last-align.
commit 899a68165f8864d3cfb67aca39ac97a2add9c8ac
Author: Charles Plessy <plessy at debian.org>
Date: Sun Oct 15 22:05:37 2017 +0900
New upstream version 885
---
ChangeLog.txt | 87 +++++++++++++++++++++++++++++++++++++++++++-
doc/last-dotplot.html | 17 +++++++--
doc/last-dotplot.txt | 15 ++++++--
doc/last-parallel.html | 4 +--
doc/last-parallel.txt | 4 +--
doc/last-train.html | 4 +++
doc/last-train.txt | 2 ++
doc/last-tuning.html | 7 ++--
doc/last-tuning.txt | 8 ++---
doc/last.html | 4 +--
doc/last.txt | 4 +--
doc/lastal.html | 13 ++++---
doc/lastal.txt | 9 +++--
doc/lastdb.html | 5 +--
doc/lastdb.txt | 5 +--
doc/maf-convert.html | 12 +++----
doc/maf-convert.txt | 18 +++++-----
scripts/fastq-interleave | 10 ++----
scripts/last-dotplot | 75 +++++++++++++++++++++++---------------
scripts/last-postmask | 23 +++++++-----
scripts/last-train | 79 ++++++++++++++++++++++++++--------------
scripts/maf-convert | 93 +++++++++++++++++++++++++++++++++++-------------
src/Alphabet.cc | 8 -----
src/Alphabet.hh | 3 --
src/LastEvaluer.cc | 2 +-
src/LastalArguments.cc | 33 ++++++++++++-----
src/MultiSequence.cc | 7 ----
src/MultiSequence.hh | 66 +++++++++++++++++++++++++++++++---
src/MultiSequenceQual.cc | 22 ++++--------
src/io.cc | 11 ++----
src/io.hh | 7 ++--
src/last-pair-probs.cc | 4 +--
src/lastal.cc | 32 ++---------------
src/lastdb.cc | 43 ++++++++++------------
src/makefile | 32 ++++++++---------
src/mcf_zstream.hh | 82 ++++++++++++++++++++++++++++++++++++++++++
src/version.hh | 2 +-
37 files changed, 569 insertions(+), 283 deletions(-)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 2203b24..7c46466 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,9 +1,94 @@
+2017-10-10 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-train:
+ Make last-train print the lastal version
+ [c2ce08b5a76a] [tip]
+
+ * src/Alphabet.cc, src/Alphabet.hh, src/MultiSequence.hh,
+ src/lastal.cc, src/lastdb.cc:
+ Refactor
+ [52565bc1c7d7]
+
+ * src/lastdb.cc:
+ Refactor (I think)
+ [02a9cb451bc7]
+
+ * src/MultiSequence.cc, src/MultiSequence.hh,
+ src/MultiSequenceQual.cc, src/lastal.cc:
+ Refactor some code
+ [88fa35342a52]
+
+ * scripts/last-train, test/last-train-test.out, test/last-train-
+ test.sh:
+ Make last-train work with lastdb8
+ [a7ef7848ad1e]
+
+2017-10-06 Martin C. Frith <Martin C. Frith>
+
+ * doc/maf-convert.txt, scripts/maf-convert, test/maf-convert-test.out,
+ test/maf-convert-test.sh:
+ Add maf-convert -> chain
+ [6635b482e6e2]
+
+ * doc/last.txt, src/LastEvaluer.cc:
+ Avoid some "can't get E-value parameters" errors
+ [2c0cb7ef8842]
+
+2017-10-03 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-dotplot.txt, scripts/last-dotplot:
+ Add text-rotation options to last-dotplot
+ [20f5c97a3cfd]
+
+ * scripts/last-dotplot:
+ Fix mixed-up margin sizes in last-dotplot
+ [f5f93e51ef53]
+
+2017-09-27 Martin C. Frith <Martin C. Frith>
+
+ * src/makefile:
+ Try to make the gz compilation work more portably
+ [bf8efe69f160]
+
+2017-09-26 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-dotplot, scripts/last-postmask, scripts/last-train,
+ scripts/maf-convert, test/last-postmask-test.out, test/last-
+ postmask-test.sh, test/last-test.sh:
+ Add gz reading to: dotplot postmask train convert
+ [592295375eb1]
+
+2017-09-22 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-parallel.txt, src/mcf_zstream.hh:
+ Make the gz stuff compile on old computers
+ [0602563ef4e2]
+
+ * src/makefile:
+ Try to make compilation more robust & portable
+ [8a2e4af79c99]
+
+ * doc/lastal.txt, doc/lastdb.txt, scripts/fastq-interleave, src/io.cc,
+ src/io.hh, src/last-pair-probs.cc, src/lastal.cc, src/lastdb.cc,
+ src/makefile, src/mcf_zstream.hh:
+ Enable lastal, lastdb, etc to read .gz files
+ [a530b1ee48f4]
+
+ * doc/last-train.txt, scripts/last-train:
+ Add option -k to last-train
+ [80a016b17089]
+
+ * doc/last-tuning.txt, doc/lastal.txt, src/LastalArguments.cc, test
+ /last-test.out, test/last-test.sh:
+ Add -x % option to lastal
+ [ad7f5f953412]
+
2017-06-20 Martin C. Frith <Martin C. Frith>
* doc/last-train.txt, scripts/last-train, test/last-train-test.out,
test/last-train-test.sh:
Make it easier to feed stdin/pipe to last-train
- [b73f1146e688] [tip]
+ [b73f1146e688]
* doc/lastal.txt, src/AlignmentWrite.cc, test/last-test.out:
Add raw score to BlastTab+ format
diff --git a/doc/last-dotplot.html b/doc/last-dotplot.html
index e1717a0..21cc1d9 100644
--- a/doc/last-dotplot.html
+++ b/doc/last-dotplot.html
@@ -319,8 +319,9 @@ table.field-list { border: thin solid green }
<h1 class="title">last-dotplot</h1>
<p>This script makes a dotplot, a.k.a. Oxford Grid, of pair-wise sequence
-alignments in MAF or LAST tabular format. It requires the Python
-Imaging Library to be installed. It can be used like this:</p>
+alignments in MAF or LAST tabular format. It requires the <a class="reference external" href="https://pillow.readthedocs.io/">Python
+Imaging Library</a> to be installed.
+It can be used like this:</p>
<pre class="literal-block">
last-dotplot my-alignments my-plot.png
</pre>
@@ -330,7 +331,11 @@ last-dotplot alns alns.gif
</pre>
<p>To get a nicer font, try something like:</p>
<pre class="literal-block">
-last-dotplot -f /usr/share/fonts/truetype/freefont/FreeSans.ttf alns alns.png
+last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png
+</pre>
+<p>or:</p>
+<pre class="literal-block">
+last-dotplot -f /Library/Fonts/Arial.ttf alns alns.png
</pre>
<p>If the fonts are located somewhere different on your computer, change
this as appropriate.</p>
@@ -450,6 +455,12 @@ appearance in the input (0), their names (1), their lengths (2).</td></tr>
<kbd><span class="option">-s <var>SIZE</var></span>, <span class="option">--fontsize=<var>SIZE</var></span></kbd></td>
<td>TrueType or OpenType font size.</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">--rot1=<var>ROT</var></span></kbd></td>
+<td>Text rotation for the 1st genome: h(orizontal) or v(ertical).</td></tr>
+<tr><td class="option-group">
+<kbd><span class="option">--rot2=<var>ROT</var></span></kbd></td>
+<td>Text rotation for the 2nd genome: h(orizontal) or v(ertical).</td></tr>
+<tr><td class="option-group">
<kbd><span class="option">--lengths1</span></kbd></td>
<td>Show sequence lengths for the 1st (horizontal) genome.</td></tr>
<tr><td class="option-group">
diff --git a/doc/last-dotplot.txt b/doc/last-dotplot.txt
index 8642e59..2c1ff46 100644
--- a/doc/last-dotplot.txt
+++ b/doc/last-dotplot.txt
@@ -2,8 +2,9 @@ last-dotplot
============
This script makes a dotplot, a.k.a. Oxford Grid, of pair-wise sequence
-alignments in MAF or LAST tabular format. It requires the Python
-Imaging Library to be installed. It can be used like this::
+alignments in MAF or LAST tabular format. It requires the `Python
+Imaging Library <https://pillow.readthedocs.io/>`_ to be installed.
+It can be used like this::
last-dotplot my-alignments my-plot.png
@@ -13,7 +14,11 @@ The output can be in any format supported by the Imaging Library::
To get a nicer font, try something like::
- last-dotplot -f /usr/share/fonts/truetype/freefont/FreeSans.ttf alns alns.png
+ last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png
+
+or::
+
+ last-dotplot -f /Library/Fonts/Arial.ttf alns alns.png
If the fonts are located somewhere different on your computer, change
this as appropriate.
@@ -95,6 +100,10 @@ Text options
TrueType or OpenType font file.
-s SIZE, --fontsize=SIZE
TrueType or OpenType font size.
+ --rot1=ROT
+ Text rotation for the 1st genome: h(orizontal) or v(ertical).
+ --rot2=ROT
+ Text rotation for the 2nd genome: h(orizontal) or v(ertical).
--lengths1
Show sequence lengths for the 1st (horizontal) genome.
--lengths2
diff --git a/doc/last-parallel.html b/doc/last-parallel.html
index 5d2ad15..1cfe214 100644
--- a/doc/last-parallel.html
+++ b/doc/last-parallel.html
@@ -372,11 +372,11 @@ parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > ou
</pre>
<p>Instead of this:</p>
<pre class="literal-block">
-zcat queries.fa.gz | lastal mydb > myalns.maf
+bzcat queries.fa.bz2 | lastal mydb > myalns.maf
</pre>
<p>try this:</p>
<pre class="literal-block">
-zcat queries.fa.gz | parallel-fasta "lastal mydb" > myalns.maf
+bzcat queries.fa.bz2 | parallel-fasta "lastal mydb" > myalns.maf
</pre>
<p>Notes:</p>
<ul>
diff --git a/doc/last-parallel.txt b/doc/last-parallel.txt
index 730157f..c98a451 100644
--- a/doc/last-parallel.txt
+++ b/doc/last-parallel.txt
@@ -61,11 +61,11 @@ try this::
Instead of this::
- zcat queries.fa.gz | lastal mydb > myalns.maf
+ bzcat queries.fa.bz2 | lastal mydb > myalns.maf
try this::
- zcat queries.fa.gz | parallel-fasta "lastal mydb" > myalns.maf
+ bzcat queries.fa.bz2 | parallel-fasta "lastal mydb" > myalns.maf
Notes:
diff --git a/doc/last-train.html b/doc/last-train.html
index a4889bd..f510a51 100644
--- a/doc/last-train.html
+++ b/doc/last-train.html
@@ -455,6 +455,10 @@ reduce run time.</td></tr>
<kbd><span class="option">-m <var>COUNT</var></span></kbd></td>
<td>Maximum number of initial matches per query position.</td></tr>
<tr><td class="option-group">
+<kbd><span class="option">-k <var>STEP</var></span></kbd></td>
+<td>Look for initial matches starting only at every STEP-th
+position in each query.</td></tr>
+<tr><td class="option-group">
<kbd><span class="option">-P <var>COUNT</var></span></kbd></td>
<td>Number of parallel threads.</td></tr>
<tr><td class="option-group">
diff --git a/doc/last-train.txt b/doc/last-train.txt
index c9fcd93..c965ff3 100644
--- a/doc/last-train.txt
+++ b/doc/last-train.txt
@@ -84,6 +84,8 @@ Alignment options
reduce run time.
-T NUMBER Type of alignment: 0=local, 1=overlap.
-m COUNT Maximum number of initial matches per query position.
+ -k STEP Look for initial matches starting only at every STEP-th
+ position in each query.
-P COUNT Number of parallel threads.
-Q NUMBER Query sequence format: 0=fasta, 1=fastq-sanger.
**Important:** if you use option -Q, last-train will only
diff --git a/doc/last-tuning.html b/doc/last-tuning.html
index a273b3b..a7c04de 100644
--- a/doc/last-tuning.html
+++ b/doc/last-tuning.html
@@ -408,11 +408,8 @@ whose query coordinates lie in those of 2 or more stronger alignments.</p>
<p>This option can make lastal <strong>faster</strong> but <strong>less sensitive</strong>. It
sets the maximum score drop in alignments, in the gapped extension
phase. Lower values make it faster, by quitting unpromising
-extensions sooner. The default aims at best accuracy.</p>
-<p>Unfortunately, the default is a complex function of the other
-parameters and the database size. You can see it in the lastal header
-after "x=", e.g. by running lastal with no queries. Then try, say,
-halving it.</p>
+extensions sooner. The default aims at best accuracy. For example,
+use -x50% to specify 50% of the default value.</p>
</div>
<div class="section" id="id2">
<h3>lastal -M</h3>
diff --git a/doc/last-tuning.txt b/doc/last-tuning.txt
index 6493e62..9fc4c9b 100644
--- a/doc/last-tuning.txt
+++ b/doc/last-tuning.txt
@@ -109,12 +109,8 @@ lastal -x
This option can make lastal **faster** but **less sensitive**. It
sets the maximum score drop in alignments, in the gapped extension
phase. Lower values make it faster, by quitting unpromising
-extensions sooner. The default aims at best accuracy.
-
-Unfortunately, the default is a complex function of the other
-parameters and the database size. You can see it in the lastal header
-after "x=", e.g. by running lastal with no queries. Then try, say,
-halving it.
+extensions sooner. The default aims at best accuracy. For example,
+use -x50% to specify 50% of the default value.
lastal -M
---------
diff --git a/doc/last.html b/doc/last.html
index 7c0f7f7..ef87985 100644
--- a/doc/last.html
+++ b/doc/last.html
@@ -354,8 +354,8 @@ compile the programs, type:</p>
<pre class="literal-block">
make
</pre>
-<p>If your compiler is really old, you might get an error message like
-this:</p>
+<p>You might get some harmless warning messages. If your compiler is
+really old, you might get an error message like this:</p>
<pre class="literal-block">
unrecognized command line option "-std=c++11"
</pre>
diff --git a/doc/last.txt b/doc/last.txt
index 70123f1..20d2102 100644
--- a/doc/last.txt
+++ b/doc/last.txt
@@ -33,8 +33,8 @@ compile the programs, type::
make
-If your compiler is really old, you might get an error message like
-this::
+You might get some harmless warning messages. If your compiler is
+really old, you might get an error message like this::
unrecognized command line option "-std=c++11"
diff --git a/doc/lastal.html b/doc/lastal.html
index 191dde2..22e63d2 100644
--- a/doc/lastal.html
+++ b/doc/lastal.html
@@ -329,9 +329,10 @@ lastal humanDb dna*.fasta > myalns.maf
writes several files whose names begin with <tt class="docutils literal">humanDb</tt>. The lastal
command reads files called <tt class="docutils literal"><span class="pre">dna*.fasta</span></tt>, compares them to
<tt class="docutils literal">humanDb</tt>, and writes alignments to a file called <tt class="docutils literal">myalns.maf</tt>.</p>
-<p>You can also pipe query sequences into lastal, for example:</p>
+<p>You can use gzip (.gz) compressed query files. You can also pipe
+query sequences into lastal, for example:</p>
<pre class="literal-block">
-zcat seqs.fasta.gz | lastal humanDb > myalns.maf
+bzcat seqs.fasta.bz2 | lastal humanDb > myalns.maf
</pre>
<div class="section" id="steps-in-lastal">
<h2>Steps in lastal</h2>
@@ -516,10 +517,14 @@ looks like this:</p>
</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-x <var>DROP</var></span></kbd></td>
-<td>Maximum score drop for gapped alignments. Gapped alignments are
+<td><p class="first">Maximum score drop for gapped alignments. Gapped alignments are
forbidden from having any internal region with score < -DROP.
This serves two purposes: accuracy (avoid spurious internal
-regions in alignments) and speed (the smaller the faster).</td></tr>
+regions in alignments) and speed (the smaller the faster).</p>
+<p class="last">You can specify either a score (e.g. -x20), or a percentage; for
+example, -x50% specifies 50% of the default value (rounded down
+to the nearest integer).</p>
+</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-y <var>DROP</var></span></kbd></td>
<td>Maximum score drop for gapless alignments.</td></tr>
diff --git a/doc/lastal.txt b/doc/lastal.txt
index db7eeb1..476d9f8 100644
--- a/doc/lastal.txt
+++ b/doc/lastal.txt
@@ -13,9 +13,10 @@ writes several files whose names begin with ``humanDb``. The lastal
command reads files called ``dna*.fasta``, compares them to
``humanDb``, and writes alignments to a file called ``myalns.maf``.
-You can also pipe query sequences into lastal, for example::
+You can use gzip (.gz) compressed query files. You can also pipe
+query sequences into lastal, for example::
- zcat seqs.fasta.gz | lastal humanDb > myalns.maf
+ bzcat seqs.fasta.bz2 | lastal humanDb > myalns.maf
Steps in lastal
---------------
@@ -186,6 +187,10 @@ Score options
This serves two purposes: accuracy (avoid spurious internal
regions in alignments) and speed (the smaller the faster).
+ You can specify either a score (e.g. -x20), or a percentage; for
+ example, -x50% specifies 50% of the default value (rounded down
+ to the nearest integer).
+
-y DROP
Maximum score drop for gapless alignments.
diff --git a/doc/lastdb.html b/doc/lastdb.html
index 974c588..4b54522 100644
--- a/doc/lastdb.html
+++ b/doc/lastdb.html
@@ -336,9 +336,10 @@ TTTGGATATG
>My2ndSequence
TTTAGAGGGTTCTTCGGGATT
</pre>
-<p>You can also pipe sequences into lastdb, for example:</p>
+<p>These files may be compressed in gzip (.gz) format. You can also pipe
+sequences into lastdb, for example:</p>
<pre class="literal-block">
-zcat humanChromosome*.fasta.gz | lastdb humanDb
+bzcat humanChromosome*.fasta.bz2 | lastdb humanDb
</pre>
</div>
<div class="section" id="options">
diff --git a/doc/lastdb.txt b/doc/lastdb.txt
index c0b5fdb..38e5fe8 100644
--- a/doc/lastdb.txt
+++ b/doc/lastdb.txt
@@ -21,9 +21,10 @@ like this::
>My2ndSequence
TTTAGAGGGTTCTTCGGGATT
-You can also pipe sequences into lastdb, for example::
+These files may be compressed in gzip (.gz) format. You can also pipe
+sequences into lastdb, for example::
- zcat humanChromosome*.fasta.gz | lastdb humanDb
+ bzcat humanChromosome*.fasta.bz2 | lastdb humanDb
Options
-------
diff --git a/doc/maf-convert.html b/doc/maf-convert.html
index a98040e..295e74e 100644
--- a/doc/maf-convert.html
+++ b/doc/maf-convert.html
@@ -318,9 +318,9 @@ table.field-list { border: thin solid green }
<div class="document" id="maf-convert">
<h1 class="title">maf-convert</h1>
-<p>This script reads alignments in maf format, and writes them in another
-format. It can write them in these formats: axt, blast, blasttab,
-html, psl, sam, tab. You can use it like this:</p>
+<p>This script reads alignments in <a class="reference external" href="http://genome.ucsc.edu/FAQ/FAQformat.html#format5">maf</a> format, and writes them in
+another format. It can write them in these formats: <a class="reference external" href="https://genome.ucsc.edu/goldenPath/help/axt.html">axt</a>, blast,
+blasttab, <a class="reference external" href="https://genome.ucsc.edu/goldenPath/help/chain.html">chain</a>, html, <a class="reference external" href="https://genome.ucsc.edu/FAQ/FAQformat.html#format2">psl</a>, sam, tab. You can use it like this:</p>
<pre class="literal-block">
maf-convert psl my-alignments.maf > my-alignments.psl
</pre>
@@ -328,9 +328,7 @@ maf-convert psl my-alignments.maf > my-alignments.psl
<pre class="literal-block">
... | maf-convert psl > my-alignments.psl
</pre>
-<p>The input should be "multiple alignment format" as described in the
-UCSC Genome FAQ (not "MIRA assembly format" or any other maf).</p>
-<p>This script takes the first (topmost) MAF sequence as the "reference"
+<p>This script takes the first (topmost) maf sequence as the "reference"
/ "subject" / "target", and the second sequence as the "query".</p>
<p>For html: if the input includes probability lines starting with 'p',
then the output will be coloured by column probability. (To get lines
@@ -351,7 +349,7 @@ starting with 'p', run lastal with option -j set to 4 or higher.)</p>
nucleotides. This affects psl format only (the first 4
columns).</td></tr>
<tr><td class="option-group">
-<kbd><span class="option">-j</span>, <span class="option">--join=<var>N</var></span></kbd></td>
+<kbd><span class="option">-j <var>N</var></span>, <span class="option">--join=<var>N</var></span></kbd></td>
<td>Join neighboring alignments if they are co-linear and
separated by at most N letters. This affects psl format
only.</td></tr>
diff --git a/doc/maf-convert.txt b/doc/maf-convert.txt
index f78acee..d2d3f4d 100644
--- a/doc/maf-convert.txt
+++ b/doc/maf-convert.txt
@@ -1,9 +1,9 @@
maf-convert
===========
-This script reads alignments in maf format, and writes them in another
-format. It can write them in these formats: axt, blast, blasttab,
-html, psl, sam, tab. You can use it like this::
+This script reads alignments in maf_ format, and writes them in
+another format. It can write them in these formats: axt_, blast,
+blasttab, chain_, html, psl_, sam, tab. You can use it like this::
maf-convert psl my-alignments.maf > my-alignments.psl
@@ -11,16 +11,18 @@ It's often convenient to pipe in the input, like this::
... | maf-convert psl > my-alignments.psl
-The input should be "multiple alignment format" as described in the
-UCSC Genome FAQ (not "MIRA assembly format" or any other maf).
-
-This script takes the first (topmost) MAF sequence as the "reference"
+This script takes the first (topmost) maf sequence as the "reference"
/ "subject" / "target", and the second sequence as the "query".
For html: if the input includes probability lines starting with 'p',
then the output will be coloured by column probability. (To get lines
starting with 'p', run lastal with option -j set to 4 or higher.)
+.. _maf: http://genome.ucsc.edu/FAQ/FAQformat.html#format5
+.. _axt: https://genome.ucsc.edu/goldenPath/help/axt.html
+.. _chain: https://genome.ucsc.edu/goldenPath/help/chain.html
+.. _psl: https://genome.ucsc.edu/FAQ/FAQformat.html#format2
+
Options
-------
@@ -32,7 +34,7 @@ Options
nucleotides. This affects psl format only (the first 4
columns).
- -j, --join=N
+ -j N, --join=N
Join neighboring alignments if they are co-linear and
separated by at most N letters. This affects psl format
only.
diff --git a/scripts/fastq-interleave b/scripts/fastq-interleave
index ca33604..dd9174f 100755
--- a/scripts/fastq-interleave
+++ b/scripts/fastq-interleave
@@ -3,6 +3,7 @@
test $# = 2 || {
cat <<EOF
Usage: $0 x.fastq y.fastq
+or: $0 x.fastq.gz y.fastq.gz
Read 2 fastq files, and write them interleaved.
Assumes 1 fastq per 4 lines, i.e. no line wrapping.
@@ -10,10 +11,5 @@ EOF
exit
}
-paste <(cat "$1" | paste - - - -) <(cat "$2" | paste - - - -) | tr '\t' '\n'
-
-# Is this better?
-#paste <(zcat -f "$1"|paste - - - -) <(zcat -f "$2"|paste - - - -)|tr '\t' '\n'
-
-# This does not interpret "-" as stdin:
-#paste <(paste - - - - < "$1") <(paste - - - - < "$2") | tr '\t' '\n'
+paste <(gzip -cdf "$1" | paste - - - -) <(gzip -cdf "$2" | paste - - - -) |
+ tr '\t' '\n'
diff --git a/scripts/last-dotplot b/scripts/last-dotplot
index 6d4f1d2..f29fed2 100755
--- a/scripts/last-dotplot
+++ b/scripts/last-dotplot
@@ -9,6 +9,7 @@
# according to the number of aligned nt-pairs within it, but the
# result is too faint. How can this be done better?
+import gzip
import fnmatch, itertools, optparse, os, re, sys
# Try to make PIL/PILLOW work:
@@ -18,6 +19,8 @@ except ImportError: import Image, ImageDraw, ImageFont, ImageColor
def myOpen(fileName): # faster than fileinput
if fileName == "-":
return sys.stdin
+ if fileName.endswith(".gz"):
+ return gzip.open(fileName)
return open(fileName)
def warn(message):
@@ -160,13 +163,17 @@ def natural_sort_key(my_string):
parts[1::2] = map(int, parts[1::2])
return parts
-def get_text_sizes(my_strings, font, fontsize, image_mode):
+def textDimensions(imageDraw, font, textRot, text):
+ x, y = imageDraw.textsize(text, font=font)
+ return (y, x) if textRot else (x, y)
+
+def get_text_sizes(my_strings, font, fontsize, image_mode, textRot):
'''Get widths & heights, in pixels, of some strings.'''
if fontsize == 0: return [(0, 0) for i in my_strings]
image_size = 1, 1
im = Image.new(image_mode, image_size)
draw = ImageDraw.Draw(im)
- return [draw.textsize(i, font=font) for i in my_strings]
+ return [textDimensions(draw, font, textRot, i) for i in my_strings]
def sizeText(size):
suffixes = "bp", "kb", "Mb", "Gb"
@@ -181,7 +188,7 @@ def seqNameAndSizeText(seqName, seqSize):
return seqName + ": " + sizeText(seqSize)
def getSeqInfo(sortOpt, seqNames, seqLimits,
- font, fontsize, image_mode, isShowSize):
+ font, fontsize, image_mode, isShowSize, textRot):
'''Return miscellaneous information about the sequences.'''
if sortOpt == 1:
seqNames.sort(key=natural_sort_key)
@@ -199,8 +206,9 @@ def getSeqInfo(sortOpt, seqNames, seqLimits,
seqLabels = map(seqNameAndSizeText, seqNames, seqSizes)
else:
seqLabels = seqNames
- labelSizes = get_text_sizes(seqLabels, font, fontsize, image_mode)
- margin = max(zip(*labelSizes)[1]) # maximum text height
+ labelSizes = get_text_sizes(seqLabels, font, fontsize, image_mode, textRot)
+ margin = max(i[1] for i in labelSizes)
+ # xxx the margin may be too big, because some labels may get omitted
return seqNames, seqSizes, seqLabels, labelSizes, margin
def div_ceil(x, y):
@@ -410,11 +418,11 @@ def drawAnnotations(im, boxes):
def make_label(text, text_size, range_start, range_size):
'''Return an axis label with endpoint & sort-order information.'''
- text_width = text_size[0]
+ text_width, text_height = text_size
label_start = range_start + (range_size - text_width) // 2
label_end = label_start + text_width
sort_key = text_width - range_size
- return sort_key, label_start, label_end, text
+ return sort_key, label_start, label_end, text, text_height
def get_nonoverlapping_labels(labels, label_space):
'''Get a subset of non-overlapping axis labels, greedily.'''
@@ -425,21 +433,23 @@ def get_nonoverlapping_labels(labels, label_space):
nonoverlapping_labels.append(i)
return nonoverlapping_labels
-def get_axis_image(seqNames, name_sizes, seq_starts, seq_pix,
- font, image_mode, opts):
+def get_axis_image(seqNames, name_sizes, seq_starts, seq_pix, textRot,
+ textAln, font, image_mode, opts):
'''Make an image of axis labels.'''
min_pos = seq_starts[0]
max_pos = seq_starts[-1] + seq_pix[-1]
- height = max(zip(*name_sizes)[1])
+ margin = max(i[1] for i in name_sizes)
labels = map(make_label, seqNames, name_sizes, seq_starts, seq_pix)
labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos]
labels.sort()
- labels = get_nonoverlapping_labels(labels, opts.label_space)
- image_size = max_pos, height
+ minPixTweenLabels = 0 if textRot else opts.label_space
+ labels = get_nonoverlapping_labels(labels, minPixTweenLabels)
+ image_size = (margin, max_pos) if textRot else (max_pos, margin)
im = Image.new(image_mode, image_size, opts.border_color)
draw = ImageDraw.Draw(im)
for i in labels:
- position = i[1], 0
+ base = margin - i[4] if textAln else 0
+ position = (base, i[1]) if textRot else (i[1], base)
draw.text(position, i[3], font=font, fill=opts.text_color)
return im
@@ -463,26 +473,28 @@ def lastDotplot(opts, args):
warn("done")
if not alignments: raise Exception("there are no alignments")
+ textRot1 = "vertical".startswith(opts.rot1)
i1 = getSeqInfo(opts.sort1, seqNames1, seqLimits1,
- font, opts.fontsize, image_mode, opts.lengths1)
- seqNames1, seqSizes1, seqLabels1, labelSizes1, margin1 = i1
+ font, opts.fontsize, image_mode, opts.lengths1, textRot1)
+ seqNames1, seqSizes1, seqLabels1, labelSizes1, tMargin = i1
+ textRot2 = "horizontal".startswith(opts.rot2)
i2 = getSeqInfo(opts.sort2, seqNames2, seqLimits2,
- font, opts.fontsize, image_mode, opts.lengths2)
- seqNames2, seqSizes2, seqLabels2, labelSizes2, margin2 = i2
+ font, opts.fontsize, image_mode, opts.lengths2, textRot2)
+ seqNames2, seqSizes2, seqLabels2, labelSizes2, lMargin = i2
warn("choosing bp per pixel...")
- pix_limit1 = opts.width - margin1
- pix_limit2 = opts.height - margin2
+ pix_limit1 = opts.width - lMargin
+ pix_limit2 = opts.height - tMargin
bpPerPix1 = get_bp_per_pix(seqSizes1, opts.border_pixels, pix_limit1)
bpPerPix2 = get_bp_per_pix(seqSizes2, opts.border_pixels, pix_limit2)
bpPerPix = max(bpPerPix1, bpPerPix2)
warn("bp per pixel = " + str(bpPerPix))
seq_pix1, seq_starts1, width = get_pix_info(seqSizes1, bpPerPix,
- opts.border_pixels, margin1)
+ opts.border_pixels, lMargin)
seq_pix2, seq_starts2, height = get_pix_info(seqSizes2, bpPerPix,
- opts.border_pixels, margin2)
+ opts.border_pixels, tMargin)
warn("width: " + str(width))
warn("height: " + str(height))
@@ -506,13 +518,13 @@ def lastDotplot(opts, args):
readRmsk(opts.rmsk1, seqLimits1),
readGenePred(opts, opts.genePred1, seqLimits1),
readGaps(opts, opts.gap1, seqLimits1))
- b1 = bedBoxes(beds1, seqLimits1, origins1, margin2, height, True, bpPerPix)
+ b1 = bedBoxes(beds1, seqLimits1, origins1, tMargin, height, True, bpPerPix)
beds2 = itertools.chain(readBed(opts.bed2, seqLimits2),
readRmsk(opts.rmsk2, seqLimits2),
readGenePred(opts, opts.genePred2, seqLimits2),
readGaps(opts, opts.gap2, seqLimits2))
- b2 = bedBoxes(beds2, seqLimits2, origins2, margin1, width, False, bpPerPix)
+ b2 = bedBoxes(beds2, seqLimits2, origins2, lMargin, width, False, bpPerPix)
boxes = sorted(itertools.chain(b1, b2))
drawAnnotations(im, boxes)
@@ -527,19 +539,22 @@ def lastDotplot(opts, args):
if opts.fontsize != 0:
axis1 = get_axis_image(seqLabels1, labelSizes1, seq_starts1, seq_pix1,
- font, image_mode, opts)
+ textRot1, False, font, image_mode, opts)
+ if textRot1:
+ axis1 = axis1.transpose(Image.ROTATE_90)
axis2 = get_axis_image(seqLabels2, labelSizes2, seq_starts2, seq_pix2,
- font, image_mode, opts)
- axis2 = axis2.transpose(Image.ROTATE_270) # !!! bug hotspot
+ textRot2, textRot2, font, image_mode, opts)
+ if not textRot2:
+ axis2 = axis2.transpose(Image.ROTATE_270)
im.paste(axis1, (0, 0))
im.paste(axis2, (0, 0))
for i in seq_starts1[1:]:
- box = i - opts.border_pixels, margin2, i, height
+ box = i - opts.border_pixels, tMargin, i, height
im.paste(opts.border_color, box)
for i in seq_starts2[1:]:
- box = margin1, i - opts.border_pixels, width, i
+ box = lMargin, i - opts.border_pixels, width, i
im.paste(opts.border_color, box)
im.save(args[1])
@@ -589,6 +604,10 @@ if __name__ == "__main__":
help="TrueType or OpenType font file")
og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=11,
help="TrueType or OpenType font size (default: %default)")
+ og.add_option("--rot1", metavar="ROT", default="h",
+ help="text rotation for the 1st genome (default=%default)")
+ og.add_option("--rot2", metavar="ROT", default="v",
+ help="text rotation for the 2nd genome (default=%default)")
og.add_option("--lengths1", action="store_true",
help="show sequence lengths for the 1st (horizontal) genome")
og.add_option("--lengths2", action="store_true",
diff --git a/scripts/last-postmask b/scripts/last-postmask
index 669be10..b17f376 100755
--- a/scripts/last-postmask
+++ b/scripts/last-postmask
@@ -12,8 +12,16 @@
# Limitations: doesn't (yet) handle sequence quality data,
# frameshifts, or generalized affine gaps.
+import gzip
import itertools, optparse, os, signal, sys
+def myOpen(fileName):
+ if fileName == "-":
+ return sys.stdin
+ if fileName.endswith(".gz"):
+ return gzip.open(fileName)
+ return open(fileName)
+
def getScoreMatrix(rowHeads, colHeads, matrix, deleteCost, insertCost):
defaultScore = min(map(min, matrix))
scoreMatrix = [[defaultScore for i in range(128)] for j in range(128)]
@@ -91,15 +99,12 @@ def doOneFile(lines):
if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
def lastPostmask(args):
- if args:
- for i in args:
- if i == "-":
- doOneFile(sys.stdin)
- else:
- with open(i) as f:
- doOneFile(f)
- else:
- doOneFile(sys.stdin)
+ if not args:
+ args = ["-"]
+ for i in args:
+ f = myOpen(i)
+ doOneFile(f)
+ f.close()
if __name__ == "__main__":
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
diff --git a/scripts/last-train b/scripts/last-train
index 7fe5947..4b360c7 100755
--- a/scripts/last-train
+++ b/scripts/last-train
@@ -1,11 +1,14 @@
#! /usr/bin/env python
# Copyright 2015 Martin C. Frith
+import gzip
import math, optparse, os, random, signal, subprocess, sys, tempfile
def myOpen(fileName): # faster than fileinput
if fileName == "-":
return sys.stdin
+ if fileName.endswith(".gz"):
+ return gzip.open(fileName)
return open(fileName)
def randomSample(things, sampleSize):
@@ -27,29 +30,30 @@ def seqInput(fileNames):
if not fileNames:
fileNames = ["-"]
for name in fileNames:
- with myOpen(name) as f:
- seqType = 0
- for line in f:
- if seqType == 0:
- if line[0] == ">":
- seqType = 1
- seq = []
- elif line[0] == "@":
- seqType = 2
- lineType = 1
- elif seqType == 1: # fasta
- if line[0] == ">":
- yield "".join(seq), ""
- seq = []
- else:
- seq.append(line.rstrip())
- elif seqType == 2: # fastq
- if lineType == 1:
- seq = line.rstrip()
- elif lineType == 3:
- yield seq, line.rstrip()
- lineType = (lineType + 1) % 4
- if seqType == 1: yield "".join(seq), ""
+ f = myOpen(name)
+ seqType = 0
+ for line in f:
+ if seqType == 0:
+ if line[0] == ">":
+ seqType = 1
+ seq = []
+ elif line[0] == "@":
+ seqType = 2
+ lineType = 1
+ elif seqType == 1: # fasta
+ if line[0] == ">":
+ yield "".join(seq), ""
+ seq = []
+ else:
+ seq.append(line.rstrip())
+ elif seqType == 2: # fastq
+ if lineType == 1:
+ seq = line.rstrip()
+ elif lineType == 3:
+ yield seq, line.rstrip()
+ lineType = (lineType + 1) % 4
+ if seqType == 1: yield "".join(seq), ""
+ f.close()
def isGoodChunk(chunk):
for i in chunk:
@@ -112,6 +116,13 @@ def getSeqSample(opts, queryFiles, outfile):
x = y
writeSegment(outfile, x)
+def versionFromHeader(lines):
+ for line in lines:
+ words = line.split()
+ if "version" in words:
+ return words[-1]
+ raise Exception("couldn't read the version")
+
def scaleFromHeader(lines):
for line in lines:
for i in line.split():
@@ -329,8 +340,16 @@ def tryToMakeChildProgramsFindable():
# put srcDir first, to avoid getting older versions of LAST:
os.environ["PATH"] = srcDir + os.pathsep + os.environ["PATH"]
-def fixedLastalArgs(opts):
- x = ["lastal", "-j7"]
+def readLastalProgName(lastdbIndexName):
+ bitsPerInt = "32"
+ with open(lastdbIndexName + ".prj") as f:
+ for line in f:
+ if line.startswith("integersize="):
+ bitsPerInt = line.split("=")[1].strip()
+ return "lastal8" if bitsPerInt == "64" else "lastal"
+
+def fixedLastalArgs(opts, lastalProgName):
+ x = [lastalProgName, "-j7"]
if opts.D: x.append("-D" + opts.D)
if opts.E: x.append("-E" + opts.E)
if opts.s: x.append("-s" + opts.s)
@@ -338,14 +357,16 @@ def fixedLastalArgs(opts):
if opts.C: x.append("-C" + opts.C)
if opts.T: x.append("-T" + opts.T)
if opts.m: x.append("-m" + opts.m)
+ if opts.k: x.append("-k" + opts.k)
if opts.P: x.append("-P" + opts.P)
if opts.Q: x.append("-Q" + opts.Q)
return x
def doTraining(opts, args):
tryToMakeChildProgramsFindable()
+ lastalProgName = readLastalProgName(args[0])
scaleIncrease = 20 # while training, up-scale the scores by this amount
- x = fixedLastalArgs(opts)
+ x = fixedLastalArgs(opts, lastalProgName)
if opts.r: x.append("-r" + opts.r)
if opts.q: x.append("-q" + opts.q)
if opts.p: x.append("-p" + opts.p)
@@ -357,6 +378,7 @@ def doTraining(opts, args):
y = ["last-split", "-n"]
p = subprocess.Popen(x, stdout=subprocess.PIPE)
q = subprocess.Popen(y, stdin=p.stdout, stdout=subprocess.PIPE)
+ lastalVersion = versionFromHeader(q.stdout)
externalScale = scaleFromHeader(q.stdout)
internalScale = externalScale * scaleIncrease
if opts.Q:
@@ -364,6 +386,7 @@ def doTraining(opts, args):
internalMatrix = scaledMatrix(externalMatrix, scaleIncrease)
oldParameters = []
+ print "# lastal version:", lastalVersion
print "# maximum percent identity:", opts.pid
print "# scale of score parameters:", externalScale
print "# scale used while training:", internalScale
@@ -390,7 +413,7 @@ def doTraining(opts, args):
if parameters in oldParameters: break
oldParameters.append(parameters)
internalMatrix = matrixWithLetters(matScores)
- x = fixedLastalArgs(opts)
+ x = fixedLastalArgs(opts, lastalProgName)
x.append("-p-")
x += args
p = subprocess.Popen(x, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
@@ -472,6 +495,8 @@ if __name__ == "__main__":
help="type of alignment: 0=local, 1=overlap (default: 0)")
og.add_option("-m", metavar="COUNT", help=
"maximum initial matches per query position (default: 10)")
+ og.add_option("-k", metavar="STEP", help="use initial matches starting at "
+ "every STEP-th position in each query (default: 1)")
og.add_option("-P", metavar="THREADS",
help="number of parallel threads")
og.add_option("-Q", metavar="NUMBER",
diff --git a/scripts/maf-convert b/scripts/maf-convert
index 850f3b1..97e78b3 100755
--- a/scripts/maf-convert
+++ b/scripts/maf-convert
@@ -7,8 +7,16 @@
# Genome FAQ, not e.g. "MIRA assembly format".
from itertools import *
+import gzip
import math, optparse, os, signal, sys
+def myOpen(fileName):
+ if fileName == "-":
+ return sys.stdin
+ if fileName.endswith(".gz"):
+ return gzip.open(fileName)
+ return open(fileName)
+
def maxlen(s):
return max(map(len, s))
@@ -161,7 +169,7 @@ def writeAxt(maf):
head, body = ranges[0], ranges[1:]
- outWords = [str(axtCounter.next())]
+ outWords = [str(next(axtCounter))]
outWords.extend(head[:3])
for i in body:
outWords.extend(i)
@@ -210,6 +218,42 @@ def mafConvertToTab(opts, lines):
for maf in mafInput(opts, lines):
writeTab(maf)
+##### Routines for converting to chain format: #####
+
+def writeChain(maf):
+ aLine, sLines, qLines, pLines = maf
+
+ score = "0"
+ for i in aLine.split():
+ if i.startswith("score="):
+ score = i[6:]
+
+ outWords = ["chain", score]
+
+ for seqName, seqLen, strand, letterSize, beg, end, row in sLines:
+ x = seqName, str(seqLen), strand, str(beg), str(end)
+ outWords.extend(x)
+
+ outWords.append(str(next(axtCounter) + 1))
+
+ print " ".join(outWords)
+
+ letterSizes = [i[3] for i in sLines]
+ rows = [i[6] for i in sLines]
+ alignmentColumns = izip(*rows)
+ size = "0"
+ for i in matchAndInsertSizes(alignmentColumns, letterSizes):
+ if ":" in i:
+ print size + "\t" + i.replace(":", "\t")
+ size = "0"
+ else:
+ size = i
+ print size
+
+def mafConvertToChain(opts, lines):
+ for maf in mafInput(opts, lines):
+ writeChain(maf)
+
##### Routines for converting to PSL format: #####
def pslBlocks(opts, mafs, outCounts):
@@ -335,16 +379,17 @@ def readGroupId(readGroupItems):
def readSequenceLengths(fileNames):
"""Read name & length of topmost sequence in each maf block."""
for i in fileNames:
- with open(i) as f:
- fields = None
- for line in f:
- if fields:
- if line.isspace():
- fields = None
- else:
- if line[0] == "s":
- fields = line.split()
- yield fields[1], fields[5]
+ f = myOpen(i)
+ fields = None
+ for line in f:
+ if fields:
+ if line.isspace():
+ fields = None
+ else:
+ if line[0] == "s":
+ fields = line.split()
+ yield fields[1], fields[5]
+ f.close()
def naturalSortKey(s):
"""Return a key that sorts strings in "natural" order."""
@@ -372,11 +417,9 @@ def writeSamHeader(opts, fileNames):
print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k])
if opts.dictfile:
- if opts.dictfile == "-":
- copyDictFile(sys.stdin)
- else:
- with open(opts.dictfile) as f:
- copyDictFile(f)
+ f = myOpen(opts.dictfile)
+ copyDictFile(f)
+ f.close()
if opts.readgroup:
print "@RG\t" + "\t".join(opts.readgroup.split())
@@ -760,6 +803,8 @@ def mafConvertOneFile(opts, formatName, lines):
mafConvertToBlast(opts, lines)
elif isFormat(formatName, "blasttab"):
mafConvertToBlastTab(opts, lines)
+ elif isFormat(formatName, "chain"):
+ mafConvertToChain(opts, lines)
elif isFormat(formatName, "html"):
mafConvertToHtml(opts, lines)
elif isFormat(formatName, "psl"):
@@ -787,15 +832,12 @@ def mafConvert(opts, args):
if isFormat(formatName, "tabular"):
opts.isKeepComments = True
- if fileNames:
- for i in fileNames:
- if i == "-":
- mafConvertOneFile(opts, formatName, sys.stdin)
- else:
- with open(i) as f:
- mafConvertOneFile(opts, formatName, f)
- else:
- mafConvertOneFile(opts, formatName, sys.stdin)
+ if not fileNames:
+ fileNames = ["-"]
+ for i in fileNames:
+ f = myOpen(i)
+ mafConvertOneFile(opts, formatName, f)
+ f.close()
if not opts.noheader:
if isFormat(formatName, "html"):
@@ -809,6 +851,7 @@ if __name__ == "__main__":
%prog axt mafFile(s)
%prog blast mafFile(s)
%prog blasttab mafFile(s)
+ %prog chain mafFile(s)
%prog html mafFile(s)
%prog psl mafFile(s)
%prog sam mafFile(s)
diff --git a/src/Alphabet.cc b/src/Alphabet.cc
index 961b1f6..d0d2d2b 100644
--- a/src/Alphabet.cc
+++ b/src/Alphabet.cc
@@ -51,14 +51,6 @@ char* Alphabet::rtCopy( const uchar* beg, const uchar* end, char* dest ) const{
return dest;
}
-void Alphabet::rc( uchar* beg, uchar* end ) const{
- std::reverse( beg, end );
-
- for( /* noop */; beg < end; ++beg ){
- *beg = complement[ *beg ];
- }
-}
-
void Alphabet::init(){
for( std::string::iterator i = letters.begin(); i < letters.end(); ++i )
*i = std::toupper( *i );
diff --git a/src/Alphabet.hh b/src/Alphabet.hh
index 0ec118c..540c884 100644
--- a/src/Alphabet.hh
+++ b/src/Alphabet.hh
@@ -42,9 +42,6 @@ struct Alphabet{
// return the position after the last written position in dest
char* rtCopy( const uchar* beg, const uchar* end, char* dest ) const;
- // reverse and complement a sequence of numbers, in place
- void rc( uchar* beg, uchar* end ) const;
-
std::string letters; // the "proper" letters, e.g. ACGT for DNA
unsigned size; // same as letters.size(): excludes delimiters
uchar encode[capacity]; // translate ASCII letters to codes (small integers)
diff --git a/src/LastEvaluer.cc b/src/LastEvaluer.cc
index b1951ca..72b1dae 100644
--- a/src/LastEvaluer.cc
+++ b/src/LastEvaluer.cc
@@ -367,7 +367,7 @@ void LastEvaluer::init(const char *matrixName,
0, maxMegabytes, randomSeed, t);
break;
} catch (const Sls::error& e) {
- if (i == 10) throw;
+ if (i == 20) throw;
}
}
} else {
diff --git a/src/LastalArguments.cc b/src/LastalArguments.cc
index 8d389ff..995f2e1 100644
--- a/src/LastalArguments.cc
+++ b/src/LastalArguments.cc
@@ -41,6 +41,18 @@ static char parseOutputFormat( const char* text ){
return 0;
}
+static int parseScoreOrPercent(const std::string &s) {
+ int x;
+ std::istringstream iss(s);
+ if(!(iss >> x) || x < 0) ERR("bad value: " + s);
+ char c;
+ if (iss >> c) {
+ if (c != '%' || iss >> c) ERR("bad value: " + s);
+ x = -x; // negative value indicates percentage (kludge)
+ }
+ return x;
+}
+
namespace cbrc{
LastalArguments::LastalArguments() :
@@ -66,7 +78,7 @@ LastalArguments::LastalArguments() :
gapPairCost(-1), // this means: OFF
frameshiftCost(-1), // this means: ordinary, non-translated alignment
matrixFile(""),
- maxDropGapped(-1), // depends on minScoreGapped & maxDropGapless
+ maxDropGapped(-100), // depends on minScoreGapped & maxDropGapless
maxDropGapless(-1), // depends on the score matrix
maxDropFinal(-1), // depends on maxDropGapped
inputFormat(sequenceFormat::fasta),
@@ -116,8 +128,8 @@ Score options (default settings):\n\
-B: insertion extension cost (b)\n\
-c: unaligned residue pair cost (off)\n\
-F: frameshift cost (off)\n\
--x: maximum score drop for gapped alignments (max[y, e-1])\n\
--y: maximum score drop for gapless alignments (t*10)\n\
+-x: maximum score drop for gapped alignments (e-1)\n\
+-y: maximum score drop for gapless alignments (min[t*10, x])\n\
-z: maximum score drop for final gapped alignments (x)\n\
-d: minimum score for gapless alignments (min[e, t*ln(1000*refSize/n)])\n\
-e: minimum score for gapped alignments\n\
@@ -222,8 +234,7 @@ LAST home page: http://last.cbrc.jp/\n\
if( frameshiftCost < 0 ) badopt( c, optarg );
break;
case 'x':
- unstringify( maxDropGapped, optarg );
- if( maxDropGapped < 0 ) badopt( c, optarg );
+ maxDropGapped = parseScoreOrPercent(optarg);
break;
case 'y':
unstringify( maxDropGapless, optarg );
@@ -525,13 +536,17 @@ To proceed without E-values, set a score threshold with option -e.");
if( temperature < 0 ) temperature = 1 / lambda;
+ if( maxDropGapped < 0 ){
+ int percent = -maxDropGapped;
+ maxDropGapped = std::max( minScoreGapped - 1, 0 );
+ maxDropGapped = std::max( maxDropGapped, maxDropGapless );
+ if (percent != 100) maxDropGapped = maxDropGapped * percent / 100;
+ }
+
if( maxDropGapless < 0 ){ // should it depend on temperature or lambda?
if( temperature < 0 ) maxDropGapless = 0; // shouldn't happen
else maxDropGapless = int( 10.0 * temperature + 0.5 );
- }
-
- if( maxDropGapped < 0 ){
- maxDropGapped = std::max( minScoreGapped - 1, maxDropGapless );
+ maxDropGapless = std::min( maxDropGapless, maxDropGapped );
}
if( maxDropFinal < 0 ) maxDropFinal = maxDropGapped;
diff --git a/src/MultiSequence.cc b/src/MultiSequence.cc
index f9b0077..1f1d05a 100644
--- a/src/MultiSequence.cc
+++ b/src/MultiSequence.cc
@@ -58,13 +58,6 @@ void MultiSequence::toFiles( const std::string& baseName ) const{
baseName + ".qua" );
}
-void MultiSequence::addName( std::string& name ){
- names.v.insert( names.v.end(), name.begin(), name.end() );
- nameEnds.v.push_back( names.v.size() );
- if( nameEnds.v.back() < names.v.size() )
- throw std::runtime_error("the sequence names are too long");
-}
-
std::istream& MultiSequence::readFastaName( std::istream& stream ){
std::string line, word;
getline( stream, line );
diff --git a/src/MultiSequence.hh b/src/MultiSequence.hh
index df388dc..3876058 100644
--- a/src/MultiSequence.hh
+++ b/src/MultiSequence.hh
@@ -17,10 +17,11 @@
namespace cbrc{
+typedef unsigned char uchar;
+
class MultiSequence{
public:
typedef LAST_INT_TYPE indexT;
- typedef unsigned char uchar;
// initialize with leftmost delimiter pad, ready for appending sequences
void initForAppending( indexT padSizeIn );
@@ -130,13 +131,67 @@ class MultiSequence{
// read the letters above PSSM columns, so we know which column is which
std::istream& readPssmHeader( std::istream& stream );
- // add a new sequence name
- void addName( std::string& name );
+ void finishName() { // finish adding a sequence name: store its end coord
+ nameEnds.v.push_back(names.v.size());
+ if (nameEnds.v.back() < names.v.size()) {
+ throw std::runtime_error("the sequence names are too long");
+ }
+ }
+
+ void addName(const std::string &name) { // add a new sequence name
+ names.v.insert(names.v.end(), name.begin(), name.end());
+ finishName();
+ }
+
+ void finishQual() { // add delimiter to the end of the quality scores
+ uchar padQualityScore = 64; // should never be used, but a valid value
+ size_t s = padSize * qualityScoresPerLetter;
+ qualityScores.v.insert(qualityScores.v.end(), s, padQualityScore);
+ }
+
+ void finishPssm() { // add delimiter to the end of the PSSM
+ pssm.insert(pssm.end(), padSize * scoreMatrixRowSize, -INF);
+ }
// can we finish the last sequence and stay within the memory limit?
bool isFinishable( indexT maxSeqLen ) const;
};
+inline void reverseComplementPssm(ScoreMatrixRow *beg, ScoreMatrixRow *end,
+ const uchar *complement) {
+ while (beg < end) {
+ --end;
+ for (unsigned i = 0; i < scoreMatrixRowSize; ++i) {
+ unsigned j = complement[i];
+ if (beg < end || i < j) std::swap((*beg)[i], (*end)[j]);
+ }
+ ++beg;
+ }
+}
+
+inline void reverseComplementOneSequence(MultiSequence &m,
+ const uchar *complement,
+ size_t seqNum) {
+ size_t b = m.seqBeg(seqNum);
+ size_t e = m.seqEnd(seqNum);
+
+ uchar *s = m.seqWriter();
+ std::reverse(s + b, s + e);
+ for (size_t i = b; i < e; ++i) {
+ s[i] = complement[s[i]];
+ }
+
+ size_t qpl = m.qualsPerLetter();
+ if (qpl) {
+ std::reverse(m.qualityWriter() + b * qpl, m.qualityWriter() + e * qpl);
+ }
+
+ ScoreMatrixRow *p = m.pssmWriter();
+ if (p) {
+ reverseComplementPssm(p + b, p + e, complement);
+ }
+}
+
// Divide the sequences into a given number of roughly-equally-sized
// chunks, and return the first sequence in the Nth chunk.
inline size_t firstSequenceInChunk(const MultiSequence &m,
@@ -152,5 +207,6 @@ inline size_t firstSequenceInChunk(const MultiSequence &m,
return (begDistance < endDistance) ? seqNum : seqNum + 1;
}
-} // end namespace cbrc
-#endif // MULTISEQUENCE_HH
+}
+
+#endif
diff --git a/src/MultiSequenceQual.cc b/src/MultiSequenceQual.cc
index 8e33f84..415fa14 100644
--- a/src/MultiSequenceQual.cc
+++ b/src/MultiSequenceQual.cc
@@ -15,12 +15,9 @@ using namespace cbrc;
std::istream&
MultiSequence::appendFromFastq( std::istream& stream, indexT maxSeqLen ){
- const uchar padQualityScore = 64; // should never be used, but a valid value
-
// initForAppending:
qualityScoresPerLetter = 1;
- if( qualityScores.v.empty() )
- qualityScores.v.insert( qualityScores.v.end(), padSize, padQualityScore );
+ if( qualityScores.v.empty() ) finishQual();
// reinitForAppending:
if( qualityScores.v.size() > seq.v.size() )
@@ -51,7 +48,7 @@ MultiSequence::appendFromFastq( std::istream& stream, indexT maxSeqLen ){
if( isFinishable(maxSeqLen) ){
finish();
- qualityScores.v.insert( qualityScores.v.end(), padSize, padQualityScore );
+ finishQual();
}
return stream;
@@ -60,15 +57,11 @@ MultiSequence::appendFromFastq( std::istream& stream, indexT maxSeqLen ){
std::istream&
MultiSequence::appendFromPrb( std::istream& stream, indexT maxSeqLen,
unsigned alphSize, const uchar decode[] ){
- const uchar padQualityScore = 64; // should never be used, but a valid value
- size_t qualPadSize = padSize * alphSize;
size_t qualSize = seq.v.size() * alphSize;
// initForAppending:
qualityScoresPerLetter = alphSize;
- if( qualityScores.v.empty() )
- qualityScores.v.insert( qualityScores.v.end(), qualPadSize,
- padQualityScore );
+ if( qualityScores.v.empty() ) finishQual();
// reinitForAppending:
if( qualityScores.v.size() > qualSize )
@@ -104,8 +97,7 @@ MultiSequence::appendFromPrb( std::istream& stream, indexT maxSeqLen,
if( isFinishable(maxSeqLen) ){
finish();
- qualityScores.v.insert( qualityScores.v.end(), qualPadSize,
- padQualityScore );
+ finishQual();
}
return stream;
@@ -148,12 +140,10 @@ std::istream&
MultiSequence::appendFromPssm( std::istream& stream, indexT maxSeqLen,
const uchar* lettersToNumbers,
bool isMaskLowercase ){
- size_t pssmPadSize = padSize * scoreMatrixRowSize;
size_t pssmSize = seq.v.size() * scoreMatrixRowSize;
// initForAppending:
- if( pssm.empty() )
- pssm.insert( pssm.end(), pssmPadSize, -INF );
+ if( pssm.empty() ) finishPssm();
// reinitForAppending:
if( pssm.size() > pssmSize )
@@ -200,7 +190,7 @@ MultiSequence::appendFromPssm( std::istream& stream, indexT maxSeqLen,
if( isFinishable(maxSeqLen) ){
finish();
- pssm.insert( pssm.end(), pssmPadSize, -INF );
+ finishPssm();
}
if( !stream.bad() ) stream.clear();
diff --git a/src/io.cc b/src/io.cc
index 0c8d3d3..f1abcce 100644
--- a/src/io.cc
+++ b/src/io.cc
@@ -6,22 +6,15 @@
namespace cbrc{
-std::istream& openIn( const std::string& fileName, std::ifstream& ifs ){
+std::istream& openIn( const std::string& fileName, mcf::izstream& ifs ){
if( fileName == "-" ) return std::cin;
ifs.open( fileName.c_str() );
if( !ifs ) throw std::runtime_error("can't open file: " + fileName);
return ifs;
}
-std::ostream& openOut( const std::string& fileName, std::ofstream& ofs ){
- if( fileName == "-" ) return std::cout;
- ofs.open( fileName.c_str() );
- if( !ofs ) throw std::runtime_error("can't open file: " + fileName);
- return ofs;
-}
-
std::string slurp( const std::string& fileName ){
- std::ifstream inFileStream;
+ mcf::izstream inFileStream;
std::istream& in = openIn( fileName, inFileStream );
std::istreambuf_iterator<char> beg(in), end;
return std::string( beg, end );
diff --git a/src/io.hh b/src/io.hh
index 9162898..c08cd8c 100644
--- a/src/io.hh
+++ b/src/io.hh
@@ -6,6 +6,8 @@
#ifndef IO_H
#define IO_H
+#include "mcf_zstream.hh"
+
#include <vector>
#include <string>
#include <stdexcept>
@@ -15,10 +17,7 @@
namespace cbrc{
// open an input file, but if the name is "-", just return cin
-std::istream& openIn( const std::string& fileName, std::ifstream& ifs );
-
-// open an output file, but if the name is "-", just return cout
-std::ostream& openOut( const std::string& fileName, std::ofstream& ofs );
+std::istream& openIn( const std::string& fileName, mcf::izstream& ifs );
// read a file into a string, but if the name is "-", read cin
std::string slurp( const std::string& fileName );
diff --git a/src/last-pair-probs.cc b/src/last-pair-probs.cc
index 5d4436d..21adc33 100644
--- a/src/last-pair-probs.cc
+++ b/src/last-pair-probs.cc
@@ -616,7 +616,7 @@ void lastPairProbs(LastPairProbsOptions& opts) {
size_t n = inputs.size();
if (!opts.isFraglen || !opts.isSdev) {
- std::ifstream inFile1, inFile2;
+ mcf::izstream inFile1, inFile2;
std::istream& in1 = (n > 0) ? cbrc::openIn(inputs[0], inFile1) : std::cin;
std::istream& in2 = (n > 1) ? cbrc::openIn(inputs[1], inFile2) : in1;
if (n < 2) skipOneBatchMarker(in1);
@@ -626,7 +626,7 @@ void lastPairProbs(LastPairProbsOptions& opts) {
}
if (!opts.estdist) {
- std::ifstream inFile1, inFile2;
+ mcf::izstream inFile1, inFile2;
std::istream& in1 = (n > 0) ? cbrc::openIn(inputs[0], inFile1) : std::cin;
std::istream& in2 = (n > 1) ? cbrc::openIn(inputs[1], inFile2) : in1;
AlignmentParameters params1 = readHeaderOrDie(in1);
diff --git a/src/lastal.cc b/src/lastal.cc
index 0e90c61..47bff6e 100644
--- a/src/lastal.cc
+++ b/src/lastal.cc
@@ -950,42 +950,16 @@ void translateAndScan( LastAligner& aligner, size_t queryNum, char strand ){
cullFinalAlignments( aligner.textAlns, oldNumOfAlns );
}
-static void reverseComplementPssm( size_t queryNum ){
- ScoreMatrixRow* beg = query.pssmWriter() + query.seqBeg(queryNum);
- ScoreMatrixRow* end = query.pssmWriter() + query.seqEnd(queryNum);
-
- while( beg < end ){
- --end;
- for( unsigned i = 0; i < scoreMatrixRowSize; ++i ){
- unsigned j = queryAlph.complement[i];
- if( beg < end || i < j ) std::swap( (*beg)[i], (*end)[j] );
- }
- ++beg;
- }
-}
-
-static void reverseComplementQuery( size_t queryNum ){
- size_t b = query.seqBeg(queryNum);
- size_t e = query.seqEnd(queryNum);
- queryAlph.rc( query.seqWriter() + b, query.seqWriter() + e );
- if( isQuality( args.inputFormat ) ){
- std::reverse( query.qualityWriter() + b * query.qualsPerLetter(),
- query.qualityWriter() + e * query.qualsPerLetter() );
- }else if( args.inputFormat == sequenceFormat::pssm ){
- reverseComplementPssm(queryNum);
- }
-}
-
static void alignOneQuery(LastAligner &aligner,
size_t queryNum, bool isFirstVolume) {
if (args.strand == 2 && !isFirstVolume)
- reverseComplementQuery(queryNum);
+ reverseComplementOneSequence(query, queryAlph.complement, queryNum);
if (args.strand != 0)
translateAndScan(aligner, queryNum, '+');
if (args.strand == 2 || (args.strand == 0 && isFirstVolume))
- reverseComplementQuery(queryNum);
+ reverseComplementOneSequence(query, queryAlph.complement, queryNum);
if (args.strand != 1)
translateAndScan(aligner, queryNum, '-');
@@ -1251,7 +1225,7 @@ void lastal( int argc, char** argv ){
char** inputBegin = argv + args.inputStart;
for( char** i = *inputBegin ? inputBegin : defaultInput; *i; ++i ){
- std::ifstream inFileStream;
+ mcf::izstream inFileStream;
std::istream& in = openIn( *i, inFileStream );
LOG( "reading " << *i << "..." );
diff --git a/src/lastdb.cc b/src/lastdb.cc
index 085d9d3..daea418 100644
--- a/src/lastdb.cc
+++ b/src/lastdb.cc
@@ -180,12 +180,21 @@ static void preprocessSeqs(MultiSequence &multi,
// Make one database volume, from one batch of sequences
void makeVolume( std::vector< CyclicSubsetSeed >& seeds,
MultiSequence& multi, const LastdbArguments& args,
- const Alphabet& alph, const std::vector<countT>& letterCounts,
+ const Alphabet& alph, std::vector<countT>& letterCounts,
const TantanMasker& masker, unsigned numOfThreads,
const std::string& seedText, const std::string& baseName ){
size_t numOfIndexes = seeds.size();
size_t numOfSequences = multi.finishedSequences();
size_t textLength = multi.finishedSize();
+ const uchar* seq = multi.seqReader();
+
+ std::vector<countT> letterCountsInThisVolume(alph.size);
+ alph.count(seq, seq + textLength, &letterCountsInThisVolume[0]);
+ for (unsigned c = 0; c < alph.size; ++c) {
+ letterCounts[c] += letterCountsInThisVolume[c];
+ }
+
+ if (args.isCountsOnly) return;
if( args.tantanSetting ){
LOG( "masking..." );
@@ -194,9 +203,8 @@ void makeVolume( std::vector< CyclicSubsetSeed >& seeds,
LOG( "writing..." );
writePrjFile( baseName + ".prj", args, alph, numOfSequences,
- letterCounts, -1, numOfIndexes, seedText );
+ letterCountsInThisVolume, -1, numOfIndexes, seedText );
multi.toFiles( baseName );
- const uchar* seq = multi.seqReader();
for( unsigned x = 0; x < numOfIndexes; ++x ){
SubsetSuffixArray myIndex;
@@ -244,12 +252,11 @@ static indexT maxLettersPerVolume( const LastdbArguments& args,
return z;
}
-// Read the next sequence, adding it to the MultiSequence and the SuffixArray
+// Read the next sequence, adding it to the MultiSequence
std::istream&
-appendFromFasta( MultiSequence& multi, unsigned numOfIndexes,
+appendFromFasta( MultiSequence& multi, indexT maxSeqLen,
const LastdbArguments& args, const Alphabet& alph,
std::istream& in ){
- indexT maxSeqLen = maxLettersPerVolume( args, numOfIndexes );
if( multi.finishedSequences() == 0 ) maxSeqLen = indexT(-1);
size_t oldSize = multi.unfinishedSize();
@@ -303,18 +310,18 @@ void lastdb( int argc, char** argv ){
unsigned volumeNumber = 0;
countT sequenceCount = 0;
std::vector<countT> letterCounts( alph.size );
- std::vector<countT> letterTotals( alph.size );
+ indexT maxSeqLen = maxLettersPerVolume( args, seeds.size() );
char defaultInputName[] = "-";
char* defaultInput[] = { defaultInputName, 0 };
char** inputBegin = argv + args.inputStart;
for( char** i = *inputBegin ? inputBegin : defaultInput; *i; ++i ){
- std::ifstream inFileStream;
+ mcf::izstream inFileStream;
std::istream& in = openIn( *i, inFileStream );
LOG( "reading " << *i << "..." );
- while( appendFromFasta( multi, seeds.size(), args, alph, in ) ){
+ while( appendFromFasta( multi, maxSeqLen, args, alph, in ) ){
if( !args.isProtein && args.userAlphabet.empty() &&
sequenceCount == 0 && isDubiousDna( alph, multi ) ){
std::cerr << args.programName << ": that's some funny-lookin DNA\n";
@@ -322,30 +329,18 @@ void lastdb( int argc, char** argv ){
if( multi.isFinished() ){
++sequenceCount;
- const uchar* seq = multi.seqReader();
- size_t lastSeq = multi.finishedSequences() - 1;
- size_t beg = multi.seqBeg( lastSeq );
- size_t end = multi.seqEnd( lastSeq );
- alph.count( seq + beg, seq + end, &letterCounts[0] );
- if( args.isCountsOnly ){
- // memory-saving, which seems to be important on 32-bit systems:
- multi.reinitForAppending();
- }
}
else{
std::string baseName = args.lastdbName + stringify(volumeNumber++);
makeVolume( seeds, multi, args, alph, letterCounts,
tantanMasker, numOfThreads, seedText, baseName );
- for( unsigned c = 0; c < alph.size; ++c )
- letterTotals[c] += letterCounts[c];
- letterCounts.assign( alph.size, 0 );
multi.reinitForAppending();
}
}
}
if( multi.finishedSequences() > 0 ){
- if( volumeNumber == 0 ){
+ if( volumeNumber == 0 && !args.isCountsOnly ){
makeVolume( seeds, multi, args, alph, letterCounts,
tantanMasker, numOfThreads, seedText, args.lastdbName );
return;
@@ -355,10 +350,8 @@ void lastdb( int argc, char** argv ){
tantanMasker, numOfThreads, seedText, baseName );
}
- for( unsigned c = 0; c < alph.size; ++c ) letterTotals[c] += letterCounts[c];
-
writePrjFile( args.lastdbName + ".prj", args, alph, sequenceCount,
- letterTotals, volumeNumber, seeds.size(), seedText );
+ letterCounts, volumeNumber, seeds.size(), seedText );
}
int main( int argc, char** argv )
diff --git a/src/makefile b/src/makefile
index df3581a..de49694 100644
--- a/src/makefile
+++ b/src/makefile
@@ -1,6 +1,3 @@
-CXX = g++
-CC = gcc
-
CXXFLAGS = -O3 -Wall -Wextra -Wcast-qual -Wswitch-enum -Wundef \
-Wcast-align -pedantic -g -std=c++11 -pthread -DHAS_CXX_THREADS
# -Wconversion
@@ -59,19 +56,19 @@ all: $(ALL)
indexAllObj4 = $(indexObj0) $(indexObj4)
lastdb: $(indexAllObj4)
- $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj4)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj4) -lz
indexAllObj8 = $(indexObj0) $(indexObj8)
lastdb8: $(indexAllObj8)
- $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj8)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj8) -lz
alignAllObj4 = $(alignObj0) $(alignObj4)
lastal: $(alignAllObj4)
- $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj4)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj4) -lz
alignAllObj8 = $(alignObj0) $(alignObj8)
lastal8: $(alignAllObj8)
- $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj8)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj8) -lz
splitAllObj4 = $(splitObj0) $(splitObj4)
last-split: $(splitAllObj4)
@@ -82,7 +79,7 @@ last-split8: $(splitAllObj8)
$(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj8)
last-pair-probs: $(PPOBJ)
- $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(PPOBJ)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(PPOBJ) -lz
last-merge-batches: $(MBOBJ)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(MBOBJ)
@@ -151,7 +148,7 @@ Centroid.o Centroid.o8: Centroid.cc Centroid.hh GappedXdropAligner.hh \
ScoreMatrixRow.hh GeneralizedAffineGapCosts.hh SegmentPair.hh \
OneQualityScoreMatrix.hh GappedXdropAlignerInl.hh
CyclicSubsetSeed.o CyclicSubsetSeed.o8: CyclicSubsetSeed.cc CyclicSubsetSeed.hh \
- CyclicSubsetSeedData.hh io.hh stringify.hh
+ CyclicSubsetSeedData.hh io.hh mcf_zstream.hh stringify.hh
DiagonalTable.o DiagonalTable.o8: DiagonalTable.cc DiagonalTable.hh
GappedXdropAligner.o GappedXdropAligner.o8: GappedXdropAligner.cc GappedXdropAligner.hh \
ScoreMatrixRow.hh GappedXdropAlignerInl.hh
@@ -179,7 +176,7 @@ LastalArguments.o LastalArguments.o8: LastalArguments.cc LastalArguments.hh \
LastdbArguments.o LastdbArguments.o8: LastdbArguments.cc LastdbArguments.hh \
SequenceFormat.hh stringify.hh getoptUtil.hh version.hh
MultiSequence.o MultiSequence.o8: MultiSequence.cc MultiSequence.hh ScoreMatrixRow.hh \
- VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh io.hh
+ VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh io.hh mcf_zstream.hh
MultiSequenceQual.o MultiSequenceQual.o8: MultiSequenceQual.cc MultiSequence.hh \
ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh
OneQualityScoreMatrix.o OneQualityScoreMatrix.o8: OneQualityScoreMatrix.cc \
@@ -187,14 +184,15 @@ OneQualityScoreMatrix.o OneQualityScoreMatrix.o8: OneQualityScoreMatrix.cc \
stringify.hh
QualityPssmMaker.o QualityPssmMaker.o8: QualityPssmMaker.cc QualityPssmMaker.hh \
ScoreMatrixRow.hh qualityScoreUtil.hh stringify.hh
-ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh io.hh
+ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh io.hh \
+ mcf_zstream.hh
SegmentPair.o SegmentPair.o8: SegmentPair.cc SegmentPair.hh
SegmentPairPot.o SegmentPairPot.o8: SegmentPairPot.cc SegmentPairPot.hh SegmentPair.hh
SubsetMinimizerFinder.o SubsetMinimizerFinder.o8: SubsetMinimizerFinder.cc \
SubsetMinimizerFinder.hh CyclicSubsetSeed.hh
SubsetSuffixArray.o SubsetSuffixArray.o8: SubsetSuffixArray.cc SubsetSuffixArray.hh \
CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
- SubsetMinimizerFinder.hh io.hh
+ SubsetMinimizerFinder.hh io.hh mcf_zstream.hh
SubsetSuffixArraySearch.o SubsetSuffixArraySearch.o8: SubsetSuffixArraySearch.cc \
SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \
fileMap.hh stringify.hh
@@ -211,11 +209,11 @@ gaplessPssmXdrop.o gaplessPssmXdrop.o8: gaplessPssmXdrop.cc gaplessPssmXdrop.hh
gaplessTwoQualityXdrop.o gaplessTwoQualityXdrop.o8: gaplessTwoQualityXdrop.cc \
gaplessTwoQualityXdrop.hh TwoQualityScoreMatrix.hh ScoreMatrixRow.hh
gaplessXdrop.o gaplessXdrop.o8: gaplessXdrop.cc gaplessXdrop.hh ScoreMatrixRow.hh
-io.o io.o8: io.cc io.hh
+io.o io.o8: io.cc io.hh mcf_zstream.hh
last-pair-probs-main.o last-pair-probs-main.o8: last-pair-probs-main.cc last-pair-probs.hh \
stringify.hh version.hh
last-pair-probs.o last-pair-probs.o8: last-pair-probs.cc last-pair-probs.hh io.hh \
- stringify.hh
+ mcf_zstream.hh stringify.hh
lastal.o lastal.o8: lastal.cc LastalArguments.hh SequenceFormat.hh \
QualityPssmMaker.hh ScoreMatrixRow.hh OneQualityScoreMatrix.hh \
TwoQualityScoreMatrix.hh qualityScoreUtil.hh stringify.hh \
@@ -228,12 +226,12 @@ lastal.o lastal.o8: lastal.cc LastalArguments.hh SequenceFormat.hh \
SegmentPairPot.hh ScoreMatrix.hh Alphabet.hh MultiSequence.hh \
TantanMasker.hh tantan.hh DiagonalTable.hh GreedyXdropAligner.hh \
gaplessXdrop.hh gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh io.hh \
- threadUtil.hh version.hh
+ mcf_zstream.hh threadUtil.hh version.hh
lastdb.o lastdb.o8: lastdb.cc LastdbArguments.hh SequenceFormat.hh \
SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \
fileMap.hh stringify.hh Alphabet.hh MultiSequence.hh ScoreMatrixRow.hh \
- TantanMasker.hh tantan.hh io.hh qualityScoreUtil.hh threadUtil.hh \
- version.hh
+ TantanMasker.hh tantan.hh io.hh mcf_zstream.hh qualityScoreUtil.hh \
+ threadUtil.hh version.hh
tantan.o tantan.o8: tantan.cc tantan.hh
last-merge-batches.o: last-merge-batches.c version.hh
alp/njn_dynprogprob.o: alp/njn_dynprogprob.cpp alp/njn_dynprogprob.hpp \
diff --git a/src/mcf_zstream.hh b/src/mcf_zstream.hh
new file mode 100644
index 0000000..fd527c5
--- /dev/null
+++ b/src/mcf_zstream.hh
@@ -0,0 +1,82 @@
+// Copyright 2017 Martin C. Frith
+
+// mcf::izstream is similar to std::ifstream. The difference is, if
+// you give it a gzip-compressed file, it will decompress what it
+// reads.
+
+#ifndef MCF_ZSTREAM
+#define MCF_ZSTREAM
+
+#include <zlib.h>
+
+#include <cstdio> // BUFSIZ
+#include <istream>
+#include <streambuf>
+
+namespace mcf {
+
+class zbuf : public std::streambuf {
+public:
+ zbuf() : input(0) {}
+
+ ~zbuf() { close(); }
+
+ bool is_open() const { return input; }
+
+ zbuf *open(const char *fileName) {
+ if (is_open()) return 0;
+ input = gzopen(fileName, "rb");
+ if (!is_open()) return 0;
+ return this;
+ }
+
+ zbuf *close() {
+ if (!is_open()) return 0;
+ int e = gzclose(input);
+ input = 0;
+ return (e == Z_OK || e == Z_BUF_ERROR) ? this : 0;
+ }
+
+protected:
+ int underflow() {
+ if (gptr() == egptr()) {
+ int size = gzread(input, buffer, BUFSIZ);
+ if (size < 0) throw std::ios_base::failure("gzread error");
+ setg(buffer, buffer, buffer + size);
+ }
+ return (gptr() == egptr()) ?
+ traits_type::eof() : traits_type::to_int_type(*gptr());
+ }
+
+private:
+ gzFile input;
+ char buffer[BUFSIZ];
+};
+
+class izstream : public std::istream {
+public:
+ izstream() : std::istream(&buf) {}
+
+ izstream(const char *fileName) : std::istream(&buf) {
+ open(fileName);
+ }
+
+ bool is_open() const { return buf.is_open(); }
+
+ void open(const char *fileName) {
+ // do something special if fileName is "-"?
+ if (!buf.open(fileName)) setstate(failbit);
+ else clear();
+ }
+
+ void close() {
+ if (!buf.close()) setstate(failbit);
+ }
+
+private:
+ zbuf buf;
+};
+
+}
+
+#endif
diff --git a/src/version.hh b/src/version.hh
index 7104b42..0fe2c52 100644
--- a/src/version.hh
+++ b/src/version.hh
@@ -1 +1 @@
-"869"
+"885"
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/last-align.git
More information about the debian-med-commit
mailing list