[med-svn] [last-align] 01/03: New upstream version 828
Andreas Tille
tille at debian.org
Mon Jan 9 14:36:00 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository last-align.
commit 28865eb7cc3f830147d0da6bebead15e97e99497
Author: Andreas Tille <tille at debian.org>
Date: Mon Jan 9 14:16:08 2017 +0100
New upstream version 828
---
ChangeLog.txt | 109 +++++++++-
doc/last-split.html | 28 ++-
doc/last-split.txt | 25 ++-
doc/last-tuning.html | 11 +
doc/last-tuning.txt | 13 ++
doc/lastal.html | 5 +
doc/lastal.txt | 7 +
doc/lastdb.html | 42 +++-
doc/lastdb.txt | 41 +++-
makefile | 7 +-
src/Centroid.cc | 438 +++++++++++++++++++++++----------------
src/Centroid.hh | 31 +--
src/DiagonalTable.hh | 9 +-
src/GappedXdropAligner.cc | 21 +-
src/GappedXdropAligner2qual.cc | 21 +-
src/GappedXdropAlignerInl.hh | 7 +
src/GappedXdropAlignerPssm.cc | 21 +-
src/GeneralizedAffineGapCosts.cc | 1 -
src/GeneralizedAffineGapCosts.hh | 7 +-
src/LastalArguments.cc | 2 +-
src/LastalArguments.hh | 20 +-
src/LastdbArguments.cc | 2 +-
src/LastdbArguments.hh | 12 +-
src/MultiSequence.cc | 5 +-
src/MultiSequence.hh | 2 +-
src/SegmentPair.hh | 4 +-
src/SegmentPairPot.cc | 2 +-
src/SegmentPairPot.hh | 2 +-
src/SubsetSuffixArray.cc | 13 +-
src/SubsetSuffixArray.hh | 16 +-
src/SubsetSuffixArraySearch.cc | 18 +-
src/SubsetSuffixArraySort.cc | 9 +-
src/lastal.cc | 34 +--
src/lastdb.cc | 3 +-
src/makefile | 204 ++++++++++--------
src/split/cbrc_split_aligner.cc | 38 ++--
src/split/cbrc_split_aligner.hh | 9 +-
src/split/last-split.cc | 31 +--
src/version.hh | 2 +-
39 files changed, 820 insertions(+), 452 deletions(-)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index c286315..157198b 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,8 +1,115 @@
+2016-12-19 Martin C. Frith <Martin C. Frith>
+
+ * src/Centroid.cc:
+ Just re-ordered some probability calculations.
+ [c13b3ad70ddb] [tip]
+
+ * src/Centroid.cc:
+ Refactoring.
+ [a31864221c04]
+
+ * src/Centroid.cc:
+ Made lastal's probabilistic alignment faster.
+ [d2813c5ef29c]
+
+ * src/Centroid.cc:
+ Refactoring.
+ [3323a24e4185]
+
+ * src/Centroid.cc, src/Centroid.hh:
+ Refactoring.
+ [07185ef5ae09]
+
+ * src/Centroid.cc, src/Centroid.hh:
+ Refactoring.
+ [23d7d4d65391]
+
+2016-12-16 Martin C. Frith <Martin C. Frith>
+
+ * src/Centroid.cc:
+ Made lastal's probabilistic alignment faster, for FASTA queries.
+ [8a5fff275e58]
+
+ * src/Centroid.cc, src/GappedXdropAligner.cc,
+ src/GappedXdropAligner2qual.cc, src/GappedXdropAlignerInl.hh,
+ src/GappedXdropAlignerPssm.cc, src/GeneralizedAffineGapCosts.cc,
+ src/GeneralizedAffineGapCosts.hh:
+ Made lastal with asymmetric gap costs faster (but symmetric gap
+ costs slower?)
+ [370ee034eebc]
+
+ * src/MultiSequence.cc, test/last-test.out, test/last-test.sh:
+ Sped up fasta-reading, added tests.
+ [5be7ba2d1f50]
+
+2016-12-13 Martin C. Frith <Martin C. Frith>
+
+ * makefile:
+ Bugfix: the previous update put junk in the zip-file.
+ [3e3e4e67c69d]
+
+ * .hgignore, doc/last-split.txt, doc/last-tuning.txt, doc/lastal.txt,
+ doc/lastdb.txt, makefile, src/lastal.cc, src/lastdb.cc,
+ src/makefile, src/split/cbrc_split_aligner.cc:
+ Added 8-byte LAST!
+ [d6d939a854fd]
+
+2016-12-09 Martin C. Frith <Martin C. Frith>
+
+ * makefile, src/makefile:
+ Refactoring.
+ [cd7349e81e02]
+
+ * src/makefile:
+ Refactoring.
+ [59c5f7c8532e]
+
+2016-12-08 Martin C. Frith <Martin C. Frith>
+
+ * src/DiagonalTable.hh, src/LastalArguments.hh, src/MultiSequence.hh,
+ src/SegmentPair.hh, src/SegmentPairPot.cc, src/SegmentPairPot.hh,
+ src/SubsetSuffixArray.hh, src/makefile:
+ Allow integers >= 2^32 in even more places.
+ [c24adb93f907]
+
+2016-12-06 Martin C. Frith <Martin C. Frith>
+
+ * src/LastalArguments.hh, src/SubsetSuffixArray.hh,
+ src/SubsetSuffixArraySearch.cc, src/lastal.cc:
+ Allow integers >= 2^32 in still more places.
+ [4d18df171b8d]
+
+ * src/LastdbArguments.cc, src/LastdbArguments.hh,
+ src/SubsetSuffixArray.cc, src/SubsetSuffixArray.hh:
+ Refactoring.
+ [a32c6d227241]
+
+ * src/LastalArguments.hh, src/LastdbArguments.hh,
+ src/SubsetSuffixArray.hh, src/SubsetSuffixArraySearch.cc,
+ src/SubsetSuffixArraySort.cc, src/split/cbrc_split_aligner.cc,
+ src/split/cbrc_split_aligner.hh:
+ Allow integers >= 2^32 in yet more places.
+ [e55f2a3d8569]
+
+ * src/LastalArguments.cc, src/LastalArguments.hh,
+ src/LastdbArguments.hh, src/SubsetSuffixArray.cc,
+ src/SubsetSuffixArray.hh, src/lastal.cc, src/lastdb.cc:
+ Allow integers >= 2^32 in more places.
+ [e8c20a8e0137]
+
+2016-12-02 Martin C. Frith <Martin C. Frith>
+
+ * doc/last-split.txt, src/split/cbrc_split_aligner.cc,
+ src/split/cbrc_split_aligner.hh, src/split/last-split.cc, test/last-
+ split-test.out:
+ Added "sense" to last-split -d2 output.
+ [c332e3b4ece7]
+
2016-11-24 Martin C. Frith <Martin C. Frith>
* src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
Made last-split a bit faster.
- [604434e30b9b] [tip]
+ [604434e30b9b]
* src/split/cbrc_split_aligner.cc:
Enabled last-split -g to read multi-volume genomes.
diff --git a/doc/last-split.html b/doc/last-split.html
index 32bad5c..f36c8b1 100644
--- a/doc/last-split.html
+++ b/doc/last-split.html
@@ -339,10 +339,10 @@ lastal -Q1 -D100 db q.fastq | last-split > out.maf
</div>
<div class="section" id="spliced-alignment-of-rna-reads-to-a-genome">
<h3>Spliced alignment of RNA reads to a genome</h3>
-<p>Now we assume that "q.fastq" has reads from RNA forward strands. This
-time, we provide the genome information to last-split, which causes it
-to do spliced instead of split alignment, and also tells it where the
-splice signals are (GT, AG, etc):</p>
+<p>Now we assume that "q.fastq" has reads from RNA forward (sense)
+strands. This time, we provide the genome information to last-split,
+which causes it to do spliced instead of split alignment, and also
+tells it where the splice signals are (GT, AG, etc):</p>
<pre class="literal-block">
lastdb -uNEAR -R01 db genome.fasta
lastal -Q1 -D10 db q.fastq | last-split -g db > out.maf
@@ -357,6 +357,8 @@ between any two places in the genome.</p>
very short parts of a spliced alignment (e.g. short exons). Note that
last-split discards the lowest-significance alignments, but it uses
them to estimate the ambiguity of higher-significance alignments.</p>
+<p>If your reads are from unknown/mixed RNA strands, add -d2 to the
+last-split options.</p>
</div>
<div class="section" id="alignment-of-two-whole-genomes">
<h3>Alignment of two whole genomes</h3>
@@ -577,10 +579,16 @@ from the named genome. NAME should be the name of a lastdb
database.</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-d</span>, <span class="option">--direction=<var>D</var></span></kbd></td>
-<td>Do spliced alignment, and set the strandedness of the
-queries: 0=reverse, 1=forward, 2=unknown/mixed. This
+<td><p class="first">Do spliced alignment, and set the strandedness of the
+queries: 0=antisense, 1=sense, 2=unknown/mixed. This
determines whether forward and/or reverse-complement splice
-signals are used.</td></tr>
+signals are used.</p>
+<p>If you use -d2, the output will have an extra "sense" field,
+indicating the log-odds that the query is sense-stranded:</p>
+<pre class="last literal-block">
+log2[ prob(sense) / prob(antisense) ]
+</pre>
+</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-c</span>, <span class="option">--cis=<var>PROB</var></span></kbd></td>
<td>Do spliced alignment, and set the average probability per
@@ -671,6 +679,12 @@ alignments are oriented to use the forward strand of the query.</p>
</li>
</ul>
</div>
+<div class="section" id="last-split8">
+<h2>last-split8</h2>
+<p>last-split8 is almost identical to last-split. The only difference is
+the -g option: last-split can only read the output of lastdb, whereas
+last-split8 can only read the output of <a class="reference external" href="lastdb.html">lastdb8</a>.</p>
+</div>
<div class="section" id="limitations">
<h2>Limitations</h2>
<p>last-split does not support:</p>
diff --git a/doc/last-split.txt b/doc/last-split.txt
index 0bda7ff..479b0e7 100644
--- a/doc/last-split.txt
+++ b/doc/last-split.txt
@@ -26,10 +26,10 @@ can do the alignment like this::
Spliced alignment of RNA reads to a genome
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Now we assume that "q.fastq" has reads from RNA forward strands. This
-time, we provide the genome information to last-split, which causes it
-to do spliced instead of split alignment, and also tells it where the
-splice signals are (GT, AG, etc)::
+Now we assume that "q.fastq" has reads from RNA forward (sense)
+strands. This time, we provide the genome information to last-split,
+which causes it to do spliced instead of split alignment, and also
+tells it where the splice signals are (GT, AG, etc)::
lastdb -uNEAR -R01 db genome.fasta
lastal -Q1 -D10 db q.fastq | last-split -g db > out.maf
@@ -46,6 +46,9 @@ very short parts of a spliced alignment (e.g. short exons). Note that
last-split discards the lowest-significance alignments, but it uses
them to estimate the ambiguity of higher-significance alignments.
+If your reads are from unknown/mixed RNA strands, add -d2 to the
+last-split options.
+
Alignment of two whole genomes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -185,10 +188,15 @@ Options
-d, --direction=D
Do spliced alignment, and set the strandedness of the
- queries: 0=reverse, 1=forward, 2=unknown/mixed. This
+ queries: 0=antisense, 1=sense, 2=unknown/mixed. This
determines whether forward and/or reverse-complement splice
signals are used.
+ If you use -d2, the output will have an extra "sense" field,
+ indicating the log-odds that the query is sense-stranded::
+
+ log2[ prob(sense) / prob(antisense) ]
+
-c, --cis=PROB
Do spliced alignment, and set the average probability per
base of cis-splicing. The default value roughly fits human
@@ -273,6 +281,13 @@ The following points matter only if you are doing something unusual
* It assumes this score matrix applies to all alignments, when the
alignments are oriented to use the forward strand of the query.
+last-split8
+-----------
+
+last-split8 is almost identical to last-split. The only difference is
+the -g option: last-split can only read the output of lastdb, whereas
+last-split8 can only read the output of `lastdb8 <lastdb.html>`_.
+
Limitations
-----------
diff --git a/doc/last-tuning.html b/doc/last-tuning.html
index 11ec408..a273b3b 100644
--- a/doc/last-tuning.html
+++ b/doc/last-tuning.html
@@ -367,6 +367,17 @@ alphabetically earliest.</p>
<p>The fraction of positions that are "minimum" is roughly: 2 / (W + 1).</p>
</div>
</div>
+<div class="section" id="lastdb8-lastal8">
+<h2>lastdb8 & lastal8</h2>
+<p>If your reference has more than about 4 billion letters, 8-byte LAST
+<em>may</em> be beneficial. Ordinary (4-byte) LAST cannot directly handle so
+much data, so it splits it into volumes, which is inefficient. 8-byte
+LAST can handle such data without voluming, but it uses more memory.</p>
+<p>8-byte LAST combines well with lastdb option -w or -W, which reduce
+memory usage. Something like <tt class="docutils literal">lastdb8 <span class="pre">-W63</span></tt> enables rapid,
+huge-scale homology search, with moderate memory usage, but low
+sensitivity.</p>
+</div>
<div class="section" id="other-options">
<h2>Other options</h2>
<div class="section" id="lastal-m">
diff --git a/doc/last-tuning.txt b/doc/last-tuning.txt
index 4fe79b2..6493e62 100644
--- a/doc/last-tuning.txt
+++ b/doc/last-tuning.txt
@@ -61,6 +61,19 @@ alphabetically earliest.
The fraction of positions that are "minimum" is roughly: 2 / (W + 1).
+lastdb8 & lastal8
+~~~~~~~~~~~~~~~~~
+
+If your reference has more than about 4 billion letters, 8-byte LAST
+*may* be beneficial. Ordinary (4-byte) LAST cannot directly handle so
+much data, so it splits it into volumes, which is inefficient. 8-byte
+LAST can handle such data without voluming, but it uses more memory.
+
+8-byte LAST combines well with lastdb option -w or -W, which reduce
+memory usage. Something like ``lastdb8 -W63`` enables rapid,
+huge-scale homology search, with moderate memory usage, but low
+sensitivity.
+
Other options
~~~~~~~~~~~~~
diff --git a/doc/lastal.html b/doc/lastal.html
index d58d90a..94f9541 100644
--- a/doc/lastal.html
+++ b/doc/lastal.html
@@ -912,6 +912,11 @@ enough memory to hold several volumes simultaneously, or run lastal on
one volume at a time. An efficient scheme is to use a different
computer for each volume.</p>
</div>
+<div class="section" id="lastal8">
+<h2>lastal8</h2>
+<p>lastal8 has identical usage to lastal, and is used with <a class="reference external" href="lastdb.html">lastdb8</a>. lastal cannot read the output of lastdb8, and
+lastal8 cannot read the output of lastdb.</p>
+</div>
</div>
</body>
</html>
diff --git a/doc/lastal.txt b/doc/lastal.txt
index ce62516..d8b2690 100644
--- a/doc/lastal.txt
+++ b/doc/lastal.txt
@@ -538,3 +538,10 @@ Therefore, with parallel processes, you should either ensure you have
enough memory to hold several volumes simultaneously, or run lastal on
one volume at a time. An efficient scheme is to use a different
computer for each volume.
+
+lastal8
+-------
+
+lastal8 has identical usage to lastal, and is used with `lastdb8
+<lastdb.html>`_. lastal cannot read the output of lastdb8, and
+lastal8 cannot read the output of lastdb.
diff --git a/doc/lastdb.html b/doc/lastdb.html
index 7fc9a57..1e6fafb 100644
--- a/doc/lastdb.html
+++ b/doc/lastdb.html
@@ -442,8 +442,8 @@ can use suffixes K, M, and G to specify KibiBytes, MebiBytes,
and GibiBytes (e.g. "-s 5G").</p>
<p>However, the output for one sequence is never split. Since the
output files are several-fold bigger than the input (unless you
-use -w), this means that mammalian chromosomes cannot be
-processed using much less than 2G (unless you use -w).</p>
+use -w or -W), this means that mammalian chromosomes cannot be
+processed using much less than 2G (unless you use -w or -W).</p>
<p class="last">There is a hard upper limit of about 4 billion sequence letters
per volume. Together with the previous point, this means that
lastdb will refuse to process any single sequence longer than
@@ -525,6 +525,44 @@ counting is never case-sensitive.</td></tr>
</blockquote>
</div>
</div>
+<div class="section" id="lastdb8">
+<h2>lastdb8</h2>
+<p>lastdb8 is identical to lastdb, except that it internally uses larger
+(8-byte) integers. This means it can handle more than 4 billion
+sequence letters per volume, but it uses more memory.</p>
+</div>
+<div class="section" id="memory-and-disk-usage">
+<h2>Memory and disk usage</h2>
+<p>Suppose we give lastdb N letters of sequence data, of which M are
+non-masked "real" letters (e.g. excluding N for DNA and X for
+proteins). The output files will include:</p>
+<ul>
+<li><p class="first">The sequences (N bytes).</p>
+</li>
+<li><p class="first">An "index" consisting of:
+positions (4M bytes), and "buckets" (<= M bytes).</p>
+</li>
+<li><p class="first">The sequence names (<em>usually</em> negligible).</p>
+</li>
+</ul>
+<p>This is modified by several options.</p>
+<ul>
+<li><p class="first">-C1 adds M bytes to the index, -C2 adds 2M bytes, and -C3 adds 4M
+bytes.</p>
+</li>
+<li><p class="first">-w STEP: makes the index STEP times smaller.</p>
+</li>
+<li><p class="first">-W SIZE: makes the index about (SIZE+1)/2 times smaller.</p>
+</li>
+<li><p class="first">lastdb8: makes the index twice as big.</p>
+</li>
+<li><p class="first">-u, -m: Multiple patterns multiply the index size. For example,
+<a class="reference external" href="last-seeds.html">MAM8</a> makes it 8 times bigger.</p>
+</li>
+<li><p class="first">-s: does not change the total size, but splits it into volumes.</p>
+</li>
+</ul>
+</div>
<div class="section" id="limitations">
<h2>Limitations</h2>
<p>lastdb can become catastrophically slow for highly redundant
diff --git a/doc/lastdb.txt b/doc/lastdb.txt
index caba336..0d2d017 100644
--- a/doc/lastdb.txt
+++ b/doc/lastdb.txt
@@ -110,8 +110,8 @@ Advanced Options
However, the output for one sequence is never split. Since the
output files are several-fold bigger than the input (unless you
- use -w), this means that mammalian chromosomes cannot be
- processed using much less than 2G (unless you use -w).
+ use -w or -W), this means that mammalian chromosomes cannot be
+ processed using much less than 2G (unless you use -w or -W).
There is a hard upper limit of about 4 billion sequence letters
per volume. Together with the previous point, this means that
@@ -190,6 +190,43 @@ Advanced Options
-V, --version
Show version information, and exit.
+lastdb8
+-------
+
+lastdb8 is identical to lastdb, except that it internally uses larger
+(8-byte) integers. This means it can handle more than 4 billion
+sequence letters per volume, but it uses more memory.
+
+Memory and disk usage
+---------------------
+
+Suppose we give lastdb N letters of sequence data, of which M are
+non-masked "real" letters (e.g. excluding N for DNA and X for
+proteins). The output files will include:
+
+* The sequences (N bytes).
+
+* An "index" consisting of:
+ positions (4M bytes), and "buckets" (<= M bytes).
+
+* The sequence names (*usually* negligible).
+
+This is modified by several options.
+
+* -C1 adds M bytes to the index, -C2 adds 2M bytes, and -C3 adds 4M
+ bytes.
+
+* -w STEP: makes the index STEP times smaller.
+
+* -W SIZE: makes the index about (SIZE+1)/2 times smaller.
+
+* lastdb8: makes the index twice as big.
+
+* -u, -m: Multiple patterns multiply the index size. For example,
+ `MAM8 <last-seeds.html>`_ makes it 8 times bigger.
+
+* -s: does not change the total size, but splits it into volumes.
+
Limitations
-----------
diff --git a/makefile b/makefile
index 545a6b9..7d1bb11 100644
--- a/makefile
+++ b/makefile
@@ -2,12 +2,15 @@ CXXFLAGS = -O3 -std=c++11 -pthread -DHAS_CXX_THREADS
all:
@cd src && $(MAKE) CXXFLAGS="$(CXXFLAGS)"
+progs = src/lastdb src/lastal src/last-split src/last-merge-batches \
+src/last-pair-probs src/lastdb8 src/lastal8 src/last-split8
+
prefix = /usr/local
exec_prefix = $(prefix)
bindir = $(exec_prefix)/bin
install: all
mkdir -p $(bindir)
- cp src/last?? src/last-split src/last-merge-batches src/last-pair-probs scripts/* $(bindir)
+ cp $(progs) scripts/* $(bindir)
clean:
@cd src && $(MAKE) clean
@@ -17,7 +20,7 @@ html:
distdir = last-`hg id -n`
-RSYNCFLAGS = -aC --exclude 'last??' --exclude last-split --exclude last-merge-batches --exclude last-pair-probs
+RSYNCFLAGS = -aC --exclude '*8' --exclude 'last??' --exclude last-split --exclude last-merge-batches --exclude last-pair-probs
dist: log html
@cd src && $(MAKE) version.hh CyclicSubsetSeedData.hh ScoreMatrixData.hh
diff --git a/src/Centroid.cc b/src/Centroid.cc
index 792ed45..42c5286 100644
--- a/src/Centroid.cc
+++ b/src/Centroid.cc
@@ -110,11 +110,8 @@ namespace cbrc{
}
void Centroid::initBackwardMatrix(){
- pp.resize( fM.size() );
mD.assign( numAntidiagonals + 2, 0.0 );
mI.assign( numAntidiagonals + 2, 0.0 );
- mX1.assign ( numAntidiagonals + 2, 1.0 );
- mX2.assign ( numAntidiagonals + 2, 1.0 );
size_t n = xa.scoreEndIndex( numAntidiagonals );
bM.assign( n, 0.0 );
@@ -170,11 +167,11 @@ namespace cbrc{
assert( gap.insExist == gap.delExist || eP <= 0.0 );
for( size_t k = 0; k < numAntidiagonals; ++k ){ // loop over antidiagonals
- double sum_f = 0.0; // sum of forward values
const size_t seq1beg = seq1start( k );
const size_t seq2pos = k - seq1beg;
- const double scale12 = 1.0 / ( scale[k+1] * scale[k] );
- const double scale1 = 1.0 / scale[k+1];
+ const double scale12 = scale[k+1] * scale[k];
+ const double scale1 = scale[k+1];
+ double sum_f = 0.0; // sum of forward values
const double seE = eE * scale1;
const double seEI = eEI * scale1;
@@ -203,37 +200,57 @@ namespace cbrc{
if (! isPssm) {
const uchar* s2 = seqPtr( seq2, isForward, seq2pos );
- while (1) { // start: inner most loop
- const double xM = *fM2 * scale12;
- const double xD = *fD1 * seE;
- const double xI = *fI1 * seEI;
- const double xP = *fP2 * seP;
- *fD0 = xM * eF + xD + xP;
- *fI0 = (xM + xD) * eFI + xI + xP;
- *fM0 = (xM + xD + xI + xP) * match_score[ *s1 ][ *s2 ];
- *fP0 = xM * eF + xP;
- sum_f += xM;
- if( globality && (isDelimiter(*s2, *match_score) ||
- isDelimiter(*s1, *match_score)) ){
- Z += xM + xD + xI + xP;
+ if (isAffine) {
+ while (1) {
+ const double xM = *fM2 * scale12;
+ const double xD = *fD1 * seE;
+ const double xI = *fI1 * seEI;
+ *fD0 = xM * eF + xD;
+ *fI0 = (xM + xD) * eFI + xI;
+ *fM0 = (xM + xD + xI) * match_score[ *s1 ][ *s2 ];
+ sum_f += xM;
+ if( globality && (isDelimiter(*s2, *match_score) ||
+ isDelimiter(*s1, *match_score)) ){
+ Z += xM + xD + xI;
+ }
+ if (fM0 == fM0last) break;
+ fM0++; fD0++; fI0++;
+ fM2++; fD1++; fI1++;
+ s1 += seqIncrement;
+ s2 -= seqIncrement;
}
- if (fM0 == fM0last) break;
- fM0++; fD0++; fI0++; fP0++;
- fM2++; fD1++; fI1++; fP2++;
- s1 += seqIncrement;
- s2 -= seqIncrement;
- } // end: inner most loop
- } // end: if (! isPssm)
- else {
+ } else {
+ while (1) {
+ const double xM = *fM2 * scale12;
+ const double xD = *fD1 * seE;
+ const double xI = *fI1 * seEI;
+ const double xP = *fP2 * seP;
+ *fD0 = xM * eF + xD + xP;
+ *fI0 = (xM + xD) * eFI + xI + xP;
+ *fM0 = (xM + xD + xI + xP) * match_score[ *s1 ][ *s2 ];
+ *fP0 = xM * eF + xP;
+ sum_f += xM;
+ if( globality && (isDelimiter(*s2, *match_score) ||
+ isDelimiter(*s1, *match_score)) ){
+ Z += xM + xD + xI + xP;
+ }
+ if (fM0 == fM0last) break;
+ fM0++; fD0++; fI0++; fP0++;
+ fM2++; fD1++; fI1++; fP2++;
+ s1 += seqIncrement;
+ s2 -= seqIncrement;
+ }
+ }
+ } else {
const ExpMatrixRow* p2 = seqPtr( pssm, isForward, seq2pos );
if (isAffine) {
- while (1) { // start: inner most loop
+ while (1) {
const double xM = *fM2 * scale12;
const double xD = *fD1 * seE;
- const double xI = *fI1 * seE;
+ const double xI = *fI1 * seEI;
*fD0 = xM * eF + xD;
- *fI0 = (xM + xD) * eF + xI;
+ *fI0 = (xM + xD) * eFI + xI;
*fM0 = (xM + xD + xI) * (*p2)[ *s1 ];
sum_f += xM;
if ( globality && (isDelimiter(0, *p2) ||
@@ -245,9 +262,9 @@ namespace cbrc{
fM2++; fD1++; fI1++;
s1 += seqIncrement;
p2 -= seqIncrement;
- } // end: inner most loop
- }else{
- while (1) { // start: inner most loop
+ }
+ } else {
+ while (1) {
const double xM = *fM2 * scale12;
const double xD = *fD1 * seE;
const double xI = *fI1 * seEI;
@@ -266,21 +283,22 @@ namespace cbrc{
fM2++; fD1++; fI1++; fP2++;
s1 += seqIncrement;
p2 -= seqIncrement;
- } // end: inner most loop
+ }
}
}
+
if( !globality ) Z += sum_f;
- scale[k+2] = sum_f + 1.0; // seems ugly
- Z /= scale[k+2]; // scaling
+ scale[k+2] = 1.0 / (sum_f + 1.0); // seems ugly
+ Z *= scale[k+2]; // scaling
} // k
+
//std::cout << "# Z=" << Z << std::endl;
assert( Z > 0.0 );
- scale[ numAntidiagonals + 1 ] *= Z; // this causes scaled Z to equal 1
+ scale[ numAntidiagonals + 1 ] /= Z; // this causes scaled Z to equal 1
}
// added by M. Hamada
// compute posterior probabilities while executing backward algorithm
- // posterior probabilities are stored in pp
void Centroid::backward( const uchar* seq1, const uchar* seq2,
size_t start1, size_t start2,
bool isForward, int globality,
@@ -312,9 +330,9 @@ namespace cbrc{
for( size_t k = numAntidiagonals; k-- > 0; ){
const size_t seq1beg = seq1start( k );
const size_t seq2pos = k - seq1beg;
- const double scale12 = 1.0 / ( scale[k+1] * scale[k] );
- const double scale1 = 1.0 / scale[k+1];
- scaledUnit /= scale[k+2];
+ const double scale12 = scale[k+1] * scale[k];
+ const double scale1 = scale[k+1];
+ scaledUnit *= scale[k+2];
const double seE = eE * scale1;
const double seEI = eEI * scale1;
@@ -326,8 +344,6 @@ namespace cbrc{
const double* bI0 = &bI[ scoreEnd + 1 ];
const double* bP0 = &bP[ scoreEnd + 1 ];
- double* pp0 = &pp[ scoreEnd ];
-
const size_t horiBeg = xa.hori( k, seq1beg );
const size_t vertBeg = xa.vert( k, seq1beg );
const size_t diagBeg = xa.diag( k, seq1beg );
@@ -336,73 +352,98 @@ namespace cbrc{
double* bM2 = &bM[ diagBeg ];
double* bP2 = &bP[ diagBeg ];
- const double* fM2 = &fM[ diagBeg ];
const double* fD1 = &fD[ horiBeg ];
const double* fI1 = &fI[ vertBeg ];
const double* fP2 = &fP[ diagBeg ];
const double* bM0last = bM0 + xa.numCellsAndPads( k ) - 2;
- int i = seq1beg; int j = seq2pos;
+ double* mDout = &mD[ seq1beg ];
+ double* mIout = &mI[ seq2pos ];
const uchar* s1 = seqPtr( seq1, isForward, seq1beg );
if (! isPssm ) {
const uchar* s2 = seqPtr( seq2, isForward, seq2pos );
- while (1) { // inner most loop
- double yM = *bM0 * match_score[ *s1 ][ *s2 ];
- double yD = *bD0;
- double yI = *bI0;
- double yP = *bP0;
- double zM = yM + yD * eF + yI * eFI + yP * eF;
- double zD = yM + yD + yI * eFI;
- double zI = yM + yI;
- double zP = yM + yP + yD + yI;
- if( globality ){
- if( isDelimiter(*s2, *match_score) ||
- isDelimiter(*s1, *match_score) ){
- zM += scaledUnit; zD += scaledUnit; zI += scaledUnit;
- zP += scaledUnit;
+ if (isAffine) {
+ while (1) {
+ double yM = *bM0 * match_score[ *s1 ][ *s2 ];
+ double yD = *bD0;
+ double yI = *bI0;
+ double zM = yM + yD * eF + yI * eFI;
+ double zD = yM + yD + yI * eFI;
+ double zI = yM + yI;
+ if( globality ){
+ if( isDelimiter(*s2, *match_score) ||
+ isDelimiter(*s1, *match_score) ){
+ zM += scaledUnit; zD += scaledUnit; zI += scaledUnit;
+ }
+ }else{
+ zM += scaledUnit;
+ }
+ *bM2 = zM * scale12;
+ *bD1 = zD * seE;
+ *bI1 = zI * seEI;
+
+ *mDout += (*fD1) * (*bD1);
+ *mIout += (*fI1) * (*bI1);
+
+ if (bM0 == bM0last) break;
+ mDout++; mIout--;
+ bM2++; bD1++; bI1++;
+ bM0++; bD0++; bI0++;
+ fD1++; fI1++;
+ s1 += seqIncrement;
+ s2 -= seqIncrement;
+ }
+ } else {
+ while (1) {
+ double yM = *bM0 * match_score[ *s1 ][ *s2 ];
+ double yD = *bD0;
+ double yI = *bI0;
+ double yP = *bP0;
+ double zM = yM + yD * eF + yI * eFI + yP * eF;
+ double zD = yM + yD + yI * eFI;
+ double zI = yM + yI;
+ double zP = yM + yP + yD + yI;
+ if( globality ){
+ if( isDelimiter(*s2, *match_score) ||
+ isDelimiter(*s1, *match_score) ){
+ zM += scaledUnit; zD += scaledUnit; zI += scaledUnit;
+ zP += scaledUnit;
+ }
+ }else{
+ zM += scaledUnit;
}
- }else{
- zM += scaledUnit;
+ *bM2 = zM * scale12;
+ *bD1 = zD * seE;
+ *bI1 = zI * seEI;
+ *bP2 = zP * seP;
+
+ double probp = *fP2 * *bP2;
+ *mDout += (*fD1) * (*bD1) + probp;
+ *mIout += (*fI1) * (*bI1) + probp;
+
+ if (bM0 == bM0last) break;
+ mDout++; mIout--;
+ bM2++; bD1++; bI1++; bP2++;
+ bM0++; bD0++; bI0++; bP0++;
+ fD1++; fI1++; fP2++;
+ s1 += seqIncrement;
+ s2 -= seqIncrement;
}
- *bM2 = zM * scale12;
- *bD1 = zD * seE;
- *bI1 = zI * seEI;
- *bP2 = zP * seP;
-
- double prob = *fM2 * *bM2;
- *pp0 = prob;
- double probd = *fD1 * *bD1;
- double probi = *fI1 * *bI1;
- double probp = *fP2 * *bP2;
- mD[ i ] += probd + probp;
- mI[ j ] += probi + probp;
- mX1 [ i ] -= ( prob + probd + probp );
- mX2 [ j ] -= ( prob + probi + probp );
-
- if (bM0 == bM0last) break;
- i++; j--;
- bM2++; bD1++; bI1++; bP2++;
- bM0++; bD0++; bI0++; bP0++;
- fM2++; fD1++; fI1++; fP2++;
- pp0++;
- s1 += seqIncrement;
- s2 -= seqIncrement;
}
- }
- else {
+ } else {
const ExpMatrixRow* p2 = seqPtr( pssm, isForward, seq2pos );
if (isAffine) {
- while (1) { // inner most loop
+ while (1) {
double yM = *bM0 * ( *p2 )[ *s1 ];
double yD = *bD0;
double yI = *bI0;
- double zM = yM + yD * eF + yI * eF;
- double zD = yM + yD + yI * eF;
+ double zM = yM + yD * eF + yI * eFI;
+ double zD = yM + yD + yI * eFI;
double zI = yM + yI;
if( globality ){
if( isDelimiter(0, *p2) ||
@@ -414,27 +455,20 @@ namespace cbrc{
}
*bM2 = zM * scale12;
*bD1 = zD * seE;
- *bI1 = zI * seE;
+ *bI1 = zI * seEI;
- double prob = *fM2 * *bM2;
- *pp0 = prob;
- double probd = *fD1 * *bD1;
- double probi = *fI1 * *bI1;
- mD[ i ] += probd;
- mI[ j ] += probi;
- mX1 [ i ] -= ( prob + probd );
- mX2 [ j ] -= ( prob + probi );
+ *mDout += (*fD1) * (*bD1);
+ *mIout += (*fI1) * (*bI1);
if (bM0 == bM0last) break;
- i++; j--;
+ mDout++; mIout--;
bM2++; bD1++; bI1++;
bM0++; bD0++; bI0++;
- fM2++; fD1++; fI1++;
- pp0++;
+ fD1++; fI1++;
s1 += seqIncrement;
p2 -= seqIncrement;
}
- }else{
+ } else {
while (1) {
double yM = *bM0 * ( *p2 )[ *s1 ];
double yD = *bD0;
@@ -458,22 +492,15 @@ namespace cbrc{
*bI1 = zI * seEI;
*bP2 = zP * seP;
- double prob = *fM2 * *bM2;
- *pp0 = prob;
- double probd = *fD1 * *bD1;
- double probi = *fI1 * *bI1;
double probp = *fP2 * *bP2;
- mD[ i ] += probd + probp;
- mI[ j ] += probi + probp;
- mX1 [ i ] -= ( prob + probd + probp );
- mX2 [ j ] -= ( prob + probi + probp );
+ *mDout += (*fD1) * (*bD1) + probp;
+ *mIout += (*fI1) * (*bI1) + probp;
if (bM0 == bM0last) break;
- i++; j--;
+ mDout++; mIout--;
bM2++; bD1++; bI1++; bP2++;
bM0++; bD0++; bI0++; bP0++;
- fM2++; fD1++; fI1++; fP2++;
- pp0++;
+ fD1++; fI1++; fP2++;
s1 += seqIncrement;
p2 -= seqIncrement;
}
@@ -506,23 +533,27 @@ namespace cbrc{
for( size_t k = 1; k < numAntidiagonals; ++k ){ // loop over antidiagonals
const size_t scoreEnd = xa.scoreEndIndex( k );
double* X0 = &X[ scoreEnd ];
- const double* P0 = &pp[ scoreEnd ];
- size_t cur = seq1start( k );
+ size_t seq1pos = seq1start( k );
const double* const x0end = X0 + xa.numCellsAndPads( k );
- const double* X1 = &X[xa.hori(k, cur)];
- const double* X2 = &X[xa.diag(k, cur)];
+ const size_t h = xa.hori( k, seq1pos );
+ const size_t d = xa.diag( k, seq1pos );
+ const double* X1 = &X[h];
+ const double* X2 = &X[d];
+ const double* fM2 = &fM[d];
+ const double* bM2 = &bM[d];
*X0++ = -DINF; // add one pad cell
do{
- const double s = ( gamma + 1 ) * ( *P0++ ) - 1;
+ const double matchProb = (*fM2++) * (*bM2++);
+ const double s = ( gamma + 1 ) * matchProb - 1;
const double oldX1 = *X1++; // Added by MCF
const double score = std::max( std::max( oldX1, *X1 ), *X2++ + s );
//assert ( score >= 0 );
- updateScore ( score, k, cur );
+ updateScore ( score, k, seq1pos );
*X0++ = score;
- cur++;
+ seq1pos++;
}while( X0 != x0end );
}
return bestScore;
@@ -537,10 +568,11 @@ namespace cbrc{
size_t oldPos1 = i;
while( k > 0 ){
- const int m =
- maxIndex( diagx( X, k, i ) + ( gamma + 1 ) * cellx( pp, k, i ) - 1,
- horix( X, k, i ),
- vertx( X, k, i ) );
+ const size_t h = xa.hori( k, i );
+ const size_t v = xa.vert( k, i );
+ const size_t d = xa.diag( k, i );
+ const double matchProb = fM[d] * bM[d];
+ const int m = maxIndex( X[d] + (gamma + 1) * matchProb - 1, X[h], X[v] );
if( m == 0 ){
k -= 2;
i -= 1;
@@ -560,28 +592,51 @@ namespace cbrc{
initDecodingMatrix();
+ mX1.assign ( numAntidiagonals + 2, 1.0 );
+ mX2.assign ( numAntidiagonals + 2, 1.0 );
+
+ for (size_t k = 0; k < numAntidiagonals; ++k) {
+ size_t seq1pos = seq1start(k);
+ size_t seq2pos = k - seq1pos;
+ size_t loopBeg = xa.diag(k, seq1pos);
+ size_t loopEnd = loopBeg + xa.numCellsAndPads(k) - 1;
+ for (size_t i = loopBeg; i < loopEnd; ++i) {
+ const double matchProb = fM[i] * bM[i];
+ mX1[seq1pos++] -= matchProb;
+ mX2[seq2pos--] -= matchProb;
+ }
+ }
+
for( size_t k = 1; k < numAntidiagonals; ++k ){ // loop over antidiagonals
const size_t scoreEnd = xa.scoreEndIndex( k );
double* X0 = &X[ scoreEnd ];
- const double* P0 = &pp[ scoreEnd ];
- size_t cur = seq1start( k );
- size_t seq2pos = k - cur;
+ size_t seq1pos = seq1start( k );
+ size_t seq2pos = k - seq1pos;
const double* const x0end = X0 + xa.numCellsAndPads( k );
- const double* X1 = &X[ xa.hori(k, cur) ];
- const double* X2 = &X[ xa.diag(k, cur) ];
+ const size_t h = xa.hori( k, seq1pos );
+ const size_t d = xa.diag( k, seq1pos );
+ const double* X1 = &X[h];
+ const double* X2 = &X[d];
+ const double* fM2 = &fM[d];
+ const double* bM2 = &bM[d];
*X0++ = -DINF; // add one pad cell
do{
- const double s = 2 * gamma * *P0++ - ( mX1[ cur ] + mX2[ seq2pos ] );
+ const double matchProb = (*fM2++) * (*bM2++);
+ const double thisD = mD[seq1pos];
+ const double thisI = mI[seq2pos];
+ const double thisXD = mX1[seq1pos] - thisD;
+ const double thisXI = mX2[seq2pos] - thisI;
+ const double s = 2 * gamma * matchProb - (thisXD + thisXI);
+ const double u = gamma * thisD - thisXD;
+ const double t = gamma * thisI - thisXI;
const double oldX1 = *X1++; // Added by MCF
- const double u = gamma * mD[ cur ] - mX1[ cur ];
- const double t = gamma * mI[ seq2pos ] - mX2[ seq2pos ];
const double score = std::max( std::max( oldX1 + u, *X1 + t), *X2++ + s );
- updateScore ( score, k, cur );
+ updateScore ( score, k, seq1pos );
*X0++ = score;
- cur++;
+ seq1pos++;
seq2pos--;
}while( X0 != x0end );
}
@@ -599,13 +654,18 @@ namespace cbrc{
while( k > 0 ){
const size_t j = k - i;
- const double s = 2 * gamma * cellx( pp, k, i ) - ( mX1[ i ] + mX2[ j ] );
- const double t = gamma * mI[ j ] - mX2[ j ];
- const double u = gamma * mD[ i ] - mX1[ i ];
- const int m =
- maxIndex( diagx( X, k, i ) + s,
- horix( X, k, i ) + u,
- vertx( X, k, i ) + t );
+ const size_t h = xa.hori( k, i );
+ const size_t v = xa.vert( k, i );
+ const size_t d = xa.diag( k, i );
+ const double matchProb = fM[d] * bM[d];
+ const double thisD = mD[i];
+ const double thisI = mI[j];
+ const double thisXD = mX1[i] - thisD;
+ const double thisXI = mX2[j] - thisI;
+ const double s = 2 * gamma * matchProb - (thisXD + thisXI);
+ const double t = gamma * thisI - thisXI;
+ const double u = gamma * thisD - thisXD;
+ const int m = maxIndex( X[d] + s, X[h] + u, X[v] + t );
if( m == 0 ){
k -= 2;
i -= 1;
@@ -653,7 +713,8 @@ namespace cbrc{
size_t seq2pos = i->end2();
for( size_t j = 0; j < i->size; ++j ){
- double p = cellx( pp, seq1pos + seq2pos, seq1pos );
+ size_t d = xa.diag( seq1pos + seq2pos, seq1pos );
+ double p = fM[d] * bM[d];
ambiguityCodes.push_back( asciiProbability(p) );
--seq1pos;
--seq2pos;
@@ -679,7 +740,7 @@ namespace cbrc{
double Centroid::logPartitionFunction() const{
double x = 0.0;
for( size_t k = 0; k < numAntidiagonals; ++k ){
- x += std::log( scale[k+2] );
+ x -= std::log( scale[k+2] );
}
return T * x;
}
@@ -711,8 +772,8 @@ namespace cbrc{
for( size_t k = 0; k < numAntidiagonals; ++k ){ // loop over antidiagonals
const size_t seq1beg = seq1start( k );
const size_t seq2pos = k - seq1beg;
- const double scale12 = 1.0 / ( scale[k+1] * scale[k] );
- const double scale1 = 1.0 / scale[k+1];
+ const double scale12 = scale[k+1] * scale[k];
+ const double scale1 = scale[k+1];
const double seE = eE * scale1;
const double seEI = eEI * scale1;
@@ -738,34 +799,59 @@ namespace cbrc{
const double* bM0last = bM0 + xa.numCellsAndPads( k ) - 2;
if (! isPssm ) {
- while (1) { // inner most loop
- const double xM = *fM2 * scale12;
- const double xD = *fD1 * seE;
- const double xI = *fI1 * seEI;
- const double xP = *fP2 * seP;
- const double yM = *bM0 * match_score[ *s1 ][ *s2 ];
- const double yD = *bD0;
- const double yI = *bI0;
- const double yP = *bP0;
- c.emit[*s1][*s2] += (xM + xD + xI + xP) * yM;
- c.MM += xM * yM;
- c.DM += xD * yM;
- c.IM += xI * yM;
- c.PM += xP * yM;
- c.MD += xM * yD * eF;
- c.DD += xD * yD;
- c.PD += xP * yD;
- c.MI += xM * yI * eFI;
- c.DI += xD * yI * eFI;
- c.II += xI * yI;
- c.PI += xP * yI;
- c.MP += xM * yP * eF;
- c.PP += xP * yP;
- if (bM0 == bM0last) break;
- fM2++; fD1++; fI1++; fP2++;
- bM0++; bD0++; bI0++; bP0++;
- s1 += seqIncrement;
- s2 -= seqIncrement;
+ if (isAffine) {
+ while (1) {
+ const double xM = *fM2 * scale12;
+ const double xD = *fD1 * seE;
+ const double xI = *fI1 * seEI;
+ const double yM = *bM0 * match_score[ *s1 ][ *s2 ];
+ const double yD = *bD0;
+ const double yI = *bI0;
+ c.emit[*s1][*s2] += (xM + xD + xI) * yM;
+ c.MM += xM * yM;
+ c.DM += xD * yM;
+ c.IM += xI * yM;
+ c.MD += xM * yD * eF;
+ c.DD += xD * yD;
+ c.MI += xM * yI * eFI;
+ c.DI += xD * yI * eFI;
+ c.II += xI * yI;
+ if (bM0 == bM0last) break;
+ fM2++; fD1++; fI1++;
+ bM0++; bD0++; bI0++;
+ s1 += seqIncrement;
+ s2 -= seqIncrement;
+ }
+ } else {
+ while (1) {
+ const double xM = *fM2 * scale12;
+ const double xD = *fD1 * seE;
+ const double xI = *fI1 * seEI;
+ const double xP = *fP2 * seP;
+ const double yM = *bM0 * match_score[ *s1 ][ *s2 ];
+ const double yD = *bD0;
+ const double yI = *bI0;
+ const double yP = *bP0;
+ c.emit[*s1][*s2] += (xM + xD + xI + xP) * yM;
+ c.MM += xM * yM;
+ c.DM += xD * yM;
+ c.IM += xI * yM;
+ c.PM += xP * yM;
+ c.MD += xM * yD * eF;
+ c.DD += xD * yD;
+ c.PD += xP * yD;
+ c.MI += xM * yI * eFI;
+ c.DI += xD * yI * eFI;
+ c.II += xI * yI;
+ c.PI += xP * yI;
+ c.MP += xM * yP * eF;
+ c.PP += xP * yP;
+ if (bM0 == bM0last) break;
+ fM2++; fD1++; fI1++; fP2++;
+ bM0++; bD0++; bI0++; bP0++;
+ s1 += seqIncrement;
+ s2 -= seqIncrement;
+ }
}
}
else {
@@ -775,7 +861,7 @@ namespace cbrc{
while (1) { // inner most loop
const double xM = *fM2 * scale12;
const double xD = *fD1 * seE;
- const double xI = *fI1 * seE;
+ const double xI = *fI1 * seEI;
const double yM = *bM0 * ( *p2 )[ *s1 ];
const double yD = *bD0;
const double yI = *bI0;
@@ -785,8 +871,8 @@ namespace cbrc{
c.IM += xI * yM;
c.MD += xM * yD * eF;
c.DD += xD * yD;
- c.MI += xM * yI * eF;
- c.DI += xD * yI * eF;
+ c.MI += xM * yI * eFI;
+ c.DI += xD * yI * eFI;
c.II += xI * yI;
if (bM0 == bM0last) break;
fM2++; fD1++; fI1++;
diff --git a/src/Centroid.hh b/src/Centroid.hh
index 14cef9e..516bbb3 100644
--- a/src/Centroid.hh
+++ b/src/Centroid.hh
@@ -101,8 +101,6 @@ namespace cbrc{
dvec_t bI; // b^I(i,j)
dvec_t bP; // b^P(i,j)
- dvec_t pp; // posterior match probabilities
-
dvec_t mD;
dvec_t mI;
dvec_t mX1;
@@ -128,30 +126,6 @@ namespace cbrc{
return xa.scoreEndIndex( antidiagonal ) - xa.scoreOrigin( antidiagonal );
}
- // get DP matrix value at the given position
- double cellx( const dvec_t& matrix,
- size_t antiDiagonal, size_t seq1pos ) const{
- return matrix[ xa.scoreOrigin( antiDiagonal ) + seq1pos ];
- }
-
- // get DP matrix value "left of" the given position
- double horix( const dvec_t& matrix,
- size_t antiDiagonal, size_t seq1pos ) const{
- return matrix[ xa.hori( antiDiagonal, seq1pos ) ];
- }
-
- // get DP matrix value "above" the given position
- double vertx( const dvec_t& matrix,
- size_t antiDiagonal, size_t seq1pos ) const{
- return matrix[ xa.vert( antiDiagonal, seq1pos ) ];
- }
-
- // get DP matrix value "diagonal from" the given position
- double diagx( const dvec_t& matrix,
- size_t antiDiagonal, size_t seq1pos ) const{
- return matrix[ xa.diag( antiDiagonal, seq1pos ) ];
- }
-
// get a pointer into a sequence, taking start and direction into account
template < typename T >
static const T* seqPtr( const T* seq, bool isForward, size_t pos ){
@@ -160,5 +134,6 @@ namespace cbrc{
}
};
-} // end namespace cbrc
-#endif // CENTROID_HH
+}
+
+#endif
diff --git a/src/DiagonalTable.hh b/src/DiagonalTable.hh
index ff3f9df..90acd2f 100644
--- a/src/DiagonalTable.hh
+++ b/src/DiagonalTable.hh
@@ -15,13 +15,15 @@
#ifndef DIAGONALTABLE_HH
#define DIAGONALTABLE_HH
+
+#include <stddef.h>
#include <utility> // pair
#include <vector>
namespace cbrc{
struct DiagonalTable{
- typedef unsigned indexT;
+ typedef LAST_INT_TYPE indexT;
typedef std::pair<indexT, indexT> pairT;
enum { BINS = 256 }; // use a power-of-two for faster modulus (maybe)
@@ -36,5 +38,6 @@ struct DiagonalTable{
std::vector<pairT> hits[BINS];
};
-} // end namespace cbrc
-#endif // DIAGONALTABLE_HH
+}
+
+#endif
diff --git a/src/GappedXdropAligner.cc b/src/GappedXdropAligner.cc
index 4c8a532..479fdbe 100644
--- a/src/GappedXdropAligner.cc
+++ b/src/GappedXdropAligner.cc
@@ -92,10 +92,9 @@ int GappedXdropAligner::align(const uchar *seq1,
int gapUnalignedCost,
int maxScoreDrop,
int maxMatchScore) {
- bool isAffine =
- insExistenceCost == delExistenceCost &&
- insExtensionCost == delExtensionCost &&
- gapUnalignedCost >= delExistenceCost + 2 * delExtensionCost;
+ const bool isAffine = isAffineGaps(delExistenceCost, delExtensionCost,
+ insExistenceCost, insExtensionCost,
+ gapUnalignedCost);
std::size_t maxSeq1begs[] = { 0, 9 };
std::size_t minSeq1ends[] = { 1, 0 };
@@ -155,14 +154,13 @@ int GappedXdropAligner::align(const uchar *seq1,
while (1) {
int x = *x2;
int y = *y1 - delExtensionCost;
- int z = *z1 - delExtensionCost;
+ int z = *z1 - insExtensionCost;
int b = maxValue(x, y, z);
if (b >= minScore) {
updateBest(bestScore, b, antidiagonal, x0, x0base);
*x0 = b + scorer[*s1][*s2];
- int g = b - delExistenceCost;
- *y0 = maxValue(g, y);
- *z0 = maxValue(g, z);
+ *y0 = maxValue(b - delExistenceCost, y);
+ *z0 = maxValue(b - insExistenceCost, z);
}
else *x0 = *y0 = *z0 = -INF;
if (x0 == x0last) break;
@@ -172,14 +170,13 @@ int GappedXdropAligner::align(const uchar *seq1,
while (1) {
int x = *x2;
int y = *y1 - delExtensionCost;
- int z = *z1 - delExtensionCost;
+ int z = *z1 - insExtensionCost;
int b = maxValue(x, y, z);
if (b >= minScore) {
updateBest(bestScore, b, antidiagonal, x0, x0base);
*x0 = b + scorer[*s1][*s2];
- int g = b - delExistenceCost;
- *y0 = maxValue(g, y);
- *z0 = maxValue(g, z);
+ *y0 = maxValue(b - delExistenceCost, y);
+ *z0 = maxValue(b - insExistenceCost, z);
}
else *x0 = *y0 = *z0 = -INF;
if (x0 == x0last) break;
diff --git a/src/GappedXdropAligner2qual.cc b/src/GappedXdropAligner2qual.cc
index 00c1559..b69cb32 100644
--- a/src/GappedXdropAligner2qual.cc
+++ b/src/GappedXdropAligner2qual.cc
@@ -24,10 +24,9 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
int gapUnalignedCost,
int maxScoreDrop,
int maxMatchScore) {
- bool isAffine =
- insExistenceCost == delExistenceCost &&
- insExtensionCost == delExtensionCost &&
- gapUnalignedCost >= delExistenceCost + 2 * delExtensionCost;
+ const bool isAffine = isAffineGaps(delExistenceCost, delExtensionCost,
+ insExistenceCost, insExtensionCost,
+ gapUnalignedCost);
std::size_t maxSeq1begs[] = { 0, 9 };
std::size_t minSeq1ends[] = { 1, 0 };
@@ -88,14 +87,13 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
while (1) {
int x = *x2;
int y = *y1 - delExtensionCost;
- int z = *z1 - delExtensionCost;
+ int z = *z1 - insExtensionCost;
int b = maxValue(x, y, z);
if (b >= minScore) {
updateBest(bestScore, b, antidiagonal, x0, x0base);
*x0 = b + scorer(*s1, *s2, *q1, *q2);
- int g = b - delExistenceCost;
- *y0 = maxValue(g, y);
- *z0 = maxValue(g, z);
+ *y0 = maxValue(b - delExistenceCost, y);
+ *z0 = maxValue(b - insExistenceCost, z);
}
else *x0 = *y0 = *z0 = -INF;
if (x0 == x0last) break;
@@ -105,14 +103,13 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
while (1) {
int x = *x2;
int y = *y1 - delExtensionCost;
- int z = *z1 - delExtensionCost;
+ int z = *z1 - insExtensionCost;
int b = maxValue(x, y, z);
if (b >= minScore) {
updateBest(bestScore, b, antidiagonal, x0, x0base);
*x0 = b + scorer(*s1, *s2, *q1, *q2);
- int g = b - delExistenceCost;
- *y0 = maxValue(g, y);
- *z0 = maxValue(g, z);
+ *y0 = maxValue(b - delExistenceCost, y);
+ *z0 = maxValue(b - insExistenceCost, z);
}
else *x0 = *y0 = *z0 = -INF;
if (x0 == x0last) break;
diff --git a/src/GappedXdropAlignerInl.hh b/src/GappedXdropAlignerInl.hh
index 21f06c5..28ca280 100644
--- a/src/GappedXdropAlignerInl.hh
+++ b/src/GappedXdropAlignerInl.hh
@@ -59,6 +59,13 @@ T whichFrame(std::size_t antidiagonal, T frame0, T frame1, T frame2) {
}
}
+inline bool isAffineGaps(int delExistenceCost, int delExtensionCost,
+ int insExistenceCost, int insExtensionCost,
+ int gapUnalignedCost) {
+ return gapUnalignedCost >= delExtensionCost + insExtensionCost +
+ std::max(delExistenceCost, insExistenceCost);
+}
+
// The next two functions will stop the alignment at delimiters. But
// this is not guaranteed if bestScore > INF / 2. We could avoid this
// restriction by replacing -INF / 2 with bestScore - INF.
diff --git a/src/GappedXdropAlignerPssm.cc b/src/GappedXdropAlignerPssm.cc
index 6d776d3..470e025 100644
--- a/src/GappedXdropAlignerPssm.cc
+++ b/src/GappedXdropAlignerPssm.cc
@@ -16,10 +16,9 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
int gapUnalignedCost,
int maxScoreDrop,
int maxMatchScore) {
- bool isAffine =
- insExistenceCost == delExistenceCost &&
- insExtensionCost == delExtensionCost &&
- gapUnalignedCost >= delExistenceCost + 2 * delExtensionCost;
+ const bool isAffine = isAffineGaps(delExistenceCost, delExtensionCost,
+ insExistenceCost, insExtensionCost,
+ gapUnalignedCost);
std::size_t maxSeq1begs[] = { 0, 9 };
std::size_t minSeq1ends[] = { 1, 0 };
@@ -78,14 +77,13 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
while (1) {
int x = *x2;
int y = *y1 - delExtensionCost;
- int z = *z1 - delExtensionCost;
+ int z = *z1 - insExtensionCost;
int b = maxValue(x, y, z);
if (b >= minScore) {
updateBest(bestScore, b, antidiagonal, x0, x0base);
*x0 = b + (*s2)[*s1];
- int g = b - delExistenceCost;
- *y0 = maxValue(g, y);
- *z0 = maxValue(g, z);
+ *y0 = maxValue(b - delExistenceCost, y);
+ *z0 = maxValue(b - insExistenceCost, z);
}
else *x0 = *y0 = *z0 = -INF;
if (x0 == x0last) break;
@@ -95,14 +93,13 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
while (1) {
int x = *x2;
int y = *y1 - delExtensionCost;
- int z = *z1 - delExtensionCost;
+ int z = *z1 - insExtensionCost;
int b = maxValue(x, y, z);
if (b >= minScore) {
updateBest(bestScore, b, antidiagonal, x0, x0base);
*x0 = b + (*s2)[*s1];
- int g = b - delExistenceCost;
- *y0 = maxValue(g, y);
- *z0 = maxValue(g, z);
+ *y0 = maxValue(b - delExistenceCost, y);
+ *z0 = maxValue(b - insExistenceCost, z);
}
else *x0 = *y0 = *z0 = -INF;
if (x0 == x0last) break;
diff --git a/src/GeneralizedAffineGapCosts.cc b/src/GeneralizedAffineGapCosts.cc
index f5fdd32..070a0ab 100644
--- a/src/GeneralizedAffineGapCosts.cc
+++ b/src/GeneralizedAffineGapCosts.cc
@@ -1,7 +1,6 @@
// Copyright 2008, 2009, 2012 Martin C. Frith
#include "GeneralizedAffineGapCosts.hh"
-#include <algorithm>
namespace cbrc{
diff --git a/src/GeneralizedAffineGapCosts.hh b/src/GeneralizedAffineGapCosts.hh
index 6902718..8c8f273 100644
--- a/src/GeneralizedAffineGapCosts.hh
+++ b/src/GeneralizedAffineGapCosts.hh
@@ -19,6 +19,8 @@
#ifndef GENERALIZEDAFFINEGAPCOSTS_HH
#define GENERALIZEDAFFINEGAPCOSTS_HH
+#include <algorithm>
+
namespace cbrc{
struct GeneralizedAffineGapCosts{
@@ -36,8 +38,9 @@ struct GeneralizedAffineGapCosts{
{ return insExist == delExist && insExtend == delExtend; }
// Will standard affine gaps always suffice for maximal alignment scores?
- bool isAffine() const
- { return isSymmetric() && pairExtend >= delExist + 2 * delExtend; }
+ bool isAffine() const{
+ return pairExtend >= std::max(delExist, insExist) + delExtend + insExtend;
+ }
// Return the score of a gap with the given sizes in a pair of
// sequences, considering that it might be either one "generalized"
diff --git a/src/LastalArguments.cc b/src/LastalArguments.cc
index 0a16f11..709ce9d 100644
--- a/src/LastalArguments.cc
+++ b/src/LastalArguments.cc
@@ -405,7 +405,7 @@ void LastalArguments::setDefaultsFromAlphabet( bool isDna, bool isProtein,
int refTantanSetting,
bool isCaseSensitiveSeeds,
bool isVolumes,
- indexT refMinimizerWindow,
+ size_t refMinimizerWindow,
unsigned realNumOfThreads ){
if( strand < 0 ) strand = (isDna || isTranslated()) ? 2 : 1;
diff --git a/src/LastalArguments.hh b/src/LastalArguments.hh
index 9f03b33..708f999 100644
--- a/src/LastalArguments.hh
+++ b/src/LastalArguments.hh
@@ -14,8 +14,6 @@
namespace cbrc{
struct LastalArguments{
- typedef unsigned indexT;
-
// set the parameters to their default values:
LastalArguments();
@@ -38,7 +36,7 @@ struct LastalArguments{
double numLettersInReference,
bool isKeepRefLowercase, int refTantanSetting,
bool isCaseSensitiveSeeds, bool isVolumes,
- indexT refMinimizerWindow,
+ size_t refMinimizerWindow,
unsigned realNumOfThreads );
void setDefaultsFromMatrix( double lambda, int minScore );
@@ -85,17 +83,17 @@ struct LastalArguments{
int maxDropGapless;
int maxDropFinal;
sequenceFormat::Enum inputFormat;
- indexT minHitDepth;
- indexT maxHitDepth;
- indexT oneHitMultiplicity;
- indexT maxGaplessAlignmentsPerQueryPosition;
+ size_t minHitDepth;
+ size_t maxHitDepth;
+ size_t oneHitMultiplicity;
+ size_t maxGaplessAlignmentsPerQueryPosition;
size_t cullingLimitForGaplessAlignments;
size_t cullingLimitForFinalAlignments;
- indexT queryStep;
- indexT minimizerWindow;
- indexT batchSize; // approx size of query sequences to scan in 1 batch
+ size_t queryStep;
+ size_t minimizerWindow;
+ size_t batchSize; // approx size of query sequences to scan in 1 batch
unsigned numOfThreads;
- indexT maxRepeatDistance; // suppress repeats <= this distance apart
+ size_t maxRepeatDistance; // suppress repeats <= this distance apart
double temperature; // probability = exp( score / temperature ) / Z
double gamma; // parameter for gamma-centroid alignment
std::string geneticCodeFile;
diff --git a/src/LastdbArguments.cc b/src/LastdbArguments.cc
index 274d248..7393685 100644
--- a/src/LastdbArguments.cc
+++ b/src/LastdbArguments.cc
@@ -38,7 +38,7 @@ LastdbArguments::LastdbArguments() :
subsetSeedFile(""),
userAlphabet(""),
minSeedLimit(0),
- bucketDepth(indexT(-1)), // means: use the default (adapts to the data)
+ bucketDepth(-1), // means: use the default (adapts to the data)
childTableType(0),
isCountsOnly(false),
verbosity(0),
diff --git a/src/LastdbArguments.hh b/src/LastdbArguments.hh
index 73018af..4eb25db 100644
--- a/src/LastdbArguments.hh
+++ b/src/LastdbArguments.hh
@@ -14,8 +14,6 @@
namespace cbrc{
struct LastdbArguments{
- typedef unsigned indexT;
-
// set the parameters to their default values:
LastdbArguments();
@@ -36,14 +34,14 @@ struct LastdbArguments{
int tantanSetting;
bool isCaseSensitive;
std::vector< std::string > seedPatterns;
- size_t volumeSize; // type?
- indexT indexStep;
- indexT minimizerWindow;
+ size_t volumeSize;
+ size_t indexStep;
+ size_t minimizerWindow;
unsigned numOfThreads;
std::string subsetSeedFile;
std::string userAlphabet;
- indexT minSeedLimit;
- indexT bucketDepth;
+ size_t minSeedLimit;
+ unsigned bucketDepth;
int childTableType;
bool isCountsOnly;
int verbosity;
diff --git a/src/MultiSequence.cc b/src/MultiSequence.cc
index df27f2e..f9b0077 100644
--- a/src/MultiSequence.cc
+++ b/src/MultiSequence.cc
@@ -5,7 +5,6 @@
#include <sstream>
#include <algorithm> // upper_bound
#include <cassert>
-#include <cctype> // isspace
#include <iterator> // istreambuf_iterator
using namespace cbrc;
@@ -91,8 +90,8 @@ MultiSequence::appendFromFasta( std::istream& stream, indexT maxSeqLen ){
std::istreambuf_iterator<char> endpos;
while( inpos != endpos ){
uchar c = *inpos;
- if( c == '>' ) break; // we have hit the next FASTA sequence
- if( !std::isspace(c) ){
+ if( c > ' ' ){ // faster than isspace
+ if( c == '>' ) break; // we have hit the next FASTA sequence
if( seq.v.size() >= maxSeqLen ) break;
seq.v.push_back(c);
}
diff --git a/src/MultiSequence.hh b/src/MultiSequence.hh
index 0c8dab6..df388dc 100644
--- a/src/MultiSequence.hh
+++ b/src/MultiSequence.hh
@@ -19,7 +19,7 @@ namespace cbrc{
class MultiSequence{
public:
- typedef unsigned indexT;
+ typedef LAST_INT_TYPE indexT;
typedef unsigned char uchar;
// initialize with leftmost delimiter pad, ready for appending sequences
diff --git a/src/SegmentPair.hh b/src/SegmentPair.hh
index 97d6357..8b0fb18 100644
--- a/src/SegmentPair.hh
+++ b/src/SegmentPair.hh
@@ -6,10 +6,12 @@
#ifndef SEGMENT_PAIR_HH
#define SEGMENT_PAIR_HH
+#include <stddef.h>
+
namespace cbrc{
struct SegmentPair{
- typedef unsigned indexT;
+ typedef LAST_INT_TYPE indexT;
typedef unsigned char uchar;
SegmentPair(){}
diff --git a/src/SegmentPairPot.cc b/src/SegmentPairPot.cc
index 58328b6..5c7e34d 100644
--- a/src/SegmentPairPot.cc
+++ b/src/SegmentPairPot.cc
@@ -113,7 +113,7 @@ void SegmentPairPot::markAllOverlaps( const std::vector<SegmentPair>& sps ){
}
void SegmentPairPot::markTandemRepeats( const SegmentPair& sp,
- indexT maxDistance ){
+ size_t maxDistance ){
assert( !items.empty() );
// Careful: if we are self-comparing lots of short sequences, there
diff --git a/src/SegmentPairPot.hh b/src/SegmentPairPot.hh
index a8670bf..9890eb9 100644
--- a/src/SegmentPairPot.hh
+++ b/src/SegmentPairPot.hh
@@ -51,7 +51,7 @@ struct SegmentPairPot{
// mark (near-)tandem repeats within sp
// to avoid death by dynamic programming when self-aligning a large sequence
- void markTandemRepeats( const SegmentPair& sp, indexT maxDistance );
+ void markTandemRepeats( const SegmentPair& sp, size_t maxDistance );
// data:
std::vector<SegmentPair> items;
diff --git a/src/SubsetSuffixArray.cc b/src/SubsetSuffixArray.cc
index 880c36c..009494c 100644
--- a/src/SubsetSuffixArray.cc
+++ b/src/SubsetSuffixArray.cc
@@ -10,7 +10,7 @@
using namespace cbrc;
void SubsetSuffixArray::addPositions(const uchar* text, indexT beg, indexT end,
- indexT step, indexT minimizerWindow) {
+ size_t step, size_t minimizerWindow) {
if (beg >= end) return;
assert(step > 0);
const uchar *subsetMap = seed.firstMap();
@@ -115,7 +115,7 @@ void SubsetSuffixArray::toFiles( const std::string& baseName,
memoryToBinaryFile( chibiTable.begin(), chibiTable.end(), fileName );
}
-void SubsetSuffixArray::makeBuckets( const uchar* text, indexT bucketDepth ){
+void SubsetSuffixArray::makeBuckets( const uchar* text, unsigned bucketDepth ){
if( bucketDepth+1 == 0 ) bucketDepth = defaultBucketDepth();
makeBucketSteps( bucketDepth );
@@ -126,7 +126,7 @@ void SubsetSuffixArray::makeBuckets( const uchar* text, indexT bucketDepth ){
const uchar* textPtr = text + suffixArray[i];
const uchar* subsetMap = seed.firstMap();
indexT bucketIndex = 0;
- indexT depth = 0;
+ unsigned depth = 0;
while( depth < bucketDepth ){
uchar subset = subsetMap[ *textPtr ];
@@ -136,8 +136,7 @@ void SubsetSuffixArray::makeBuckets( const uchar* text, indexT bucketDepth ){
}
++textPtr;
++depth;
- indexT step = bucketSteps[depth];
- bucketIndex += subset * step;
+ bucketIndex += subset * bucketSteps[depth];
subsetMap = seed.nextMap( subsetMap );
}
@@ -159,11 +158,11 @@ void SubsetSuffixArray::makeBucketSteps( indexT bucketDepth ){
}
}
-SubsetSuffixArray::indexT SubsetSuffixArray::defaultBucketDepth(){
+unsigned SubsetSuffixArray::defaultBucketDepth(){
indexT maxBucketEntries = suffixArray.size() / 4;
- indexT bucketDepth = 0;
indexT kmerEntries = 1;
indexT bucketEntries = 1;
+ unsigned bucketDepth = 0;
while(true){
indexT nextSubsetCount = seed.subsetCount(bucketDepth);
diff --git a/src/SubsetSuffixArray.hh b/src/SubsetSuffixArray.hh
index c77c858..de1422e 100644
--- a/src/SubsetSuffixArray.hh
+++ b/src/SubsetSuffixArray.hh
@@ -30,7 +30,7 @@ namespace cbrc{
class SubsetSuffixArray{
public:
- typedef unsigned indexT;
+ typedef LAST_INT_TYPE indexT;
CyclicSubsetSeed& getSeed() { return seed; }
const CyclicSubsetSeed& getSeed() const { return seed; }
@@ -41,17 +41,17 @@ public:
// If minimizerWindow > 1 then the positions are added only if they
// are "minimizers" for the given window and seed pattern.
void addPositions( const uchar* text, indexT beg, indexT end,
- indexT step, indexT minimizerWindow );
+ size_t step, size_t minimizerWindow );
// Sort the suffix array (but don't make the buckets).
void sortIndex( const uchar* text,
- indexT maxUnsortedInterval, int childTableType );
+ size_t maxUnsortedInterval, int childTableType );
// Make the buckets. If bucketDepth+1 == 0, then a default
// bucketDepth is used. The default is: the maximum possible
// bucketDepth such that the number of bucket entries is at most 1/4
// the number of suffix array entries.
- void makeBuckets( const uchar* text, indexT bucketDepth );
+ void makeBuckets( const uchar* text, unsigned bucketDepth );
void fromFiles( const std::string& baseName,
bool isMaskLowercase, const uchar letterCode[] );
@@ -66,13 +66,13 @@ public:
// via begPtr and endPtr.
void match( const indexT*& begPtr, const indexT*& endPtr,
const uchar* queryPtr, const uchar* text,
- indexT maxHits, indexT minDepth, indexT maxDepth ) const;
+ size_t maxHits, size_t minDepth, size_t maxDepth ) const;
// Count matches of all sizes (up to maxDepth), starting at the
// given position in the query.
void countMatches( std::vector<unsigned long long>& counts,
const uchar* queryPtr, const uchar* text,
- indexT maxDepth ) const;
+ size_t maxDepth ) const;
private:
CyclicSubsetSeed seed;
@@ -113,9 +113,9 @@ private:
const uchar* textBase, const uchar* subsetMap ) const;
// Return the maximum prefix size covered by the buckets.
- indexT maxBucketPrefix() const { return bucketSteps.size() - 1; }
+ size_t maxBucketPrefix() const { return bucketSteps.size() - 1; }
- indexT defaultBucketDepth();
+ unsigned defaultBucketDepth();
void makeBucketSteps( indexT bucketDepth );
diff --git a/src/SubsetSuffixArraySearch.cc b/src/SubsetSuffixArraySearch.cc
index 2c0efff..a505030 100644
--- a/src/SubsetSuffixArraySearch.cc
+++ b/src/SubsetSuffixArraySearch.cc
@@ -9,17 +9,17 @@ using namespace cbrc;
// could & probably should return the match depth
void SubsetSuffixArray::match( const indexT*& begPtr, const indexT*& endPtr,
const uchar* queryPtr, const uchar* text,
- indexT maxHits,
- indexT minDepth, indexT maxDepth ) const{
+ size_t maxHits,
+ size_t minDepth, size_t maxDepth ) const{
// the next line is unnecessary, but makes it faster in some cases:
if( maxHits == 0 && minDepth < maxDepth ) minDepth = maxDepth;
- indexT depth = 0;
+ size_t depth = 0;
const uchar* subsetMap = seed.firstMap();
// match using buckets:
- indexT bucketDepth = maxBucketPrefix();
- indexT startDepth = std::min( bucketDepth, maxDepth );
+ size_t bucketDepth = maxBucketPrefix();
+ size_t startDepth = std::min( bucketDepth, maxDepth );
const indexT* bucketPtr = &buckets[0];
while( depth < startDepth ){
@@ -51,7 +51,7 @@ void SubsetSuffixArray::match( const indexT*& begPtr, const indexT*& endPtr,
// match using binary search:
if( depth < minDepth ){
- indexT d = depth;
+ size_t d = depth;
const uchar* s = subsetMap;
while( depth < minDepth ){
uchar subset = subsetMap[ queryPtr[depth] ];
@@ -85,12 +85,12 @@ void SubsetSuffixArray::match( const indexT*& begPtr, const indexT*& endPtr,
void SubsetSuffixArray::countMatches( std::vector<unsigned long long>& counts,
const uchar* queryPtr,
const uchar* text,
- indexT maxDepth ) const{
- indexT depth = 0;
+ size_t maxDepth ) const{
+ size_t depth = 0;
const uchar* subsetMap = seed.firstMap();
// match using buckets:
- indexT bucketDepth = maxBucketPrefix();
+ size_t bucketDepth = maxBucketPrefix();
const indexT* bucketPtr = &buckets[0];
indexT beg = 0;
indexT end = suffixArray.size();
diff --git a/src/SubsetSuffixArraySort.cc b/src/SubsetSuffixArraySort.cc
index e460b8c..5fe3a6c 100644
--- a/src/SubsetSuffixArraySort.cc
+++ b/src/SubsetSuffixArraySort.cc
@@ -307,7 +307,7 @@ void SubsetSuffixArray::radixSortN( const uchar* text, const uchar* subsetMap,
}
void SubsetSuffixArray::sortIndex( const uchar* text,
- indexT maxUnsortedInterval,
+ size_t maxUnsortedInterval,
int childTableType ){
const indexT minLength = 1;
@@ -324,18 +324,19 @@ void SubsetSuffixArray::sortIndex( const uchar* text,
indexT depth;
POP( beg, end, depth );
- if( end - beg <= maxUnsortedInterval && depth >= minLength ) continue;
+ size_t interval = end - beg;
+ if( interval <= maxUnsortedInterval && depth >= minLength ) continue;
const uchar* textBase = text + depth;
const uchar* subsetMap = seed.subsetMap(depth);
if( childTableType == 0 ){
- if( end - beg < 10 ){ // ???
+ if( interval < 10 ){ // ???
insertionSort( textBase, seed, beg, end, subsetMap );
continue;
}
}else{
- if( end - beg == 2 ){
+ if( interval == 2 ){
sort2( textBase, beg, subsetMap );
continue;
}
diff --git a/src/lastal.cc b/src/lastal.cc
index 3219c3e..467fad7 100644
--- a/src/lastal.cc
+++ b/src/lastal.cc
@@ -292,12 +292,13 @@ void calculateScoreStatistics( const std::string& matrixName,
// Read the .prj file for the whole database
void readOuterPrj( const std::string& fileName, unsigned& volumes,
- indexT& refMinimizerWindow, indexT& minSeedLimit,
+ size_t& refMinimizerWindow, size_t& minSeedLimit,
bool& isKeepRefLowercase, int& refTantanSetting,
countT& refSequences, countT& refLetters ){
std::ifstream f( fileName.c_str() );
if( !f ) ERR( "can't open file: " + fileName );
unsigned version = 0;
+ size_t fileBitsPerInt = 32;
std::string trigger = "#lastal";
std::string line, word;
@@ -320,6 +321,7 @@ void readOuterPrj( const std::string& fileName, unsigned& volumes,
if( word == "minimizerwindow" ) iss >> refMinimizerWindow;
if( word == "volumes" ) iss >> volumes;
if( word == "numofindexes" ) iss >> numOfIndexes;
+ if( word == "integersize" ) iss >> fileBitsPerInt;
}
if( f.eof() && !f.bad() ) f.clear();
@@ -331,6 +333,12 @@ void readOuterPrj( const std::string& fileName, unsigned& volumes,
if( !f ) ERR( "can't read file: " + fileName );
if( version < 294 && version > 0)
ERR( "the lastdb files are old: please re-run lastdb" );
+
+ if( fileBitsPerInt != sizeof(indexT) * CHAR_BIT ){
+ if( fileBitsPerInt == 32 ) ERR( "please use lastal for " + fileName );
+ if( fileBitsPerInt == 64 ) ERR( "please use lastal8 for " + fileName );
+ ERR( "weird integersize in " + fileName );
+ }
}
// Read a per-volume .prj file, with info about a database volume
@@ -360,7 +368,7 @@ void writeCounts( std::ostream& out ){
for( indexT i = 0; i < matchCounts.size(); ++i ){
out << query.seqName(i) << '\n';
- for( indexT j = args.minHitDepth; j < matchCounts[i].size(); ++j ){
+ for( size_t j = args.minHitDepth; j < matchCounts[i].size(); ++j ){
out << j << '\t' << matchCounts[i][j] << '\n';
}
@@ -370,12 +378,12 @@ void writeCounts( std::ostream& out ){
// Count all matches, of all sizes, of a query sequence against a suffix array
void countMatches( size_t queryNum, const uchar* querySeq ){
- indexT loopBeg = query.seqBeg(queryNum) - query.padBeg(queryNum);
- indexT loopEnd = query.seqEnd(queryNum) - query.padBeg(queryNum);
+ size_t loopBeg = query.seqBeg(queryNum) - query.padBeg(queryNum);
+ size_t loopEnd = query.seqEnd(queryNum) - query.padBeg(queryNum);
if( args.minHitDepth > 1 )
loopEnd -= std::min( args.minHitDepth - 1, loopEnd );
- for( indexT i = loopBeg; i < loopEnd; i += args.queryStep ){
+ for( size_t i = loopBeg; i < loopEnd; i += args.queryStep ){
for( unsigned x = 0; x < numOfIndexes; ++x )
suffixArrays[x].countMatches( matchCounts[queryNum], querySeq + i,
text.seqReader(), args.maxHitDepth );
@@ -545,8 +553,8 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns,
DiagonalTable dt; // record already-covered positions on each diagonal
countT matchCount = 0, gaplessExtensionCount = 0, gaplessAlignmentCount = 0;
- indexT loopBeg = query.seqBeg(queryNum) - query.padBeg(queryNum);
- indexT loopEnd = query.seqEnd(queryNum) - query.padBeg(queryNum);
+ size_t loopBeg = query.seqBeg(queryNum) - query.padBeg(queryNum);
+ size_t loopEnd = query.seqEnd(queryNum) - query.padBeg(queryNum);
if( args.minHitDepth > 1 )
loopEnd -= std::min( args.minHitDepth - 1, loopEnd );
@@ -571,7 +579,7 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns,
// increase "i" to the delimiter position. This gave a speed-up
// of only 3%, with 34-nt tags.
- indexT gaplessAlignmentsPerQueryPosition = 0;
+ size_t gaplessAlignmentsPerQueryPosition = 0;
for( /* noop */; beg < end; ++beg ){ // loop over suffix-array matches
if( gaplessAlignmentsPerQueryPosition ==
@@ -639,7 +647,7 @@ void alignGapped( LastAligner& aligner,
size_t queryNum, char strand, const uchar* querySeq,
Phase::Enum phase ){
Dispatcher dis( phase, aligner, queryNum, strand, querySeq );
- indexT frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0;
+ size_t frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0;
countT gappedExtensionCount = 0, gappedAlignmentCount = 0;
// Redo the gapless extensions, using gapped score parameters.
@@ -714,7 +722,7 @@ void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns,
size_t queryNum, char strand, const uchar* querySeq ){
Centroid& centroid = aligner.centroid;
Dispatcher dis( Phase::final, aligner, queryNum, strand, querySeq );
- indexT frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0;
+ size_t frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0;
if( args.outputType > 3 ){
if( dis.p ){
@@ -751,7 +759,7 @@ void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns,
static void eraseWeakAlignments(LastAligner &aligner, AlignmentPot &gappedAlns,
size_t queryNum, char strand,
const uchar *querySeq) {
- indexT frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0;
+ size_t frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0;
Dispatcher dis(Phase::gapless, aligner, queryNum, strand, querySeq);
for (size_t i = 0; i < gappedAlns.size(); ++i) {
Alignment &a = gappedAlns.items[i];
@@ -1146,8 +1154,8 @@ void lastal( int argc, char** argv ){
args.resetCumulativeOptions(); // because we will do fromArgs again
unsigned volumes = unsigned(-1);
- indexT refMinimizerWindow = 1; // assume this value, if not specified
- indexT minSeedLimit = 0;
+ size_t refMinimizerWindow = 1; // assume this value, if not specified
+ size_t minSeedLimit = 0;
countT refSequences = -1;
countT refLetters = -1;
bool isKeepRefLowercase = true;
diff --git a/src/lastdb.cc b/src/lastdb.cc
index 57cf321..085d9d3 100644
--- a/src/lastdb.cc
+++ b/src/lastdb.cc
@@ -38,7 +38,7 @@ bool isDubiousDna( const Alphabet& alph, const MultiSequence& multi ){
const uchar* seq = multi.seqReader() + multi.seqBeg(0);
unsigned dnaCount = 0;
- for( indexT i = 0; i < 100; ++i ){ // look at the first 100 letters
+ for( unsigned i = 0; i < 100; ++i ){ // look at the first 100 letters
uchar c = alph.numbersToUppercase[ seq[i] ];
if( c == alph.size ) return false; // we hit the end of the sequence early
if( c < alph.size || c == alph.encode[ (uchar)'N' ] ) ++dnaCount;
@@ -140,6 +140,7 @@ void writePrjFile( const std::string& fileName, const LastdbArguments& args,
else{
f << "numofindexes=" << numOfIndexes << '\n';
}
+ f << "integersize=" << (sizeof(indexT) * CHAR_BIT) << '\n';
writeLastalOptions( f, seedText );
}
diff --git a/src/makefile b/src/makefile
index efbfb2c..cddf3b7 100644
--- a/src/makefile
+++ b/src/makefile
@@ -8,51 +8,78 @@ CXXFLAGS = -O3 -Wall -Wextra -Wcast-qual -Wswitch-enum -Wundef \
CFLAGS = -Wall -O2
-DBOBJ = Alphabet.o MultiSequence.o CyclicSubsetSeed.o \
-SubsetSuffixArray.o LastdbArguments.o io.o fileMap.o TantanMasker.o \
-ScoreMatrix.o SubsetMinimizerFinder.o LambdaCalculator.o tantan.o \
-SubsetSuffixArraySort.o MultiSequenceQual.o lastdb.o
-
-ALOBJ = Alphabet.o MultiSequence.o CyclicSubsetSeed.o \
-SubsetSuffixArray.o LastalArguments.o io.o fileMap.o TantanMasker.o \
-ScoreMatrix.o SubsetMinimizerFinder.o tantan.o DiagonalTable.o \
-SegmentPair.o Alignment.o GappedXdropAligner.o SegmentPairPot.o \
-AlignmentPot.o GeneralizedAffineGapCosts.o Centroid.o \
-LambdaCalculator.o TwoQualityScoreMatrix.o OneQualityScoreMatrix.o \
-QualityPssmMaker.o GeneticCode.o LastEvaluer.o GreedyXdropAligner.o \
-gaplessXdrop.o gaplessPssmXdrop.o gaplessTwoQualityXdrop.o \
-SubsetSuffixArraySearch.o AlignmentWrite.o MultiSequenceQual.o \
+alpObj = alp/sls_alignment_evaluer.o alp/sls_pvalues.o \
+alp/sls_alp_sim.o alp/sls_alp_regression.o alp/sls_alp_data.o \
+alp/sls_alp.o alp/sls_basic.o alp/njn_localmaxstatmatrix.o \
+alp/njn_localmaxstat.o alp/njn_localmaxstatutil.o \
+alp/njn_dynprogprob.o alp/njn_dynprogprobproto.o \
+alp/njn_dynprogproblim.o alp/njn_ioutil.o alp/njn_random.o \
+alp/sls_falp_alignment_evaluer.o alp/sls_fsa1_pvalues.o \
+alp/sls_fsa1_utils.o alp/sls_fsa1.o alp/sls_fsa1_parameters.o
+
+indexObj0 = Alphabet.o CyclicSubsetSeed.o LambdaCalculator.o \
+ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o io.o \
+tantan.o LastdbArguments.o
+
+indexObj4 = MultiSequence.o MultiSequenceQual.o SubsetSuffixArray.o \
+SubsetSuffixArraySort.o lastdb.o
+
+alignObj0 = Alphabet.o CyclicSubsetSeed.o LambdaCalculator.o \
+ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o io.o \
+tantan.o LastalArguments.o GappedXdropAligner.o \
GappedXdropAlignerPssm.o GappedXdropAligner2qual.o \
-GappedXdropAligner3frame.o lastal.o alp/sls_alignment_evaluer.o \
-alp/sls_pvalues.o alp/sls_alp_sim.o alp/sls_alp_regression.o \
-alp/sls_alp_data.o alp/sls_alp.o alp/sls_basic.o \
-alp/njn_localmaxstatmatrix.o alp/njn_localmaxstat.o \
-alp/njn_localmaxstatutil.o alp/njn_dynprogprob.o \
-alp/njn_dynprogprobproto.o alp/njn_dynprogproblim.o alp/njn_ioutil.o \
-alp/njn_random.o alp/sls_falp_alignment_evaluer.o \
-alp/sls_fsa1_pvalues.o alp/sls_fsa1_utils.o alp/sls_fsa1.o \
-alp/sls_fsa1_parameters.o
-
-SPOBJ = Alphabet.o MultiSequence.o fileMap.o split/cbrc_linalg.o \
-split/last-split.o split/cbrc_split_aligner.o split/last-split-main.o \
-split/cbrc_unsplit_alignment.o
+GappedXdropAligner3frame.o GeneralizedAffineGapCosts.o GeneticCode.o \
+GreedyXdropAligner.o LastEvaluer.o OneQualityScoreMatrix.o \
+QualityPssmMaker.o TwoQualityScoreMatrix.o gaplessXdrop.o \
+gaplessPssmXdrop.o gaplessTwoQualityXdrop.o $(alpObj)
+
+alignObj4 = Alignment.o AlignmentPot.o AlignmentWrite.o Centroid.o \
+DiagonalTable.o MultiSequence.o MultiSequenceQual.o SegmentPair.o \
+SegmentPairPot.o SubsetSuffixArray.o SubsetSuffixArraySearch.o \
+lastal.o
+
+splitObj0 = Alphabet.o fileMap.o split/cbrc_linalg.o \
+split/cbrc_unsplit_alignment.o split/last-split-main.o
+
+splitObj4 = MultiSequence.o split/cbrc_split_aligner.o \
+split/last-split.o
PPOBJ = last-pair-probs.o last-pair-probs-main.o io.o
MBOBJ = last-merge-batches.o
-ALL = lastdb lastal last-split last-merge-batches last-pair-probs
+ALL = lastdb lastal last-split last-merge-batches last-pair-probs \
+lastdb8 lastal8 last-split8
+
+indexObj8 = $(indexObj4:.o=.o8)
+alignObj8 = $(alignObj4:.o=.o8)
+splitObj8 = $(splitObj4:.o=.o8)
all: $(ALL)
-lastdb: $(DBOBJ)
- $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(DBOBJ)
+indexAllObj4 = $(indexObj0) $(indexObj4)
+lastdb: $(indexAllObj4)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj4)
+
+indexAllObj8 = $(indexObj0) $(indexObj8)
+lastdb8: $(indexAllObj8)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj8)
+
+alignAllObj4 = $(alignObj0) $(alignObj4)
+lastal: $(alignAllObj4)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj4)
-lastal: $(ALOBJ)
- $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(ALOBJ)
+alignAllObj8 = $(alignObj0) $(alignObj8)
+lastal8: $(alignAllObj8)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj8)
-last-split: $(SPOBJ)
- $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(SPOBJ)
+splitAllObj4 = $(splitObj0) $(splitObj4)
+last-split: $(splitAllObj4)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj4)
+
+splitAllObj8 = $(splitObj0) $(splitObj8)
+last-split8: $(splitAllObj8)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj8)
last-pair-probs: $(PPOBJ)
$(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(PPOBJ)
@@ -61,19 +88,22 @@ last-merge-batches: $(MBOBJ)
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(MBOBJ)
.SUFFIXES:
-.SUFFIXES: .o .c .cc .cpp
+.SUFFIXES: .o .c .cc .cpp .o8
.c.o:
$(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $<
.cc.o:
- $(CXX) $(CPPFLAGS) $(CXXFLAGS) -I. -c -o $@ $<
+ $(CXX) -DLAST_INT_TYPE=unsigned $(CPPFLAGS) $(CXXFLAGS) -I. -c -o $@ $<
+
+.cc.o8:
+ $(CXX) -DLAST_INT_TYPE=size_t $(CPPFLAGS) $(CXXFLAGS) -I. -c -o $@ $<
.cpp.o:
$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<
clean:
- rm -f $(ALL) *.o */*.o
+ rm -f $(ALL) *.o* */*.o*
CyclicSubsetSeedData.hh: ../data/*.seed
../build/seed-inc.sh ../data/*.seed > $@
@@ -95,96 +125,98 @@ version.hh: FORCE
FORCE:
+fix8 = sed 's/.*\.o/& &8/'
+
depend:
sed '/[m][v]/q' makefile > m
- $(CXX) -MM *.cc >> m
+ $(CXX) -MM *.cc | $(fix8) >> m
$(CC) -MM *.c >> m
$(CXX) -MM alp/*.cpp | sed 's|.*:|alp/&|' >> m
- $(CXX) -MM -I. split/*.cc | sed 's|.*:|split/&|' >> m
+ $(CXX) -MM -I. split/*.cc | sed 's|.*:|split/&|' | $(fix8) >> m
mv m makefile
-Alignment.o: Alignment.cc Alignment.hh ScoreMatrixRow.hh SegmentPair.hh \
+Alignment.o Alignment.o8: Alignment.cc Alignment.hh ScoreMatrixRow.hh SegmentPair.hh \
Alphabet.hh Centroid.hh GappedXdropAligner.hh \
GeneralizedAffineGapCosts.hh OneQualityScoreMatrix.hh GeneticCode.hh \
GreedyXdropAligner.hh TwoQualityScoreMatrix.hh
-AlignmentPot.o: AlignmentPot.cc AlignmentPot.hh Alignment.hh \
+AlignmentPot.o AlignmentPot.o8: AlignmentPot.cc AlignmentPot.hh Alignment.hh \
ScoreMatrixRow.hh SegmentPair.hh
-AlignmentWrite.o: AlignmentWrite.cc Alignment.hh ScoreMatrixRow.hh \
+AlignmentWrite.o AlignmentWrite.o8: AlignmentWrite.cc Alignment.hh ScoreMatrixRow.hh \
SegmentPair.hh GeneticCode.hh LastEvaluer.hh \
alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp alp/sls_basic.hpp \
alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \
MultiSequence.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
Alphabet.hh
-Alphabet.o: Alphabet.cc Alphabet.hh
-Centroid.o: Centroid.cc Centroid.hh GappedXdropAligner.hh \
+Alphabet.o Alphabet.o8: Alphabet.cc Alphabet.hh
+Centroid.o Centroid.o8: Centroid.cc Centroid.hh GappedXdropAligner.hh \
ScoreMatrixRow.hh GeneralizedAffineGapCosts.hh SegmentPair.hh \
OneQualityScoreMatrix.hh GappedXdropAlignerInl.hh
-CyclicSubsetSeed.o: CyclicSubsetSeed.cc CyclicSubsetSeed.hh \
+CyclicSubsetSeed.o CyclicSubsetSeed.o8: CyclicSubsetSeed.cc CyclicSubsetSeed.hh \
CyclicSubsetSeedData.hh io.hh stringify.hh
-DiagonalTable.o: DiagonalTable.cc DiagonalTable.hh
-GappedXdropAligner.o: GappedXdropAligner.cc GappedXdropAligner.hh \
+DiagonalTable.o DiagonalTable.o8: DiagonalTable.cc DiagonalTable.hh
+GappedXdropAligner.o GappedXdropAligner.o8: GappedXdropAligner.cc GappedXdropAligner.hh \
ScoreMatrixRow.hh GappedXdropAlignerInl.hh
-GappedXdropAligner2qual.o: GappedXdropAligner2qual.cc \
+GappedXdropAligner2qual.o GappedXdropAligner2qual.o8: GappedXdropAligner2qual.cc \
GappedXdropAligner.hh ScoreMatrixRow.hh GappedXdropAlignerInl.hh \
TwoQualityScoreMatrix.hh
-GappedXdropAligner3frame.o: GappedXdropAligner3frame.cc \
+GappedXdropAligner3frame.o GappedXdropAligner3frame.o8: GappedXdropAligner3frame.cc \
GappedXdropAligner.hh ScoreMatrixRow.hh GappedXdropAlignerInl.hh
-GappedXdropAligner3framePssm.o: GappedXdropAligner3framePssm.cc \
+GappedXdropAligner3framePssm.o GappedXdropAligner3framePssm.o8: GappedXdropAligner3framePssm.cc \
GappedXdropAligner.hh ScoreMatrixRow.hh GappedXdropAlignerInl.hh
-GappedXdropAlignerPssm.o: GappedXdropAlignerPssm.cc GappedXdropAligner.hh \
+GappedXdropAlignerPssm.o GappedXdropAlignerPssm.o8: GappedXdropAlignerPssm.cc GappedXdropAligner.hh \
ScoreMatrixRow.hh GappedXdropAlignerInl.hh
-GeneralizedAffineGapCosts.o: GeneralizedAffineGapCosts.cc \
+GeneralizedAffineGapCosts.o GeneralizedAffineGapCosts.o8: GeneralizedAffineGapCosts.cc \
GeneralizedAffineGapCosts.hh
-GeneticCode.o: GeneticCode.cc GeneticCode.hh Alphabet.hh
-GreedyXdropAligner.o: GreedyXdropAligner.cc GreedyXdropAligner.hh \
+GeneticCode.o GeneticCode.o8: GeneticCode.cc GeneticCode.hh Alphabet.hh
+GreedyXdropAligner.o GreedyXdropAligner.o8: GreedyXdropAligner.cc GreedyXdropAligner.hh \
ScoreMatrixRow.hh
-LambdaCalculator.o: LambdaCalculator.cc LambdaCalculator.hh
-LastEvaluer.o: LastEvaluer.cc LastEvaluer.hh ScoreMatrixRow.hh \
+LambdaCalculator.o LambdaCalculator.o8: LambdaCalculator.cc LambdaCalculator.hh
+LastEvaluer.o LastEvaluer.o8: LastEvaluer.cc LastEvaluer.hh ScoreMatrixRow.hh \
alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp alp/sls_basic.hpp \
alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \
GeneticCode.hh
-LastalArguments.o: LastalArguments.cc LastalArguments.hh \
+LastalArguments.o LastalArguments.o8: LastalArguments.cc LastalArguments.hh \
SequenceFormat.hh stringify.hh version.hh
-LastdbArguments.o: LastdbArguments.cc LastdbArguments.hh \
+LastdbArguments.o LastdbArguments.o8: LastdbArguments.cc LastdbArguments.hh \
SequenceFormat.hh stringify.hh version.hh
-MultiSequence.o: MultiSequence.cc MultiSequence.hh ScoreMatrixRow.hh \
+MultiSequence.o MultiSequence.o8: MultiSequence.cc MultiSequence.hh ScoreMatrixRow.hh \
VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh io.hh
-MultiSequenceQual.o: MultiSequenceQual.cc MultiSequence.hh \
+MultiSequenceQual.o MultiSequenceQual.o8: MultiSequenceQual.cc MultiSequence.hh \
ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh
-OneQualityScoreMatrix.o: OneQualityScoreMatrix.cc \
+OneQualityScoreMatrix.o OneQualityScoreMatrix.o8: OneQualityScoreMatrix.cc \
OneQualityScoreMatrix.hh ScoreMatrixRow.hh qualityScoreUtil.hh \
stringify.hh
-QualityPssmMaker.o: QualityPssmMaker.cc QualityPssmMaker.hh \
+QualityPssmMaker.o QualityPssmMaker.o8: QualityPssmMaker.cc QualityPssmMaker.hh \
ScoreMatrixRow.hh qualityScoreUtil.hh stringify.hh
-ScoreMatrix.o: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh io.hh
-SegmentPair.o: SegmentPair.cc SegmentPair.hh
-SegmentPairPot.o: SegmentPairPot.cc SegmentPairPot.hh SegmentPair.hh
-SubsetMinimizerFinder.o: SubsetMinimizerFinder.cc \
+ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh io.hh
+SegmentPair.o SegmentPair.o8: SegmentPair.cc SegmentPair.hh
+SegmentPairPot.o SegmentPairPot.o8: SegmentPairPot.cc SegmentPairPot.hh SegmentPair.hh
+SubsetMinimizerFinder.o SubsetMinimizerFinder.o8: SubsetMinimizerFinder.cc \
SubsetMinimizerFinder.hh CyclicSubsetSeed.hh
-SubsetSuffixArray.o: SubsetSuffixArray.cc SubsetSuffixArray.hh \
+SubsetSuffixArray.o SubsetSuffixArray.o8: SubsetSuffixArray.cc SubsetSuffixArray.hh \
CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
SubsetMinimizerFinder.hh io.hh
-SubsetSuffixArraySearch.o: SubsetSuffixArraySearch.cc \
+SubsetSuffixArraySearch.o SubsetSuffixArraySearch.o8: SubsetSuffixArraySearch.cc \
SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \
fileMap.hh stringify.hh
-SubsetSuffixArraySort.o: SubsetSuffixArraySort.cc SubsetSuffixArray.hh \
+SubsetSuffixArraySort.o SubsetSuffixArraySort.o8: SubsetSuffixArraySort.cc SubsetSuffixArray.hh \
CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh
-TantanMasker.o: TantanMasker.cc TantanMasker.hh ScoreMatrixRow.hh \
+TantanMasker.o TantanMasker.o8: TantanMasker.cc TantanMasker.hh ScoreMatrixRow.hh \
tantan.hh ScoreMatrix.hh LambdaCalculator.hh
-TwoQualityScoreMatrix.o: TwoQualityScoreMatrix.cc \
+TwoQualityScoreMatrix.o TwoQualityScoreMatrix.o8: TwoQualityScoreMatrix.cc \
TwoQualityScoreMatrix.hh ScoreMatrixRow.hh qualityScoreUtil.hh \
stringify.hh
-fileMap.o: fileMap.cc fileMap.hh stringify.hh
-gaplessPssmXdrop.o: gaplessPssmXdrop.cc gaplessPssmXdrop.hh \
+fileMap.o fileMap.o8: fileMap.cc fileMap.hh stringify.hh
+gaplessPssmXdrop.o gaplessPssmXdrop.o8: gaplessPssmXdrop.cc gaplessPssmXdrop.hh \
ScoreMatrixRow.hh
-gaplessTwoQualityXdrop.o: gaplessTwoQualityXdrop.cc \
+gaplessTwoQualityXdrop.o gaplessTwoQualityXdrop.o8: gaplessTwoQualityXdrop.cc \
gaplessTwoQualityXdrop.hh TwoQualityScoreMatrix.hh ScoreMatrixRow.hh
-gaplessXdrop.o: gaplessXdrop.cc gaplessXdrop.hh ScoreMatrixRow.hh
-io.o: io.cc io.hh
-last-pair-probs-main.o: last-pair-probs-main.cc last-pair-probs.hh \
+gaplessXdrop.o gaplessXdrop.o8: gaplessXdrop.cc gaplessXdrop.hh ScoreMatrixRow.hh
+io.o io.o8: io.cc io.hh
+last-pair-probs-main.o last-pair-probs-main.o8: last-pair-probs-main.cc last-pair-probs.hh \
stringify.hh version.hh
-last-pair-probs.o: last-pair-probs.cc last-pair-probs.hh io.hh \
+last-pair-probs.o last-pair-probs.o8: last-pair-probs.cc last-pair-probs.hh io.hh \
stringify.hh
-lastal.o: lastal.cc LastalArguments.hh SequenceFormat.hh \
+lastal.o lastal.o8: lastal.cc LastalArguments.hh SequenceFormat.hh \
QualityPssmMaker.hh ScoreMatrixRow.hh OneQualityScoreMatrix.hh \
TwoQualityScoreMatrix.hh qualityScoreUtil.hh stringify.hh \
LambdaCalculator.hh LastEvaluer.hh alp/sls_alignment_evaluer.hpp \
@@ -197,12 +229,12 @@ lastal.o: lastal.cc LastalArguments.hh SequenceFormat.hh \
TantanMasker.hh tantan.hh DiagonalTable.hh GreedyXdropAligner.hh \
gaplessXdrop.hh gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh io.hh \
threadUtil.hh version.hh
-lastdb.o: lastdb.cc LastdbArguments.hh SequenceFormat.hh \
+lastdb.o lastdb.o8: lastdb.cc LastdbArguments.hh SequenceFormat.hh \
SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \
fileMap.hh stringify.hh Alphabet.hh MultiSequence.hh ScoreMatrixRow.hh \
TantanMasker.hh tantan.hh io.hh qualityScoreUtil.hh threadUtil.hh \
version.hh
-tantan.o: tantan.cc tantan.hh
+tantan.o tantan.o8: tantan.cc tantan.hh
last-merge-batches.o: last-merge-batches.c version.hh
alp/njn_dynprogprob.o: alp/njn_dynprogprob.cpp alp/njn_dynprogprob.hpp \
alp/njn_dynprogprobproto.hpp alp/njn_memutil.hpp alp/njn_ioutil.hpp
@@ -275,17 +307,17 @@ alp/sls_fsa1_utils.o: alp/sls_fsa1_utils.cpp alp/sls_fsa1_utils.hpp \
alp/sls_pvalues.o: alp/sls_pvalues.cpp alp/sls_pvalues.hpp alp/sls_basic.hpp \
alp/sls_alp_data.hpp alp/sls_alp_regression.hpp alp/njn_random.hpp \
alp/njn_uniform.hpp alp/sls_normal_distr_array.hpp
-split/cbrc_linalg.o: split/cbrc_linalg.cc split/cbrc_linalg.hh
-split/cbrc_split_aligner.o: split/cbrc_split_aligner.cc \
+split/cbrc_linalg.o split/cbrc_linalg.o8: split/cbrc_linalg.cc split/cbrc_linalg.hh
+split/cbrc_split_aligner.o split/cbrc_split_aligner.o8: split/cbrc_split_aligner.cc \
split/cbrc_split_aligner.hh split/cbrc_unsplit_alignment.hh \
split/cbrc_int_exponentiator.hh Alphabet.hh MultiSequence.hh \
ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
split/cbrc_linalg.hh
-split/cbrc_unsplit_alignment.o: split/cbrc_unsplit_alignment.cc \
+split/cbrc_unsplit_alignment.o split/cbrc_unsplit_alignment.o8: split/cbrc_unsplit_alignment.cc \
split/cbrc_unsplit_alignment.hh
-split/last-split-main.o: split/last-split-main.cc split/last-split.hh \
+split/last-split-main.o split/last-split-main.o8: split/last-split-main.cc split/last-split.hh \
stringify.hh version.hh
-split/last-split.o: split/last-split.cc split/last-split.hh \
+split/last-split.o split/last-split.o8: split/last-split.cc split/last-split.hh \
split/cbrc_split_aligner.hh split/cbrc_unsplit_alignment.hh \
split/cbrc_int_exponentiator.hh Alphabet.hh MultiSequence.hh \
ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh
diff --git a/src/split/cbrc_split_aligner.cc b/src/split/cbrc_split_aligner.cc
index c395a5f..bea2d07 100644
--- a/src/split/cbrc_split_aligner.cc
+++ b/src/split/cbrc_split_aligner.cc
@@ -714,8 +714,8 @@ void SplitAligner::initSpliceSignals(unsigned i) {
StringNumMap::const_iterator f = chromosomeIndex.find(a.rname);
if (f == chromosomeIndex.end())
err("can't find " + std::string(a.rname) + " in the genome");
- unsigned v = f->second % maxGenomeVolumes();
- unsigned c = f->second / maxGenomeVolumes();
+ size_t v = f->second % maxGenomeVolumes();
+ size_t c = f->second / maxGenomeVolumes();
const uchar *chromBeg = genome[v].seqReader() + genome[v].seqBeg(c);
const uchar *chromEnd = genome[v].seqReader() + genome[v].seqEnd(c);
if (a.rend > chromEnd - chromBeg)
@@ -916,13 +916,13 @@ void SplitAligner::flipSpliceSignals() {
}
}
-double SplitAligner::spliceSignalStrandProb() const {
+double SplitAligner::spliceSignalStrandLogOdds() const {
assert(rescales.size() == rescalesRev.size());
- double r = 1.0;
+ double logOdds = 0;
for (unsigned j = 0; j < rescales.size(); ++j) {
- r *= rescalesRev[j] / rescales[j];
- } // r might overflow to inf, but that should be OK
- return 1.0 / (1.0 + r);
+ logOdds += std::log(rescales[j] / rescalesRev[j]);
+ }
+ return logOdds;
}
// 1st 1 million reads from SRR359290.fastq:
@@ -1088,8 +1088,9 @@ void SplitAligner::printParameters() const {
static void readPrjFile(const std::string& baseName,
std::string& alphabetLetters,
- unsigned& seqCount,
- unsigned& volumes) {
+ size_t& seqCount,
+ size_t& volumes) {
+ size_t fileBitsPerInt = 32;
seqCount = volumes = -1;
std::string fileName = baseName + ".prj";
@@ -1102,15 +1103,22 @@ static void readPrjFile(const std::string& baseName,
getline(iss, word, '=');
if (word == "alphabet") iss >> alphabetLetters;
if (word == "numofsequences") iss >> seqCount;
- if( word == "volumes" ) iss >> volumes;
+ if (word == "volumes") iss >> volumes;
+ if (word == "integersize") iss >> fileBitsPerInt;
}
if (alphabetLetters != "ACGT") err("can't read file: " + fileName);
+
+ if (fileBitsPerInt != sizeof(MultiSequence::indexT) * CHAR_BIT) {
+ if (fileBitsPerInt == 32) err("please use last-split for " + baseName);
+ if (fileBitsPerInt == 64) err("please use last-split8 for " + baseName);
+ err("weird integersize in " + fileName);
+ }
}
void SplitAligner::readGenomeVolume(const std::string& baseName,
- unsigned seqCount,
- unsigned volumeNumber) {
+ size_t seqCount,
+ size_t volumeNumber) {
if (seqCount + 1 == 0) err("can't read: " + baseName);
genome[volumeNumber].fromFiles(baseName, seqCount, 0);
@@ -1125,14 +1133,14 @@ void SplitAligner::readGenomeVolume(const std::string& baseName,
void SplitAligner::readGenome(const std::string& baseName) {
std::string alphabetLetters;
- unsigned seqCount, volumes;
+ size_t seqCount, volumes;
readPrjFile(baseName, alphabetLetters, seqCount, volumes);
if (volumes + 1 > 0 && volumes > 1) {
if (volumes > maxGenomeVolumes()) err("too many volumes: " + baseName);
- for (unsigned i = 0; i < volumes; ++i) {
+ for (size_t i = 0; i < volumes; ++i) {
std::string b = baseName + stringify(i);
- unsigned c, v;
+ size_t c, v;
readPrjFile(b, alphabetLetters, c, v);
readGenomeVolume(b, c, i);
}
diff --git a/src/split/cbrc_split_aligner.hh b/src/split/cbrc_split_aligner.hh
index 53db5b0..a2ca7ac 100644
--- a/src/split/cbrc_split_aligner.hh
+++ b/src/split/cbrc_split_aligner.hh
@@ -102,9 +102,10 @@ public:
// Toggles between forward and reverse-complement splice signals
void flipSpliceSignals();
- // The probability that the query uses splice signals in the
- // orientation currently set by flipSpliceSignals()
- double spliceSignalStrandProb() const;
+ // This returns log(p / (1-p)), where p is the probability that
+ // the query uses splice signals in the orientation currently set
+ // by flipSpliceSignals()
+ double spliceSignalStrandLogOdds() const;
private:
static const int numQualCodes = 64;
@@ -203,7 +204,7 @@ private:
void initRbegsAndEnds();
static size_t maxGenomeVolumes() { return sizeof genome / sizeof *genome; }
void readGenomeVolume(const std::string& baseName,
- unsigned seqCount, unsigned volumeNumber);
+ size_t seqCount, size_t volumeNumber);
void dpExtensionMinScores(int maxJumpScore,
size_t& minScore1, size_t& minScore2) const;
diff --git a/src/split/last-split.cc b/src/split/last-split.cc
index b18a156..04790bc 100644
--- a/src/split/last-split.cc
+++ b/src/split/last-split.cc
@@ -70,7 +70,7 @@ static void doOneAlignmentPart(cbrc::SplitAligner& sa,
const cbrc::UnsplitAlignment& a,
unsigned alnNum,
unsigned qSliceBeg, unsigned qSliceEnd,
- double forwardDirectionProb,
+ double senseStrandLogOdds,
const LastSplitOptions& opts,
bool isAlreadySplit) {
unsigned alnBeg, alnEnd;
@@ -92,9 +92,11 @@ static void doOneAlignmentPart(cbrc::SplitAligner& sa,
}
if (opts.direction == 0) p.swap(pRev);
if (opts.direction == 2) {
- double reverseDirectionProb = 1.0 - forwardDirectionProb;
+ double reverseProb = 1 / (1 + std::exp(senseStrandLogOdds));
+ // the exp might overflow to inf, but that should be OK
+ double forwardProb = 1 - reverseProb;
for (unsigned i = 0; i < p.size(); ++i) {
- p[i] = forwardDirectionProb * p[i] + reverseDirectionProb * pRev[i];
+ p[i] = forwardProb * p[i] + reverseProb * pRev[i];
}
}
@@ -115,8 +117,16 @@ static void doOneAlignmentPart(cbrc::SplitAligner& sa,
}
std::cout << std::setprecision(mismapPrecision)
- << "a score=" << score << " mismap=" << mismap << "\n"
- << std::setprecision(6);
+ << "a score=" << score << " mismap=" << mismap;
+ if (opts.direction == 2) {
+ double b = senseStrandLogOdds / std::log(2.0);
+ if (b < 0.1 && b > -0.1) b = 0;
+ else if (b > 10) b = std::floor(b + 0.5);
+ else if (b < -10) b = std::ceil(b - 0.5);
+ int sensePrecision = (b < 10 && b > -10) ? 2 : 3;
+ std::cout << std::setprecision(sensePrecision) << " sense=" << b;
+ }
+ std::cout << "\n" << std::setprecision(6);
if (a.qstrand == '-') cbrc::flipMafStrands(s.begin(), s.end());
if (opts.no_split && a.linesEnd[-1][0] == 'c') s.push_back(a.linesEnd[-1]);
cbrc::printMaf(s);
@@ -149,17 +159,14 @@ static void doOneQuery(std::vector<cbrc::UnsplitAlignment>::const_iterator beg,
sa.flipSpliceSignals();
}
- double forwardDirectionProb = -1;
- if (opts.direction == 2) {
- forwardDirectionProb = sa.spliceSignalStrandProb();
- if (opts.verbose) std::cerr << "\tforwardProb=" << forwardDirectionProb;
- }
+ double senseStrandLogOdds =
+ (opts.direction == 2) ? sa.spliceSignalStrandLogOdds() : 0;
if (opts.no_split) {
if (opts.verbose) std::cerr << "\n";
for (unsigned i = 0; i < end - beg; ++i) {
doOneAlignmentPart(sa, beg[i], i, beg[i].qstart, beg[i].qend,
- forwardDirectionProb, opts, isAlreadySplit);
+ senseStrandLogOdds, opts, isAlreadySplit);
}
} else {
long viterbiScore = LONG_MIN;
@@ -192,7 +199,7 @@ static void doOneQuery(std::vector<cbrc::UnsplitAlignment>::const_iterator beg,
for (unsigned k = 0; k < alnNums.size(); ++k) {
unsigned i = alnNums[k];
doOneAlignmentPart(sa, beg[i], i, queryBegs[k], queryEnds[k],
- forwardDirectionProb, opts, isAlreadySplit);
+ senseStrandLogOdds, opts, isAlreadySplit);
}
}
}
diff --git a/src/version.hh b/src/version.hh
index 9420baa..25bdae7 100644
--- a/src/version.hh
+++ b/src/version.hh
@@ -1 +1 @@
-"809"
+"828"
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/last-align.git
More information about the debian-med-commit
mailing list