[med-svn] [Git][med-team/last-align][upstream] New upstream version 1145
Nilesh Patra
gitlab at salsa.debian.org
Tue Nov 10 21:40:48 GMT 2020
Nilesh Patra pushed to branch upstream at Debian Med / last-align
Commits:
649febce by Nilesh Patra at 2020-11-11T02:56:23+05:30
New upstream version 1145
- - - - -
21 changed files:
- ChangeLog.txt
- doc/last-split.html
- doc/last-split.txt
- src/Alignment.cc
- src/Alignment.hh
- src/AlignmentWrite.cc
- src/Centroid.cc
- src/Centroid.hh
- src/GappedXdropAligner.cc
- src/GappedXdropAligner2qual.cc
- src/GappedXdropAlignerDna.cc
- src/GappedXdropAlignerFrame.cc
- src/GappedXdropAlignerPssm.cc
- src/LastalArguments.cc
- src/lastal.cc
- src/makefile
- + src/mcf_frameshift_xdrop_aligner.cc
- + src/mcf_frameshift_xdrop_aligner.hh
- src/mcf_gap_costs.cc
- src/mcf_gap_costs.hh
- src/version.hh
Changes:
=====================================
ChangeLog.txt
=====================================
@@ -1,8 +1,71 @@
+2020-11-06 Martin C. Frith <Martin C. Frith>
+
+ * src/mcf_frameshift_xdrop_aligner.hh:
+ Try to appease C++ compilers
+ [62a147543f2b] [tip]
+
+ * doc/last-split.txt:
+ Add awk mismap trick to doc
+ [aabc3e86c600]
+
+ * src/GappedXdropAlignerFrame.cc:
+ Refactor
+ [10c6c0ff4ac2]
+
+ * src/GappedXdropAligner.cc, src/GappedXdropAligner2qual.cc,
+ src/GappedXdropAlignerDna.cc, src/GappedXdropAlignerFrame.cc,
+ src/GappedXdropAlignerPssm.cc:
+ Refactor
+ [0bff03a84a63]
+
+ * src/Alignment.cc, src/Alignment.hh, src/AlignmentWrite.cc,
+ src/LastalArguments.cc, src/lastal.cc, src/makefile,
+ src/mcf_frameshift_xdrop_aligner.cc,
+ src/mcf_frameshift_xdrop_aligner.hh, test/last-test.out, test/last-
+ test.sh:
+ Add probabilities for new-style frameshifts
+ [8b133765d82e]
+
+2020-11-05 Martin C. Frith <Martin C. Frith>
+
+ * src/Alignment.hh, src/Centroid.cc, src/Centroid.hh:
+ Refactor
+ [8291399ebe14]
+
+ * src/Alignment.hh, src/AlignmentWrite.cc, test/last-test.out, test
+ /last-test.sh:
+ Fix protein-codon alignment probabilities
+ [86ac07a750ac]
+
+2020-11-04 Martin C. Frith <Martin C. Frith>
+
+ * src/Centroid.cc, src/lastal.cc, src/mcf_gap_costs.cc,
+ src/mcf_gap_costs.hh:
+ Refactor
+ [8383c3867429]
+
+ * src/Alignment.cc, src/Alignment.hh, src/lastal.cc, src/makefile:
+ Refactor
+ [115ac5aa5bee]
+
+ * src/Alignment.cc, src/AlignmentWrite.cc:
+ Refactor
+ [defdc3f2ccdd]
+
+ * src/Alignment.cc:
+ Refactor
+ [7b420067c0fa]
+
+ * src/Alignment.cc, src/Alignment.hh, src/AlignmentWrite.cc,
+ src/Centroid.cc, src/Centroid.hh, src/lastal.cc:
+ Refactor
+ [60f728268532]
+
2020-10-14 Martin C. Frith <Martin C. Frith>
* scripts/last-dotplot:
last-dotplot: fix crash on long annotation names
- [04663552433d] [tip]
+ [04663552433d]
* scripts/last-dotplot:
last-dotplot: fix crash on annotation names
=====================================
doc/last-split.html
=====================================
@@ -500,6 +500,14 @@ Error probability <= 10 ^ -((ASCII value - 33) / 10)
you run last-split twice, as in the genome alignment recipes, the
mismap is the lowest combined error probability from both "p" lines.)</p>
</div>
+<div class="section" id="tips">
+<h2>Tips</h2>
+<p>To omit alignments with mismap probability > <tt class="docutils literal"><span class="pre">10^-6</span></tt> (say), you can
+use the <tt class="docutils literal"><span class="pre">-m</span></tt> option (see below), or do this:</p>
+<pre class="literal-block">
+awk -F= '/^a/ {i = $3 <= 1e-6} i' out.maf > out2.maf
+</pre>
+</div>
<div class="section" id="split-versus-spliced-alignment">
<h2>Split versus spliced alignment</h2>
<p>Here is a split alignment:</p>
=====================================
doc/last-split.txt
=====================================
@@ -111,6 +111,14 @@ The "mismap" is simply the lowest probability from the "p" line. (If
you run last-split twice, as in the genome alignment recipes, the
mismap is the lowest combined error probability from both "p" lines.)
+Tips
+----
+
+To omit alignments with mismap probability > ``10^-6`` (say), you can
+use the ``-m`` option (see below), or do this::
+
+ awk -F= '/^a/ {i = $3 <= 1e-6} i' out.maf > out2.maf
+
Split versus spliced alignment
------------------------------
=====================================
src/Alignment.cc
=====================================
@@ -2,54 +2,36 @@
#include "Alignment.hh"
#include "Alphabet.hh"
-#include "Centroid.hh"
#include "GeneticCode.hh"
-#include "GreedyXdropAligner.hh"
#include "TwoQualityScoreMatrix.hh"
-#include <cassert>
+
+#include <assert.h>
// make C++ tolerable:
#define IT(type) std::vector<type>::iterator
using namespace cbrc;
-void Alignment::fromSegmentPair( const SegmentPair& sp ){
- blocks.assign( 1, sp );
- score = sp.score;
-}
-
-static void addExpectedCounts( double* expectedCounts,
- const ExpectedCount& ec,
- const Alphabet& alph ){
+static void addExpectedCounts(double *totalCounts, const ExpectedCount &ec) {
for( unsigned i = 0; i < scoreMatrixRowSize; ++i ){
- unsigned x = alph.numbersToUppercase[i];
- if( x >= alph.size ) continue;
for( unsigned j = 0; j < scoreMatrixRowSize; ++j ){
- unsigned y = alph.numbersToUppercase[j];
- if( y >= alph.size ) continue;
- expectedCounts[ x * alph.size + y ] += ec.emit[i][j];
+ *totalCounts += ec.emit[i][j];
+ ++totalCounts;
}
}
-
- const int numEmissionCounts = alph.size * alph.size;
- double* transitionCounts = &expectedCounts[ numEmissionCounts ];
-
- transitionCounts[0] += ec.toMatch;
- transitionCounts[1] += ec.delNext + ec.delInit; // deleted letter count
- transitionCounts[2] += ec.insNext + ec.insInit; // inserted letter count
- transitionCounts[3] += ec.delInit; // deletion open/close count
- transitionCounts[4] += ec.insInit; // insertion open/close count
+ totalCounts[0] += ec.toMatch;
+ totalCounts[1] += ec.delNext + ec.delInit; // deleted letter count
+ totalCounts[2] += ec.insNext + ec.insInit; // inserted letter count
+ totalCounts[3] += ec.delInit; // deletion open/close count
+ totalCounts[4] += ec.insInit; // insertion open/close count
}
-static void countSeedMatches( double* expectedCounts,
- const uchar* seq1beg, const uchar* seq1end,
- const uchar* seq2beg, const Alphabet& alph ){
- while( seq1beg < seq1end ){
- unsigned x1 = alph.numbersToUppercase[ *seq1beg++ ];
- unsigned x2 = alph.numbersToUppercase[ *seq2beg++ ];
- if( x1 < alph.size && x2 < alph.size )
- ++expectedCounts[ x1 * alph.size + x2 ];
+static void addSeedCounts(const uchar *seq1, const uchar *seq2, size_t size,
+ double *counts) {
+ for (size_t i = 0; i < size; ++i) {
+ ++counts[seq1[i] * scoreMatrixRowSize + seq2[i]];
}
+ counts[scoreMatrixRowSize * scoreMatrixRowSize] += size;
}
// Does x precede and touch y in both sequences?
@@ -57,12 +39,12 @@ static bool isNext( const SegmentPair& x, const SegmentPair& y ){
return x.end1() == y.beg1() && x.end2() == y.beg2();
}
-void Alignment::makeXdrop( Centroid& centroid,
- GreedyXdropAligner& greedyAligner, bool isGreedy,
+void Alignment::makeXdrop( Aligners &aligners, bool isGreedy,
const uchar* seq1, const uchar* seq2, int globality,
const ScoreMatrixRow* scoreMatrix,
int smMax, int smMin,
- const mcf::GapCosts& gap, int maxDrop,
+ const const_dbl_ptr* probMatrix, double scale,
+ const GapCosts& gap, int maxDrop,
size_t frameSize, const ScoreMatrixRow* pssm2,
const TwoQualityScoreMatrix& sm2qual,
const uchar* qual1, const uchar* qual2,
@@ -73,21 +55,17 @@ void Alignment::makeXdrop( Centroid& centroid,
if( outputType == 7 ){
assert( seed.size > 0 ); // makes things easier to understand
- const int numEmissionCounts = alph.size * alph.size;
- const int numTransitionCounts = 5;
- std::vector<double>& expectedCounts = extras.expectedCounts;
- expectedCounts.resize( numEmissionCounts + numTransitionCounts );
- countSeedMatches( &expectedCounts[0],
- seq1 + seed.beg1(), seq1 + seed.end1(),
- seq2 + seed.beg2(), alph );
- expectedCounts[ numEmissionCounts ] += seed.size; // match count
+ const int numOfTransitions = frameSize ? 9 : 5;
+ std::vector<double> &ec = extras.expectedCounts;
+ ec.resize(scoreMatrixRowSize * scoreMatrixRowSize + numOfTransitions);
+ addSeedCounts(seq1 + seed.beg1(), seq2 + seed.beg2(), seed.size, &ec[0]);
}
// extend a gapped alignment in the left/reverse direction from the seed:
std::vector<char>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
- extend( blocks, columnAmbiguityCodes, centroid, greedyAligner, isGreedy,
+ extend( blocks, columnAmbiguityCodes, aligners, isGreedy,
seq1, seq2, seed.beg1(), seed.beg2(), false, globality,
- scoreMatrix, smMax, smMin, maxDrop, gap,
+ scoreMatrix, smMax, smMin, probMatrix, scale, maxDrop, gap,
frameSize, pssm2, sm2qual, qual1, qual2, alph,
extras, gamma, outputType );
@@ -105,9 +83,9 @@ void Alignment::makeXdrop( Centroid& centroid,
// extend a gapped alignment in the right/forward direction from the seed:
std::vector<SegmentPair> forwardBlocks;
std::vector<char> forwardAmbiguities;
- extend( forwardBlocks, forwardAmbiguities, centroid, greedyAligner, isGreedy,
+ extend( forwardBlocks, forwardAmbiguities, aligners, isGreedy,
seq1, seq2, seed.end1(), seed.end2(), true, globality,
- scoreMatrix, smMax, smMin, maxDrop, gap,
+ scoreMatrix, smMax, smMin, probMatrix, scale, maxDrop, gap,
frameSize, pssm2, sm2qual, qual1, qual2, alph,
extras, gamma, outputType );
@@ -162,7 +140,7 @@ void Alignment::makeXdrop( Centroid& centroid,
// cost of the gap between x and y
static int gapCost(const SegmentPair &x, const SegmentPair &y,
- const mcf::GapCosts &gapCosts, size_t frameSize) {
+ const GapCosts &gapCosts, size_t frameSize) {
if (gapCosts.isNewFrameshifts()) return x.score;
size_t gapSize1 = y.beg1() - x.end1();
size_t gapSize2, frameshift2;
@@ -174,7 +152,7 @@ static int gapCost(const SegmentPair &x, const SegmentPair &y,
bool Alignment::isOptimal( const uchar* seq1, const uchar* seq2, int globality,
const ScoreMatrixRow* scoreMatrix, int maxDrop,
- const mcf::GapCosts& gapCosts, size_t frameSize,
+ const GapCosts& gapCosts, size_t frameSize,
const ScoreMatrixRow* pssm2,
const TwoQualityScoreMatrix& sm2qual,
const uchar* qual1, const uchar* qual2 ) const{
@@ -216,7 +194,7 @@ bool Alignment::isOptimal( const uchar* seq1, const uchar* seq2, int globality,
bool Alignment::hasGoodSegment(const uchar *seq1, const uchar *seq2,
int minScore, const ScoreMatrixRow *scoreMatrix,
- const mcf::GapCosts &gapCosts, size_t frameSize,
+ const GapCosts &gapCosts, size_t frameSize,
const ScoreMatrixRow *pssm2,
const TwoQualityScoreMatrix &sm2qual,
const uchar *qual1, const uchar *qual2) const {
@@ -273,27 +251,46 @@ static void getColumnCodes(const Centroid& centroid, std::vector<char>& codes,
}
}
+static void getColumnCodes(const FrameshiftXdropAligner &fxa,
+ std::vector<char> &codes,
+ const std::vector<SegmentPair> &chunks) {
+ for (size_t i = 0; i < chunks.size(); ++i) {
+ const SegmentPair &x = chunks[i];
+ for (size_t k = x.size; k-- > 0;) {
+ double p = fxa.matchProb(x.beg1() + k, x.beg2() + k * 3);
+ codes.push_back(asciiProbability(p));
+ }
+ size_t j = i + 1;
+ bool isNext = (j < chunks.size());
+ size_t end1 = isNext ? chunks[j].end1() : 0;
+ size_t end2 = isNext ? chunks[j].beg2() + chunks[j].size * 3 : 0;
+ size_t n1 = x.beg1() - end1;
+ size_t n2 = (x.beg2() - end2 + 1) / 3;
+ codes.insert(codes.end(), n1 + n2, '-');
+ }
+}
+
void Alignment::extend( std::vector< SegmentPair >& chunks,
std::vector< char >& columnCodes,
- Centroid& centroid,
- GreedyXdropAligner& greedyAligner, bool isGreedy,
+ Aligners &aligners, bool isGreedy,
const uchar* seq1, const uchar* seq2,
size_t start1, size_t start2,
bool isForward, int globality,
const ScoreMatrixRow* sm, int smMax, int smMin,
- int maxDrop, const mcf::GapCosts& gap,
- size_t frameSize,
+ const const_dbl_ptr* probMat, double scale,
+ int maxDrop, const GapCosts& gap, size_t frameSize,
const ScoreMatrixRow* pssm2,
- const TwoQualityScoreMatrix& sm2qual,
+ const TwoQualityScoreMatrix& sm2qual,
const uchar* qual1, const uchar* qual2,
const Alphabet& alph, AlignmentExtras& extras,
double gamma, int outputType ){
- const mcf::GapCosts::Piece &del = gap.delPieces[0];
- const mcf::GapCosts::Piece &ins = gap.insPieces[0];
+ const GapCosts::Piece &del = gap.delPieces[0];
+ const GapCosts::Piece &ins = gap.insPieces[0];
+ Centroid ¢roid = aligners.centroid;
GappedXdropAligner& aligner = centroid.aligner();
+ GreedyXdropAligner &greedyAligner = aligners.greedyAligner;
if( frameSize ){
- assert( outputType < 4 );
assert( !isGreedy );
assert( !globality );
assert( !pssm2 );
@@ -313,7 +310,27 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
while (aligner.getNextChunkFrame(end1, end2, size, gapCost, gap))
chunks.push_back(SegmentPair(end1 - size, end2 - size * 3, size,
gapCost));
+ if (outputType > 3) {
+ FrameshiftXdropAligner &fxa = aligners.frameshiftAligner;
+ double probDropLimit = exp(scale * -maxDrop);
+ double s = fxa.forward(seq1 + start1, seq2 + start2,
+ seq2 + dnaToAa(frame1, frameSize),
+ seq2 + dnaToAa(frame2, frameSize),
+ isForward, probMat, gap, probDropLimit);
+ extras.fullScore += s / scale;
+ fxa.backward(isForward, probMat, gap);
+ getColumnCodes(fxa, columnCodes, chunks);
+ if (outputType == 7) {
+ double *ec = &extras.expectedCounts[0];
+ double *subsCounts[scoreMatrixRowSize];
+ for (int i = 0; i < scoreMatrixRowSize; ++i)
+ subsCounts[i] = ec + i * scoreMatrixRowSize;
+ double *tranCounts = ec + scoreMatrixRowSize * scoreMatrixRowSize;
+ fxa.count(isForward, gap, subsCounts, tranCounts);
+ }
+ }
} else {
+ assert(outputType < 4);
size_t frame2 = isForward ? dnaStart - 1 : dnaStart + 1;
score += aligner.align3(seq1 + start1, seq2 + start2,
seq2 + dnaToAa(frame1, frameSize),
@@ -417,7 +434,7 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
ExpectedCount ec;
centroid.computeExpectedCounts( seq1, seq2, start1, start2,
isForward, gap, alph.size, ec );
- addExpectedCounts( &extras.expectedCounts[0], ec, alph );
+ addExpectedCounts(&extras.expectedCounts[0], ec);
}
}
}
=====================================
src/Alignment.hh
=====================================
@@ -4,25 +4,28 @@
#ifndef ALIGNMENT_HH
#define ALIGNMENT_HH
-#include "ScoreMatrixRow.hh"
+
+#include "Centroid.hh"
+#include "GreedyXdropAligner.hh"
#include "SegmentPair.hh"
-#include "mcf_gap_costs.hh"
-#include <stddef.h> // size_t
-#include <string>
+#include "mcf_frameshift_xdrop_aligner.hh"
+
#include <vector>
#include <cstring>
namespace cbrc{
-typedef unsigned char uchar;
-
-class GreedyXdropAligner;
class LastEvaluer;
class MultiSequence;
class Alphabet;
-class Centroid;
class TwoQualityScoreMatrix;
+struct Aligners {
+ Centroid centroid;
+ FrameshiftXdropAligner frameshiftAligner;
+ GreedyXdropAligner greedyAligner;
+};
+
struct AlignmentText {
// This holds the final text representation of an alignment, along
// with data for sorting it.
@@ -70,18 +73,21 @@ struct AlignmentExtras {
struct Alignment{
// make a single-block alignment:
- void fromSegmentPair( const SegmentPair& sp );
+ void fromSegmentPair(const SegmentPair &sp) {
+ blocks.assign(1, sp);
+ score = sp.score;
+ }
// Make an Alignment by doing gapped X-drop extension in both
// directions starting from a seed SegmentPair. The resulting
// Alignment might not be "optimal" (see below).
// If outputType > 3: calculates match probabilities.
// If outputType > 4: does gamma-centroid alignment.
- void makeXdrop( Centroid& centroid,
- GreedyXdropAligner& greedyAligner, bool isGreedy,
+ void makeXdrop( Aligners &aligners, bool isGreedy,
const uchar* seq1, const uchar* seq2, int globality,
const ScoreMatrixRow* scoreMatrix, int smMax, int smMin,
- const mcf::GapCosts& gap, int maxDrop, size_t frameSize,
+ const const_dbl_ptr* probMatrix, double scale,
+ const GapCosts& gap, int maxDrop, size_t frameSize,
const ScoreMatrixRow* pssm2,
const TwoQualityScoreMatrix& sm2qual,
const uchar* qual1, const uchar* qual2,
@@ -94,7 +100,7 @@ struct Alignment{
// If "globality" is non-zero, skip the prefix and suffix checks.
bool isOptimal( const uchar* seq1, const uchar* seq2, int globality,
const ScoreMatrixRow* scoreMatrix, int maxDrop,
- const mcf::GapCosts& gapCosts, size_t frameSize,
+ const GapCosts& gapCosts, size_t frameSize,
const ScoreMatrixRow* pssm2,
const TwoQualityScoreMatrix& sm2qual,
const uchar* qual1, const uchar* qual2 ) const;
@@ -102,7 +108,7 @@ struct Alignment{
// Does the Alignment have any segment with score >= minScore?
bool hasGoodSegment(const uchar *seq1, const uchar *seq2,
int minScore, const ScoreMatrixRow *scoreMatrix,
- const mcf::GapCosts &gapCosts, size_t frameSize,
+ const GapCosts &gapCosts, size_t frameSize,
const ScoreMatrixRow *pssm2,
const TwoQualityScoreMatrix &sm2qual,
const uchar *qual1, const uchar *qual2) const;
@@ -130,13 +136,13 @@ struct Alignment{
void extend( std::vector< SegmentPair >& chunks,
std::vector< char >& columnCodes,
- Centroid& centroid,
- GreedyXdropAligner& greedyAligner, bool isGreedy,
+ Aligners &aligners, bool isGreedy,
const uchar* seq1, const uchar* seq2,
size_t start1, size_t start2,
bool isForward, int globality,
- const ScoreMatrixRow* sm, int smMax, int smMin, int maxDrop,
- const mcf::GapCosts& gap, size_t frameSize,
+ const ScoreMatrixRow* sm, int smMax, int smMin,
+ const const_dbl_ptr* probMat, double scale,
+ int maxDrop, const GapCosts& gap, size_t frameSize,
const ScoreMatrixRow* pssm2,
const TwoQualityScoreMatrix& sm2qual,
const uchar* qual1, const uchar* qual2,
@@ -169,6 +175,9 @@ struct Alignment{
char *writeBotSeq(char *dest, const uchar *seq, const Alphabet &alph,
size_t qualsPerBase, size_t frameSize, bool isCodon) const;
+
+ char *writeColumnProbs(char *dest, const char *probSymbols,
+ size_t frameSize, bool isCodon) const;
};
} // end namespace cbrc
=====================================
src/AlignmentWrite.cc
=====================================
@@ -302,14 +302,33 @@ static char *writeMafHeadQ(char *out,
// Write a "c" line
static void writeMafLineC(std::vector<char> &cLine,
- const std::vector<double> &counts) {
- size_t s = counts.size();
- if (s == 0) return;
- cLine.resize(2 + 32 * s);
+ const std::vector<double> &counts,
+ const Alphabet &alph, bool isCodon) {
+ if (counts.empty()) return;
+ unsigned alphSize2 = isCodon ? 64 : alph.size;
+ unsigned numOfSubstitutions = scoreMatrixRowSize * scoreMatrixRowSize;
+ size_t numOfTransitions = counts.size() - numOfSubstitutions;
+ size_t numOfCounts = alph.size * alphSize2 + numOfTransitions;
+ cLine.resize(2 + 32 * numOfCounts);
char *e = &cLine[0];
*e++ = 'c';
- for (size_t i = 0; i < s; ++i)
- e += std::sprintf(e, " %.3g", counts[i]);
+
+ for (unsigned i = 0; i < alph.size; ++i) {
+ unsigned x = alph.numbersToLowercase[i];
+ for (unsigned j = 0; j < alphSize2; ++j) {
+ unsigned y = isCodon ? j : alph.numbersToLowercase[j];
+ double c = counts[i * scoreMatrixRowSize + j];
+ if (x != i) c += counts[x * scoreMatrixRowSize + j];
+ if (y != j) c += counts[i * scoreMatrixRowSize + y];
+ if (x != i && y != j) c += counts[x * scoreMatrixRowSize + y];
+ e += std::sprintf(e, " %.3g", c);
+ }
+ }
+
+ for (size_t i = 0; i < numOfTransitions; ++i) {
+ e += std::sprintf(e, " %.3g", counts[numOfSubstitutions + i]);
+ }
+
*e++ = '\n';
cLine.resize(e - &cLine[0]);
}
@@ -324,7 +343,7 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
const AlignmentExtras& extras) const {
bool isCodon = (translationType == 2);
double fullScore = extras.fullScore;
- const std::vector<char>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
+ const std::vector<char>& columnProbSymbols = extras.columnAmbiguityCodes;
size_t alnBeg1 = beg1();
size_t alnEnd1 = end1();
@@ -365,14 +384,14 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
size_t sLineLen = 2 + pLineBlankLen + numColumns(frameSize2, isCodon) + 1;
std::vector<char> cLine;
- writeMafLineC(cLine, extras.expectedCounts);
+ writeMafLineC(cLine, extras.expectedCounts, alph, isCodon);
size_t qualsPerBase1 = seq1.qualsPerLetter();
size_t qualsPerBase2 = seq2.qualsPerLetter();
bool isQuals1 = qualsPerBase1 && (translationType != 2);
bool isQuals2 = qualsPerBase2 && (translationType != 1);
- size_t sLineNum = 2 + isQuals1 + isQuals2 + !columnAmbiguityCodes.empty();
+ size_t sLineNum = 2 + isQuals1 + isQuals2 + !columnProbSymbols.empty();
size_t textLen = aLineLen + sLineLen * sLineNum + cLine.size() + 1;
char *text = new char[textLen + 1];
@@ -402,17 +421,17 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
*dest++ = '\n';
}
- Writer w(dest);
-
- if (!columnAmbiguityCodes.empty()) {
+ if (!columnProbSymbols.empty()) {
+ Writer w(dest);
w << 'p' << ' ';
w.fill(pLineBlankLen, ' ');
- w.copy(&columnAmbiguityCodes[0], columnAmbiguityCodes.size());
- w << '\n';
+ dest = w.pointer();
+ dest = writeColumnProbs(dest, &columnProbSymbols[0], frameSize2, isCodon);
+ *dest++ = '\n';
}
+ Writer w(dest);
if (!cLine.empty()) w.copy(&cLine[0], cLine.size());
-
w << '\n' << '\0'; // blank line afterwards
return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand2, score, 0, 0, text);
@@ -649,3 +668,42 @@ char *Alignment::writeBotSeq(char *dest, const uchar *seq,
return dest;
}
+
+static char *writeProbSymbols(char *dest, const char *probSymbols,
+ size_t size, int aaLen) {
+ for (size_t i = 0; i < size; ++i)
+ for (int j = 0; j < aaLen; ++j)
+ *dest++ = probSymbols[i];
+ return dest;
+}
+
+char *Alignment::writeColumnProbs(char *dest, const char *probSymbols,
+ size_t frameSize, bool isCodon) const {
+ const int aaLen = isCodon ? 3 : 1;
+
+ for (size_t i = 0; i < blocks.size(); ++i) {
+ const SegmentPair &y = blocks[i];
+ if (i > 0) { // between each pair of aligned blocks:
+ const SegmentPair &x = blocks[i - 1];
+
+ size_t topLen = y.beg1() - x.end1();
+ dest = writeProbSymbols(dest, probSymbols, topLen, aaLen);
+ probSymbols += topLen;
+
+ size_t beg, end, shift;
+ setFrameCoords(beg, end, shift, x.end2(), y.beg2(), frameSize, isCodon);
+ size_t rawLen = end - beg;
+ size_t botLen = (rawLen + aaLen / 3) / aaLen;
+ if (shift) *dest++ = '-';
+ dest = writeProbSymbols(dest, probSymbols, botLen, aaLen);
+ probSymbols += botLen;
+ if (beg < end && rawLen % aaLen == 1) *dest++ = '-';
+ if (beg < end && rawLen % aaLen == 2) --dest;
+ }
+
+ dest = writeProbSymbols(dest, probSymbols, y.size, aaLen);
+ probSymbols += y.size;
+ }
+
+ return dest;
+}
=====================================
src/Centroid.cc
=====================================
@@ -3,12 +3,11 @@
#include "Centroid.hh"
#include "GappedXdropAlignerInl.hh"
+
#include <algorithm>
#include <cassert>
#include <cmath> // for exp
#include <cfloat> // for DBL_MAX
-#include <cstdlib> // for abs
-#include <iomanip>
static const double DINF = DBL_MAX / 2;
@@ -114,14 +113,14 @@ namespace cbrc{
void Centroid::forward(const uchar* seq1, const uchar* seq2,
const ExpMatrixRow* pssm, bool isExtendFwd,
- const GapCosts& gap, int globality) {
+ const GapCosts& gapCosts, int globality) {
const int seqIncrement = isExtendFwd ? 1 : -1;
initForwardMatrix();
- const double delOpen = EXP(-gap.delPieces[0].openCost / T);
- const double delGrow = EXP(-gap.delPieces[0].growCost / T);
- const double insOpen = EXP(-gap.insPieces[0].openCost / T);
- const double insGrow = EXP(-gap.insPieces[0].growCost / T);
+ const double delOpen = gapCosts.delProbPieces[0].openProb;
+ const double delGrow = gapCosts.delProbPieces[0].growProb;
+ const double insOpen = gapCosts.insProbPieces[0].openProb;
+ const double insGrow = gapCosts.insProbPieces[0].growProb;
const double delInit = delOpen * delGrow; // for 1st letter in a deletion
const double insInit = insOpen * insGrow; // for 1st letter in an insert
@@ -231,14 +230,14 @@ namespace cbrc{
// compute posterior probabilities while executing backward algorithm
void Centroid::backward(const uchar* seq1, const uchar* seq2,
const ExpMatrixRow* pssm, bool isExtendFwd,
- const GapCosts& gap, int globality) {
+ const GapCosts& gapCosts, int globality) {
const int seqIncrement = isExtendFwd ? 1 : -1;
initBackwardMatrix();
- const double delOpen = EXP(-gap.delPieces[0].openCost / T);
- const double delGrow = EXP(-gap.delPieces[0].growCost / T);
- const double insOpen = EXP(-gap.insPieces[0].openCost / T);
- const double insGrow = EXP(-gap.insPieces[0].growCost / T);
+ const double delOpen = gapCosts.delProbPieces[0].openProb;
+ const double delGrow = gapCosts.delProbPieces[0].growProb;
+ const double insOpen = gapCosts.insProbPieces[0].openProb;
+ const double insGrow = gapCosts.insProbPieces[0].growProb;
const double delInit = delOpen * delGrow; // for 1st letter in a deletion
const double insInit = insOpen * insGrow; // for 1st letter in an insert
@@ -516,21 +515,6 @@ namespace cbrc{
}
}
- // Return an ASCII code representing an error probability. The
- // printable codes are 33--126, but here we use a maximum of 125, so
- // that 126 is reserved for special cases.
- static char asciiProbability( double probCorrect ){
- assert( probCorrect >= 0 );
- //assert( probCorrect <= 1 ); // can fail: floating point is imperfect
- double e = 1 - probCorrect;
- double f = std::max( e, 1e-10 ); // avoid overflow errors
- double g = -10 * std::log10(f);
- int i = static_cast<int>(g); // round fractions down
- int j = i + 33;
- int k = std::min( j, 125 );
- return static_cast<char>(k);
- }
-
void Centroid::getMatchAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq1end, size_t seq2end,
size_t length) const {
@@ -584,7 +568,7 @@ namespace cbrc{
void Centroid::computeExpectedCounts ( const uchar* seq1, const uchar* seq2,
size_t start1, size_t start2,
bool isExtendFwd,
- const GapCosts& gap,
+ const GapCosts& gapCosts,
unsigned alphabetSize,
ExpectedCount& c ) const{
seq1 += start1;
@@ -600,10 +584,10 @@ namespace cbrc{
int alphabetSizeIncrement = alphabetSize;
if (!isExtendFwd) alphabetSizeIncrement *= -1;
- const double delOpen = EXP(-gap.delPieces[0].openCost / T);
- const double delGrow = EXP(-gap.delPieces[0].growCost / T);
- const double insOpen = EXP(-gap.insPieces[0].openCost / T);
- const double insGrow = EXP(-gap.insPieces[0].growCost / T);
+ const double delOpen = gapCosts.delProbPieces[0].openProb;
+ const double delGrow = gapCosts.delProbPieces[0].growProb;
+ const double insOpen = gapCosts.insProbPieces[0].openProb;
+ const double insGrow = gapCosts.insProbPieces[0].growProb;
const double delInit = delOpen * delGrow; // for 1st letter in a deletion
const double insInit = insOpen * insGrow; // for 1st letter in an insert
=====================================
src/Centroid.hh
=====================================
@@ -3,10 +3,16 @@
#ifndef CENTROID_HH
#define CENTROID_HH
+
#include "GappedXdropAligner.hh"
#include "mcf_gap_costs.hh"
#include "SegmentPair.hh"
#include "OneQualityScoreMatrix.hh"
+
+#include <assert.h>
+#include <math.h>
+
+#include <algorithm>
#include <vector>
#include <iostream> // for debug
@@ -56,17 +62,17 @@ namespace cbrc{
void doForwardBackwardAlgorithm(const uchar* seq1, const uchar* seq2,
size_t start1, size_t start2,
- bool isExtendFwd, const GapCosts& gap,
+ bool isExtendFwd, const GapCosts& gapCosts,
int globality) {
seq1 += start1;
seq2 += start2;
const ExpMatrixRow *pssm = isPssm ? pssmExp2 + start2 : 0;
numAntidiagonals = xa.numAntidiagonals();
scale.assign(numAntidiagonals + 2, 1.0);
- forward(seq1, seq2, pssm, isExtendFwd, gap, globality);
+ forward(seq1, seq2, pssm, isExtendFwd, gapCosts, globality);
mD.assign(numAntidiagonals + 2, 0.0);
mI.assign(numAntidiagonals + 2, 0.0);
- backward(seq1, seq2, pssm, isExtendFwd, gap, globality);
+ backward(seq1, seq2, pssm, isExtendFwd, gapCosts, globality);
}
double dp( double gamma );
@@ -97,7 +103,7 @@ namespace cbrc{
// Added by MH (2008/10/10) : compute expected counts for transitions and emissions
void computeExpectedCounts(const uchar* seq1, const uchar* seq2,
size_t start1, size_t start2, bool isExtendFwd,
- const GapCosts& gap, unsigned alphabetSize,
+ const GapCosts& gapCosts, unsigned alphabetSize,
ExpectedCount& count) const;
private:
@@ -138,11 +144,11 @@ namespace cbrc{
void forward(const uchar* seq1, const uchar* seq2,
const ExpMatrixRow* pssm, bool isExtendFwd,
- const GapCosts& gap, int globality);
+ const GapCosts& gapCosts, int globality);
void backward(const uchar* seq1, const uchar* seq2,
const ExpMatrixRow* pssm, bool isExtendFwd,
- const GapCosts& gap, int globality);
+ const GapCosts& gapCosts, int globality);
void initForwardMatrix();
void initBackwardMatrix();
@@ -156,6 +162,20 @@ namespace cbrc{
}
};
+ // Return an ASCII code representing an error probability. The
+ // printable codes are 33--126, but here we use a maximum of 125, so
+ // that 126 is reserved for special cases.
+ inline char asciiProbability(double probCorrect) {
+ assert(probCorrect >= 0);
+ //assert(probCorrect <= 1); // can fail: floating point is imperfect
+ double e = 1 - probCorrect;
+ double f = std::max(e, 1e-10); // avoid overflow errors
+ double g = -10 * log10(f);
+ int i = static_cast<int>(g); // round fractions down
+ int j = i + 33;
+ int k = std::min(j, 125);
+ return static_cast<char>(k);
+ }
}
#endif
=====================================
src/GappedXdropAligner.cc
=====================================
@@ -74,7 +74,7 @@ int GappedXdropAligner::align(const uchar *seq1,
const SimdInt mInsGrowCost = simdFill(insExtensionCost);
const int seqIncrement = isForward ? 1 : -1;
- size_t seq1beg = 0;
+ int numCells = 1;
size_t seq1end = 1;
size_t diagPos = xdropPadLen - 1;
size_t horiPos = xdropPadLen * 2 - 1;
@@ -103,9 +103,7 @@ int GappedXdropAligner::align(const uchar *seq1,
size_t antidiagonal;
for (antidiagonal = 0; /* noop */; ++antidiagonal) {
- int numCells = seq1end - seq1beg;
int n = numCells - 1;
-
const const_int_ptr *s1 = &pssmQueue.fromEnd(n + simdLen);
const uchar *s2 = seq2queue.begin();
@@ -129,7 +127,7 @@ int GappedXdropAligner::align(const uchar *seq1,
const Score *z2 = &zScores[diagPos];
int b = maxValue(x2[0], z1[0]-insExtensionCost, z2[0]-gapUnalignedCost);
updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
- minScore, b, antidiagonal, seq1beg);
+ minScore, b, antidiagonal, seq1end - numCells);
}
if (globality && isDelimiter1) {
@@ -200,6 +198,7 @@ int GappedXdropAligner::align(const uchar *seq1,
thisPos += numCells;
if (x0[n] > -INF / 2) {
+ ++numCells;
++seq1end;
const int *x = scorer[*seq1];
pssmQueue.push(x, n + simdLen);
@@ -213,10 +212,10 @@ int GappedXdropAligner::align(const uchar *seq1,
seq2 += seqIncrement;
isDelimiter2 = isDelimiter(y, scorer[0]);
} else {
- ++seq1beg;
+ --numCells;
+ if (numCells == 0) break;
++diagPos;
++horiPos;
- if (seq1beg == seq1end) break;
}
}
=====================================
src/GappedXdropAligner2qual.cc
=====================================
@@ -27,7 +27,7 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
int maxMatchScore) {
const int seqIncrement = isForward ? 1 : -1;
- size_t seq1beg = 0;
+ int numCells = 1;
size_t seq1end = 1;
size_t diagPos = xdropPadLen - 1;
size_t horiPos = xdropPadLen * 2 - 1;
@@ -41,15 +41,13 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
size_t antidiagonal;
for (antidiagonal = 0; /* noop */; ++antidiagonal) {
- int numCells = seq1end - seq1beg;
int n = numCells - 1;
-
- size_t seq2pos = antidiagonal - seq1beg;
-
- const uchar *s1 = isForward ? seq1 + seq1beg : seq1 - seq1beg;
- const uchar *q1 = isForward ? qual1 + seq1beg : qual1 - seq1beg;
- const uchar *s2 = isForward ? seq2 + seq2pos : seq2 - seq2pos;
- const uchar *q2 = isForward ? qual2 + seq2pos : qual2 - seq2pos;
+ const uchar *s1 = seq1;
+ const uchar *q1 = qual1;
+ const uchar *s2 = seq2;
+ const uchar *q2 = qual2;
+ seq2 += seqIncrement;
+ qual2 += seqIncrement;
initAntidiagonal(antidiagonal + 2, seq1end, thisPos, numCells);
thisPos += xdropPadLen;
@@ -73,7 +71,7 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
const Score *z2 = &zScores[diagPos];
int b = maxValue(x2[0], z1[0]-insExtensionCost, z2[0]-gapUnalignedCost);
updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
- minScore, b, antidiagonal, seq1beg);
+ minScore, b, antidiagonal, seq1end - numCells);
}
if (globality && isDelimiter1) {
@@ -154,14 +152,19 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
thisPos += numCells;
if (x0[0] > -INF / 2) {
+ ++numCells;
++seq1end;
}
if (x0[-n] <= -INF / 2) {
- ++seq1beg;
+ --numCells;
+ if (numCells == 0) break;
++diagPos;
++horiPos;
- if (seq1beg >= seq1end) break;
+ seq1 += seqIncrement;
+ qual1 += seqIncrement;
+ seq2 -= seqIncrement;
+ qual2 -= seqIncrement;
}
}
=====================================
src/GappedXdropAlignerDna.cc
=====================================
@@ -54,7 +54,7 @@ int GappedXdropAligner::alignDna(const uchar *seq1,
scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
scorer[0][3], scorer[0][2], scorer[0][1], scorer[0][0]);
- size_t seq1beg = 0;
+ int numCells = 1;
size_t seq1end = 1;
size_t diagPos = xdropPadLen - 1;
size_t horiPos = xdropPadLen * 2 - 1;
@@ -83,9 +83,7 @@ int GappedXdropAligner::alignDna(const uchar *seq1,
size_t antidiagonal;
for (antidiagonal = 2; /* noop */; ++antidiagonal) {
- int numCells = seq1end - seq1beg;
int n = numCells - 1;
-
const uchar *s1 = &seq1queue.fromEnd(n + seqLoadLen);
const uchar *s2 = seq2queue.begin();
@@ -199,6 +197,7 @@ int GappedXdropAligner::alignDna(const uchar *seq1,
thisPos += numCells;
if (x0[n] != droppedTinyScore) {
+ ++numCells;
++seq1end;
uchar x = toUnmasked[*seq1];
seq1queue.push(x, n + seqLoadLen);
@@ -217,10 +216,10 @@ int GappedXdropAligner::alignDna(const uchar *seq1,
isDna = false;
}
} else {
- ++seq1beg;
+ --numCells;
+ if (numCells == 0) break;
++diagPos;
++horiPos;
- if (seq1beg == seq1end) break;
}
}
=====================================
src/GappedXdropAlignerFrame.cc
=====================================
@@ -30,7 +30,10 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
const ScoreMatrixRow *scorer,
const GapCosts &gap,
int maxScoreDrop) {
- const const_uchar_ptr frames[] = {seq2frame0, seq2frame1, seq2frame2};
+ if (!isForward) {
+ --seq1; --seq2frame0; --seq2frame1; --seq2frame2;
+ }
+ const_uchar_ptr frames[] = {seq2frame0, seq2frame1, seq2frame2};
const int seqIncrement = isForward ? 1 : -1;
const int delOpenScore = -gap.delPieces[0].openCost;
@@ -44,7 +47,7 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
int runOfDrops = 2;
int runOfEdges = 0;
- size_t seq1beg = 0;
+ int numCells = 1;
size_t seq1end = 1;
size_t diagPos6 = xdropPadLen - 1;
size_t horiPos5 = xdropPadLen * 2 - 1;
@@ -60,13 +63,9 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
initFrame();
for (size_t antidiagonal = 0; /* noop */; ++antidiagonal) {
- int numCells = seq1end - seq1beg;
- int n = numCells - 1;
-
- const uchar *seq2 = frames[antidiagonal % 3];
- size_t seq2pos = antidiagonal / 3 - seq1beg;
- const uchar *s1 = isForward ? seq1 + seq1beg : seq1 - seq1beg - 1;
- const uchar *s2 = isForward ? seq2 + seq2pos : seq2 - seq2pos - 1;
+ const uchar *s1 = seq1;
+ const uchar *s2 = frames[antidiagonal % 3];
+ frames[antidiagonal % 3] += seqIncrement;
initAntidiagonal(antidiagonal + 6, seq1end, thisPos, numCells);
thisPos += xdropPadLen;
@@ -114,9 +113,11 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
vertPos1 = thisPos;
thisPos += numCells;
+ int n = numCells - 1;
if (X0[n] > -INF / 2 || runOfEdges) {
++runOfEdges;
if (runOfEdges == 3) {
+ ++numCells;
++seq1end;
runOfEdges = 0;
}
@@ -127,8 +128,12 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
} else {
++runOfDrops;
if (runOfDrops == 3) {
- ++seq1beg;
- if (seq1beg == seq1end) break;
+ --numCells;
+ if (numCells == 0) break;
+ seq1 += seqIncrement;
+ frames[0] -= seqIncrement;
+ frames[1] -= seqIncrement;
+ frames[2] -= seqIncrement;
++diagPos6;
++horiPos5;
++horiPos4;
=====================================
src/GappedXdropAlignerPssm.cc
=====================================
@@ -25,7 +25,7 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
const SimdInt mInsGrowCost = simdFill(insExtensionCost);
const int seqIncrement = isForward ? 1 : -1;
- size_t seq1beg = 0;
+ int numCells = 1;
size_t seq1end = 1;
size_t diagPos = xdropPadLen - 1;
size_t horiPos = xdropPadLen * 2 - 1;
@@ -54,9 +54,7 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
size_t antidiagonal;
for (antidiagonal = 0; /* noop */; ++antidiagonal) {
- int numCells = seq1end - seq1beg;
int n = numCells - 1;
-
const uchar *s1 = &seq1queue.fromEnd(n + simdLen);
const const_int_ptr *s2 = &pssmQueue.fromEnd(1);
@@ -80,7 +78,7 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
const Score *z2 = &zScores[diagPos];
int b = maxValue(x2[0], z1[0]-insExtensionCost, z2[0]-gapUnalignedCost);
updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
- minScore, b, antidiagonal, seq1beg);
+ minScore, b, antidiagonal, seq1end - numCells);
}
if (globality && isDelimiter1) {
@@ -151,6 +149,7 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
thisPos += numCells;
if (x0[n] > -INF / 2) {
+ ++numCells;
++seq1end;
uchar x = *seq;
seq1queue.push(x, n + simdLen);
@@ -165,10 +164,10 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
pssm += seqIncrement;
isDelimiter2 = isDelimiter(0, y);
} else {
- ++seq1beg;
+ --numCells;
+ if (numCells == 0) break;
++diagPos;
++horiPos;
- if (seq1beg == seq1end) break;
}
}
=====================================
src/LastalArguments.cc
=====================================
@@ -382,7 +382,10 @@ LAST home page: http://last.cbrc.jp/\n\
if( isTranslated() && inputFormat == 5 )
ERR( "can't combine option -F with option -Q 5" );
- if( isFrameshift() && outputType > 3 )
+ if( frameshiftCosts.size() == 1 && frameshiftCosts[0] > 0 && outputType > 3 )
+ ERR( "can't combine option -F > 0 with option -j > 3" );
+
+ if( isFrameshift() && outputType > 4 && outputType < 7 )
ERR( "can't combine option -F > 0 with option -j > 3" );
if( isFrameshift() && globality == 1 )
=====================================
src/lastal.cc
=====================================
@@ -12,15 +12,12 @@
#include "GeneticCode.hh"
#include "SubsetMinimizerFinder.hh"
#include "SubsetSuffixArray.hh"
-#include "Centroid.hh"
#include "AlignmentPot.hh"
#include "Alignment.hh"
#include "SegmentPairPot.hh"
-#include "SegmentPair.hh"
#include "ScoreMatrix.hh"
#include "TantanMasker.hh"
#include "DiagonalTable.hh"
-#include "GreedyXdropAligner.hh"
#include "gaplessXdrop.hh"
#include "gaplessPssmXdrop.hh"
#include "gaplessTwoQualityXdrop.hh"
@@ -28,6 +25,9 @@
#include "zio.hh"
#include "stringify.hh"
#include "threadUtil.hh"
+
+#include <math.h>
+
#include <iomanip> // setw
#include <iostream>
#include <fstream>
@@ -45,21 +45,26 @@ static void warn( const char* programName, const char* s ){
using namespace cbrc;
struct LastAligner { // data that changes between queries
- Centroid centroid;
- GreedyXdropAligner greedyAligner;
+ Aligners engines;
std::vector<int> qualityPssm;
std::vector<AlignmentText> textAlns;
};
struct SubstitutionMatrices {
- int scores[scoreMatrixRowSize][scoreMatrixRowSize];
- int scoresMasked[scoreMatrixRowSize][scoreMatrixRowSize];
mcf::SubstitutionMatrixStats stats;
- OneQualityScoreMatrix oneQual;
- OneQualityScoreMatrix oneQualMasked;
OneQualityExpMatrix oneQualExp;
QualityPssmMaker maker;
+
+ int scores[scoreMatrixRowSize][scoreMatrixRowSize];
+ double ratiosMatrix[scoreMatrixRowSize][scoreMatrixRowSize];
+ double *ratios[scoreMatrixRowSize];
+ OneQualityScoreMatrix oneQual;
TwoQualityScoreMatrix twoQual;
+
+ int scoresMasked[scoreMatrixRowSize][scoreMatrixRowSize];
+ double ratiosMaskedMatrix[scoreMatrixRowSize][scoreMatrixRowSize];
+ double *ratiosMasked[scoreMatrixRowSize];
+ OneQualityScoreMatrix oneQualMasked;
TwoQualityScoreMatrix twoQualMasked;
};
@@ -118,7 +123,9 @@ calculateSubstitutionScoreMatrixStatistics(const std::string &matrixName) {
}
} else {
if (scoreMatrix.isCodonCols()) {
- LOG("can't calculate probabilities");
+ static const char msg[] = "can't calculate probabilities";
+ if (args.outputType > 3) ERR(msg); // xxx what if temperature is given?
+ LOG(msg);
return;
} else if (args.temperature < 0) {
const char *canonicalMatrixName = ScoreMatrix::canonicalName(matrixName);
@@ -175,9 +182,9 @@ void makeScoreMatrix( const std::string& matrixName,
}
scoreMatrix.init(alph.encode);
+ const mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
if (args.outputType > 0) {
calculateSubstitutionScoreMatrixStatistics(matrixName);
- const mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
if (!stats.isBad()) {
scoreMatrix.addAmbiguousScores(alph.letters == alph.dna,
args.ambiguousLetterOpt % 2,
@@ -208,6 +215,20 @@ void makeScoreMatrix( const std::string& matrixName,
complementMatrix(fwdMatrices.scores, revMatrices.scores);
complementMatrix(fwdMatrices.scoresMasked, revMatrices.scoresMasked);
}
+
+ double s = stats.lambda();
+ for (unsigned i = 0; i < scoreMatrixRowSize; ++i) {
+ fwdMatrices.ratios[i] = fwdMatrices.ratiosMatrix[i];
+ revMatrices.ratios[i] = revMatrices.ratiosMatrix[i];
+ fwdMatrices.ratiosMasked[i] = fwdMatrices.ratiosMaskedMatrix[i];
+ revMatrices.ratiosMasked[i] = revMatrices.ratiosMaskedMatrix[i];
+ for (unsigned j = 0; j < scoreMatrixRowSize; ++j) {
+ fwdMatrices.ratios[i][j] = exp(s * fwdMatrices.scores[i][j]);
+ revMatrices.ratios[i][j] = exp(s * revMatrices.scores[i][j]);
+ fwdMatrices.ratiosMasked[i][j] = exp(s * fwdMatrices.scoresMasked[i][j]);
+ revMatrices.ratiosMasked[i][j] = exp(s * revMatrices.scoresMasked[i][j]);
+ }
+ }
}
void makeQualityScorers(SubstitutionMatrices &m, bool isCheck) {
@@ -443,6 +464,7 @@ struct Dispatcher{
const uchar* j; // the query quality data
const ScoreMatrixRow* p; // the query PSSM
const ScoreMatrixRow* m; // the score matrix
+ const const_dbl_ptr* r; // the substitution probability ratios
const TwoQualityScoreMatrix& t;
int d; // the maximum score drop
int z;
@@ -455,6 +477,7 @@ struct Dispatcher{
j( getQueryQual(queryNum) ),
p( getQueryPssm(aligner, queryNum) ),
m( isMaskLowercase(e) ? matrices.scoresMasked : matrices.scores ),
+ r( isMaskLowercase(e) ? matrices.ratiosMasked : matrices.ratios ),
t( isMaskLowercase(e) ? matrices.twoQualMasked : matrices.twoQual ),
d( (e == Phase::gapless) ? args.maxDropGapless :
(e == Phase::gapped ) ? args.maxDropGapped : args.maxDropFinal ),
@@ -719,9 +742,10 @@ void alignGapped( LastAligner& aligner,
shrinkToLongestIdenticalRun( aln.seed, dis );
// do gapped extension from each end of the seed:
- aln.makeXdrop(aligner.centroid, aligner.greedyAligner, args.isGreedy,
+ aln.makeXdrop(aligner.engines, args.isGreedy,
dis.a, dis.b, args.globality, dis.m,
- scoreMatrix.maxScore, scoreMatrix.minScore, gapCosts,
+ scoreMatrix.maxScore, scoreMatrix.minScore,
+ dis.r, matrices.stats.lambda(), gapCosts,
dis.d, frameSize, dis.p, dis.t, dis.i, dis.j, alph, extras);
++gappedExtensionCount;
@@ -753,7 +777,7 @@ void alignGapped( LastAligner& aligner,
void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns,
size_t queryNum, const SubstitutionMatrices &matrices,
const uchar* querySeq ){
- Centroid& centroid = aligner.centroid;
+ Centroid& centroid = aligner.engines.centroid;
Dispatcher dis(Phase::final, aligner, queryNum, matrices, querySeq);
size_t queryLen = query.padLen(queryNum);
size_t frameSize = args.isFrameshift() ? (queryLen / 3) : 0;
@@ -786,9 +810,10 @@ void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns,
Alignment probAln;
AlignmentExtras extras;
probAln.seed = aln.seed;
- probAln.makeXdrop(centroid, aligner.greedyAligner, args.isGreedy,
+ probAln.makeXdrop(aligner.engines, args.isGreedy,
dis.a, dis.b, args.globality, dis.m,
- scoreMatrix.maxScore, scoreMatrix.minScore, gapCosts,
+ scoreMatrix.maxScore, scoreMatrix.minScore,
+ dis.r, matrices.stats.lambda(), gapCosts,
dis.d, frameSize, dis.p, dis.t, dis.i, dis.j, alph,
extras, args.gamma, args.outputType);
assert( aln.score != -INF );
@@ -1232,7 +1257,8 @@ void lastal( int argc, char** argv ){
makeScoreMatrix( matrixName, matrixFile );
gapCosts.assign(args.delOpenCosts, args.delGrowCosts,
args.insOpenCosts, args.insGrowCosts,
- args.frameshiftCosts, args.gapPairCost);
+ args.frameshiftCosts, args.gapPairCost,
+ fwdMatrices.stats.lambda());
if( args.isTranslated() ){
if( isDna ) // allow user-defined alphabet
=====================================
src/makefile
=====================================
@@ -33,11 +33,11 @@ ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o \
dna_words_finder.o fileMap.o tantan.o LastalArguments.o \
GappedXdropAligner.o GappedXdropAlignerDna.o GappedXdropAlignerPssm.o \
GappedXdropAligner2qual.o GappedXdropAligner3frame.o \
-GappedXdropAlignerFrame.o mcf_gap_costs.o GeneticCode.o \
-GreedyXdropAligner.o LastEvaluer.o OneQualityScoreMatrix.o \
-QualityPssmMaker.o TwoQualityScoreMatrix.o gaplessXdrop.o \
-gaplessPssmXdrop.o gaplessTwoQualityXdrop.o cbrc_linalg.o \
-mcf_substitution_matrix_stats.o $(alpObj)
+GappedXdropAlignerFrame.o mcf_frameshift_xdrop_aligner.o \
+mcf_gap_costs.o GeneticCode.o GreedyXdropAligner.o LastEvaluer.o \
+OneQualityScoreMatrix.o QualityPssmMaker.o TwoQualityScoreMatrix.o \
+gaplessXdrop.o gaplessPssmXdrop.o gaplessTwoQualityXdrop.o \
+cbrc_linalg.o mcf_substitution_matrix_stats.o $(alpObj)
alignObj4 = Alignment.o AlignmentPot.o AlignmentWrite.o Centroid.o \
DiagonalTable.o MultiSequence.o MultiSequenceQual.o SegmentPair.o \
@@ -144,19 +144,26 @@ depend:
$(CXX) -MM alp/*.cpp | sed 's|.*:|alp/&|' >> m
$(CXX) -MM -I. split/*.cc | sed 's|.*:|split/&|' | $(fix8) >> m
mv m makefile
-Alignment.o Alignment.o8: Alignment.cc Alignment.hh ScoreMatrixRow.hh SegmentPair.hh \
- mcf_gap_costs.hh Alphabet.hh Centroid.hh GappedXdropAligner.hh \
- mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_simd.hh \
- OneQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh GeneticCode.hh \
- GreedyXdropAligner.hh TwoQualityScoreMatrix.hh
-AlignmentPot.o AlignmentPot.o8: AlignmentPot.cc AlignmentPot.hh Alignment.hh \
- ScoreMatrixRow.hh SegmentPair.hh mcf_gap_costs.hh
-AlignmentWrite.o AlignmentWrite.o8: AlignmentWrite.cc Alignment.hh ScoreMatrixRow.hh \
- SegmentPair.hh mcf_gap_costs.hh GeneticCode.hh LastEvaluer.hh \
- alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp alp/sls_basic.hpp \
- alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \
- MultiSequence.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
- Alphabet.hh
+Alignment.o Alignment.o8: Alignment.cc Alignment.hh Centroid.hh GappedXdropAligner.hh \
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_gap_costs.hh \
+ mcf_simd.hh ScoreMatrixRow.hh SegmentPair.hh OneQualityScoreMatrix.hh \
+ mcf_substitution_matrix_stats.hh GreedyXdropAligner.hh \
+ mcf_frameshift_xdrop_aligner.hh Alphabet.hh GeneticCode.hh \
+ TwoQualityScoreMatrix.hh
+AlignmentPot.o AlignmentPot.o8: AlignmentPot.cc AlignmentPot.hh Alignment.hh Centroid.hh \
+ GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_reverse_queue.hh \
+ mcf_gap_costs.hh mcf_simd.hh ScoreMatrixRow.hh SegmentPair.hh \
+ OneQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh \
+ GreedyXdropAligner.hh mcf_frameshift_xdrop_aligner.hh
+AlignmentWrite.o AlignmentWrite.o8: AlignmentWrite.cc Alignment.hh Centroid.hh \
+ GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_reverse_queue.hh \
+ mcf_gap_costs.hh mcf_simd.hh ScoreMatrixRow.hh SegmentPair.hh \
+ OneQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh \
+ GreedyXdropAligner.hh mcf_frameshift_xdrop_aligner.hh GeneticCode.hh \
+ LastEvaluer.hh alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp \
+ alp/sls_basic.hpp alp/sls_falp_alignment_evaluer.hpp \
+ alp/sls_fsa1_pvalues.hpp MultiSequence.hh VectorOrMmap.hh Mmap.hh \
+ fileMap.hh stringify.hh Alphabet.hh
Alphabet.o Alphabet.o8: Alphabet.cc Alphabet.hh
Centroid.o Centroid.o8: Centroid.cc Centroid.hh GappedXdropAligner.hh \
mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_gap_costs.hh \
@@ -251,11 +258,12 @@ lastal.o lastal.o8: lastal.cc last.hh Alphabet.hh CyclicSubsetSeed.hh \
mcf_substitution_matrix_stats.hh TwoQualityScoreMatrix.hh LastEvaluer.hh \
alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp alp/sls_basic.hpp \
alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \
- GeneticCode.hh SubsetMinimizerFinder.hh SubsetSuffixArray.hh Centroid.hh \
- GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_reverse_queue.hh \
- mcf_gap_costs.hh mcf_simd.hh SegmentPair.hh AlignmentPot.hh Alignment.hh \
- SegmentPairPot.hh ScoreMatrix.hh TantanMasker.hh tantan.hh \
- DiagonalTable.hh GreedyXdropAligner.hh gaplessXdrop.hh \
+ GeneticCode.hh SubsetMinimizerFinder.hh SubsetSuffixArray.hh \
+ AlignmentPot.hh Alignment.hh Centroid.hh GappedXdropAligner.hh \
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_gap_costs.hh \
+ mcf_simd.hh SegmentPair.hh GreedyXdropAligner.hh \
+ mcf_frameshift_xdrop_aligner.hh SegmentPairPot.hh ScoreMatrix.hh \
+ TantanMasker.hh tantan.hh DiagonalTable.hh gaplessXdrop.hh \
gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh zio.hh mcf_zstream.hh \
threadUtil.hh version.hh
lastdb.o lastdb.o8: lastdb.cc last.hh Alphabet.hh CyclicSubsetSeed.hh \
@@ -263,6 +271,8 @@ lastdb.o lastdb.o8: lastdb.cc last.hh Alphabet.hh CyclicSubsetSeed.hh \
stringify.hh SequenceFormat.hh dna_words_finder.hh qualityScoreUtil.hh \
LastdbArguments.hh SubsetSuffixArray.hh TantanMasker.hh tantan.hh zio.hh \
mcf_zstream.hh threadUtil.hh version.hh
+mcf_frameshift_xdrop_aligner.o mcf_frameshift_xdrop_aligner.o8: mcf_frameshift_xdrop_aligner.cc \
+ mcf_frameshift_xdrop_aligner.hh mcf_gap_costs.hh
mcf_gap_costs.o mcf_gap_costs.o8: mcf_gap_costs.cc mcf_gap_costs.hh
mcf_substitution_matrix_stats.o mcf_substitution_matrix_stats.o8: mcf_substitution_matrix_stats.cc \
mcf_substitution_matrix_stats.hh LambdaCalculator.hh cbrc_linalg.hh
=====================================
src/mcf_frameshift_xdrop_aligner.cc
=====================================
@@ -0,0 +1,379 @@
+// Author: Martin C. Frith 2020
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "mcf_frameshift_xdrop_aligner.hh"
+
+#include <assert.h>
+#include <math.h>
+
+//#include <iostream> // for debugging
+
+namespace mcf {
+
+double FrameshiftXdropAligner::forward(const uchar *protein,
+ const uchar *frame0,
+ const uchar *frame1,
+ const uchar *frame2,
+ bool isRightwardExtension,
+ const const_dbl_ptr *substitutionProbs,
+ const GapCosts &gapCosts,
+ double probDropLimit) {
+ if (isRightwardExtension) {
+ --protein; --frame0; --frame1; --frame2;
+ }
+ proteinPtr = protein;
+ frames[0] = frame0;
+ frames[1] = frame1;
+ frames[2] = frame2;
+ const int seqIncrement = isRightwardExtension ? 1 : -1;
+
+ const double delOpenProb = gapCosts.delProbPieces[0].openProb;
+ const double insOpenProb = gapCosts.insProbPieces[0].openProb;
+ const double delProb1 = gapCosts.delProb1;
+ const double delProb2 = gapCosts.delProb2;
+ const double delProb3 = gapCosts.delProb3;
+ const double insProb1 = gapCosts.insProb1;
+ const double insProb2 = gapCosts.insProb2;
+ const double insProb3 = gapCosts.insProb3;
+
+ int runOfDrops = 2;
+ int runOfEdges = 0;
+ int numCells = 1;
+ size_t proteinEnd = 1;
+ size_t diagPos6 = padSize - 1; // xxx or 0 ?
+ size_t horiPos5 = diagPos6 + padSize;
+ size_t horiPos4 = horiPos5 + padSize;
+ size_t horiPos3 = horiPos4 + padSize;
+ size_t vertPos2 = horiPos3 + padSize + 1;
+ size_t vertPos1 = vertPos2 + padSize;
+ size_t thisPos = vertPos1; // xxx or + padSize ?
+ size_t antidiagonal = 0;
+
+ double sumOfProbRatios = 0;
+ double logSumOfProbRatios = 0;
+
+ if (xdropShape.empty()) xdropShape.assign(1, thisPos);
+
+ resizeFwdProbsIfSmaller(thisPos);
+ double behindProb = substitutionProbs[*protein][*frame0];
+ assert(behindProb > 0); // XXX might fail?
+ xFwdProbs[padSize - 1] = 1 / behindProb; // xxx or padSize ?
+
+ while (1) {
+ const uchar *s1 = proteinPtr;
+ frames[antidiagonal % 3] += seqIncrement;
+ const uchar *s2 = frames[antidiagonal % 3];
+
+ resizeFwdProbsIfSmaller(thisPos + padSize + numCells);
+ for (int i = 0; i < padSize; ++i) {
+ xFwdProbs[thisPos + i] = 0;
+ yFwdProbs[thisPos + i] = 0;
+ zFwdProbs[thisPos + i] = 0;
+ }
+ thisPos += padSize;
+ Prob *X0 = &xFwdProbs[thisPos];
+ Prob *Y0 = &yFwdProbs[thisPos];
+ Prob *Z0 = &zFwdProbs[thisPos];
+ const Prob *X1 = &xFwdProbs[vertPos1];
+ const Prob *X2 = &xFwdProbs[vertPos2];
+ const Prob *Y3 = &yFwdProbs[horiPos3];
+ const Prob *Z3 = &zFwdProbs[horiPos3 + 1];
+ const Prob *X4 = &xFwdProbs[horiPos4];
+ const Prob *X5 = &xFwdProbs[horiPos5];
+ const Prob *X6 = &xFwdProbs[diagPos6];
+
+ double minProb = sumOfProbRatios * probDropLimit;
+
+ bool isDelimiter2 = (substitutionProbs[0][*s2] <= 0);
+
+ for (int i = 0; i < numCells; ++i) {
+ s2 -= seqIncrement;
+ double x = X6[i] * substitutionProbs[*s1][*s2];
+ double y = X5[i] * delProb1 + X4[i] * delProb2 + Y3[i] * delProb3;
+ double z = X1[i] * insProb1 + X2[i] * insProb2 + Z3[i] * insProb3;
+ double b = x + y * delOpenProb + z * insOpenProb;
+ X0[i] = b;
+ Y0[i] = b + y;
+ Z0[i] = b + z;
+ sumOfProbRatios += b;
+ s1 += seqIncrement;
+ }
+
+ bool isDelimiter1 = (substitutionProbs[*s1][0] <= 0);
+
+ diagPos6 = horiPos5;
+ horiPos5 = horiPos4;
+ horiPos4 = horiPos3;
+ horiPos3 = vertPos2 - 1;
+ vertPos2 = vertPos1;
+ vertPos1 = thisPos;
+ thisPos += numCells;
+
+ ++antidiagonal;
+
+ size_t a2 = antidiagonal * 2;
+ if (xdropShape.size() < a2 + 1) xdropShape.resize(a2 + 1);
+ xdropShape[a2 - 1] = proteinEnd;
+ xdropShape[a2] = thisPos;
+
+ int n = numCells - 1;
+ bool isOkBeg = (X0[0] >= minProb && !isDelimiter2);
+ bool isOkEnd = (X0[n] >= minProb && !isDelimiter1);
+
+ if (isOkEnd || runOfEdges) {
+ ++runOfEdges;
+ if (runOfEdges == 3) {
+ ++numCells;
+ ++proteinEnd;
+ runOfEdges = 0;
+ }
+ }
+
+ if (isOkBeg) {
+ runOfDrops = 0;
+ } else {
+ ++runOfDrops;
+ if (runOfDrops == 3) {
+ --numCells;
+ if (numCells == 0) break;
+ proteinPtr += seqIncrement;
+ frames[0] -= seqIncrement;
+ frames[1] -= seqIncrement;
+ frames[2] -= seqIncrement;
+ ++diagPos6; ++horiPos5; ++horiPos4; ++horiPos3; ++vertPos2; ++vertPos1;
+ runOfDrops = 0;
+ }
+ }
+
+ if (antidiagonal % rescaleStep == 0) {
+ assert(sumOfProbRatios > 0);
+ double scale = 1 / sumOfProbRatios;
+ size_t numOfRescales = antidiagonal / rescaleStep;
+ if (rescales.size() < numOfRescales) rescales.resize(numOfRescales);
+ rescales[numOfRescales - 1] = scale;
+ rescaleFwdProbs(begInProbs(antidiagonal - 6), thisPos, scale);
+ logSumOfProbRatios += log(sumOfProbRatios);
+ sumOfProbRatios = 1;
+ }
+ }
+
+ numOfAntidiagonals = antidiagonal;
+ rescaledSumOfProbRatios = sumOfProbRatios;
+ return logSumOfProbRatios + log(sumOfProbRatios);
+}
+
+void FrameshiftXdropAligner::backward(bool isRightwardExtension,
+ const const_dbl_ptr *substitutionProbs,
+ const GapCosts &gapCosts) {
+ const int seqIncrement = isRightwardExtension ? 1 : -1;
+
+ const double delOpenProb = gapCosts.delProbPieces[0].openProb;
+ const double insOpenProb = gapCosts.insProbPieces[0].openProb;
+ const double delProb1 = gapCosts.delProb1;
+ const double delProb2 = gapCosts.delProb2;
+ const double delProb3 = gapCosts.delProb3;
+ const double insProb1 = gapCosts.insProb1;
+ const double insProb2 = gapCosts.insProb2;
+ const double insProb3 = gapCosts.insProb3;
+
+ size_t diagPos6 = padSize - 1;
+ size_t horiPos5 = diagPos6 + padSize;
+ size_t horiPos4 = horiPos5 + padSize;
+ size_t horiPos3 = horiPos4 + padSize;
+ size_t vertPos2 = horiPos3 + padSize + 1;
+ size_t vertPos1 = vertPos2 + padSize;
+ size_t thisPos = vertPos1;
+ size_t antidiagonal = numOfAntidiagonals;
+
+ double scaledUnit = 1 / rescaledSumOfProbRatios;
+
+ resizeBckProbsIfSmaller(begInProbs(numOfAntidiagonals));
+
+ while (1) {
+ --antidiagonal;
+ const uchar *s1 = proteinPtr;
+ frames[antidiagonal % 3] -= seqIncrement;
+ const uchar *s2 = frames[antidiagonal % 3];
+
+ int numCells = numOfCellsInAntidiagonal(antidiagonal);
+ for (int i = 0; i < padSize; ++i) {
+ xBckProbs[thisPos + i] = 0;
+ yBckProbs[thisPos + i] = 0;
+ zBckProbs[thisPos + i] = 0;
+ }
+ thisPos += padSize;
+ Prob *X0 = &xBckProbs[thisPos];
+ Prob *Y0 = &yBckProbs[thisPos];
+ Prob *Z0 = &zBckProbs[thisPos];
+ const Prob *Z1 = &zBckProbs[vertPos1];
+ const Prob *Z2 = &zBckProbs[vertPos2];
+ const Prob *Y3 = &yBckProbs[horiPos3];
+ const Prob *Z3 = &zBckProbs[horiPos3 + 1];
+ const Prob *Y4 = &yBckProbs[horiPos4];
+ const Prob *Y5 = &yBckProbs[horiPos5];
+ const Prob *X6 = &xBckProbs[diagPos6];
+
+ for (int i = 0; i < numCells; ++i) {
+ double x = X6[i];
+ double y1 = Y5[i] * delProb1;
+ double y2 = Y4[i] * delProb2;
+ double y3 = Y3[i] * delProb3;
+ double z1 = Z1[i] * insProb1;
+ double z2 = Z2[i] * insProb2;
+ double z3 = Z3[i] * insProb3;
+ double b = x + y1 + y2 + y3 + z1 + z2 + z3 + scaledUnit;
+ X0[i] = b * substitutionProbs[*s1][*s2];
+ Y0[i] = b * delOpenProb + y3;
+ Z0[i] = b * insOpenProb + z3;
+ s1 -= seqIncrement;
+ s2 += seqIncrement;
+ }
+
+ diagPos6 = horiPos5;
+ horiPos5 = horiPos4;
+ horiPos4 = horiPos3;
+ horiPos3 = vertPos2 - 1;
+ vertPos2 = vertPos1;
+ vertPos1 = thisPos;
+ thisPos += numCells;
+
+ if (antidiagonal == 0) break;
+
+ if (endInProtein(antidiagonal - 1) < endInProtein(antidiagonal)) {
+ proteinPtr -= seqIncrement;
+ frames[0] += seqIncrement;
+ frames[1] += seqIncrement;
+ frames[2] += seqIncrement;
+ ++diagPos6; ++horiPos5; ++horiPos4; ++horiPos3; ++vertPos2; ++vertPos1;
+ }
+
+ size_t offset = 6;
+ if ((antidiagonal + offset) % rescaleStep == 0 &&
+ antidiagonal + offset < numOfAntidiagonals) {
+ double scale = rescales[antidiagonal / rescaleStep];
+ size_t length = begInProbs(antidiagonal + 6) - begInProbs(antidiagonal);
+ rescaleBckProbs(thisPos - length, thisPos, scale);
+ scaledUnit *= scale;
+ }
+ }
+
+ assert(thisPos == begInProbs(numOfAntidiagonals));
+}
+
+void FrameshiftXdropAligner::count(bool isRightwardExtension,
+ const GapCosts &gapCosts,
+ const dbl_ptr *substitutionCounts,
+ double *transitionCounts) {
+ const int seqIncrement = isRightwardExtension ? 1 : -1;
+
+ size_t diagPos6 = padSize - 1;
+ size_t horiPos5 = diagPos6 + padSize;
+ size_t horiPos4 = horiPos5 + padSize;
+ size_t horiPos3 = horiPos4 + padSize;
+ size_t vertPos2 = horiPos3 + padSize + 1;
+ size_t vertPos1 = vertPos2 + padSize;
+ size_t thisPos = vertPos1;
+ size_t antidiagonal = numOfAntidiagonals;
+ size_t totalNumOfCells = begInProbs(numOfAntidiagonals);
+
+ size_t proteinEnd = endInProtein(antidiagonal - 1);
+ if (isRightwardExtension) {
+ proteinPtr += proteinEnd;
+ frames[0] += (antidiagonal + 2) / 3 - (proteinEnd - 1);
+ frames[1] += (antidiagonal + 1) / 3 - (proteinEnd - 1);
+ frames[2] += (antidiagonal + 0) / 3 - (proteinEnd - 1);
+ } else {
+ proteinPtr -= proteinEnd;
+ frames[0] -= (antidiagonal + 2) / 3 - (proteinEnd - 1);
+ frames[1] -= (antidiagonal + 1) / 3 - (proteinEnd - 1);
+ frames[2] -= (antidiagonal + 0) / 3 - (proteinEnd - 1);
+ }
+
+ double del1 = 0; double del2 = 0; double del3 = 0; double delNext = 0;
+ double ins1 = 0; double ins2 = 0; double ins3 = 0; double insNext = 0;
+
+ while (1) {
+ --antidiagonal;
+ const uchar *s1 = proteinPtr;
+ const uchar *s2 = frames[antidiagonal % 3];
+ frames[antidiagonal % 3] -= seqIncrement;
+
+ int numCells = numOfCellsInAntidiagonal(antidiagonal);
+ thisPos += padSize;
+ const Prob *Z1 = &zBckProbs[vertPos1];
+ const Prob *Z2 = &zBckProbs[vertPos2];
+ const Prob *Y3 = &yBckProbs[horiPos3];
+ const Prob *Z3 = &zBckProbs[horiPos3 + 1];
+ const Prob *Y4 = &yBckProbs[horiPos4];
+ const Prob *Y5 = &yBckProbs[horiPos5];
+ const Prob *X6 = &xBckProbs[diagPos6];
+
+ const Prob *fX0 = &xFwdProbs[totalNumOfCells - thisPos + padSize*7 - 1];
+ const Prob *fY0 = &yFwdProbs[totalNumOfCells - thisPos + padSize*7 - 1];
+ const Prob *fZ0 = &zFwdProbs[totalNumOfCells - thisPos + padSize*7 - 1];
+
+ double d1 = 0; double d2 = 0; double d3 = 0; double dNext = 0;
+ double i1 = 0; double i2 = 0; double i3 = 0; double iNext = 0;
+
+ for (int i = 0; i < numCells; ++i) {
+ double matchProb = X6[i] * fX0[-i];
+ substitutionCounts[*s1][*s2] += matchProb;
+ transitionCounts[0] += matchProb;
+ d1 += Y5[i] * fX0[-i];
+ d2 += Y4[i] * fX0[-i];
+ d3 += Y3[i] * fX0[-i];
+ dNext += Y3[i] * fY0[-i];
+ i1 += Z1[i] * fX0[-i];
+ i2 += Z2[i] * fX0[-i];
+ i3 += Z3[i] * fX0[-i];
+ iNext += Z3[i] * fZ0[-i];
+ s1 -= seqIncrement;
+ s2 += seqIncrement;
+ }
+
+ size_t mod = antidiagonal % rescaleStep;
+ if (mod >= rescaleStep - 6 &&
+ antidiagonal - mod + rescaleStep < numOfAntidiagonals) {
+ double mul = 1 / rescales[antidiagonal / rescaleStep];
+ if (mod < rescaleStep - 1) i1 *= mul;
+ if (mod < rescaleStep - 2) i2 *= mul;
+ if (mod < rescaleStep - 3) {
+ i3 *= mul; iNext *= mul; d3 *= mul; dNext *= mul;
+ }
+ if (mod < rescaleStep - 4) d2 *= mul;
+ if (mod < rescaleStep - 5) d1 *= mul;
+ }
+
+ ins1 += i1; ins2 += i2; ins3 += i3; insNext += iNext;
+ del1 += d1; del2 += d2; del3 += d3; delNext += dNext;
+
+ diagPos6 = horiPos5;
+ horiPos5 = horiPos4;
+ horiPos4 = horiPos3;
+ horiPos3 = vertPos2 - 1;
+ vertPos2 = vertPos1;
+ vertPos1 = thisPos;
+ thisPos += numCells;
+
+ if (antidiagonal == 0) break;
+
+ if (endInProtein(antidiagonal - 1) < endInProtein(antidiagonal)) {
+ proteinPtr -= seqIncrement;
+ frames[0] += seqIncrement;
+ frames[1] += seqIncrement;
+ frames[2] += seqIncrement;
+ ++diagPos6; ++horiPos5; ++horiPos4; ++horiPos3; ++vertPos2; ++vertPos1;
+ }
+ }
+
+ transitionCounts[1] += gapCosts.insProb1 * ins1;
+ transitionCounts[2] += gapCosts.insProb2 * ins2;
+ transitionCounts[3] += gapCosts.insProb3 * ins3;
+ transitionCounts[4] += gapCosts.insProb3 * (insNext - ins3);
+ transitionCounts[5] += gapCosts.delProb1 * del1;
+ transitionCounts[6] += gapCosts.delProb2 * del2;
+ transitionCounts[7] += gapCosts.delProb3 * del3;
+ transitionCounts[8] += gapCosts.delProb3 * (delNext - del3);
+}
+
+}
=====================================
src/mcf_frameshift_xdrop_aligner.hh
=====================================
@@ -0,0 +1,121 @@
+// Author: Martin C. Frith 2020
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef FRAMESHIFT_XDROP_ALIGNER_HH
+#define FRAMESHIFT_XDROP_ALIGNER_HH
+
+#include "mcf_gap_costs.hh"
+
+#include <stddef.h>
+
+#include <vector>
+
+namespace mcf {
+
+typedef const double *const_dbl_ptr;
+typedef double *dbl_ptr;
+
+class FrameshiftXdropAligner {
+ typedef unsigned char uchar;
+ typedef const uchar *const_uchar_ptr;
+ typedef double Prob;
+
+ enum { padSize = 1 };
+ enum { rescaleStep = 32 };
+
+public:
+ double forward(const uchar *protein,
+ const uchar *frame0, // translated DNA, 0 frame
+ const uchar *frame1, // translated DNA, +1 frame
+ const uchar *frame2, // translated DNA, +2 frame
+ bool isRightwardExtension,
+ const const_dbl_ptr *substitutionProbs,
+ const GapCosts &gapCosts,
+ double probDropLimit);
+
+ void backward(bool isRightwardExtension,
+ const const_dbl_ptr *substitutionProbs,
+ const GapCosts &gapCosts);
+
+ void count(bool isRightwardExtension,
+ const GapCosts &gapCosts,
+ const dbl_ptr *substitutionCounts,
+ double *transitionCounts);
+
+ double matchProb(size_t proteinCoordinate, size_t dnaCoordinate) const {
+ size_t antidiagonal = proteinCoordinate * 3 + dnaCoordinate;
+ if (antidiagonal + 6 >= numOfAntidiagonals) return 0;
+ size_t a = antidiagonal * 2;
+ size_t i = xdropShape[a + 2] - xdropShape[a + 1] + proteinCoordinate;
+ if (i < xdropShape[a] + padSize || i >= xdropShape[a + 2]) return 0;
+ size_t b = (antidiagonal + 6) * 2;
+ size_t j = xdropShape[b + 2] - xdropShape[b + 1] + proteinCoordinate + 1;
+ size_t totalNumOfCells = xdropShape[numOfAntidiagonals * 2];
+ return xFwdProbs[i] * xBckProbs[totalNumOfCells - j + padSize*7 - 1];
+ }
+
+private:
+ std::vector<Prob> xFwdProbs;
+ std::vector<Prob> yFwdProbs;
+ std::vector<Prob> zFwdProbs;
+
+ std::vector<Prob> xBckProbs;
+ std::vector<Prob> yBckProbs;
+ std::vector<Prob> zBckProbs;
+
+ std::vector<size_t> xdropShape;
+
+ std::vector<double> rescales;
+
+ double rescaledSumOfProbRatios;
+
+ size_t numOfAntidiagonals;
+
+ const uchar *proteinPtr;
+ const_uchar_ptr frames[3];
+
+ size_t begInProbs(size_t antidiagonal) const
+ { return xdropShape[antidiagonal * 2]; }
+
+ size_t endInProtein(size_t antidiagonal) const
+ { return xdropShape[antidiagonal * 2 + 1]; }
+
+ int numOfCellsInAntidiagonal(size_t antidiagonal) const
+ { return begInProbs(antidiagonal + 1) - begInProbs(antidiagonal) - padSize; }
+
+ void resizeFwdProbsIfSmaller(size_t size) {
+ if (xFwdProbs.size() < size) {
+ xFwdProbs.resize(size);
+ yFwdProbs.resize(size);
+ zFwdProbs.resize(size);
+ }
+ }
+
+ void resizeBckProbsIfSmaller(size_t size) {
+ if (xBckProbs.size() < size) {
+ xBckProbs.resize(size);
+ yBckProbs.resize(size);
+ zBckProbs.resize(size);
+ }
+ }
+
+ void rescaleFwdProbs(size_t beg, size_t end, double scale) {
+ for (size_t i = beg; i < end; ++i) {
+ xFwdProbs[i] *= scale;
+ yFwdProbs[i] *= scale;
+ zFwdProbs[i] *= scale;
+ }
+ }
+
+ void rescaleBckProbs(size_t beg, size_t end, double scale) {
+ for (size_t i = beg; i < end; ++i) {
+ xBckProbs[i] *= scale;
+ yBckProbs[i] *= scale;
+ zBckProbs[i] *= scale;
+ }
+ }
+};
+
+}
+
+#endif
=====================================
src/mcf_gap_costs.cc
=====================================
@@ -5,6 +5,7 @@
#include <stdexcept>
#include <assert.h>
#include <limits.h>
+#include <math.h>
#include <stddef.h> // size_t
static void err(const char *s) {
@@ -26,12 +27,23 @@ static void assignGapCostPieces(const std::vector<int> &openCosts,
assert(!pieces.empty());
}
+static void assignProbs(std::vector<GapCosts::ProbPiece> &probPieces,
+ const std::vector<GapCosts::Piece> &costPieces,
+ double scale) {
+ probPieces.resize(costPieces.size());
+ for (size_t i = 0; i < costPieces.size(); ++i) {
+ probPieces[i].openProb = exp(scale * -costPieces[i].openCost);
+ probPieces[i].growProb = exp(scale * -costPieces[i].growCost);
+ }
+}
+
void GapCosts::assign(const std::vector<int> &delOpenCosts,
const std::vector<int> &delGrowCosts,
const std::vector<int> &insOpenCosts,
const std::vector<int> &insGrowCosts,
const std::vector<int> &frameshiftCosts,
- int unalignedPairCost) {
+ int unalignedPairCost,
+ double scale) {
assignGapCostPieces(delOpenCosts, delGrowCosts, delPieces);
assignGapCostPieces(insOpenCosts, insGrowCosts, insPieces);
if (unalignedPairCost > 0) {
@@ -44,6 +56,9 @@ void GapCosts::assign(const std::vector<int> &delOpenCosts,
pairCost >= delPieces[0].growCost + insPieces[0].growCost +
std::max(delPieces[0].openCost, insPieces[0].openCost));
+ assignProbs(delProbPieces, delPieces, scale);
+ assignProbs(insProbPieces, insPieces, scale);
+
if (frameshiftCosts.empty()) {
frameshiftCost = -1;
} else if (frameshiftCosts.size() == 1) {
@@ -57,6 +72,13 @@ void GapCosts::assign(const std::vector<int> &delOpenCosts,
insScore2 = -(2 * insPieces[0].growCost + frameshiftCosts[3]);
insScore3 = -(3 * insPieces[0].growCost);
frameshiftCost = -2;
+
+ delProb1 = exp(scale * delScore1);
+ delProb2 = exp(scale * delScore2);
+ delProb3 = exp(scale * delScore3);
+ insProb1 = exp(scale * insScore1);
+ insProb2 = exp(scale * insScore2);
+ insProb3 = exp(scale * insScore3);
}
}
=====================================
src/mcf_gap_costs.hh
=====================================
@@ -27,6 +27,11 @@ struct GapCosts {
int growCost;
};
+ struct ProbPiece {
+ double openProb;
+ double growProb;
+ };
+
std::vector<Piece> delPieces;
std::vector<Piece> insPieces;
int pairCost;
@@ -43,6 +48,16 @@ struct GapCosts {
bool isAffine;
+ // these are for calculating alignment probabilities:
+ std::vector<ProbPiece> delProbPieces;
+ std::vector<ProbPiece> insProbPieces;
+ double delProb1;
+ double delProb2;
+ double delProb3;
+ double insProb1;
+ double insProb2;
+ double insProb3;
+
// Assign piecewise linear open and grow costs, and one pairCost.
// If unalignedPairCost <= 0, assign non-generalized costs.
// Throw a runtime_error if any growCost is <= 0.
@@ -56,7 +71,8 @@ struct GapCosts {
const std::vector<int> &insOpenCosts,
const std::vector<int> &insGrowCosts,
const std::vector<int> &frameshiftCosts,
- int unalignedPairCost);
+ int unalignedPairCost,
+ double scale); // probability ratio = exp(scale * score)
// The cost of a "gap" consisting of unaligned letters in the query
// and/or reference sequence
=====================================
src/version.hh
=====================================
@@ -1 +1 @@
-"1133"
+"1145"
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/649febce00d36a9bea89dda4805e112d5f9d052e
--
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/649febce00d36a9bea89dda4805e112d5f9d052e
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201110/10d7ab61/attachment-0001.html>
More information about the debian-med-commit
mailing list